In [1]:
import numpy as np
import string
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Granth
[nltk_data]     Bagadia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
class SkipGram(object):
    def __init__(self, N, window_size):
        self.N = N
        self.X_train = []
        self.y_train = []
        self.window_size = window_size
        self.alpha = 0.001
        self.words = []
        self.word_index = {}

    def initialize(self, V, data):
        self.V = V
        self.W1 = np.random.uniform(-0.8, 0.8, (self.V, self.N))
        self.W2 = np.random.uniform(-0.8, 0.8, (self.N, self.V))
        self.words = data
        for i in range(len(data)):
            self.word_index[data[i]] = i

    def preprocessing(self, corpus):
        stop_words = set(stopwords.words('english'))
        training_data = []
        sentences = corpus.split(".")
        for i in range(len(sentences)):
            sentences[i] = sentences[i].strip()
            sentence = sentences[i].split()
            x = [word.strip(string.punctuation) for word in sentence if word not in stop_words]
            x = [word.lower() for word in x]
            training_data.append(x)
        return training_data

    def prepare_data_for_training(self, corpus):
        sentences = self.preprocessing(corpus)
        data = {}
        for sentence in sentences:
            for word in sentence:
                if word not in data:
                    data[word] = 1
                else:
                    data[word] += 1

        V = len(data)
        data = sorted(list(data.keys()))
        vocab = {data[i]: i for i in range(len(data))}

        for sentence in sentences:
            for i in range(len(sentence)):
                center_word = [0 for _ in range(V)]
                center_word[vocab[sentence[i]]] = 1
                context = [0 for _ in range(V)]
                for j in range(i - self.window_size, i + self.window_size + 1):
                    if i != j and j >= 0 and j < len(sentence):
                        context[vocab[sentence[j]]] += 1
                self.X_train.append(center_word)
                self.y_train.append(context)

        self.initialize(V, data)

    def feed_forward(self, X):
        self.h = np.dot(self.W1.T, X).reshape(self.N, 1)
        self.u = np.dot(self.W2.T, self.h)
        self.y = self.softmax(self.u)
        return self.y

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / exp_x.sum(axis=0)

    def backpropagate(self, x, t):
        e = self.y - np.asarray(t).reshape(self.V, 1)
        dLdW2 = np.dot(self.h, e.T)
        X = np.array(x).reshape(self.V, 1)
        dLdW1 = np.dot(X, np.dot(self.W2, e).T)
        self.W2 -= self.alpha * dLdW2
        self.W1 -= self.alpha * dLdW1

    def train(self, epochs):
        for epoch in range(1, epochs + 1):
            self.loss = 0
            for j in range(len(self.X_train)):
                self.feed_forward(self.X_train[j])
                self.backpropagate(self.X_train[j], self.y_train[j])
                C = 0
                for m in range(self.V):
                    if self.y_train[j][m]:
                        self.loss += -1 * self.u[m][0]
                        C += 1
                self.loss += C * np.log(np.sum(np.exp(self.u)))
            if epoch % 1000 == 0:
                print("Epoch", epoch, "loss =", self.loss)
            self.alpha *= 1 / (1 + self.alpha * epoch)

    def predict(self, word, number_of_predictions):
        if word in self.words:
            index = self.word_index[word]
            X = [0 for _ in range(self.V)]
            X[index] = 1
            prediction = self.feed_forward(X)
            output = {}
            for i in range(self.V):
                output[prediction[i][0]] = i
            top_context_words = []
            for k in sorted(output, reverse=True):
                top_context_words.append(self.words[output[k]])
                if len(top_context_words) >= number_of_predictions:
                    break
            return top_context_words
        else:
            print("Word not found in dictionary")

    def compute_similarity(self, vec1, vec2):
        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

    def rank_words(self, target_vector):
        similarities = {}
        for i in range(self.V):
            word_vector = self.W1[i]
            similarity = self.compute_similarity(target_vector, word_vector)
            similarities[i] = similarity
        ranked_words = sorted(similarities, key=similarities.get, reverse=True)
        return ranked_words

    def compute_mrr_for_window(self, target_word, context_words):
        target_index = self.word_index[target_word]
        target_vector = self.W1[target_index]

        ranked_indices = self.rank_words(target_vector)

        mrr = 0
        for context_word in context_words:
            context_index = self.word_index[context_word]
            rank = ranked_indices.index(context_index) + 1
            mrr += 1 / rank
        return mrr / len(context_words)

    def compute_mrr(self, test_data):
        total_mrr = 0
        for window in test_data:
            target_word, context_words = window[0], window[1:]
            total_mrr += self.compute_mrr_for_window(target_word, context_words)
        average_mrr = total_mrr / len(test_data)
        return average_mrr

    def evaluate_mrr(self, test_corpus):
        test_sentences = self.preprocessing(test_corpus)
        test_data = []
        for sentence in test_sentences:
            for i in range(len(sentence)):
                window = sentence[max(0, i - self.window_size):min(len(sentence), i + self.window_size + 1)]
                test_data.append(window)

        mrr = self.compute_mrr(test_data)
        print("MRR:", mrr)

In [3]:
train_corpus = "The earth revolves around the sun. The moon revolves around the earth."
test_corpus = "The sun revolves around the earth. The earth revolves around the moon."

In [4]:
skipGram = SkipGram(N=50, window_size=2)
skipGram.prepare_data_for_training(train_corpus)
skipGram.train(epochs=10000)

Epoch 1 loss = 58.887011541503476
Epoch 2 loss = 58.64709721840355
Epoch 3 loss = 58.4106670445719
Epoch 4 loss = 58.17789482477999
Epoch 5 loss = 57.948940209437446
Epoch 6 loss = 57.72394810286907
Epoch 7 loss = 57.50304823092566
Epoch 8 loss = 57.28635486861832
Epoch 9 loss = 57.073966725242855
Epoch 10 loss = 56.86596698145871
Epoch 11 loss = 56.66242347010017
Epoch 12 loss = 56.463388990190374
Epoch 13 loss = 56.26890174175414
Epoch 14 loss = 56.07898586760554
Epoch 15 loss = 55.89365208733189
Epoch 16 loss = 55.712898408189616
Epoch 17 loss = 55.536710897544566
Epoch 18 loss = 55.36506450178148
Epoch 19 loss = 55.19792389723173
Epoch 20 loss = 55.03524435955704
Epoch 21 loss = 54.87697263912938
Epoch 22 loss = 54.723047831198315
Epoch 23 loss = 54.573402230983554
Epoch 24 loss = 54.427962165221075
Epoch 25 loss = 54.28664879307899
Epoch 26 loss = 54.14937887071203
Epoch 27 loss = 54.01606547500333
Epoch 28 loss = 53.886618683235106
Epoch 29 loss = 53.760946206511164
Epoch 30 loss

In [5]:
skipGram.evaluate_mrr(test_corpus)

MRR: 0.29374999999999996


In [6]:
print(skipGram.predict("around", 3))
print(skipGram.W1[skipGram.word_index["around"]])

['earth', 'sun', 'revolves']
[ 0.50109605 -0.15694449  0.62142422 -0.72597305 -0.38422686 -0.67202534
  0.01267053 -0.53733874  0.41199657 -0.92059272  0.4848794   0.28024614
  0.36248699 -0.92664583 -0.58658719 -0.72951232 -0.81755205  0.12657269
  0.16030049 -0.40025347 -0.67725591 -0.01423304  0.53823557 -0.11840061
 -0.24944799]
