In [1]:
import numpy as np
from collections import defaultdict

I will first define settings for this task - our hyperparameters

In [2]:
settings = {
    'window_size': 2,    # will take 2 words before and 2 words after the target word
    'n': 10,             # Embedding Dimension - one word will be represented as a vector of 10 dimensions
    'epochs': 150,        # Number of times the whole training data will be passed through the model
    'learning_rate': 0.01 # How much the weights will be updated during training
}

I will use this text to train my model

In [3]:
text = "I would really love to get into JetBrains and learn fun ai"

Now I will create a class Word2Vec to preprocess our text into a one-hot vector 

In [4]:
class Word2Vec:
    def __init__(self, settings):
        self.window_size = settings['window_size']
        self.n = settings['n']
        self.epochs = settings['epochs']
        self.learning_rate = settings['learning_rate']

    def word2onehot(self, word):

        # creating a one-hot vector in format [0, 0, 1, 0, 0,...]
        word_vec = np.zeros(self.v_count)
        word_index = self.word_index[word]
        word_vec[word_index] = 1
        return word_vec

    def generate_training_data(self, text):

        # counting the number of unique words in the text and creating a list of those words and their counts
        word_counts = defaultdict(int)
        for row in text:
            for word in row:
                word_counts[word] += 1


        # creating a vocabulary of unique words and counting the number of unique words in the text
        self.v_count = len(word_counts.keys()) #needed for matrix size
        self.word_list = list(word_counts.keys())

        # creating a dictionary that maps each word to its index and another dictionary that maps each index to its corresponding word
        self.word_index = {word: i for i, word in enumerate(self.word_list)}
        self.index_word = {i: word for i, word in enumerate(self.word_list)}

        training_data = []

        # sliding a window of size 2*window_size + 1 over the text and creating training data for each target word and its context words
        for sentence in text:
            sentence_len = len(sentence)

            for i, word in enumerate(sentence):

                # creating a one-hot vector for the target word - nn input
                w_target = self.word2onehot(sentence[i])
                w_context = []

                for j in range(i - self.window_size, i + self.window_size + 1):

                    # checking the range 
                    if j != i and j < sentence_len and j >= 0:
                        w_context.append(self.word2onehot(sentence[j]))

                # saving training data in format [target_word, [context_words]]
                training_data.append([w_target, w_context])    

        return np.array(training_data, dtype=object)    

    def train(self, training_data):

        # initializing weight matrices with random values between -1 and 1
        # W1 being input -> hidden layer (V x N) and W2 being hidden -> output layer (N x V)

        self.w1 = np.random.uniform(-1, 1, (self.v_count, self.n))
        self.w2 = np.random.uniform(-1, 1, (self.n, self.v_count))

        # epoch cycle
        for epoch in range(self.epochs):
            self.loss = 0

            for w_target, w_context in training_data:

                # forward pass

                h = np.dot(w_target, self.w1)

                u = np.dot(h, self.w2)

                # applying softmax to get the predicted probabilities for each word in the vocabulary
                y_pred = self.softmax(u)

                # calculate error
                EI = np.sum([np.subtract(y_pred, word) for word in w_context], axis=0)


                # backpropagation

                # calculating gradient for W1 
                dl_dw1 = np.outer(w_target, np.dot(self.w2, EI.T))

                # calculating gradient for W2
                dl_dw2 = np.outer(h, EI)
            
                # weight update
                self.w1 = self.w1 - (self.learning_rate * dl_dw1)
                self.w2 = self.w2 - (self.learning_rate * dl_dw2)

                # loss calculation - using negative log likelihood loss function
                self.loss += -np.sum([u[np.argmax(word)] for word in w_context]) + len(w_context) * np.log(np.sum(np.exp(u)))

            if epoch % 10 == 0:
                print(f'Epoch: {epoch}, Loss: {self.loss}')  


    def softmax(self, x):
        # softmax function
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)            

    def word_vec(self, word):

        word_index = self.word_index[word]

        # returning matrix row by index
        return self.w1[word_index]           


    def vec_sim(self, word, top_n):

        v_w1 = self.word_vec(word)
        word_sim = {}

        for i in range(self.v_count):

            v_w2 = self.w1[i]

            # cosine similarity
            theta_sum = np.dot(v_w1, v_w2)
            theta_den = np.linalg.norm(v_w1) * np.linalg.norm(v_w2)
            theta = theta_sum / theta_den

            word = self.index_word[i]
            word_sim[word] = theta

        words_sorted = sorted(word_sim.items(), key = lambda x: x[1], reverse=True)  

        for word, sim in words_sorted[:top_n]:
            print(f"{word}: {sim:.4f}")           

In [5]:
text= [[word.lower() for word in text.split()]]

w2v = Word2Vec(settings)

training_data = w2v.generate_training_data(text)

In [6]:
w2v.train(training_data)

Epoch: 0, Loss: 122.4132950252059
Epoch: 10, Loss: 102.20198847247079
Epoch: 20, Loss: 91.59023907394975
Epoch: 30, Loss: 84.43484377049194
Epoch: 40, Loss: 79.04009289790562
Epoch: 50, Loss: 74.73456907947579
Epoch: 60, Loss: 71.18339078869968
Epoch: 70, Loss: 68.2030296743959
Epoch: 80, Loss: 65.68839197780146
Epoch: 90, Loss: 63.57364471758648
Epoch: 100, Loss: 61.81164888346612
Epoch: 110, Loss: 60.36366529647496
Epoch: 120, Loss: 59.19466913725407
Epoch: 130, Loss: 58.27015167302643
Epoch: 140, Loss: 57.55180922819144


I will check the result of training

In [7]:
target_word = "jetbrains"
vector = w2v.word_vec(target_word)

if vector is not None:
    print(f"Vector for {target_word}:", vector)
else:
    print(f"Word not found.")

Vector for jetbrains: [-0.40147218 -0.12856303 -0.08856362 -0.05165499  0.02183594  1.4424081
 -0.7713191  -1.07088168 -1.50079747  0.06069808]


In [8]:
w2v.vec_sim(target_word, 4)

jetbrains: 1.0000
fun: 0.3253
ai: 0.2895
into: 0.2716
