In [1]:
# Implementing word2vec algorithm using only numpy, no framework. 

# First we implement SkipGram. 
# Second we impplement CBOW.


In [2]:
import numpy as np

In [63]:
corpus = """

The similarity scores you are seeing are a perfect reflection of what the model learned from the only data it has ever seen
The fact that the trained results are different from the random ones proves your entire implementation works
To get more human-like results, you would simply run your exact same code on a much larger text file
"""

corpus_tokens = corpus.split(" ")

unique_words = sorted(list(set(corpus_tokens)))
index_to_word = {i: w for i, w in enumerate(unique_words)}
word_to_index = {w: i for i, w in index_to_word.items()}


embedding_dim = 5
learning_rate = 0.01
epochs = 300
w1 = np.random.uniform(-1, 1, (len(unique_words), embedding_dim))
w2 = np.random.uniform(-1, 1, (embedding_dim, len(unique_words)))

In [70]:
def get_similar_words(input_word="results"):
    similarity_score = {}
    input_word_vector = w1[word_to_index[input_word]]
    
    for word, index in word_to_index.items():
        if word == input_word:
            continue 
        word_vector = w1[word_to_index[word]]
    
        # Cosine Similarity = (A · B) / (||A|| × ||B||)
        cos_sim = np.dot(input_word_vector, word_vector) / (np.linalg.norm(input_word_vector) * np.linalg.norm(word_vector))
        similarity_score[word] = cos_sim

    return similarity_score

In [1]:
# Skip Gram

In [36]:
def generate_pairing_skipgram(corpus_tokens, window_size=2):
    pairs = []

    for i, word in enumerate(corpus_tokens):
        full_context = corpus_tokens[
            max(0, i - window_size): min(len(corpus_tokens), i + window_size + 1)
        ]
        
        context_words = full_context[0: full_context.index(word)] + full_context[full_context.index(word)+1:]

        for cw in context_words:
            pairs.append((word, cw))
    
    return pairs

skipgram_training_set = generate_pairing_skipgram(corpus_tokens)
skipgram_training_set

[('the', 'quick'),
 ('the', 'brown'),
 ('quick', 'the'),
 ('quick', 'brown'),
 ('quick', 'fox'),
 ('brown', 'the'),
 ('brown', 'quick'),
 ('brown', 'fox'),
 ('brown', 'jumps'),
 ('fox', 'quick'),
 ('fox', 'brown'),
 ('fox', 'jumps'),
 ('fox', 'over'),
 ('jumps', 'brown'),
 ('jumps', 'fox'),
 ('jumps', 'over'),
 ('jumps', 'the'),
 ('over', 'fox'),
 ('over', 'jumps'),
 ('over', 'the'),
 ('over', 'lazy'),
 ('the', 'jumps'),
 ('the', 'over'),
 ('the', 'lazy'),
 ('the', 'dog'),
 ('lazy', 'over'),
 ('lazy', 'the'),
 ('lazy', 'dog'),
 ('dog', 'the'),
 ('dog', 'lazy')]

In [38]:
for epoch in range(epochs):

        sum_of_loss = 0
        
        for pair in skipgram_training_set:
            center_word, context_word = pair[0], pair[1]
            center_word_vector = w1[word_to_index[center_word]].reshape(1, -1)

            scores = center_word_vector @ w2
            exp_scores = np.exp(scores)
            
            sum_of_scores = np.sum(exp_scores)

            probabilities = exp_scores / sum_of_scores

            context_word_probability = probabilities[0, word_to_index[context_word]]
            
            loss = -np.log(context_word_probability)
            sum_of_loss += loss

            cw_one_hot = np.zeros((1, len(unique_words)))
            cw_one_hot[0, word_to_index[context_word]] = 1

            grad_score = probabilities - cw_one_hot

            grad_w2 = center_word_vector.T @ grad_score
            grad_w1 = grad_score @ w2.T

            w1[word_to_index[center_word]] -= learning_rate * grad_w1[0]
            w2 -= learning_rate * grad_w2

        print(f"Epoch {epoch + 1}, Loss: {sum_of_loss / len(skipgram_training_set)}")

Epoch 1, Loss: 2.358094011680059
Epoch 2, Loss: 2.2361935458920845
Epoch 3, Loss: 2.135740641242449
Epoch 4, Loss: 2.0514200600444505
Epoch 5, Loss: 1.9798814146038315
Epoch 6, Loss: 1.918978752509616
Epoch 7, Loss: 1.8671844403036069
Epoch 8, Loss: 1.8232291593413215
Epoch 9, Loss: 1.7859348173860001
Epoch 10, Loss: 1.7541820381029918
Epoch 11, Loss: 1.726943436830321
Epoch 12, Loss: 1.703325053871199
Epoch 13, Loss: 1.682586711264009
Epoch 14, Loss: 1.6641382712367558
Epoch 15, Loss: 1.6475209354529499
Epoch 16, Loss: 1.6323833524859668
Epoch 17, Loss: 1.6184586303864674
Epoch 18, Loss: 1.6055448719028929
Epoch 19, Loss: 1.593489772297716
Epoch 20, Loss: 1.5821788858404437
Epoch 21, Loss: 1.5715268738945911
Epoch 22, Loss: 1.5614710399752425
Epoch 23, Loss: 1.5519665481407758
Epoch 24, Loss: 1.5429828285401752
Epoch 25, Loss: 1.5345007690051042
Epoch 26, Loss: 1.5265103696892275
Epoch 27, Loss: 1.519008604947831
Epoch 28, Loss: 1.511997303126542
Epoch 29, Loss: 1.50548093095829
Epoch

In [72]:
# Test skipGram
print(corpus)
input_word = "results"
similar_word = get_similar_words(input_word)
dict(sorted(similar_word.items(), key=lambda item: item[1], reverse=True))



The similarity scores you are seeing are a perfect reflection of what the model learned from the only data it has ever seen
The fact that the trained results are different from the random ones proves your entire implementation works
To get more human-like results, you would simply run your exact same code on a much larger text file



{'results,': np.float64(0.9644569032438622),
 'trained': np.float64(0.8997579933464152),
 'seeing': np.float64(0.8879324904918763),
 'scores': np.float64(0.5849170951964071),
 'perfect': np.float64(0.5699731822649808),
 'larger': np.float64(0.5099492154535696),
 'of': np.float64(0.46667694890313205),
 'data': np.float64(0.45398990553805246),
 'get': np.float64(0.4371552114962526),
 'simply': np.float64(0.3821807652666966),
 'different': np.float64(0.3792789373061041),
 'from': np.float64(0.3400427725860029),
 'entire': np.float64(0.30557846150304807),
 'implementation': np.float64(0.29950883729055017),
 'are': np.float64(0.23888494793271384),
 'ones': np.float64(0.23142707348136896),
 'much': np.float64(0.19257491878690283),
 'that': np.float64(0.16905722693256126),
 'file\n': np.float64(0.14599589210170355),
 'proves': np.float64(0.1327829572455837),
 'run': np.float64(0.11772731088513941),
 'random': np.float64(0.11528228870203448),
 'more': np.float64(0.09626360068371216),
 'learned

In [3]:
# CBOW

In [64]:
def generate_training_set_cbow(corpus_tokens, window_size=2):
    pairs = []

    for i, word in enumerate(corpus_tokens):
        full_context = corpus_tokens[
            max(0, i - window_size): min(len(corpus_tokens), i + window_size + 1)
        ]
        
        context_words = full_context[0: full_context.index(word)] + full_context[full_context.index(word)+1:]

        pairs.append((context_words, word))
    
    return pairs
    
cbow_training_set = generate_training_set_cbow(corpus_tokens)
cbow_training_set

[(['similarity', 'scores'], '\n\nThe'),
 (['\n\nThe', 'scores', 'you'], 'similarity'),
 (['\n\nThe', 'similarity', 'you', 'are'], 'scores'),
 (['similarity', 'scores', 'are', 'seeing'], 'you'),
 (['scores', 'you', 'seeing', 'are'], 'are'),
 (['you', 'are', 'are', 'a'], 'seeing'),
 (['seeing', 'are', 'a', 'perfect'], 'are'),
 (['seeing', 'are', 'perfect', 'reflection'], 'a'),
 (['are', 'a', 'reflection', 'of'], 'perfect'),
 (['a', 'perfect', 'of', 'what'], 'reflection'),
 (['perfect', 'reflection', 'what', 'the'], 'of'),
 (['reflection', 'of', 'the', 'model'], 'what'),
 (['of', 'what', 'model', 'learned'], 'the'),
 (['what', 'the', 'learned', 'from'], 'model'),
 (['the', 'model', 'from', 'the'], 'learned'),
 (['model', 'learned', 'the', 'only'], 'from'),
 (['learned', 'from', 'only', 'data'], 'the'),
 (['from', 'the', 'data', 'it'], 'only'),
 (['the', 'only', 'it', 'has'], 'data'),
 (['only', 'data', 'has', 'ever'], 'it'),
 (['data', 'it', 'ever', 'seen\nThe'], 'has'),
 (['it', 'has', '

In [65]:
for epoch in range(epochs):
    sum_of_loss = 0
    
    for context_words, word in cbow_training_set:
        cw_vectors = [w1[word_to_index[cw]] for cw in context_words]
        avg_cw_vectors = np.sum(cw_vectors, axis=0) / len(context_words)
    
        scores = avg_cw_vectors @ w2
        exp_scores = np.exp(scores)
    
        sum_of_scores = np.sum(exp_scores)
    
        probabilities = exp_scores / sum_of_scores
    
        center_word_probability = probabilities[word_to_index[word]]
        
        loss = -np.log(center_word_probability)
        sum_of_loss += loss 
        
        cw_one_hot = np.zeros((1, len(unique_words)))
        cw_one_hot[0, word_to_index[word]] = 1
    
        grad_score = probabilities - cw_one_hot
    
        grad_w2 = avg_cw_vectors.reshape(1, -1).T @ grad_score
        grad_w1 = grad_score @ w2.T
    
        # to update w1, we need to distribute the learnings for all participants of the context words.
        for context_word in context_words:
            w1[word_to_index[context_word]] -= learning_rate * grad_w1[0]
        w2 -= learning_rate * grad_w2
        
    print(f"Epoch {epoch + 1}, Loss: {sum_of_loss / len(cbow_training_set)}")
    

Epoch 1, Loss: 3.9075400778339806
Epoch 2, Loss: 3.8853502203779695
Epoch 3, Loss: 3.863659435235088
Epoch 4, Loss: 3.842425688790338
Epoch 5, Loss: 3.821610055637859
Epoch 6, Loss: 3.8011764380345725
Epoch 7, Loss: 3.781091318024663
Epoch 8, Loss: 3.761323538045292
Epoch 9, Loss: 3.7418441064595225
Epoch 10, Loss: 3.722626024989313
Epoch 11, Loss: 3.7036441354600846
Epoch 12, Loss: 3.684874983635017
Epoch 13, Loss: 3.6662966982248095
Epoch 14, Loss: 3.6478888834172327
Epoch 15, Loss: 3.6296325234884614
Epoch 16, Loss: 3.611509898241708
Epoch 17, Loss: 3.5935045081728165
Epoch 18, Loss: 3.575601008391789
Epoch 19, Loss: 3.5577851504367124
Epoch 20, Loss: 3.5400437312050084
Epoch 21, Loss: 3.5223645482983486
Epoch 22, Loss: 3.5047363611335527
Epoch 23, Loss: 3.487148857213709
Epoch 24, Loss: 3.4695926229825327
Epoch 25, Loss: 3.452059118701592
Epoch 26, Loss: 3.4345406567951455
Epoch 27, Loss: 3.417030383102089
Epoch 28, Loss: 3.3995222604594586
Epoch 29, Loss: 3.382011054018949
Epoch 3

In [73]:
# Testing Similarity
print(corpus)
input_word = "results"
similar_word = get_similar_words(input_word)
dict(sorted(similar_word.items(), key=lambda item: item[1], reverse=True))




The similarity scores you are seeing are a perfect reflection of what the model learned from the only data it has ever seen
The fact that the trained results are different from the random ones proves your entire implementation works
To get more human-like results, you would simply run your exact same code on a much larger text file



{'results,': np.float64(0.9644569032438622),
 'trained': np.float64(0.8997579933464152),
 'seeing': np.float64(0.8879324904918763),
 'scores': np.float64(0.5849170951964071),
 'perfect': np.float64(0.5699731822649808),
 'larger': np.float64(0.5099492154535696),
 'of': np.float64(0.46667694890313205),
 'data': np.float64(0.45398990553805246),
 'get': np.float64(0.4371552114962526),
 'simply': np.float64(0.3821807652666966),
 'different': np.float64(0.3792789373061041),
 'from': np.float64(0.3400427725860029),
 'entire': np.float64(0.30557846150304807),
 'implementation': np.float64(0.29950883729055017),
 'are': np.float64(0.23888494793271384),
 'ones': np.float64(0.23142707348136896),
 'much': np.float64(0.19257491878690283),
 'that': np.float64(0.16905722693256126),
 'file\n': np.float64(0.14599589210170355),
 'proves': np.float64(0.1327829572455837),
 'run': np.float64(0.11772731088513941),
 'random': np.float64(0.11528228870203448),
 'more': np.float64(0.09626360068371216),
 'learned