In [26]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from typing import List, Tuple, Dict

In [27]:
def build_vocab(sentences: List[str]) -> Dict[str, int]:
    """
    Builds a dictionary mapping words to integer indices.
    """
    vocab = set()
    for sentence in sentences:
        words = sentence.lower().strip().split()
        vocab.update(words)

    return {word: i for i, word in enumerate(sorted(vocab))}

In [28]:
def text_to_indices(sentences: List[str], word_to_int: Dict[str, int]) -> List[List[int]]:
    """
    Converts sentences of text into lists of integer indices.
    """
    corpus_indices = []
    for sentence in sentences:
        words = sentence.lower().strip().split()
        sentence_indices = [word_to_int[word] for word in words if word in word_to_int]
        corpus_indices.append(sentence_indices)
                   
    return corpus_indices

In [29]:
def generate_skipgram_data(corpus_indices: List[List[int]], window_size: int = 2) -> List[Tuple[int, int]]:
    """
    Generates (center_word, context_word) pairs for Skip-gram training.
    """
    training_data = []
    
    for sentence in corpus_indices:
        sentence_length = len(sentence)
        for center_pos, center_word_id in enumerate(sentence):
            for offset in range(-window_size, window_size + 1):
                if offset == 0:
                    continue
                
                context_pos = center_pos + offset
                if 0 <= context_pos < sentence_length:
                    context_word_id = sentence[context_pos]
                    training_data.append((center_word_id, context_word_id))
    
    return training_data

In [30]:
class Word2VecDataset(Dataset):
    def __init__(self, training_data: List[Tuple[int, int]]):
        self.data = training_data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        target, context = self.data[idx]
        return torch.tensor(target, dtype=torch.long), torch.tensor(context, dtype=torch.long)


In [31]:
class Word2VecModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size, bias=False)
        
    def forward(self, x):
        embeds = self.embedding(x)  # Shape: (Batch_Size, Embedding_Dim)
        out = self.linear(embeds)   # Shape: (Batch_Size, Vocab_Size)
        return out

In [32]:
EMBEDDING_DIM = 10
WINDOW_SIZE = 2
BATCH_SIZE = 4
EPOCHS = 100
LEARNING_RATE = 0.01

In [33]:
raw_sentences = [
    # Cluster 1: Royalty and Gender (The "King-Queen" attempt)
    "the king is a royal man who rules the kingdom",
    "the queen is a royal woman who rules the kingdom",
    "the king and the queen sit on the throne",
    "the prince is a young man who will be king",
    "the princess is a young woman who will be queen",
    "the man became a king",
    "the woman became a queen",
    "he is a man and she is a woman",
    "the brother is a man and the sister is a woman",
    "kings and queens are royalty",
    
    # Cluster 2: Animals (To separate from royalty)
    "the dog barks at the cat",
    "the cat meows at the dog",
    "the dog chased the cat up the tree",
    "the puppy is a young dog",
    "the kitten is a young cat",
    "dogs and cats are pets",
    "the dog eats meat and the cat drinks milk",
    
    # Cluster 3: Filler to connect grammar (semantics)
    "the man walked the dog",
    "the woman fed the cat",
    "the king has a dog",
    "the queen loves her cat"
] * 10  # Multiply the list by 10 to give the model more "iterations" to see the pairs

word_to_int = build_vocab(raw_sentences)
print(word_to_int)
corpus_indices = text_to_indices(raw_sentences, word_to_int)
print(corpus_indices)
training_data = generate_skipgram_data(corpus_indices, window_size=WINDOW_SIZE)
print(training_data)

{'a': 0, 'and': 1, 'are': 2, 'at': 3, 'barks': 4, 'be': 5, 'became': 6, 'brother': 7, 'cat': 8, 'cats': 9, 'chased': 10, 'dog': 11, 'dogs': 12, 'drinks': 13, 'eats': 14, 'fed': 15, 'has': 16, 'he': 17, 'her': 18, 'is': 19, 'king': 20, 'kingdom': 21, 'kings': 22, 'kitten': 23, 'loves': 24, 'man': 25, 'meat': 26, 'meows': 27, 'milk': 28, 'on': 29, 'pets': 30, 'prince': 31, 'princess': 32, 'puppy': 33, 'queen': 34, 'queens': 35, 'royal': 36, 'royalty': 37, 'rules': 38, 'she': 39, 'sister': 40, 'sit': 41, 'the': 42, 'throne': 43, 'tree': 44, 'up': 45, 'walked': 46, 'who': 47, 'will': 48, 'woman': 49, 'young': 50}
[[42, 20, 19, 0, 36, 25, 47, 38, 42, 21], [42, 34, 19, 0, 36, 49, 47, 38, 42, 21], [42, 20, 1, 42, 34, 41, 29, 42, 43], [42, 31, 19, 0, 50, 25, 47, 48, 5, 20], [42, 32, 19, 0, 50, 49, 47, 48, 5, 34], [42, 25, 6, 0, 20], [42, 49, 6, 0, 34], [17, 19, 0, 25, 1, 39, 19, 0, 49], [42, 7, 19, 0, 25, 1, 42, 40, 19, 0, 49], [22, 1, 35, 2, 37], [42, 11, 4, 3, 42, 8], [42, 8, 27, 3, 42, 11],

In [34]:
dataset = Word2VecDataset(training_data)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [35]:
vocab_size = len(word_to_int)
model = Word2VecModel(vocab_size, EMBEDDING_DIM)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [36]:
print("Starting Training...")
for epoch in range(EPOCHS):
    total_loss = 0
    for target_batch, context_batch in dataloader:
        # 1. Forward Pass
        preds = model(target_batch)
        
        # 2. Compute Loss
        loss = loss_fn(preds, context_batch)
        
        # 3. Backward Pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch: {epoch+1} | Loss: {total_loss/len(dataloader):.4f}")

Starting Training...
Epoch: 10 | Loss: 2.9224
Epoch: 20 | Loss: 2.6060
Epoch: 30 | Loss: 2.4370
Epoch: 40 | Loss: 2.3403
Epoch: 50 | Loss: 2.2799
Epoch: 60 | Loss: 2.2420
Epoch: 70 | Loss: 2.2171
Epoch: 80 | Loss: 2.2017
Epoch: 90 | Loss: 2.1908
Epoch: 100 | Loss: 2.1845


In [39]:
embeddings = model.embedding.weight.detach().numpy()

In [40]:
idx = word_to_int['queen']
vector = embeddings[idx]

print(f"Vector for 'dog':\n{vector}")

Vector for 'dog':
[-0.05130796 -0.5789322   1.40281     0.39650747 -1.4877977  -0.4426972
  2.4739347   3.2943633   0.9444695  -1.4515406 ]


In [41]:
def get_similarity(word1, word2):
    v1 = embeddings[word_to_int[word1]]
    v2 = embeddings[word_to_int[word2]]
    # Cosine Similarity Formula: (A . B) / (||A|| * ||B||)
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [42]:
print(get_similarity("queen", "king"))
print(get_similarity("man", "king"))
print(get_similarity("dog", "king"))

0.113034576
0.50585294
0.19964592
