# Continous Bag of Words

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import itertools
from pathlib import Path
import sys
import os
from datetime import datetime
from tqdm import tqdm

In [10]:
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

cuda


# Dataloader

In [4]:
# CBOW Dataset
class CBOWDataset(Dataset):
    def __init__(self, token_indices, context_size):
        self.data = token_indices
        self.context_size = context_size

    def __len__(self):
        return len(self.data) - 2*self.context_size

    def __getitem__(self, idx):
        # context: C words before + C words after
        context = (
            self.data[idx : idx + self.context_size] +
            self.data[idx + self.context_size + 1 : idx + 2*self.context_size + 1]
        )
        target = self.data[idx + self.context_size]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)

# Model

In [5]:
# Define CBOW model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, inputs):
        # inputs: (batch_size, 2*context_size)
        embs = self.embeddings(inputs)      # → (batch_size, C, embed_dim)
        embs = embs.mean(dim=1)             # → (batch_size, embed_dim)
        out = self.linear(embs)             # → (batch_size, vocab_size)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

# Load Text8

In [6]:
# Read text8 and build vocab, 50k is bigger than vocab size, if smaller than vocab size least frequent words replaced with unk
def load_text8(path, max_vocab=50000):
    with open(path, 'r') as f:
        words = f.read().split()
    # build most common vocab
    freq = Counter(words)
    most_common = freq.most_common(max_vocab-1)
    idx_to_word = ['<unk>'] + [w for w,_ in most_common]
    word_to_idx = {w:i for i,w in enumerate(idx_to_word)}
    # map to indices (unk if not in vocab)
    data = [word_to_idx.get(w, 0) for w in words]
    return data, word_to_idx, idx_to_word

data, w2i, i2w = load_text8("text8.txt", max_vocab=50000)

# Training 

In [17]:
model_dir = 'models'
os.makedirs(model_dir, exist_ok=True)

def train(
    data=data,
    w2i=w2i, 
    i2w=i2w,
    context_size=4,
    embed_dim=128,
    batch_size=256,
    epochs=3,
    lr=0.001,
):

    vocab_size = len(w2i)
    dataset = CBOWDataset(data, context_size)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    model = CBOW(vocab_size, embed_dim).to(DEVICE)
    loss_fn = nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # training loop
    for epoch in range(1, epochs+1):
        total_loss = 0.0
        progress_bar = tqdm(enumerate(dataloader, 1), total=len(dataloader), desc=f"Epoch {epoch}", unit="batch", dynamic_ncols=True)

        for step, (contexts, targets) in progress_bar:
            contexts, targets = contexts.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            log_probs = model(contexts)
            loss = loss_fn(log_probs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            avg_loss = total_loss / step
            progress_bar.set_postfix(loss=avg_loss)

        avg_epoch = total_loss / len(dataloader)

        print(f"✓ Epoch {epoch} complete. Average Loss: {avg_loss:.4f}\n")
    
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'cbow_text8_{timestamp}.pth'
    filepath = os.path.join(model_dir, filename)

    # save
    torch.save({
        'model_state': model.state_dict(),
        'word_to_idx': w2i,
        'idx_to_word': i2w,
        'embed_dim': embed_dim,
        'context_size': context_size
    }, filepath)
    print(f"Model saved {filepath}")

In [18]:
train()

Epoch 1:   0%|          | 87/66427 [02:07<26:59:15,  1.46s/batch, loss=10.3]
Epoch 1: 100%|██████████| 66427/66427 [10:23<00:00, 106.56batch/s, loss=6.35]


✓ Epoch 1 complete. Average Loss: 6.3487



Epoch 2:   0%|          | 0/66427 [00:00<?, ?batch/s]

# Load Model after Training

In [7]:
ckpt_path = Path(filepath)
ckpt = torch.load(ckpt_path, map_location=DEVICE)

In [8]:
# Re-instantiate model and load weights
vocab_size = len(ckpt["idx_to_word"])
embed_dim  = ckpt["embed_dim"]
window_sz  = ckpt["context_size"]

model = CBOW(vocab_size, embed_dim).to(DEVICE).eval()
model.load_state_dict(ckpt["model_state"])

# Helper dicts
idx2word = ckpt["idx_to_word"]
word2idx = ckpt["word_to_idx"]

embedding_matrix = model.embeddings.weight.detach()   # (V, D)
embedding_matrix = F.normalize(embedding_matrix, dim=1)

# Find top K similar words

In [None]:
def nearest_neighbours(query_word, k=5):
    q_idx = word2idx.get(query_word, 0)
    q_vec = embedding_matrix[q_idx]                 # (D,)
    cos = torch.mv(embedding_matrix, q_vec)         # cosine similarity to all vocab
    topk = cos.topk(k+1).indices.tolist()           # +1 because first will be the word itself
    topk = [i for i in topk if i != q_idx][:k]
    return [idx2word[i] for i in topk]

print(nearest_neighbours("queen", k=5))

# Evals
By Tyrone on Gitub

In [9]:
from eval_function import word_analogy_test, semantic_similarity_test, category_clustering_test

In [10]:
def evaluate_embeddings(embedding_matrix, word2idx):
    # Run all tests
    analogy_hard_score, analogy_soft_score = word_analogy_test(embedding_matrix, word2idx)
    similarity_score, dissimilarity_score = semantic_similarity_test(embedding_matrix, word2idx)
    clustering_score = category_clustering_test(embedding_matrix, word2idx)

    overall_score = (analogy_soft_score + similarity_score + clustering_score) / 3

    return overall_score, analogy_soft_score, similarity_score, dissimilarity_score, clustering_score

In [None]:
print(evaluate_embeddings(embedding_matrix, word2idx))