# Continuous Bag of Words (CBOW) Model

In [16]:
corpus = [
    "Natural Language Processing is a fascinating field of study that has been evolving rapidly over the past few decades.",
    "Machine learning provides powerful tools for automating tasks and making predictions from data.",
    "Text data is often messy and unstructured, which makes it challenging to analyze and understand without the right tools.",
    "Deep learning models have shown remarkable success in understanding complex patterns in data, especially for tasks related to NLP.",
    "I love building machine learning models and experimenting with different techniques to improve their performance.",
    "Clean and properly preprocessed data is essential for building successful machine learning models that generalize well."
]

In [17]:
import pandas as pd
df = pd.read_csv("imdb_reviews.csv")
corpus = df['review'].tolist()
corpus[:5]

["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the f

In [18]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")

def clean_text(documents: list[str], batch_size: int = 100):

    pattern = re.compile(r"[^\w\s]")
    batch_size = batch_size

    cleaned_docs = []
    for i in range (0, len(documents), batch_size):
        batch_docs = documents[i:i+batch_size]
        docs_text = [pattern.sub("", doc.lower()) for doc in batch_docs]
        for doc_nlp in nlp.pipe(docs_text):
            filtered_text = [token.text for token in doc_nlp if not token.is_stop and token.text.strip()]
            cleaned_docs.append(filtered_text)

    return cleaned_docs

cleaned_corpus = clean_text(corpus[:5])
cleaned_corpus[:5]


[['reviewers',
  'mentioned',
  'watching',
  '1',
  'oz',
  'episode',
  'll',
  'hooked',
  'right',
  'exactly',
  'happened',
  'mebr',
  'br',
  'thing',
  'struck',
  'oz',
  'brutality',
  'unflinching',
  'scenes',
  'violence',
  'set',
  'right',
  'word',
  'trust',
  'faint',
  'hearted',
  'timid',
  'pulls',
  'punches',
  'regards',
  'drugs',
  'sex',
  'violence',
  'hardcore',
  'classic',
  'use',
  'wordbr',
  'br',
  'called',
  'oz',
  'nickname',
  'given',
  'oswald',
  'maximum',
  'security',
  'state',
  'penitentary',
  'focuses',
  'mainly',
  'emerald',
  'city',
  'experimental',
  'section',
  'prison',
  'cells',
  'glass',
  'fronts',
  'face',
  'inwards',
  'privacy',
  'high',
  'agenda',
  'em',
  'city',
  'home',
  'manyaryans',
  'muslims',
  'gangstas',
  'latinos',
  'christians',
  'italians',
  'irish',
  'moreso',
  'scuffles',
  'death',
  'stares',
  'dodgy',
  'dealings',
  'shady',
  'agreements',
  'far',
  'awaybr',
  'br',
  'main',


In [19]:
from collections import Counter

def build_vocab(corpus: list[str]):
    vocab = Counter(term for doc in corpus for term in doc)
    word_to_idx = {word: idx for idx, (word, _) in enumerate(vocab.items())}
    idx_to_word = {idx: word for idx, (word, _) in enumerate(vocab.items())}
    return word_to_idx, idx_to_word

word_to_idx, idx_to_word = build_vocab(cleaned_corpus[:5])
print(word_to_idx)
print(idx_to_word)
print(len(word_to_idx))

{'reviewers': 0, 'mentioned': 1, 'watching': 2, '1': 3, 'oz': 4, 'episode': 5, 'll': 6, 'hooked': 7, 'right': 8, 'exactly': 9, 'happened': 10, 'mebr': 11, 'br': 12, 'thing': 13, 'struck': 14, 'brutality': 15, 'unflinching': 16, 'scenes': 17, 'violence': 18, 'set': 19, 'word': 20, 'trust': 21, 'faint': 22, 'hearted': 23, 'timid': 24, 'pulls': 25, 'punches': 26, 'regards': 27, 'drugs': 28, 'sex': 29, 'hardcore': 30, 'classic': 31, 'use': 32, 'wordbr': 33, 'called': 34, 'nickname': 35, 'given': 36, 'oswald': 37, 'maximum': 38, 'security': 39, 'state': 40, 'penitentary': 41, 'focuses': 42, 'mainly': 43, 'emerald': 44, 'city': 45, 'experimental': 46, 'section': 47, 'prison': 48, 'cells': 49, 'glass': 50, 'fronts': 51, 'face': 52, 'inwards': 53, 'privacy': 54, 'high': 55, 'agenda': 56, 'em': 57, 'home': 58, 'manyaryans': 59, 'muslims': 60, 'gangstas': 61, 'latinos': 62, 'christians': 63, 'italians': 64, 'irish': 65, 'moreso': 66, 'scuffles': 67, 'death': 68, 'stares': 69, 'dodgy': 70, 'deali

In [20]:
def create_context_target_pairs(corpus: list[str], window_size: int = 2):
    pairs = []
    for document in corpus:
        for idx, term in enumerate(document):
            start_idx = max(idx - window_size, 0)
            end_idx = min(idx + window_size + 1, len(document))
            pairs.append(([document[i] for i in range(start_idx, end_idx)], term))

    return pairs

pairs = create_context_target_pairs(cleaned_corpus)
pairs[:5]   

[(['reviewers', 'mentioned', 'watching'], 'reviewers'),
 (['reviewers', 'mentioned', 'watching', '1'], 'mentioned'),
 (['reviewers', 'mentioned', 'watching', '1', 'oz'], 'watching'),
 (['mentioned', 'watching', '1', 'oz', 'episode'], '1'),
 (['watching', '1', 'oz', 'episode', 'll'], 'oz')]

In [21]:
def encode_pairs(pairs: list[str], word_to_idx: dict):
    encoded_pairs = []
    for context, target in pairs:
        context_idx = [word_to_idx[term] for term in context]
        target_idx = word_to_idx[target]
        encoded_pairs.append((context_idx, target_idx))

    return encoded_pairs

encoded_pairs = encode_pairs(pairs, word_to_idx)
encoded_pairs[:5]

[([0, 1, 2], 0),
 ([0, 1, 2, 3], 1),
 ([0, 1, 2, 3, 4], 2),
 ([1, 2, 3, 4, 5], 3),
 ([2, 3, 4, 5, 6], 4)]

In [22]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

def convert_to_dataset(encoded_pairs: list, batch_size: int):
    context_data, target_data = zip(*encoded_pairs)
    context_tensors = [torch.tensor(context, dtype=torch.long) for context in context_data]
    context_tensor = pad_sequence(context_tensors, batch_first=True)
    target_tensor = torch.tensor(target_data, dtype=torch.long)
    dataset = TensorDataset(context_tensor, target_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return dataloader

def create_dataset(corpus: list[str], batch_size: int, test_size: float = 0.2) -> tuple[DataLoader, DataLoader, dict]:
    cleaned_corpus = clean_text(corpus)
    word_to_idx, idx_to_word = build_vocab(cleaned_corpus)
    pairs = create_context_target_pairs(cleaned_corpus)
    encoded_pairs = encode_pairs(pairs, word_to_idx)
    train_pairs, test_pairs = train_test_split(encoded_pairs, test_size=test_size, random_state=42)
    return convert_to_dataset(train_pairs, batch_size), convert_to_dataset(test_pairs, batch_size), word_to_idx


`nn.Embedding(vocab_size, embedding_dim)`: This creates a lookup table for the embeddings. It takes two arguments:
- `vocab_size`: The size of the vocabulary (number of unique words in the corpus).
- `embedding_dim`: The dimension of the embeddings.

When you pass a tensor of indices to the `Embedding` layer, it looks up the embeddings for each index and returns a tensor of shape `(batch_size, sequence_length, embedding_dim)`.

in the forward pass, the mean of the embeddings is taken and then passed to the linear layer. this is necessary as mean is a good choice for cbow model because:
- Stability: Averaging smooths out noise and produces more stable representations.
- Invariance to Context Size: Whether you have 2 or 10 context words, you get an embedding of the same dimension and approximate magnitude.
- Computational Efficiency: It's simple and fast to compute.
- Theoretical Interpretation: It can be interpreted as estimating the expected context embedding for a given target word.


In [23]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

class CBOWModel(nn.Module):
    def __init__(self, vocab, embedding_dim):
        super(CBOWModel, self).__init__()
        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.embeddings = nn.Embedding(self.vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, self.vocab_size)

    def forward(self, context):
        embedded = self.embeddings(context)
        embedded = F.dropout(embedded, p=0.1)
        embedded = embedded.mean(dim=1)
        out = self.linear(embedded)
        return out
    
    def predict(self, context):
        logits = self(context)
        probs = F.softmax(logits, dim=1)
        pred_idx = torch.argmax(probs, dim=1)
        return pred_idx
    
    def cosine_similarity(self, word1_idx, word2_idx):
        word1_embedding = self.embeddings(torch.tensor([word1_idx]))
        word2_embedding = self.embeddings(torch.tensor([word2_idx]))
        similarity = F.cosine_similarity(word1_embedding, word2_embedding)
        return similarity.item()
    
    def similarity(self, word1, word2):
        word1_idx = self.vocab[word1]
        word2_idx = self.vocab[word2]
        return self.cosine_similarity(word1_idx, word2_idx)

def train_model(model: nn.Module, train_loader: DataLoader, test_loader: DataLoader, epochs: int, learning_rate: float):
    
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    loss_function = nn.CrossEntropyLoss()
    losses = []

    for epoch in tqdm(range(epochs), desc="Training", total=epochs):
        total_loss = 0
        test_loss = 0
        model.train()
        for X_train, y_train in train_loader:
            y_logits = model(X_train) # forward pass
            loss = loss_function(y_logits, y_train) # compute loss

            optimizer.zero_grad() # set gradients to zero
            loss.backward() # backward pass to compute gradients that are used to update the weights
            optimizer.step() # update the weights
            total_loss += loss.item() # add the loss to the total loss
        losses.append(total_loss)

        model.eval()
        with torch.inference_mode():
            for X_test, y_test in test_loader:
                y_logits = model(X_test)
                test_loss_fn = loss_function(y_logits, y_test)
                test_loss += test_loss_fn.item()

        if epoch % 10 == 0:
            print(f"Epoch {epoch} | Train Loss: {total_loss} | Test Loss: {test_loss}")

    return losses

In [26]:
train_loader, test_loader, vocab = create_dataset(corpus[:1000], 32)
print(len(vocab))
embedding_dim = 12
model = CBOWModel(vocab=vocab, embedding_dim=embedding_dim)
losses = train_model(model, train_loader, test_loader, epochs=100, learning_rate=0.01)

# save model
torch.save(model.state_dict(), "cbow_model.pt")
# losses

20663


Training:   1%|          | 1/100 [00:12<21:20, 12.94s/it]

Epoch 0 | Train Loss: 22541.599172592163 | Test Loss: 5439.917895317078


Training:  11%|█         | 11/100 [02:23<19:01, 12.83s/it]

Epoch 10 | Train Loss: 13056.894749641418 | Test Loss: 6550.245553016663


Training:  21%|██        | 21/100 [04:07<13:43, 10.43s/it]

Epoch 20 | Train Loss: 11774.263401031494 | Test Loss: 7210.885009765625


Training:  31%|███       | 31/100 [05:55<11:57, 10.40s/it]

Epoch 30 | Train Loss: 11182.8458943367 | Test Loss: 7676.044419765472


Training:  41%|████      | 41/100 [07:32<09:31,  9.69s/it]

Epoch 40 | Train Loss: 10885.312699317932 | Test Loss: 8016.459345817566


Training:  51%|█████     | 51/100 [09:18<08:41, 10.65s/it]

Epoch 50 | Train Loss: 10700.528440237045 | Test Loss: 8272.693334579468


Training:  61%|██████    | 61/100 [11:02<06:47, 10.46s/it]

Epoch 60 | Train Loss: 10564.257967948914 | Test Loss: 8505.10803604126


Training:  71%|███████   | 71/100 [12:56<05:57, 12.32s/it]

Epoch 70 | Train Loss: 10455.07897400856 | Test Loss: 8650.00625038147


Training:  81%|████████  | 81/100 [15:01<03:43, 11.75s/it]

Epoch 80 | Train Loss: 10384.208398103714 | Test Loss: 8808.534088134766


Training:  91%|█████████ | 91/100 [17:09<01:54, 12.72s/it]

Epoch 90 | Train Loss: 10321.743609905243 | Test Loss: 8911.629905223846


Training: 100%|██████████| 100/100 [19:08<00:00, 11.49s/it]


In [16]:
# load model
import torch
# Load the model state dictionary
model_state_dict = torch.load("cbow_model.pt")

# Load the model directly from the state dictionary
model = model_state_dict

# No need to create a new instance and load state dictionary separately
# since the saved model contains all the necessary information

  model_state_dict = torch.load("cbow_model.pt")


In [27]:
import plotly.express as px
import plotly.graph_objects as go

def loss_curve(losses: list[float]):
    fig = px.line(x=range(len(losses)), y=losses, title='Loss Curve')
    fig.show()

loss_curve(losses)


## Evals

In [29]:
analogies = [
    ("man", "woman", "boy", "girl"),
    ("king", "queen", "prince", "princess"),
    ("father", "mother", "brother", "sister"),
    ("doctor", "hospital", "teacher", "school"),
    ("apple", "fruit", "carrot", "vegetable"),
    ("dog", "bark", "cat", "meow"),
    ("book", "read", "movie", "watch"),
    ("water", "drink", "food", "eat"),
    ("car", "drive", "bicycle", "ride"),
    ("Paris", "France", "Berlin", "Germany"),
    ("walk", "walked", "run", "ran"),
    ("eat", "ate", "see", "saw"),
    ("good", "better", "bad", "worse"),
    ("big", "bigger", "small", "smaller"),
    ("child", "children", "man", "men"),
    ("go", "going", "come", "coming"),
    ("happy", "happier", "sad", "sadder"),
    ("think", "thought", "feel", "felt"),
    ("write", "written", "speak", "spoken"),
    ("play", "played", "work", "worked")
]

import numpy as np

def compute_analogy_accuracy(embeddings, word_to_idx, analogies):
    """
    Compute the word analogy accuracy using cosine similarity.
    
    Parameters:
    - embeddings: numpy array of shape (vocab_size, embedding_dim), the word embeddings from the CBOW model
    - word_to_idx: dict mapping words to their indices in the embeddings matrix
    - analogies: list of tuples (A, B, C, D) representing analogies "A is to B as C is to D"
    
    Returns:
    - accuracy: float, percentage of correct predictions
    """
    # Normalize embeddings to unit length for cosine similarity
    norms = np.linalg.norm(embeddings, axis=1)
    print(f"norms: {norms}")
    embeddings_normalized = embeddings / norms[:, np.newaxis]
    print(f"embeddings_normalized: {embeddings_normalized}")
    correct = 0
    total = 0
    avg_similarity = 0
    for A, B, C, D in analogies:
        print(f"A: {A}, B: {B}, C: {C}, D: {D}")
        # Skip analogy if any word is not in the vocabulary
        if A not in word_to_idx or B not in word_to_idx or C not in word_to_idx or D not in word_to_idx:
            print(f"Skipping analogy: {A} : {B} :: {C} : {D} (word not in vocabulary)")
            continue
        
        # Get indices of words A, B, C, D
        idx_A = word_to_idx[A]
        idx_B = word_to_idx[B]
        idx_C = word_to_idx[C]
        idx_D = word_to_idx[D]
        print(idx_A, idx_B, idx_C, idx_D)
        
        # Compute the analogy vector X = vector(B) - vector(A) + vector(C)
        X = embeddings_normalized[idx_B] - embeddings_normalized[idx_A] + embeddings_normalized[idx_C]
        print(f"X: {X}")
        
        similarity_threshold = 0.90
        # similarity between X and D
        similarity = np.dot(X, embeddings_normalized[idx_D])
        print(f"similarity: {similarity}")
        avg_similarity += similarity
        if similarity >= similarity_threshold:
            correct += 1
        total += 1
        
    
    # Handle case where no valid analogies were processed
    if total == 0:
        return 0.0
    
    # Compute accuracy as percentage
    accuracy = correct / total * 100
    avg_similarity /= total
    return accuracy, avg_similarity


embeddings = model.embeddings.weight.data.numpy()
acc, avg_similarity = compute_analogy_accuracy(embeddings, vocab, analogies)
acc, avg_similarity

norms: [ 1.2366066  8.320855   7.452545  ... 19.6043    17.017044  25.644018 ]
embeddings_normalized: [[ 0.10190571  0.10056304  0.10401462 ...  0.05363297 -0.18942468
  -0.36074483]
 [ 0.20849703 -0.03250107 -0.6173669  ... -0.01910633  0.18679707
  -0.42084056]
 [ 0.3873951  -0.11529763 -0.0641541  ...  0.18682756  0.23886132
   0.01504575]
 ...
 [-0.5978849   0.03820896 -0.48145422 ...  0.20628786 -0.3227963
  -0.367883  ]
 [ 0.21028958  0.07459436 -0.36274526 ...  0.34811535 -0.5412496
  -0.53862935]
 [ 0.14371923  0.27519608 -0.08739192 ...  0.32810763 -0.06639595
   0.22749989]]
A: man, B: woman, C: boy, D: girl
785 619 259 1062
X: [ 0.12723988 -0.37427738 -0.1376323  -0.49823642  0.531686    0.2920439
  0.32796383 -0.74903715 -0.1466363   0.33375967 -0.28666532  0.19823392]
similarity: 0.6867948770523071
A: king, B: queen, C: prince, D: princess
3299 5958 15078 4692
X: [ 0.4116267   0.62182826  0.90117663 -0.08440593  0.46371832  0.37712312
  0.5110897  -0.40262944 -0.9548292   

(7.6923076923076925, 0.24254346237732813)

torch.Size([18516, 10])