# Continuous Bag of Words (CBOW) Model

In [1]:
corpus = [
    "Natural Language Processing is a fascinating field of study that has been evolving rapidly over the past few decades.",
    "Machine learning provides powerful tools for automating tasks and making predictions from data.",
    "Text data is often messy and unstructured, which makes it challenging to analyze and understand without the right tools.",
    "Deep learning models have shown remarkable success in understanding complex patterns in data, especially for tasks related to NLP.",
    "I love building machine learning models and experimenting with different techniques to improve their performance.",
    "Clean and properly preprocessed data is essential for building successful machine learning models that generalize well."
]

In [2]:
import pandas as pd
df = pd.read_csv("brown.csv")
corpus = df['tokenized_text'].tolist()
corpus[:5]

['Furthermore , as an encouragement to revisionist thinking , it manifestly is fair to admit that any fraternity has a constitutional right to refuse to accept persons it dislikes .',
 'The Unitarian clergy were an exclusive club of cultivated gentlemen -- as the term was then understood in the Back Bay -- and Parker was definitely not a gentleman , either in theology or in manners .',
 'Ezra Stiles Gannett , an honorable representative of the sanhedrin , addressed himself frankly to the issue in 1845 , insisting that Parker should not be persecuted or calumniated and that in this republic no power to restrain him by force could exist .',
 "Even so , Gannett judiciously argued , the Association could legitimately decide that Parker `` should not be encouraged nor assisted in diffusing his opinions by those who differ from him in regard to their correctness '' .",
 'We today are not entitled to excoriate honest men who believed Parker to be downright pernicious and who barred their pulp

In [3]:
import re
import spacy

nlp = spacy.load("en_core_web_sm")


def clean_text(documents: list[str]):
    cleaned_docs = []
    for doc in documents:
        doc_text = re.sub(r"[^\w\s]", "", doc.lower())
        doc_nlp = nlp(doc_text)
        filtered_text = [token.text for token in doc_nlp if not token.is_stop and token.text.strip()]
        cleaned_docs.append(filtered_text)

    return cleaned_docs

cleaned_corpus = clean_text(corpus[:5])
cleaned_corpus[:5]


[['furthermore',
  'encouragement',
  'revisionist',
  'thinking',
  'manifestly',
  'fair',
  'admit',
  'fraternity',
  'constitutional',
  'right',
  'refuse',
  'accept',
  'persons',
  'dislikes'],
 ['unitarian',
  'clergy',
  'exclusive',
  'club',
  'cultivated',
  'gentlemen',
  'term',
  'understood',
  'bay',
  'parker',
  'definitely',
  'gentleman',
  'theology',
  'manners'],
 ['ezra',
  'stiles',
  'gannett',
  'honorable',
  'representative',
  'sanhedrin',
  'addressed',
  'frankly',
  'issue',
  '1845',
  'insisting',
  'parker',
  'persecuted',
  'calumniated',
  'republic',
  'power',
  'restrain',
  'force',
  'exist'],
 ['gannett',
  'judiciously',
  'argued',
  'association',
  'legitimately',
  'decide',
  'parker',
  'encouraged',
  'assisted',
  'diffusing',
  'opinions',
  'differ',
  'regard',
  'correctness'],
 ['today',
  'entitled',
  'excoriate',
  'honest',
  'men',
  'believed',
  'parker',
  'downright',
  'pernicious',
  'barred',
  'pulpits',
  'dema

In [4]:
from collections import Counter

def build_vocab(corpus: list[str]):
    vocab = Counter(term for doc in corpus for term in doc)
    word_to_idx = {word: idx for idx, (word, _) in enumerate(vocab.items())}
    idx_to_word = {idx: word for idx, (word, _) in enumerate(vocab.items())}
    return word_to_idx, idx_to_word

word_to_idx, idx_to_word = build_vocab(cleaned_corpus[:5])
print(word_to_idx)
print(idx_to_word)
print(len(word_to_idx))

{'furthermore': 0, 'encouragement': 1, 'revisionist': 2, 'thinking': 3, 'manifestly': 4, 'fair': 5, 'admit': 6, 'fraternity': 7, 'constitutional': 8, 'right': 9, 'refuse': 10, 'accept': 11, 'persons': 12, 'dislikes': 13, 'unitarian': 14, 'clergy': 15, 'exclusive': 16, 'club': 17, 'cultivated': 18, 'gentlemen': 19, 'term': 20, 'understood': 21, 'bay': 22, 'parker': 23, 'definitely': 24, 'gentleman': 25, 'theology': 26, 'manners': 27, 'ezra': 28, 'stiles': 29, 'gannett': 30, 'honorable': 31, 'representative': 32, 'sanhedrin': 33, 'addressed': 34, 'frankly': 35, 'issue': 36, '1845': 37, 'insisting': 38, 'persecuted': 39, 'calumniated': 40, 'republic': 41, 'power': 42, 'restrain': 43, 'force': 44, 'exist': 45, 'judiciously': 46, 'argued': 47, 'association': 48, 'legitimately': 49, 'decide': 50, 'encouraged': 51, 'assisted': 52, 'diffusing': 53, 'opinions': 54, 'differ': 55, 'regard': 56, 'correctness': 57, 'today': 58, 'entitled': 59, 'excoriate': 60, 'honest': 61, 'men': 62, 'believed': 6

In [5]:
def create_context_target_pairs(corpus: list[str], window_size: int = 2):
    pairs = []
    for document in corpus:
        for idx, term in enumerate(document):
            start_idx = max(idx - window_size, 0)
            end_idx = min(idx + window_size + 1, len(document))
            pairs.append(([document[i] for i in range(start_idx, end_idx)], term))

    return pairs

pairs = create_context_target_pairs(cleaned_corpus)
pairs[:5]   

[(['furthermore', 'encouragement', 'revisionist'], 'furthermore'),
 (['furthermore', 'encouragement', 'revisionist', 'thinking'],
  'encouragement'),
 (['furthermore', 'encouragement', 'revisionist', 'thinking', 'manifestly'],
  'revisionist'),
 (['encouragement', 'revisionist', 'thinking', 'manifestly', 'fair'],
  'thinking'),
 (['revisionist', 'thinking', 'manifestly', 'fair', 'admit'], 'manifestly')]

In [6]:
def encode_pairs(pairs: list[str], word_to_idx: dict):
    encoded_pairs = []
    for context, target in pairs:
        context_idx = [word_to_idx[term] for term in context]
        target_idx = word_to_idx[target]
        encoded_pairs.append((context_idx, target_idx))

    return encoded_pairs

encoded_pairs = encode_pairs(pairs, word_to_idx)
encoded_pairs[:5]

[([0, 1, 2], 0),
 ([0, 1, 2, 3], 1),
 ([0, 1, 2, 3, 4], 2),
 ([1, 2, 3, 4, 5], 3),
 ([2, 3, 4, 5, 6], 4)]

In [7]:
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

def convert_to_dataset(encoded_pairs: list, batch_size: int):
    context_data, target_data = zip(*encoded_pairs)
    context_tensors = [torch.tensor(context, dtype=torch.long) for context in context_data]
    context_tensor = pad_sequence(context_tensors, batch_first=True)
    target_tensor = torch.tensor(target_data, dtype=torch.long)
    dataset = TensorDataset(context_tensor, target_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return dataloader

def create_dataset(corpus: list[str], batch_size: int, test_size: float = 0.2) -> tuple[DataLoader, DataLoader, int]:
    cleaned_corpus = clean_text(corpus)
    word_to_idx, idx_to_word = build_vocab(cleaned_corpus)
    pairs = create_context_target_pairs(cleaned_corpus)
    encoded_pairs = encode_pairs(pairs, word_to_idx)
    train_pairs, test_pairs = train_test_split(encoded_pairs, test_size=test_size, random_state=42)
    return convert_to_dataset(train_pairs, batch_size), convert_to_dataset(test_pairs, batch_size), len(word_to_idx)


`nn.Embedding(vocab_size, embedding_dim)`: This creates a lookup table for the embeddings. It takes two arguments:
- `vocab_size`: The size of the vocabulary (number of unique words in the corpus).
- `embedding_dim`: The dimension of the embeddings.

When you pass a tensor of indices to the `Embedding` layer, it looks up the embeddings for each index and returns a tensor of shape `(batch_size, sequence_length, embedding_dim)`.

in the forward pass, the mean of the embeddings is taken and then passed to the linear layer. this is necessary as mean is a good choice for cbow model because:
- Stability: Averaging smooths out noise and produces more stable representations.
- Invariance to Context Size: Whether you have 2 or 10 context words, you get an embedding of the same dimension and approximate magnitude.
- Computational Efficiency: It's simple and fast to compute.
- Theoretical Interpretation: It can be interpreted as estimating the expected context embedding for a given target word.


In [8]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm

class CBOWModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        embedded = self.embeddings(context)
        embedded = F.dropout(embedded, p=0.1)
        embedded = embedded.mean(dim=1)
        out = self.linear(embedded)
        return out
    
    def predict(self, context):
        logits = self(context)
        probs = F.softmax(logits, dim=1)
        pred_idx = torch.argmax(probs, dim=1)
        return pred_idx


def train_model(model: nn.Module, train_loader: DataLoader, test_loader: DataLoader, epochs: int, learning_rate: float):
    
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
    loss_function = nn.CrossEntropyLoss()
    losses = []

    for epoch in tqdm(range(epochs), desc="Training", total=epochs):
        total_loss = 0
        test_loss = 0
        model.train()
        for X_train, y_train in train_loader:
            y_logits = model(X_train) # forward pass
            loss = loss_function(y_logits, y_train) # compute loss

            optimizer.zero_grad() # set gradients to zero
            loss.backward() # backward pass to compute gradients that are used to update the weights
            optimizer.step() # update the weights
            total_loss += loss.item() # add the loss to the total loss
        losses.append(total_loss)

        model.eval()
        with torch.inference_mode():
            for X_test, y_test in test_loader:
                y_logits = model(X_test)
                loss = loss_function(y_logits, y_test)
                test_loss += loss.item()

        if epoch % 100 == 0:
            print(f"Epoch {epoch} | Train Loss: {total_loss} | Test Loss: {test_loss}")

    return losses

In [10]:
train_loader, test_loader, vocab_size = create_dataset(corpus[:20000], 64)
print(vocab_size)
embedding_dim = 10
model = CBOWModel(vocab_size=vocab_size, embedding_dim=embedding_dim)
losses = train_model(model, train_loader, test_loader, epochs=100, learning_rate=0.0001)

# save model
torch.save(model.state_dict(), "cbow_model.pt")
losses

27625


Training:   1%|          | 1/100 [00:29<49:17, 29.87s/it]

Epoch 0 | Train Loss: 21643.038626670837 | Test Loss: 5388.9089012146


Training:  65%|██████▌   | 65/100 [33:54<18:15, 31.30s/it]  


KeyboardInterrupt: 

In [74]:
import plotly.express as px
import plotly.graph_objects as go

def loss_curve(losses: list[float]):
    fig = px.line(x=range(len(losses)), y=losses, title='Loss Curve')
    fig.show()

loss_curve(losses)


## Evals

In [75]:
from torchmetrics import Accuracy, Precision, Recall, F1Score

accuracy_fn = Accuracy(task="multiclass", num_classes=vocab_size)
precision_fn = Precision(task="multiclass", num_classes=vocab_size)
recall_fn = Recall(task="multiclass", num_classes=vocab_size)
f1_fn = F1Score(task="multiclass", num_classes=vocab_size)


X_test, y_test = torch.tensor([], dtype=torch.long), torch.tensor([], dtype=torch.long)
for X, y in test_loader:
    X_test = torch.cat((X_test, X), dim=0)
    y_test = torch.cat((y_test, y), dim=0)


print(X_test.shape)
print(y_test.shape)

y_pred = model.predict(X_test)
y_pred

accuracy = accuracy_fn(y_pred, y_test)
precision = precision_fn(y_pred, y_test)
recall = recall_fn(y_pred, y_test)
f1 = f1_fn(y_pred, y_test)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


torch.Size([17718, 5])
torch.Size([17718])
Accuracy: 0.05152951925992966
Precision: 0.05152951925992966
Recall: 0.05152951925992966
F1 Score: 0.05152951925992966
