[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/xiptos/is_notes/blob/main/word_embeddings_cbow.ipynb)

# Mini Word2Vec (CBOW) in PyTorch

Goal:
- Build word embeddings from a tiny corpus
- Use a CBOW-style model:
    given context words → predict the center word
- Then inspect which words end up close in the embedding space

This is a teaching example: small, simple, and fast.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import math
import random

torch.manual_seed(42)
random.seed(42)



## 1. Toy corpus
Small corpus with clear semantic structure (royalty vs fruit vs places, etc.)

In [None]:

corpus = [
    "king is a strong man",
    "queen is a wise woman",
    "man and woman are humans",
    "king and queen rule the kingdom",
    "paris is a city",
    "france is a country",
    "paris is in france",
    "apple and orange are fruits",
    "apple is red",
    "orange is orange",
]

corpus = [s.lower().split() for s in corpus]
print(corpus[:2])

In [None]:
print(list(word for sent in corpus for word in sent))

## 2. Build vocabulary and mappings

In [None]:
# Count words
word_counts = Counter(word for sent in corpus for word in sent)
vocab = sorted(word_counts.keys())
vocab_size = len(vocab)

word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

print("Vocab size:", vocab_size)
print("Vocab:", vocab)

## 3. Generate CBOW training data

For window size = 2:
- Input: the 4 context words (2 left, 2 right)
- Target: the center word

In [None]:

WINDOW_SIZE = 2

def make_cbow_data(corpus, window_size=2):
    data = []
    for sent in corpus:
        indices = [word_to_idx[w] for w in sent]
        for i in range(window_size, len(indices) - window_size):
            context = (
                indices[i - window_size : i] +
                indices[i + 1 : i + 1 + window_size]
            )
            target = indices[i]
            data.append((context, target))
    return data

data = make_cbow_data(corpus, WINDOW_SIZE)
print("Number of training examples:", len(data))
print("Example (context idx, target idx):", data[0])

# Show example in words
ctx_words = [idx_to_word[i] for i in data[0][0]]
tgt_word = idx_to_word[data[0][1]]
print("Context words:", ctx_words)
print("Target word:", tgt_word)

## 4. Define CBOW model with an Embedding layer

- `nn.Embedding` maps word indices → dense vectors
- We average the context embeddings, then use a linear layer to predict the target word

In [None]:

class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, embed_dim)
        self.linear = nn.Linear(embed_dim, vocab_size)

    def forward(self, context_idxs):
        """
        context_idxs: LongTensor of shape (batch_size, 2*window_size)
        """
        # Get embeddings: (batch, 2*window, embed_dim)
        embeds = self.emb(context_idxs)
        # Average over context words: (batch, embed_dim)
        pooled = embeds.mean(dim=1)
        # Predict vocabulary distribution
        logits = self.linear(pooled)
        return logits

EMBED_DIM = 10  # small dimension for visualization
model = CBOW(vocab_size, EMBED_DIM)
print(model)

## 5. Training loop

- Loss: cross-entropy between predicted distribution and true target word
- Tiny corpus → few epochs are enough to see structure

In [None]:
EPOCHS = 200
BATCH_SIZE = 16
lr = 0.01

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

def get_batches(data, batch_size):
    random.shuffle(data)
    for i in range(0, len(data), batch_size):
        batch = data[i : i + batch_size]
        contexts = torch.tensor([c for (c, t) in batch], dtype=torch.long)
        targets = torch.tensor([t for (c, t) in batch], dtype=torch.long)
        yield contexts, targets

for epoch in range(1, EPOCHS + 1):
    total_loss = 0.0
    for contexts, targets in get_batches(data, BATCH_SIZE):
        optimizer.zero_grad()
        logits = model(contexts)
        loss = criterion(logits, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * contexts.size(0)
    if epoch % 40 == 0:
        avg_loss = total_loss / len(data)
        print(f"Epoch {epoch:3d} | loss = {avg_loss:.4f}")

## 6. Inspect learned embeddings

Let's:
- Extract the embedding matrix
- Define a helper to get nearest neighbors (cosine similarity)

In [None]:

embeddings = model.emb.weight.data  # shape: (vocab_size, EMBED_DIM)

def cosine_similarity(a, b):
    a_norm = a / (a.norm(dim=-1, keepdim=True) + 1e-8)
    b_norm = b / (b.norm(dim=-1, keepdim=True) + 1e-8)
    return (a_norm * b_norm).sum(dim=-1)

def most_similar(word, top_k=5):
    if word not in word_to_idx:
        print(f"'{word}' not in vocab")
        return
    idx = word_to_idx[word]
    word_vec = embeddings[idx].unsqueeze(0)  # (1, D)
    sims = cosine_similarity(word_vec, embeddings)  # (V,)
    # Sort by similarity, skip the word itself (idx)
    best = torch.topk(sims, top_k + 1).indices.tolist()
    best = [i for i in best if i != idx][:top_k]
    print(f"\nMost similar to '{word}':")
    for i in best:
        print(f"  {idx_to_word[i]:>8s}  (cosine={sims[i]:.3f})")

In [None]:

most_similar("king")
most_similar("queen")
most_similar("apple")
most_similar("paris")
most_similar("france")

## 7. (Optional) Analogy intuition

Very tiny corpus → analogies will be noisy, but we can still show the idea:
vector("king") - vector("man") + vector("woman") ≈ vector("queen")

In [None]:

def analogy(a, b, c, top_k=5):
    for w in (a, b, c):
        if w not in word_to_idx:
            print(f"'{w}' not in vocab")
            return
    va = embeddings[word_to_idx[a]]
    vb = embeddings[word_to_idx[b]]
    vc = embeddings[word_to_idx[c]]
    query = vb - va + vc  # b - a + c

    sims = cosine_similarity(query.unsqueeze(0), embeddings)
    best = torch.topk(sims, top_k + 3).indices.tolist()
    # Filter out the original words
    exclude = {word_to_idx[w] for w in (a, b, c)}
    best = [i for i in best if i not in exclude][:top_k]

    print(f"\nAnalogy: {a} → {b}  as  {c} → ?")
    for i in best:
        print(f"  {idx_to_word[i]:>8s}  (cosine={sims[i]:.3f})")

analogy("man", "king", "woman")