# Word 2 what?

- As usual, we need to import some libraries.
- Make sure you have installed the spaCy English package (if you haven't: `python -m spacy download en_core_web_sm`).

- Tasks:
    - **Task 1**: Implement the `tokenize` function so it returns a list of tokens when given a text.
    - **Task 2**: Now that we have our tokens, let's build our vocabulary! The `build_vocab` function you are implementing should return:
        - A **list** containing our vocabulary
        - A **dictionary** linking each unique token with an index (`{'hi': 0, 'name': 1, 'my': 2, ...}`)

In [None]:

import numpy as np
from collections import Counter
import spacy
import matplotlib.pyplot as plt
import math
from sklearn.decomposition import PCA
import matplotlib.animation as animation
import torch
import torch.nn as nn
import torch.optim as optim

# make sure to download the spacy model with: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# our example text we will be using for this exercise 
# this was generated using deepseek, you can change it if you want, this example was used cause our models are a bit dumb
with open('example_text.txt', 'r') as file:
    text = file.read()

def tokenize(text):
    # --- Task 1 begins here ---
    doc = "NOT IMPLEMENTED YET"
    return "NOT IMPLEMENTED YET"
    # --- Task 1 ends here ---

print("Tokenized Text: ", tokenize(text))
print("We got",len(set(tokenize(text))), "unique tokens")

def build_vocab(tokens, min_count=1):
    # --- Task 2 begins here ---
    counts = "NOT IMPLEMENTED YET"
    vocab = "NOT IMPLEMENTED YET"
    idx = "NOT IMPLEMENTED YET"
    return vocab, idx
    # --- Task 2 ends here ---
    
print("Building vocab...", build_vocab(tokenize(text)))

### Helper Functions
- Execute this code, we will need these functions later

In [None]:
# Softmax function
def softmax(x):
    x = x - np.max(x)
    e = np.exp(x)
    return e / np.sum(e)

# Neareast neighbors function: computes the k nearest neighbors of a given word in the embedding space
def nearest_neighbors(word, vocab, idx, embeddings, k=5):
    if word not in idx:
        return []
    i = idx[word]
    v = embeddings[i]
    norms = np.linalg.norm(embeddings, axis=1)
    sims = embeddings.dot(v) / (norms * np.linalg.norm(v) + 1e-9)
    nearest = np.argsort(-sims)
    return [(vocab[n], float(sims[n])) for n in nearest[1:k+1]]

# We use this function to create an animation of the embeddings over time
def animate_embeddings_gif(snapshots, vocab, save_path='skip_grams.gif', n_words=19, fps=12, figsize=(8,8)):

    n_words = min(n_words, len(vocab))
    vs = snapshots[0].shape[0]

    stacked = np.vstack(snapshots)
    pca = PCA(n_components=2)
    stacked_2d = pca.fit_transform(stacked)

    n_frames = len(snapshots)
    frames_coords = [ stacked_2d[i*vs:(i+1)*vs, :2][:n_words] for i in range(n_frames) ]
    words = vocab[:n_words]

    fig, ax = plt.subplots(figsize=figsize)
    ax.set_xticks([]); ax.set_yticks([])
    scatter = ax.scatter(frames_coords[0][:,0], frames_coords[0][:,1], s=40)
    texts = [ax.text(x, y, w, fontsize=12) for (x,y), w in zip(frames_coords[0], words)]

    all_xy = np.vstack(frames_coords)
    margin = 0.5
    ax.set_xlim(all_xy[:,0].min()-margin, all_xy[:,0].max()+margin)
    ax.set_ylim(all_xy[:,1].min()-margin, all_xy[:,1].max()+margin)

    def update(i):
        coords = frames_coords[i]
        scatter.set_offsets(coords)
        for t, (x,y) in zip(texts, coords):
            t.set_position((x,y))
        ax.set_title(f'Embeddings — frame {i+1}/{n_frames}')
        return [scatter, *texts]

    ani = animation.FuncAnimation(fig, update, frames=n_frames, interval=1000//fps, blit=False)

    writer = animation.PillowWriter(fps=fps)
    ani.save(save_path, writer=writer)
    plt.close(fig)
    print(f"Saved GIF to {save_path}")

# Continous Bag of Words
- Now that we have our tokens and vocab, we will start creating our simple word2vec model (we will be using the **Skip-gram** architecture for this part)
- Tasks:
    - Task 1: produce examples of (context_indices, center_index). Implement `generated_cbow_examples`
    - Task 2: create an embedding matrix and an output layer that maps embedding_dim → vocab_size. Initialize weights small and random
    - Task 3: instantiate model, an optimizer (e.g. SGD), and a loss (CrossEntropyLoss)
    - Task 4: forward pass
    - Task 5: compute loss, backpropagate, step optimizer
    - Task 6: return input embeddings and output weights (Hint: transpose when necessary)

In [None]:
def generate_cbow_examples(tokens, idx, window=2):
    examples = []
    # --- Task 1 begins here ---

    # --- Task 1 ends here ---
    return examples

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_layer = nn.Linear(embedding_dim, vocab_size, bias=False)
        
        # --- Task 2 begins here ---
        with torch.no_grad():
            self.embeddings.weight.data = "NOT IMPLEMENTED YET"
            self.output_layer.weight.data = "NOT IMPLEMENTED YET"
        # --- Task 2 ends here ---
        
    def forward(self, context_indices):
        context_embeds = self.embeddings(context_indices)
        embed = context_embeds.mean(dim=0)
        scores = self.output_layer(embed)
        return scores
    


    def train_cbow(examples, vocab_size, embedding_dim=16, lr=0.05, epochs=200, print_every=50, record_every=1):

        torch.manual_seed(42)
        
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # --- Task 3 begins here ---
        model = "NOT IMPLEMENTED YET"
        optimizer = "NOT IMPLEMENTED YET"
        criterion = "NOT IMPLEMENTED YET"
        # --- Task 3 ends here ---
        
        snapshots = []
        
        for ep in range(1, epochs + 1):

            indices = torch.randperm(len(examples))
            total_loss = 0.0
            
            for idx in indices:
                context_idxs, center = examples[idx]
                
                context_tensor = torch.tensor(context_idxs, dtype=torch.long, device=device)
                center_tensor = torch.tensor([center], dtype=torch.long, device=device)
                
                # --- Task 4 begins here ---
                """Something missing here?"""
                scores = "NOT IMPLEMENTED YET"
                loss = "NOT IMPLEMENTED YET"
                # --- Task 4 ends here ---
                
                # --- Task 5 begins here ---
                
                # --- Task 5 ends here ---
            
            # --- For Animation Purposes ---
            if record_every and (ep % record_every == 0 or ep == epochs):
                snapshots.append(model.embeddings.weight.data.cpu().numpy().copy())
            # --- For Animation Purposes ---
            
            if ep % print_every == 0 or ep == 1:
                avg_loss = total_loss / len(examples)
                print(f"CBOW Epoch {ep}/{epochs}  avg_loss={avg_loss:.4f}")

        # --- Task 6 begins here ---
        W_in = "NOT IMPLEMENTED YET"
        W_out = "NOT IMPLEMENTED YET"
        # --- Task 6 ends here ---
        
        return W_in, W_out, snapshots

# we use our tokenization function
tokens = tokenize(text)
# we build our vocabulary and index mapping
vocab, idx = build_vocab(tokens)

# we use our CBOW data generation function
cbow_examples = generate_cbow_examples(tokens, idx, window=2)

# we train our CBOW model
W_in_cb, W_out_cb, snapshots_cb = CBOW.train_cbow(
    cbow_examples, vocab_size=len(vocab), embedding_dim=16, lr=0.01, epochs=400, print_every=100, record_every=1
)

# we use our helper function to see nearest neighbors of some words in the learned embedding space
for w in ['cat', 'dog', 'mat', 'sat', 'rug']:
    if w in idx:
        print('Nearest to', w, ':', nearest_neighbors(w, vocab, idx, W_in_cb, k=5))
    else:
        print('\nWord', w, 'not in vocab')

# The Fun Part!
- Now that you've successfully implemented **CBOW**, we'll use the snapshots recorded during training to visualize how tokens move through the embedding space.

In [None]:
animate_embeddings_gif(snapshots_cb, vocab, save_path='cbow.gif', n_words=len(vocab), fps=12)


# Let's get Perplexed!
- We are going to calculate the perplexity of **CBOW** (check the guidebook for a quick reminder)
- Task: try to implement `perplexity_cbow`, what does this number mean?

In [None]:

def perplexity_cbow(W_in, W_out, examples):
    total_nll = 0.0
    count = 0
    # --- Task begins here ---
    """Something missing here?"""
    # --- Task ends here ---
    return math.exp(total_nll / count) if count > 0 else float('inf')

ppl_cb = perplexity_cbow(W_in_cb, W_out_cb, cbow_examples)

print(f"Perplexity using CBOW: {ppl_cb:.4f}")

---

---

# Optional Part (a bit hardcore, no torch challenge, totally unnecessary)
### Skip-Gram (skipping?)
- Now that we have our tokens and vocab (already created for CBOW), we will start creating our simple word2vec model (we will be using the **Skip-gram** architecture for this part)
- Tasks:
    - Task 1: Implement `generate_skipgram_pairs(tokens, idx, window=2)`. 
        - Return a list of **(center_idx, context_idx)** tuples where **center_idx = idx\[token_at_i\]** and context_idx = **idx\[token_at_j\]** for all j in [i-window, i+window] (skip j==i), clipped to sequence bounds. 
        - Skip tokens not in idx. Keep duplicate pairs (frequency matters!). 
        - Example: tokens = ["the","cat","sat"], idx={"the":0,"cat":1,"sat":2}, window=1 -> [(0,1),(1,0),(1,2),(2,1)].
    - Task 2: Now we want to initialize the **input embedding matrix** (`W_in`) and the **output embedding matrix** (`W_out`)
        - Hint: check the guidebook if you haven't already
    - Task 3: Like we did in previous exercises: we want to implement the forward pass and the loss calculation
        - Hint: we will be using the negative log-likelihood loss
    - Task 4: Compute the gradients
    - Task 5: Update the parameters (the ones we initialized in Task 2)


In [None]:
def generate_skipgram_pairs(tokens, idx, window=2):
    pairs = []
    # --- Task 1 begins here ---

    # --- Task 1 ends here ---
    return pairs

def train_skipgram(pairs, vocab_size, embedding_dim=16, lr=0.05, epochs=200, print_every=50, record_every=1):
    rng = np.random.RandomState(1)
    
    # --- Task 2 begins here ---
    W_in = "NOT IMPLEMENTED YET"
    W_out = "NOT IMPLEMENTED YET"
    # --- Task 2 ends here ---
    
    # --- Animation Purposes --- Begin ---
    snapshots = []
    # --- Animation Purposes --- End ---
    
    for ep in range(1, epochs+1):
        np.random.shuffle(pairs)
        total_loss = 0.0
        for center, context in pairs:
            
            # --- Task 3 begins here ---
            embed = "NOT IMPLEMENTED YET"
            scores = "NOT IMPLEMENTED YET"
            probs = "NOT IMPLEMENTED YET"
            loss = "NOT IMPLEMENTED YET"
            total_loss += loss
            # --- Task 3 ends here ---

            # --- Task 4 begins here ---
            grad_scores = "NOT IMPLEMENTED YET"
            grad_scores[context] = "NOT IMPLEMENTED YET"
            dW_out = "NOT IMPLEMENTED YET"
            d_embed = "NOT IMPLEMENTED YET"
            # --- Task 4 ends here ---

            # --- Task 5 begins here ---
            W_out = "NOT IMPLEMENTED YET"
            W_in[center] = "NOT IMPLEMENTED YET"
            # --- Task 5 ends here ---

        # --- Animation Purposes --- Begin ---
        if record_every and (ep % record_every == 0 or ep == epochs):
            snapshots.append(W_in.copy())
        # --- Animation Purposes --- End ---

        if ep % print_every == 0 or ep == 1:
            print(f"Epoch {ep}/{epochs}  avg_loss={total_loss/len(pairs):.4f}")

    return W_in, W_out, snapshots


# Here we use our tokenization function
tokens = tokenize(text)

# Here we build our vocabulary and index mapping
vocab, idx = build_vocab(tokens)

# Here we generate our skip-gram pairs
pairs = generate_skipgram_pairs(tokens, idx, window=2)

# Let's see some stats
print('vocab size:', len(vocab), 'pairs:', len(pairs))

# We use our training loop, you can change the parameters as you wish!
W_in_sg, W_out_sg, snapshots_sg = train_skipgram(pairs, vocab_size=len(vocab), embedding_dim=16, lr=0.01, epochs=400, print_every=100)

# We use our helper function to see nearest neighbors of some words in the learned embedding space
for w in ['cat', 'dog', 'mat', 'sat', 'rug']:
    if w in idx:
        print('\nNearest to', w, ':', nearest_neighbors(w, vocab, idx, W_in_sg, k=5))
    else:
        print('\nWord', w, 'not in vocab')

# The Fun Part (Part 2)!
- Now that you managed to implement **skip-gram**, we are going to use the snapshots we recorded during the training, to see our tokens move in the embedding space.

In [None]:
# We are gonna use the function we defined earlier
animate_embeddings_gif(snapshots_sg, vocab, save_path='skip_grams.gif', n_words=len(vocab), fps=12)

#### Skip-Gram Perplexity

In [None]:
def perplexity_skipgram(W_in, W_out, pairs):
    total_nll = 0.0
    count = 0
    for center, context in pairs:
        embed = W_in[center]
        scores = W_out.T.dot(embed)
        probs = softmax(scores)
        total_nll += -np.log(probs[context] + 1e-9)
        count += 1
    return math.exp(total_nll / count) if count > 0 else float('inf')

ppl_sg = perplexity_skipgram(W_in_sg, W_out_sg, pairs)

print(f"\nPerplexity using skip-gram: {ppl_sg:.4f}")