# Task A

In [21]:
import os
import numpy as np
import pandas as pd
import random

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim

from scipy.sparse import dok_matrix, lil_matrix, save_npz, load_npz

nltk.download("punkt")
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

DATA_PATH = "bbc"

[nltk_data] Downloading package punkt to C:\Users\Tushar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tushar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Load and Preprocess (tokenize) data

In [22]:
def load_bbc_dataset(base_path=DATA_PATH):
    docs, labels = [], []
    for category in os.listdir(base_path):
        category_path = os.path.join(base_path, category)
        if os.path.isdir(category_path):
            for fname in os.listdir(category_path):
                fpath = os.path.join(category_path, fname)
                with open(fpath, "r", encoding="latin-1") as f:
                    text = f.read().lower()
                    docs.append(text)
                    labels.append(category)
    return docs, labels

docs, labels = load_bbc_dataset()
print(f"Loaded {len(docs)} documents across {len(set(labels))} categories.")

def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

tokenized_docs = [preprocess(doc) for doc in docs]

vocab = sorted(set([w for doc in tokenized_docs for w in doc]))

window_size = 5
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
V = len(vocab)
print(f"Vocab size: {V}")

Loaded 2225 documents across 5 categories.
Vocab size: 27206


Compute and Maintain PPMI Matrix

In [23]:
ppmi_file = "ppmi_matrix.npz"

def compute_ppmi_mat () :
    cooc_mat = dok_matrix((V, V), dtype=np.float32)
    for doc in tokenized_docs:
        for i, word in enumerate(doc):
            if word not in word2idx: 
                continue
            w_idx = word2idx[word]
            context = doc[max(0, i - window_size): i] + doc[i+1: i+1+window_size]
            for c in context:
                if c in word2idx:
                    c_idx = word2idx[c]
                    cooc_mat[w_idx, c_idx] += 1

    cooc_mat = cooc_mat.tocsr()

    ppmi_mat = lil_matrix(cooc_mat.shape, dtype=np.float32)
    total_count = cooc_mat.sum()
    word_freq = np.array(cooc_mat.sum(axis=1)).flatten()
    context_freq = np.array(cooc_mat.sum(axis=0)).flatten()

    rows, cols = cooc_mat.nonzero()
    for i, j in zip(rows, cols):
        val = cooc_mat[i, j]
        p_ij = val / total_count
        p_i = word_freq[i] / total_count
        p_j = context_freq[j] / total_count
        score = np.log2(p_ij / (p_i * p_j))
        if score > 0:
            ppmi_mat[i, j] = score

    ppmi_mat = ppmi_mat.tocsr()
    return ppmi_mat

if os.path.exists(ppmi_file):
    ppmi_mat = load_npz(ppmi_file)
    print("Loaded PPMI matrix from disk:", ppmi_mat.shape)
else:
    ppmi_mat = compute_ppmi_mat()
    save_npz(ppmi_file, ppmi_mat)
    print("Computed and saved PPMI matrix:", ppmi_mat.shape)


Loaded PPMI matrix from disk: (27206, 27206)


Compute and Maintain SVD Reduced Matrix

In [24]:
svd_file = "svd_matrix.npy"
def compute_svd_mat(d=300):
    svd = TruncatedSVD(n_components=d, random_state=42)
    svd_mat = svd.fit_transform(ppmi_mat)
    return svd_mat

if os.path.exists(svd_file):
    svd_mat = np.load(svd_file)
    print("Loaded SVD-reduced matrix from disk:", svd_mat.shape)
else:
    svd_mat = compute_svd_mat()
    np.save(svd_file, svd_mat)
    print("Computed and saved SVD-reduced matrix:", svd_mat.shape)

Loaded SVD-reduced matrix from disk: (27206, 300)


Defining SkipGram Word2Vec Model

In [25]:
svd_tensor = torch.tensor(svd_mat, dtype=torch.float32)
vocab_size, embedding_dim = svd_tensor.shape
num_negative_samples = 5  

# Create unigram distribution raised to 3/4 for negative sampling
word_freqs = np.array([np.sum(ppmi_mat.getrow(i)) for i in range(vocab_size)])
unigram_dist = word_freqs ** 0.75
unigram_dist = unigram_dist / unigram_dist.sum()

# Convert to torch tensor for efficient sampling
unigram_dist = torch.tensor(unigram_dist)

class SkipGramNegSampling(nn.Module):
    def __init__(self, vocab_size, embedding_dim, svd_tensor):
        super().__init__()
        self.in_embeddings = nn.Embedding.from_pretrained(svd_tensor.clone(), freeze=False)
        self.out_embeddings = nn.Embedding(vocab_size, embedding_dim)
        nn.init.uniform_(self.out_embeddings.weight, -0.5/embedding_dim, 0.5/embedding_dim)

    def forward(self, center_words, pos_context_words, neg_context_words):
        center_embeds = self.in_embeddings(center_words)  # (batch_size, embed_dim)
        pos_embeds = self.out_embeddings(pos_context_words)  # (batch_size, embed_dim)
        neg_embeds = self.out_embeddings(neg_context_words)  # (batch_size, num_neg_samples, embed_dim)

        pos_score = torch.sum(center_embeds * pos_embeds, dim=1)  # (batch_size)
        pos_loss = torch.log(torch.sigmoid(pos_score) + 1e-10)  # avoid log(0)

        neg_score = torch.bmm(neg_embeds.neg(), center_embeds.unsqueeze(2)).squeeze()  # (batch_size, num_neg_samples)
        neg_loss = torch.log(torch.sigmoid(neg_score) + 1e-10).sum(1)  # (batch_size)

        loss = - (pos_loss + neg_loss).mean()
        return loss

In [26]:
def get_negative_samples(batch_size, num_neg_samples, unigram_dist):
    neg_samples = torch.multinomial(unigram_dist, batch_size * num_neg_samples, replacement=True)
    neg_samples = neg_samples.view(batch_size, num_neg_samples)
    return neg_samples

def generate_training_data(tokenized_docs, window_size=5, neg_samples=5):
    pairs = []
    for doc in tokenized_docs:
        idxs = [word2idx[w] for w in doc if w in word2idx]
        for i, w in enumerate(idxs):
            context = idxs[max(0,i-window_size):i] + idxs[i+1:i+1+window_size+1]
            for c in context:
                pairs.append((w, c))
    return pairs


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SkipGramNegSampling(vocab_size, embedding_dim, svd_tensor)
model.to(device)  # Move model to GPU or CPU

optimizer = optim.Adam(model.parameters(), lr=0.01)

training_pairs = generate_training_data(tokenized_docs)
print(f"Training pairs: {len(training_pairs)}")

checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

def save_checkpoint(model, optimizer, epoch, loss, path):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }, path)

def load_checkpoint(path, model, optimizer):
    checkpoint = torch.load(path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    model.train()
    return epoch, loss

def find_last_checkpoint(directory):
    checkpoints = [f for f in os.listdir(directory) if f.endswith('.pt')]
    if not checkpoints:
        return None
    epochs = [int(f.split('_')[1].split('.pt')[0]) for f in checkpoints if '_' in f]
    if not epochs:
        return None
    max_epoch = max(epochs)
    return os.path.join(directory, f"checkpoint_{max_epoch}.pt"), max_epoch

start_epoch = 0
training_epochs = 20

last_ckpt = find_last_checkpoint(checkpoint_dir)
if last_ckpt is not None:
    ckpt_path, start_epoch = last_ckpt
    start_epoch, val_loss = load_checkpoint(ckpt_path, model, optimizer)
    start_epoch += 1
    print(f"Resuming training from epoch {start_epoch}")

batch_size = 128
neg_samples = 5

for epoch in range(start_epoch, training_epochs):
    total_loss = 0
    np.random.shuffle(training_pairs)

    for i in range(0, min(len(training_pairs), 100000), batch_size):
        batch = training_pairs[i:i+batch_size]

        center_batch = torch.tensor([c for c, _ in batch], dtype=torch.long).to(device)
        context_batch = torch.tensor([ctx for _, ctx in batch], dtype=torch.long).to(device)
        neg_batch = torch.tensor(np.random.choice(len(vocab), size=(len(batch), neg_samples), replace=True), dtype=torch.long).to(device)

        optimizer.zero_grad()
        loss = model(center_batch, context_batch, neg_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * len(batch)

    avg_loss = total_loss / min(len(training_pairs), 100000)
    print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")

    ckpt_path = os.path.join(checkpoint_dir, f"checkpoint_{epoch}.pt")
    save_checkpoint(model, optimizer, epoch, avg_loss, ckpt_path)
    print(f"Saved checkpoint: {ckpt_path}")

w2v_embeddings = model.in_embeddings.weight.detach().cpu().numpy()


Training pairs: 5069429
Resuming training from epoch 20


  checkpoint = torch.load(path, map_location=device)


## Eval 1

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

def top_k_similar(word, mat, k=5):
    if word not in word2idx:
        return []
    idx = word2idx[word]

    if hasattr(mat, "tocsr"):  # sparse case
        vec = mat.getrow(idx)        # 1 x V
        sims = cosine_similarity(vec, mat)[0]
    else:  # dense numpy
        sims = cosine_similarity([mat[idx]], mat)[0]

    top_idx = sims.argsort()[::-1][1:k+1]
    return [idx2word[i] for i in top_idx]

query_words = ["market", "film", "election", "football", "computer"]

results = []
for q in query_words:
    vsm_res = top_k_similar(q, ppmi_mat)
    svd_res = top_k_similar(q, svd_mat)
    w2v_res = top_k_similar(q, w2v_embeddings)
    results.append([q, vsm_res, svd_res, w2v_res])

pd.DataFrame(results, columns=["Query", "VSM", "SVD", "Word2Vec"])

Unnamed: 0,Query,VSM,SVD,Word2Vec
0,market,"[stock, share, housing, prices, growth]","[stock, share, analysts, housing, prices]","[stock, share, prices, exchange, analysts]"
1,film,"[festival, directed, director, awards, films]","[directed, festival, movie, films, fuqua]","[festival, directed, movie, films, hollywood]"
2,election,"[general, labour, campaign, party, tories]","[labour, general, campaign, tories, party]","[labour, general, campaign, campaigning, tories]"
3,football,"[league, club, manager, manchester, ferguson]","[club, warding, league, docherty, clubs]","[club, league, champions, coach, manchester]"
4,computer,"[software, users, system, mac, pc]","[pc, software, computers, frees, docking]","[software, pc, computers, use, means]"


In [36]:
def analogy(a, b, c, embeddings, k=1):
    if any(w not in word2idx for w in [a,b,c]):
        return None
    vec = embeddings[word2idx[a]] - embeddings[word2idx[b]] + embeddings[word2idx[c]]
    sims = cosine_similarity([vec], embeddings)[0]
    best = sims.argsort()[::-1]
    for idx in best:
        word = idx2word[idx]
        if word not in [a,b,c]:
            return word

questions = [
    ("business", "profit", "politics"),
    ("britain", "london", "france"),
    ("sport", "football", "tech"),
    ("minister", "government", "player"),
    ("movie", "entertainment", "legislation")

]

for a,b,c in questions:
    print(f"{a} is to {b} as {c} is to {analogy(a,b,c,w2v_embeddings)}")

business is to profit as politics is to would
britain is to london as france is to germany
sport is to football as tech is to explanations
minister is to government as player is to prime
movie is to entertainment as legislation is to starring


In [31]:
query_words = ["mirza"]

results = []
for q in query_words:
    vsm_res = top_k_similar(q, ppmi_mat)
    svd_res = top_k_similar(q, svd_mat)
    w2v_res = top_k_similar(q, w2v_embeddings)
    results.append([q, vsm_res, svd_res, w2v_res])

pd.DataFrame(results, columns=["Query", "VSM", "SVD", "Word2Vec"])

Unnamed: 0,Query,VSM,SVD,Word2Vec
0,mirza,"[sania, bondarenko, jelena, jankovic, alyona]","[sania, hyderabad, bondarenko, jelena, jankovic]","[sania, serena, hyderabad, round, martinez]"
