# Lab Session 4 - Task B: Word2Vec

1. Setup and Data Loading

In [3]:
import os
import re
from collections import Counter
import numpy as np
import pandas as pd
from tqdm import tqdm
import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# Data Loading and Preprocessing (from Task A) 
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words and len(word) > 2]

def load_data(folder_path):
    all_docs = []
    for category in os.listdir(folder_path):
        category_path = os.path.join(folder_path, category)
        if not os.path.isdir(category_path): continue
        for filename in os.listdir(category_path):
            file_path = os.path.join(category_path, filename)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                all_docs.append(preprocess_text(f.read()))
    return all_docs

BBC_FOLDER = 'bbc'
documents = load_data(BBC_FOLDER)

# Vocabulary Creation (from Task A) 
VOCAB_SIZE = 10000
all_tokens = [token for doc in documents for token in doc]
word_counts = Counter(all_tokens)
vocab = [word for word, count in word_counts.most_common(VOCAB_SIZE)]
word_to_id = {word: i for i, word in enumerate(vocab)}
id_to_word = {i: word for i, word in enumerate(vocab)}
vocab_set = set(vocab)

print(f"Setup complete. Vocabulary size: {len(vocab)}")

Setup complete. Vocabulary size: 10000


 2. Prepare Data for Word2Vec

In [4]:
WINDOW_SIZE = 5
NUM_NEGATIVE_SAMPLES = 5

# Generate skip-gram pairs
pairs = []
for doc in tqdm(documents, desc="Generating training pairs"):
    doc_indices = [word_to_id[word] for word in doc if word in vocab_set]
    for i, target_idx in enumerate(doc_indices):
        start = max(0, i - WINDOW_SIZE)
        end = min(len(doc_indices), i + WINDOW_SIZE + 1)
        context_indices = doc_indices[start:i] + doc_indices[i+1:end]
        for context_idx in context_indices:
            pairs.append((target_idx, context_idx))

# Prepare for negative sampling
word_freqs = np.array([word_counts[word] for word in vocab])
unigram_dist = word_freqs**0.75 / np.sum(word_freqs**0.75)
word_indices = np.arange(VOCAB_SIZE)

class Word2VecDataset(Dataset):
    def __init__(self, pairs, unigram_dist, num_negative_samples):
        self.pairs = pairs
        self.unigram_dist = unigram_dist
        self.num_negative_samples = num_negative_samples

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        target, context = self.pairs[idx]
        negative_samples = np.random.choice(word_indices, size=self.num_negative_samples, p=self.unigram_dist)
        return torch.LongTensor([target]), torch.LongTensor([context]), torch.LongTensor(negative_samples)


Generating training pairs: 100%|██████████| 2225/2225 [00:01<00:00, 1968.31it/s]


3. Define the PyTorch Word2Vec Model

In [5]:
class SkipGramNegativeSampling(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramNegativeSampling, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.init_embed()

    def init_embed(self):
        nn.init.uniform_(self.in_embed.weight, -0.5 / self.in_embed.embedding_dim, 0.5 / self.in_embed.embedding_dim)
        nn.init.uniform_(self.out_embed.weight, -0.5 / self.out_embed.embedding_dim, 0.5 / self.out_embed.embedding_dim)
        
    def forward(self, target_word, context_word, negative_words):
        v_target = self.in_embed(target_word)  
        v_context = self.out_embed(context_word) 
        v_negs = self.out_embed(negative_words)
        
        pos_score = torch.bmm(v_target, v_context.transpose(1, 2)).squeeze(2)
        pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-8).mean()
        
        neg_score = torch.bmm(v_target.expand(v_negs.size()), v_negs.transpose(1, 2))
        neg_loss = -torch.log(torch.sigmoid(-neg_score) + 1e-8).mean()
        
        return pos_loss + neg_loss

4. Train the Model

In [None]:
EMBEDDING_DIM = 300
BATCH_SIZE = 1024
EPOCHS = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

dataset = Word2VecDataset(pairs, unigram_dist, NUM_NEGATIVE_SAMPLES)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

model = SkipGramNegativeSampling(VOCAB_SIZE, EMBEDDING_DIM).to(device)
optimizer = optim.SparseAdam(model.parameters(), lr=0.001)

for epoch in range(EPOCHS):
    total_loss = 0
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    for target, context, negs in pbar:
        target, context, negs = target.to(device), context.to(device), negs.to(device)
        
        optimizer.zero_grad()
        loss = model(target, context, negs)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': loss.item()})
    
    print(f"Epoch {epoch+1}, Average Loss: {total_loss / len(dataloader)}")

word2vec_embeddings = model.in_embed.weight.cpu().detach().numpy()

Using device: cpu


Epoch 1/3: 100%|██████████| 4139/4139 [13:10<00:00,  5.24it/s, loss=1.21]


Epoch 1, Average Loss: 1.2633462648053018


Epoch 2/3: 100%|██████████| 4139/4139 [12:47<00:00,  5.40it/s, loss=1.13]


Epoch 2, Average Loss: 1.1624225196875244


Epoch 3/3: 100%|██████████| 4139/4139 [13:21<00:00,  5.16it/s, loss=1.07]

Epoch 3, Average Loss: 1.1048398629050522





5. Task B.a: Word Similarity Comparison

In [14]:
def find_most_similar(query_word, matrix, word_to_id, id_to_word, top_n=5):
    if query_word not in word_to_id:
        return [("Word not in vocabulary", 0)] * top_n
    query_id = word_to_id[query_word]
    query_vector = matrix[query_id].reshape(1, -1)
    sim_scores = cosine_similarity(query_vector, matrix).flatten()
    top_indices = np.argsort(sim_scores)[::-1][1:top_n+1]
    return [(id_to_word[i], sim_scores[i]) for i in top_indices]

query_words = ['market', 'film', 'election', 'game', 'software']
word2vec_results = []

for word in query_words:
    similar_words = find_most_similar(word, word2vec_embeddings, word_to_id, id_to_word)
    word2vec_results.append(", ".join([w[0] for w in similar_words]))

print("\n--- Word2Vec Similarity Results ---")
for q, res in zip(query_words, word2vec_results):
    print(f"{q}: {res}")


--- Word2Vec Similarity Results ---
market: stock, ntpc, upsurge, fujitsu, steadily
film: berlin, directed, fahrenheit, daylewis, gibsons
election: campaigning, looming, slogan, turnout, polls
game: encounter, warcraft, toshack, kicker, awesome
software: opensource, programs, patents, linux, windows


6. Task B.b: Analogy Questions

In [16]:
def solve_analogy(a, b, c, matrix, word_to_id, id_to_word):
    for word in [a, b, c]:
        if word not in word_to_id:
            return f"Error: '{word}' not in vocabulary."
    vec_a = matrix[word_to_id[a]]
    vec_b = matrix[word_to_id[b]]
    vec_c = matrix[word_to_id[c]]
    result_vec = (vec_a - vec_b + vec_c).reshape(1, -1)
    sim_scores = cosine_similarity(result_vec, matrix).flatten()
    top_indices = np.argsort(sim_scores)[::-1]
    for idx in top_indices:
        word = id_to_word[idx]
        if word not in [a, b, c]:
            return word
    return "No answer found."

analogies = [
    ("business is to profit as politics is to ?", ('business', 'profit', 'politics')),
    ("britain is to london as france is to ?", ('britain', 'london', 'france')),
    ("sport is to football as tech is to ?", ('sport', 'football', 'tech')),
    ("minister is to government as player is to ?", ('minister', 'government', 'player')),
    ("movie is to entertainment as computer is to ?", ('movie', 'entertainment', 'computer'))
]

print("\n Analogy Question Results \n")
for question_text, (a, b, c) in analogies:
    answer = solve_analogy(a, b, c, word2vec_embeddings, word_to_id, id_to_word)
    print(f"{question_text}\n  -> Answer: {answer}\n")


 Analogy Question Results 

business is to profit as politics is to ?
  -> Answer: cardinal

britain is to london as france is to ?
  -> Answer: germany

sport is to football as tech is to ?
  -> Answer: exploiting

minister is to government as player is to ?
  -> Answer: prime

movie is to entertainment as computer is to ?
  -> Answer: mouse

