# Task A: Word2Vec

Enabling CUDA for inbuilt GPU usage


In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='TRUE'

import torch
if torch.cuda.is_available():
    print("Success! PyTorch can see your CUDA-enabled GPU.")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("Installation issue: PyTorch cannot see your GPU.")

Success! PyTorch can see your CUDA-enabled GPU.
GPU Name: NVIDIA GeForce GTX 1650


In [2]:
import os
import re
from collections import Counter
import numpy as np
import pandas as pd
from tqdm import tqdm
import random

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


1. DATA LOADING AND PREPROCESSING

In [3]:
def setup_nltk():
    """Download necessary NLTK data."""
    try:
        stopwords.words('english')
    except LookupError:
        nltk.download('stopwords')
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

def preprocess_text(text):
    """Cleans and tokenizes text."""
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words and len(word) > 2]

def load_data(folder_path):
    """Loads and preprocesses all documents from the dataset folder."""
    all_docs = []
    print(f"Loading data from: {folder_path}")
    for category in os.listdir(folder_path):
        category_path = os.path.join(folder_path, category)
        if not os.path.isdir(category_path): continue
        for filename in os.listdir(category_path):
            file_path = os.path.join(category_path, filename)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                all_docs.append(preprocess_text(f.read()))
    return all_docs

2. PYTORCH WORD2VEC IMPLEMENTATION

In [4]:
class Word2VecDataset(Dataset):
    """Dataset for generating skip-gram pairs for Word2Vec."""
    def __init__(self, pairs, unigram_dist, num_negative_samples, word_indices):
        self.pairs = pairs
        self.unigram_dist = unigram_dist
        self.num_negative_samples = num_negative_samples
        self.word_indices = word_indices

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        target, context = self.pairs[idx]
        negative_samples = np.random.choice(
            self.word_indices, size=self.num_negative_samples, p=self.unigram_dist
        )
        return torch.LongTensor([target]), torch.LongTensor([context]), torch.LongTensor(negative_samples)

class SkipGramNegativeSampling(nn.Module):
    """PyTorch implementation of Skip-Gram model with Negative Sampling."""
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramNegativeSampling, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.out_embed = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.init_embed()

    def init_embed(self):
        nn.init.uniform_(self.in_embed.weight, -0.5 / self.in_embed.embedding_dim, 0.5 / self.in_embed.embedding_dim)
        nn.init.zeros_(self.out_embed.weight)
        
    def forward(self, target_word, context_word, negative_words):
        v_target = self.in_embed(target_word)
        v_context = self.out_embed(context_word)
        v_negs = self.out_embed(negative_words)
        
        pos_score = torch.bmm(v_target, v_context.transpose(1, 2)).squeeze(2)
        pos_loss = -torch.log(torch.sigmoid(pos_score) + 1e-8).mean()
        
        neg_score = torch.bmm(v_target.expand(v_negs.size()), v_negs.transpose(1, 2))
        neg_loss = -torch.log(torch.sigmoid(-neg_score) + 1e-8).mean()
        
        return pos_loss + neg_loss

3. HELPER FUNCTIONS FOR EVALUATION

In [None]:
def find_most_similar(query_word, matrix, word_to_id, id_to_word, top_n=5):
    """Finds the top_n most similar words to a query_word."""
    if query_word not in word_to_id:
        return ["Word not in vocabulary"] * top_n
    query_id = word_to_id[query_word]
    query_vector = matrix[query_id].reshape(1, -1)
    sim_scores = cosine_similarity(query_vector, matrix).flatten()
    top_indices = np.argsort(sim_scores)[::-1][1:top_n+1]
    return [id_to_word[i] for i in top_indices]

def solve_analogy(a, b, c, matrix, word_to_id, id_to_word):
    """Solves the analogy 'a is to b as c is to ?'"""
    for word in [a, b, c]:
        if word not in word_to_id:
            return f"Error: '{word}' not in vocabulary."
    
    # Vector arithmetic: vec(a) - vec(b) + vec(c)
    vec_a = matrix[word_to_id[a]]
    vec_b = matrix[word_to_id[b]]
    vec_c = matrix[word_to_id[c]]
    result_vec = (vec_a - vec_b + vec_c).reshape(1, -1)
    
    sim_scores = cosine_similarity(result_vec, matrix).flatten()
    top_indices = np.argsort(sim_scores)[::-1]
    
    for idx in top_indices:
        word = id_to_word[idx]
        if word not in [a, b, c]:
            return word
    return "No answer found."

4. MAIN EXECUTION (20 Epochs)

In [None]:
def main():
    # Hyperparameters
    setup_nltk()
    BBC_FOLDER = 'bbc'
    VOCAB_SIZE = 10000
    

    WINDOW_SIZE = 5         
    EMBEDDING_DIM = 300    
    NUM_NEGATIVE_SAMPLES = 5
    BATCH_SIZE = 1024
    EPOCHS = 20
    LEARNING_RATE = 0.001

    # Data Loading and Vocab Building 
    documents = load_data(BBC_FOLDER)
    all_tokens = [token for doc in documents for token in doc]
    word_counts = Counter(all_tokens)
    vocab = [word for word, count in word_counts.most_common(VOCAB_SIZE)]
    word_to_id = {word: i for i, word in enumerate(vocab)}
    id_to_word = {i: word for i, word in enumerate(vocab)}
    vocab_set = set(vocab)
    print(f"Vocabulary size: {len(vocab)}")

    # Prepare Data for PyTorch
    print("Generating training pairs for Word2Vec...")
    pairs = []
    for doc in tqdm(documents):
        doc_indices = [word_to_id[word] for word in doc if word in vocab_set]
        for i, target_idx in enumerate(doc_indices):
            start = max(0, i - WINDOW_SIZE)
            end = min(len(doc_indices), i + WINDOW_SIZE + 1)
            context_indices = doc_indices[start:i] + doc_indices[i+1:end]
            for context_idx in context_indices:
                pairs.append((target_idx, context_idx))
    
    word_freqs = np.array([word_counts[word] for word in vocab])
    unigram_dist = word_freqs**0.75 / np.sum(word_freqs**0.75)
    word_indices = np.arange(VOCAB_SIZE)
    
    dataset = Word2VecDataset(pairs, unigram_dist, NUM_NEGATIVE_SAMPLES, word_indices)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    # Train the Word2Vec Model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model = SkipGramNegativeSampling(VOCAB_SIZE, EMBEDDING_DIM).to(device)
    optimizer = optim.SparseAdam(model.parameters(), lr=LEARNING_RATE)

    for epoch in range(EPOCHS):
        total_loss = 0
        pbar = tqdm(dataloader, desc=f"Training Epoch {epoch+1}/{EPOCHS}")
        for target, context, negs in pbar:
            target, context, negs = target.to(device), context.to(device), negs.to(device)
            optimizer.zero_grad()
            loss = model(target, context, negs)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            pbar.set_postfix({'loss': f'{loss.item():.4f}'})
        print(f"Epoch {epoch+1}, Average Loss: {total_loss / len(dataloader):.4f}")

    word2vec_embeddings = model.in_embed.weight.cpu().detach().numpy()

    #  Part a: Word Similarity Comparison 
    print("\n--- Part a: Word Similarity Comparison ---")
    
    # Load VSM/SVD results from Lab 4
    try:
        results_df = pd.read_csv('task_a_results.csv')
        print("Successfully loaded results from Lab 4 (task_a_results.csv)")
    except FileNotFoundError:
        print("ERROR: 'task_a_results.csv' not found. Creating a placeholder table.")
        print("Please run Step 1 to save the file from your Lab 4 notebook for a full comparison.")
        results_df = pd.DataFrame({
            "Query Word": [
                "market (business)", "film (entertainment)", "election (politics)", 
                "game (sport)", "software (tech)"
            ],
            "Top 5 Similar Words (VSM)": ["..." for _ in range(5)],
            "Top 5 Similar Words (SVD)": ["..." for _ in range(5)]
        })

    query_words = ['market', 'film', 'election', 'game', 'software']
    w2v_results = []
    for word in query_words:
        similar_words = find_most_similar(word, word2vec_embeddings, word_to_id, id_to_word)
        w2v_results.append(", ".join(similar_words))
    
    results_df["Top 5 Similar Words (Word2Vec)"] = w2v_results
    
    print("\nFinal Comparison Table:")
    print(results_df.to_string())

    #  Part b: Analogy Questions 
    print("\n\n--- Part b: Analogy Questions ---")
    analogies = [
        ("business is to profit as politics is to ?", ('business', 'profit', 'politics')),
        ("britain is to london as france is to ?", ('britain', 'london', 'france')),
        ("sport is to football as tech is to ?", ('sport', 'football', 'tech')),
        ("minister is to government as player is to ?", ('minister', 'government', 'player')),
        ("movie is to entertainment as computer is to ?", ('movie', 'entertainment', 'computer'))
    ]

    for question_text, (a, b, c) in analogies:
        answer = solve_analogy(a, b, c, word2vec_embeddings, word_to_id, id_to_word)
        print(f"\n{question_text}")
        print(f"  -> Answer: {answer}")

if __name__ == "__main__":
    main()

Loading data from: bbc
Vocabulary size: 10000
Generating training pairs for Word2Vec...


100%|██████████| 2225/2225 [00:00<00:00, 3012.31it/s]


Using device: cuda


Training Epoch 1/20: 100%|██████████| 4139/4139 [12:47<00:00,  5.39it/s, loss=1.1757]


Epoch 1, Average Loss: 1.2641


Training Epoch 2/20: 100%|██████████| 4139/4139 [12:39<00:00,  5.45it/s, loss=1.1475]


Epoch 2, Average Loss: 1.1640


Training Epoch 3/20: 100%|██████████| 4139/4139 [13:12<00:00,  5.22it/s, loss=1.0907]


Epoch 3, Average Loss: 1.1067


Training Epoch 4/20: 100%|██████████| 4139/4139 [12:40<00:00,  5.44it/s, loss=1.0555]


Epoch 4, Average Loss: 1.0601


Training Epoch 5/20: 100%|██████████| 4139/4139 [13:01<00:00,  5.29it/s, loss=1.0389]


Epoch 5, Average Loss: 1.0200


Training Epoch 6/20: 100%|██████████| 4139/4139 [12:35<00:00,  5.48it/s, loss=0.9689]


Epoch 6, Average Loss: 0.9852


Training Epoch 7/20: 100%|██████████| 4139/4139 [12:33<00:00,  5.50it/s, loss=0.9233]


Epoch 7, Average Loss: 0.9547


Training Epoch 8/20: 100%|██████████| 4139/4139 [12:37<00:00,  5.47it/s, loss=0.9117]


Epoch 8, Average Loss: 0.9281


Training Epoch 9/20: 100%|██████████| 4139/4139 [12:39<00:00,  5.45it/s, loss=0.9041]


Epoch 9, Average Loss: 0.9056


Training Epoch 10/20: 100%|██████████| 4139/4139 [12:54<00:00,  5.34it/s, loss=0.8493]


Epoch 10, Average Loss: 0.8856


Training Epoch 11/20: 100%|██████████| 4139/4139 [13:00<00:00,  5.30it/s, loss=0.8749]


Epoch 11, Average Loss: 0.8682


Training Epoch 12/20: 100%|██████████| 4139/4139 [12:51<00:00,  5.37it/s, loss=0.8145]


Epoch 12, Average Loss: 0.8532


Training Epoch 13/20: 100%|██████████| 4139/4139 [12:36<00:00,  5.47it/s, loss=0.8575]


Epoch 13, Average Loss: 0.8404


Training Epoch 14/20: 100%|██████████| 4139/4139 [12:30<00:00,  5.51it/s, loss=0.8551]


Epoch 14, Average Loss: 0.8289


Training Epoch 15/20: 100%|██████████| 4139/4139 [12:41<00:00,  5.44it/s, loss=0.8636]


Epoch 15, Average Loss: 0.8190


Training Epoch 16/20: 100%|██████████| 4139/4139 [12:41<00:00,  5.43it/s, loss=0.8551]


Epoch 16, Average Loss: 0.8101


Training Epoch 17/20: 100%|██████████| 4139/4139 [12:36<00:00,  5.47it/s, loss=0.8110]


Epoch 17, Average Loss: 0.8021


Training Epoch 18/20: 100%|██████████| 4139/4139 [12:59<00:00,  5.31it/s, loss=0.7813]


Epoch 18, Average Loss: 0.7948


Training Epoch 19/20: 100%|██████████| 4139/4139 [13:04<00:00,  5.28it/s, loss=0.8009]


Epoch 19, Average Loss: 0.7888


Training Epoch 20/20: 100%|██████████| 4139/4139 [12:59<00:00,  5.31it/s, loss=0.7962]


Epoch 20, Average Loss: 0.7825

--- Part a: Word Similarity Comparison ---
Successfully loaded results from Lab 4 (task_a_results.csv)

Final Comparison Table:
             Query Word                       Top 5 Similar Words (VSM)                     Top 5 Similar Words (SVD)                      Top 5 Similar Words (Word2Vec)
0     market (business)           stock, housing, growth, prices, sales        stock, analysts, share, growth, prices    stock, steadily, capitalisation, wimpey, buoyant
1  film (entertainment)          best, awards, actress, director, films           films, movie, awards, best, actress    cheadle, documentary, starring, directed, moores
2   election (politics)         general, labour, campaign, blair, party    labour, general, partys, campaign, labours     general, presidential, milburn, slogan, labours
3          game (sport)            games, play, players, match, playing          play, games, players, playing, first          encounter, warcraft, kirwan, halo