In [None]:
import os
import re
import string
from collections import Counter

import numpy as np
import pandas as pd
from scipy.sparse import lil_matrix, csr_matrix
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity

# PyTorch for neural network models
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# NLTK for natural language processing tasks
import nltk
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('punkt')
    nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Plotly for interactive visualizations
import plotly.express as px

# tqdm for progress bars
from tqdm.notebook import tqdm

# --- Configuration ---
DATASET_PATH = 'bbc'
VOCAB_SIZE = 10000
MIN_WORD_COUNT = 5
WINDOW_SIZE = 5       # K=5
SVD_DIMS = 300        # d=300
W2V_DIMS = 300        # d=300
RANDOM_SEED = 42

# --- Set seeds for reproducibility ---
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

print("\nLibraries imported and configuration set.")


Libraries imported and configuration set.


In [2]:
def load_data(path):
    """Loads text documents from the BBC dataset directory."""
    texts = []
    for category in os.listdir(path):
        category_path = os.path.join(path, category)
        if os.path.isdir(category_path):
            # Loop through each .txt file in the category folder
            for filename in os.listdir(category_path):
                if filename.endswith('.txt'):
                    with open(os.path.join(category_path, filename), 'r', encoding='latin-1') as f:
                        texts.append(f.read())
    return texts

def preprocess(texts):
    """Cleans and tokenizes a list of text documents."""
    print("Preprocessing documents...")
    stop_words = set(stopwords.words('english'))
    tokenized_corpus = []
    for doc in texts:
        # Lowercase, remove punctuation and numbers
        doc = doc.lower()
        doc = re.sub(f'[{re.escape(string.punctuation)}]', '', doc)
        doc = re.sub(r'\d+', '', doc)
        tokens = word_tokenize(doc)
        tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
        tokenized_corpus.append(tokens)
    print("Preprocessing complete.")
    return tokenized_corpus

def build_vocabulary(corpus):
    """Builds vocabulary and word-to-index mappings."""
    print("Building vocabulary...")
    word_counts = Counter(word for doc in corpus for word in doc)
    
    # Get a sorted list of (word, count) tuples that meet the minimum count requirement
    sorted_and_filtered = [item for item in word_counts.most_common() if item[1] >= MIN_WORD_COUNT]
    
    # Create the vocabulary by taking the word from the top N items in the filtered list
    vocabulary = [word for word, count in sorted_and_filtered[:VOCAB_SIZE]]
    
    # Create word-to-index mappings
    word_to_idx = {word: i for i, word in enumerate(vocabulary)}
    idx_to_word = {i: word for i, word in enumerate(vocabulary)}
    print("Vocabulary built.")
    return vocabulary, word_to_idx, idx_to_word

print("Preprocessing functions defined.")

Preprocessing functions defined.


In [3]:
print("--- Starting Data Preparation ---")
docs = load_data(DATASET_PATH)
tokenized_corpus = preprocess(docs)
vocabulary, word_to_idx, idx_to_word = build_vocabulary(tokenized_corpus)
print(f"\nData preparation complete. Vocabulary size: {len(vocabulary)}")

--- Starting Data Preparation ---
Preprocessing documents...
Preprocessing complete.
Building vocabulary...
Vocabulary built.

Data preparation complete. Vocabulary size: 10000


In [4]:
def build_cooccurrence_matrix(corpus, word_to_idx, window_size=5):
    """Builds a word-word co-occurrence matrix."""
    vocab_size = len(word_to_idx)
    cooc_matrix = lil_matrix((vocab_size, vocab_size), dtype=np.float32)
    
    print("Building co-occurrence matrix (this may take a moment)...")
    for doc in corpus:
        doc_indices = [word_to_idx[word] for word in doc if word in word_to_idx]
        for i, target_idx in enumerate(doc_indices):
            start = max(0, i - window_size)
            end = min(len(doc_indices), i + window_size + 1)
            for j in range(start, end):
                if i != j:
                    context_idx = doc_indices[j]
                    cooc_matrix[target_idx, context_idx] += 1
    
    print("Co-occurrence matrix built.")
    return cooc_matrix.tocsr() 

def calculate_ppmi(cooc_matrix):
    """Calculates the Positive Pointwise Mutual Information (PPMI) matrix."""
    total_cooccurrences = cooc_matrix.sum()
    word_counts = np.array(cooc_matrix.sum(axis=1)).flatten()
    word_counts[word_counts == 0] = 1
    p_w_c = cooc_matrix / total_cooccurrences
    p_w = word_counts / total_cooccurrences
    pmi = np.log2(p_w_c.toarray() / (p_w[:, None] * p_w[None, :]) + 1e-9) 
    ppmi_matrix = np.maximum(0, pmi) # This step ensures it's Positive PMI
    
    return csr_matrix(ppmi_matrix)

def get_most_similar(query_word, word_vectors, word_to_idx, idx_to_word, top_n=5):
    if query_word not in word_to_idx:
        return [f"'{query_word}' not in vocabulary"] * top_n
        
    query_idx = word_to_idx[query_word]
    query_vector = word_vectors[query_idx].reshape(1, -1)
    
    similarities = cosine_similarity(query_vector, word_vectors).flatten()
    top_indices = np.argsort(-similarities)[1:top_n+1]
    
    return [idx_to_word[i] for i in top_indices]

print(" VSM & SVD helper functions defined.")

 VSM & SVD helper functions defined.


In [5]:
print("--- Starting Task A: VSM & SVD Execution ---")
print("Building co-occurrence matrix...")
cooc_matrix = build_cooccurrence_matrix(tokenized_corpus, word_to_idx, window_size=WINDOW_SIZE)

print("\nCalculating PPMI matrix...")
ppmi_matrix = calculate_ppmi(cooc_matrix)
print(f"Final PPMI matrix dimensions: {ppmi_matrix.shape}")

print("\nApplying Truncated SVD...")
svd = TruncatedSVD(n_components=SVD_DIMS, random_state=RANDOM_SEED)
svd_vectors = svd.fit_transform(ppmi_matrix)
print(f"SVD-reduced matrix dimensions: {svd_vectors.shape}")

print("\n Task A models are ready.")

--- Starting Task A: VSM & SVD Execution ---
Building co-occurrence matrix...
Building co-occurrence matrix (this may take a moment)...
Co-occurrence matrix built.

Calculating PPMI matrix...
Final PPMI matrix dimensions: (10000, 10000)

Applying Truncated SVD...
SVD-reduced matrix dimensions: (10000, 300)

 Task A models are ready.


In [6]:
query_words = ['market', 'film', 'election', 'game', 'software']
results_data = []

for word in query_words:
    vsm_sim = get_most_similar(word, ppmi_matrix, word_to_idx, idx_to_word)
    svd_sim = get_most_similar(word, svd_vectors, word_to_idx, idx_to_word)
    results_data.append({
        "Query Word": word,
        "Top 5 (VSM - PPMI)": ', '.join(vsm_sim),
        "Top 5 (SVD)": ', '.join(svd_sim)
    })
results_df_A = pd.DataFrame(results_data)
print("--- VSM vs. SVD Similarity Results ---")
display(results_df_A)

--- VSM vs. SVD Similarity Results ---


Unnamed: 0,Query Word,Top 5 (VSM - PPMI),Top 5 (SVD)
0,market,"stock, housing, growth, prices, sales","stock, share, analysts, growth, prices"
1,film,"best, awards, actress, director, actor","films, movie, best, actress, director"
2,election,"general, labour, campaign, blair, party","labour, general, partys, labours, campaign"
3,game,"games, play, players, match, playing","play, games, playing, players, first"
4,software,"microsoft, programs, users, antivirus, windows","programs, microsoft, windows, users, program"


In [7]:
class Word2VecDataset(Dataset):
    """PyTorch Dataset for generating skip-gram pairs and negative samples."""
    def __init__(self, corpus, word_to_idx, word_counts, window_size=5, num_neg_samples=5):
        self.data = []
        self.num_neg_samples = num_neg_samples
        
        freq = np.array([word_counts[word] for word in vocabulary])**0.75
        self.sampling_dist = freq / freq.sum()
        
        print("Creating Skip-gram dataset (this might take a minute)...")

        for doc in corpus:
            doc_indices = [word_to_idx[word] for word in doc if word in word_to_idx]
            for i, target_word_idx in enumerate(doc_indices):
                start = max(0, i - window_size)
                end = min(len(doc_indices), i + window_size + 1)
                for j in range(start, end):
                    if i != j:
                        context_word_idx = doc_indices[j]
                        self.data.append((target_word_idx, context_word_idx))
        print("Dataset creation complete.")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        target, context = self.data[idx]
        # Draw negative samples from the frequency distribution
        neg_samples = np.random.choice(
            len(self.sampling_dist),
            size=self.num_neg_samples,
            p=self.sampling_dist
        )
        return torch.tensor(target), torch.tensor(context), torch.from_numpy(neg_samples)

class SkipGramNegativeSampling(nn.Module):
    """PyTorch implementation of Skip-gram with Negative Sampling."""
    def __init__(self, vocab_size, embed_dim):
        super(SkipGramNegativeSampling, self).__init__()
        self.target_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.context_embeddings = nn.Embedding(vocab_size, embed_dim)
        self.init_embeds()

    def init_embeds(self):
        # Initialize embeddings with a uniform distribution
        initrange = 0.5 / self.target_embeddings.embedding_dim
        self.target_embeddings.weight.data.uniform_(-initrange, initrange)
        self.context_embeddings.weight.data.uniform_(-0, 0) # Context vectors initialized to zero

    def forward(self, target, context, neg_samples):
        target_embed = self.target_embeddings(target)
        context_embed = self.context_embeddings(context)
        neg_embed = self.context_embeddings(neg_samples)

        # Positive score (dot product between target and true context)
        pos_score = torch.sum(target_embed * context_embed, dim=1)
        pos_loss = -torch.nn.functional.logsigmoid(pos_score).mean()

        # Negative score (dot products between target and negative samples)
        neg_score = torch.bmm(neg_embed, target_embed.unsqueeze(2)).squeeze()
        neg_loss = -torch.nn.functional.logsigmoid(-neg_score).mean()

        return pos_loss + neg_loss

print("Word2Vec Model and Dataset classes defined.")

Word2Vec Model and Dataset classes defined.


In [8]:
print("--- Starting Task B: Word2Vec Training ---")

# Get word counts needed for negative sampling distribution
full_word_counts = Counter(word for doc in tokenized_corpus for word in doc)

# Create dataset and dataloader
dataset = Word2VecDataset(
    tokenized_corpus, word_to_idx, full_word_counts, window_size=WINDOW_SIZE
)
dataloader = DataLoader(dataset, batch_size=1024, shuffle=True)

# Initialize model and optimizer
model = SkipGramNegativeSampling(vocab_size=len(vocabulary), embed_dim=W2V_DIMS)
optimizer = optim.Adam(model.parameters(), lr=0.005)

# Training loop
num_epochs = 50
model.train()
print("Starting model training...")
for epoch in range(num_epochs):
    total_loss = 0
    # --- MODIFIED LINE: Removed tqdm wrapper from dataloader ---
    for target, context, neg_samples in dataloader:
        optimizer.zero_grad()
        loss = model(target, context, neg_samples)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(dataloader):.4f}")

w2v_vectors = model.target_embeddings.weight.data.cpu().numpy()
print("\n Word2Vec model training complete.")

--- Starting Task B: Word2Vec Training ---
Creating Skip-gram dataset (this might take a minute)...
Dataset creation complete.
Starting model training...
Epoch 1, Loss: 1.2512
Epoch 2, Loss: 1.0957
Epoch 3, Loss: 1.0230
Epoch 4, Loss: 0.9925
Epoch 5, Loss: 0.9758
Epoch 6, Loss: 0.9657
Epoch 7, Loss: 0.9597
Epoch 8, Loss: 0.9552
Epoch 9, Loss: 0.9515
Epoch 10, Loss: 0.9496
Epoch 11, Loss: 0.9480
Epoch 12, Loss: 0.9458
Epoch 13, Loss: 0.9457
Epoch 14, Loss: 0.9447
Epoch 15, Loss: 0.9437
Epoch 16, Loss: 0.9436
Epoch 17, Loss: 0.9432
Epoch 18, Loss: 0.9431
Epoch 19, Loss: 0.9428
Epoch 20, Loss: 0.9432
Epoch 21, Loss: 0.9422
Epoch 22, Loss: 0.9423
Epoch 23, Loss: 0.9428
Epoch 24, Loss: 0.9419
Epoch 25, Loss: 0.9427
Epoch 26, Loss: 0.9426
Epoch 27, Loss: 0.9425
Epoch 28, Loss: 0.9427
Epoch 29, Loss: 0.9430
Epoch 30, Loss: 0.9425
Epoch 31, Loss: 0.9431
Epoch 32, Loss: 0.9426
Epoch 33, Loss: 0.9427
Epoch 34, Loss: 0.9430
Epoch 35, Loss: 0.9434
Epoch 36, Loss: 0.9433
Epoch 37, Loss: 0.9432
Epoc

In [9]:
def solve_analogy(w1, w2, w3, word_vectors, word_to_idx, idx_to_word):
    """Solves word analogy: w1 is to w2 as w3 is to ?"""
    if not all(w in word_to_idx for w in [w1, w2, w3]):
        return "One or more words not in vocabulary."

    vec1 = word_vectors[word_to_idx[w1]]
    vec2 = word_vectors[word_to_idx[w2]]
    vec3 = word_vectors[word_to_idx[w3]]

    analogy_vec = vec2 - vec1 + vec3
    similarities = cosine_similarity(analogy_vec.reshape(1, -1), word_vectors).flatten()
    
    # Exclude query words from results
    for w in [w1, w2, w3]:
        similarities[word_to_idx[w]] = -np.inf
        
    # Find the single best answer
    answer_idx = np.argmax(similarities)
    return idx_to_word[answer_idx]

# --- 1. Final Similarity Comparison ---
print("--- Final Word Similarity Comparison ---")
w2v_sim_results = [', '.join(get_most_similar(q, w2v_vectors, word_to_idx, idx_to_word)) for q in query_words]
results_df_A['Top 5 (Word2Vec)'] = w2v_sim_results
display(results_df_A)

# --- 2. Word Analogy Tasks ---
print("\n--- Word Analogy Tasks (Word2Vec) ---")
analogies = [
    ("business", "profit", "politics"),
    ("britain", "london", "france"),
    ("sport", "football", "tech"),
    ("minister", "government", "player"),
    ("movie", "entertainment", "computer")
]
for w1, w2, w3 in analogies:
    answer = solve_analogy(w1, w2, w3, w2v_vectors, word_to_idx, idx_to_word)
    print(f"'{w1}' is to '{w2}' as '{w3}' is to ---> '{answer}'")

--- Final Word Similarity Comparison ---


Unnamed: 0,Query Word,Top 5 (VSM - PPMI),Top 5 (SVD),Top 5 (Word2Vec)
0,market,"stock, housing, growth, prices, sales","stock, share, analysts, growth, prices","stock, shares, value, detroit, housing"
1,film,"best, awards, actress, director, actor","films, movie, best, actress, director","movie, awards, festival, oscar, aviator"
2,election,"general, labour, campaign, blair, party","labour, general, partys, labours, campaign","general, labour, party, blair, campaign"
3,game,"games, play, players, match, playing","play, games, playing, players, first","games, players, played, match, play"
4,software,"microsoft, programs, users, antivirus, windows","programs, microsoft, windows, users, program","programs, microsoft, antivirus, patent, linux"



--- Word Analogy Tasks (Word2Vec) ---
'business' is to 'profit' as 'politics' is to ---> 'carlsberg'
'britain' is to 'london' as 'france' is to ---> 'achilles'
'sport' is to 'football' as 'tech' is to ---> 'talents'
'minister' is to 'government' as 'player' is to ---> 'cord'
'movie' is to 'entertainment' as 'computer' is to ---> 'sina'


In [10]:
def visualize_embeddings(vectors, words_to_show, title):
    """Projects vectors to 2D using PCA and t-SNE and creates interactive plots."""
    print(f"\nCreating visualization: {title}")
    pca = PCA(n_components=2, random_state=RANDOM_SEED)
    vectors_2d_pca = pca.fit_transform(vectors)
    
    fig_pca = px.scatter(
        x=vectors_2d_pca[:, 0], y=vectors_2d_pca[:, 1],
        hover_name=words_to_show,
        title=f"PCA Visualization - {title}",
        width=800, height=600
    )
    fig_pca.update_traces(mode='markers')
    fig_pca.show()
    
    # --- t-SNE Visualization ---
    tsne = TSNE(n_components=2, random_state=RANDOM_SEED, perplexity=30, max_iter=400)
    vectors_2d_tsne = tsne.fit_transform(vectors)

    fig_tsne = px.scatter(
        x=vectors_2d_tsne[:, 0], y=vectors_2d_tsne[:, 1],
        hover_name=words_to_show,
        title=f"t-SNE Visualization - {title}",
        width=800, height=600
    )
    fig_tsne.update_traces(mode='markers')
    fig_tsne.show()

print("Visualization function defined.")

Visualization function defined.


In [11]:
words_to_plot_count = 300
plot_words = vocabulary[:words_to_plot_count]
plot_indices = [word_to_idx[word] for word in plot_words]

visualize_embeddings(svd_vectors[plot_indices], plot_words, title="SVD Embeddings")

visualize_embeddings(w2v_vectors[plot_indices], plot_words, title="Word2Vec Embeddings")


Creating visualization: SVD Embeddings



Creating visualization: Word2Vec Embeddings


# In Lab Evaluation


In [12]:
def visualize_word_neighborhoods(query_words, all_vectors, word_to_idx, idx_to_word, model_name):
    words_to_plot = []
    vectors_to_plot = []
    categories = [] 

    print(f"\n--- Generating Neighborhood Plot for {model_name} ---")
    
    for query in query_words:
        neighbors = get_most_similar(query, all_vectors, word_to_idx, idx_to_word, top_n=20)
        words_in_neighborhood = [query] + neighbors
        indices = [word_to_idx[word] for word in words_in_neighborhood if word in word_to_idx]
        
        if not indices:
            continue
            
        vectors = all_vectors[indices]
        
        words_to_plot.extend(words_in_neighborhood)
        vectors_to_plot.append(vectors)
        categories.extend([query] * len(words_in_neighborhood))

    if not vectors_to_plot:
        print("No words found to plot.")
        return
    combined_vectors = np.vstack(vectors_to_plot)
    pca = PCA(n_components=2, random_state=RANDOM_SEED)
    vectors_2d = pca.fit_transform(combined_vectors)
    
    # 3. Create a DataFrame for plotting
    plot_df = pd.DataFrame({
        'x': vectors_2d[:, 0],
        'y': vectors_2d[:, 1],
        'word': words_to_plot,
        'category': categories
    })
    
    fig = px.scatter(
        plot_df,
        x='x',
        y='y',
        text='word',
        color='category',
        title=f"PCA of Word Neighborhoods - {model_name}"
    )
    fig.update_traces(textposition='top center', mode='markers+text')
    fig.update_layout(height=700, legend_title_text='Query Word')
    fig.show()

print("Neighborhood visualization function defined.")

Neighborhood visualization function defined.


In [13]:

query_words = ['market', 'film', 'election', 'game', 'software']
visualize_word_neighborhoods(query_words, svd_vectors, word_to_idx, idx_to_word, "SVD")
visualize_word_neighborhoods(query_words, w2v_vectors, word_to_idx, idx_to_word, "Word2Vec")


--- Generating Neighborhood Plot for SVD ---



--- Generating Neighborhood Plot for Word2Vec ---
