# Lab Session 4 - Task A: VSM for Word Similarity
Imports and Setup

In [1]:
import os
import re
from collections import Counter
import numpy as np
import pandas as pd
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from scipy.sparse import lil_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("Setup Complete")


Setup Complete


Data Loading and Preprocessing

In [2]:
def preprocess_text(text):
    """Cleans and tokenizes text."""
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  
    tokens = word_tokenize(text)
    return [word for word in tokens if word not in stop_words and len(word) > 2]


def load_data(folder_path):
    """Loads and preprocesses all documents from the dataset folder."""
    all_docs = []
    categories = [d for d in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, d))]
    print(f"Found categories: {categories}")
    
    for category in tqdm(categories, desc="Loading categories"):
        category_path = os.path.join(folder_path, category)
        for filename in os.listdir(category_path):
            file_path = os.path.join(category_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    text = f.read()
                    processed_doc = preprocess_text(text)
                    if processed_doc:
                        all_docs.append(processed_doc)
            except Exception as e:
                print(f"Could not read {file_path}: {e}")
    return all_docs


BBC_FOLDER = 'bbc'
documents = load_data(BBC_FOLDER)
print(f"\nLoaded and processed {len(documents)} documents.")

Found categories: ['business', 'entertainment', 'politics', 'sport', 'tech']


Loading categories: 100%|██████████| 5/5 [00:21<00:00,  4.22s/it]


Loaded and processed 2225 documents.





Build Co-occurrence Matrix

In [3]:
VOCAB_SIZE = 10000
WINDOW_SIZE = 5  

all_tokens = [token for doc in documents for token in doc]
word_counts = Counter(all_tokens)
vocab = [word for word, count in word_counts.most_common(VOCAB_SIZE)]

word_to_id = {word: i for i, word in enumerate(vocab)}
id_to_word = {i: word for i, word in enumerate(vocab)}
vocab_set = set(vocab)
print(f"Vocabulary size: {len(vocab)}")

cooc_matrix = lil_matrix((VOCAB_SIZE, VOCAB_SIZE), dtype=np.float32)

for doc in tqdm(documents, desc="Building Co-occurrence Matrix"):
    doc_indices = [word_to_id[word] for word in doc if word in vocab_set]
    for i, target_idx in enumerate(doc_indices):
        start = max(0, i - WINDOW_SIZE)
        end = min(len(doc_indices), i + WINDOW_SIZE + 1)
        context_indices = doc_indices[start:i] + doc_indices[i+1:end]
        for context_idx in context_indices:
            cooc_matrix[target_idx, context_idx] += 1

print("Co-occurrence matrix built.")

Vocabulary size: 10000


Building Co-occurrence Matrix: 100%|██████████| 2225/2225 [00:17<00:00, 123.98it/s]

Co-occurrence matrix built.





Apply PPMI Weighting

In [5]:
def calculate_ppmi(matrix):
    matrix_csr = matrix.tocsr()
    
    total_cooccurrences = matrix_csr.sum()
    if total_cooccurrences == 0:
        return lil_matrix(matrix.shape, dtype=np.float32)
    
    word_totals = np.array(matrix_csr.sum(axis=1)).flatten()
    context_totals = np.array(matrix_csr.sum(axis=0)).flatten()
    
    p_wc = matrix_csr / total_cooccurrences
    p_w = word_totals / total_cooccurrences
    p_c = context_totals / total_cooccurrences

    ppmi_matrix = lil_matrix(matrix.shape, dtype=np.float32)
    rows, cols = matrix_csr.nonzero()
    
    for r, c in tqdm(zip(rows, cols), desc="Calculating PPMI", total=len(rows)):
        if p_w[r] > 0 and p_c[c] > 0:
            pmi = np.log2(p_wc[r, c] / (p_w[r] * p_c[c]))
            ppmi_matrix[r, c] = max(0, pmi)
            
    return ppmi_matrix


ppmi_matrix = calculate_ppmi(cooc_matrix)
print("\n--- Task A.a Result ---")
print(f"The dimensions of the final PPMI matrix are: {ppmi_matrix.shape}")


Calculating PPMI: 100%|██████████| 2416672/2416672 [00:48<00:00, 49548.36it/s]


--- Task A.a Result ---
The dimensions of the final PPMI matrix are: (10000, 10000)





Apply Truncated SVD


In [6]:
SVD_DIMENSIONS = 300

svd = TruncatedSVD(n_components=SVD_DIMENSIONS, random_state=42)
svd_matrix = svd.fit_transform(ppmi_matrix)

print(f"SVD matrix created with dimensions: {svd_matrix.shape}")


SVD matrix created with dimensions: (10000, 300)


Word Similarity Evaluation

In [7]:
def find_most_similar(query_word, matrix, word_to_id, id_to_word, top_n=5):
    """Finds the top_n most similar words to a query_word in a given matrix."""
    if query_word not in word_to_id:
        return [("Word not in vocabulary", 0)] * top_n
    
    query_id = word_to_id[query_word]
    query_vector = matrix[query_id].reshape(1, -1)
    
    sim_scores = cosine_similarity(query_vector, matrix).flatten()
    
    top_indices = np.argsort(sim_scores)[::-1][1:top_n+1]
    
    return [(id_to_word[i], sim_scores[i]) for i in top_indices]



query_words = {
    'business': 'market',
    'entertainment': 'film',
    'politics': 'election',
    'sport': 'game',
    'tech': 'software'
}

results = []
for category, word in query_words.items():
    vsm_similar = find_most_similar(word, ppmi_matrix, word_to_id, id_to_word)
    svd_similar = find_most_similar(word, svd_matrix, word_to_id, id_to_word)
    
    results.append({
        "Query Word": f"{word} ({category})",
        "Top 5 Similar Words (VSM)": ", ".join([f"{w[0]}" for w in vsm_similar]),
        "Top 5 Similar Words (SVD)": ", ".join([f"{w[0]}" for w in svd_similar])
    })

results_df = pd.DataFrame(results)
print("\nWord Similarity Results:")
print(results_df.to_string(index=False))


Word Similarity Results:
          Query Word                      Top 5 Similar Words (VSM)                    Top 5 Similar Words (SVD)
   market (business)          stock, housing, growth, prices, sales       stock, analysts, share, growth, prices
film (entertainment)         best, awards, actress, director, films          films, movie, awards, best, actress
 election (politics)        general, labour, campaign, blair, party   labour, general, partys, campaign, labours
        game (sport)           games, play, players, match, playing         play, games, players, playing, first
     software (tech) microsoft, programs, users, antivirus, windows programs, microsoft, windows, users, program


In [8]:
# Run this in your Lab 4, Task A notebook
results_df.to_csv('task_a_results.csv', index=False)
print("Lab 4 results saved to task_a_results.csv")

Lab 4 results saved to task_a_results.csv
