### Imports

In [None]:
import re
from collections import defaultdict, Counter
import nltk
from scipy.sparse import lil_matrix, coo_matrix
nltk.download('stopwords')
from nltk.corpus import stopwords
import random
import numpy as np
import math
from tqdm.notebook import tqdm
import time
from scipy.stats import spearmanr
import pandas as pd
from scipy.sparse import save_npz
from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive
from sklearn.decomposition import TruncatedSVD
from numpy.linalg import norm
!pip install -U datasets
from datasets import load_dataset

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency res

### Load stopwords

In [None]:
stop_words = set(stopwords.words('english'))

### Define Methods

In [None]:
def preprocess(text):
  # put all text in lowercase so case does not matter
  text = text.lower()
  # replace anything that is not a lowercase character or whitespace
  text = re.sub(r'[^a-z\s]', '', text)
  # split the text into a list of strings (words)
  words = text.split()
  # remove stopwords
  return [word for word in words if word not in stop_words]

def build_vocabulary(corpus, min_count=25):
  c = Counter()
  for sentence, _ in tqdm(corpus, desc="Building vocab"):
    c.update(preprocess(sentence))
  # filter words by min_count
  filtered_words = [word for word, count in c.items() if count >= min_count]
  # add a word to the vocabulary only if it matches or exceeds the minimum number of appearances (default = 5)
  vocabulary = {word: i for i, word in enumerate(filtered_words)}
  return vocabulary

def build_matrix(corpus, vocab, window_size=2):
  vocab_size = len(vocab)
  # create a vocab_size x vocab_size matrix
  matrix = lil_matrix((vocab_size, vocab_size), dtype=np.float32)
  # set counts for all words in vocab to 0
  counts = np.zeros(vocab_size, dtype=np.int32)
  tot = 0

  for sentence, _ in tqdm(corpus, desc='Preprocessing sentences'):
    # split the sentence into a list of words
    words = preprocess(sentence)
    # get the indices for all words in vocabulary
    indices = [vocab[word] for word in preprocess(sentence) if word in vocab]

    # for each word in the sentence, add 1 for the words within the window
    for i, id in enumerate(indices):
      counts[id] += 1
      start = max(i - window_size, 0)
      end = min(i + window_size + 1, len(indices))

      for j in range(start, end):
        if i != j:
          matrix[id, indices[j]] += 1
          tot += 1

  return matrix.tocoo(), counts, tot

def build_pmi(matrix, counts, tot, positive=True):
  rows, cols = matrix.row, matrix.col
  data = matrix.data
  pmi_data = []

  for i in tqdm(range(len(data)), desc="Computing PMI"):
    w1 = rows[i]
    w2 = cols[i]
    p_w1w2 = data[i]/tot
    p_w1 = counts[w1]/tot
    p_w2 = counts[w2]/tot

    if p_w1 > 0 and p_w2 > 0 and p_w1w2 > 0:
      pmi = math.log2(p_w1w2 / (p_w1 * p_w2))

      if positive:
        pmi = max(pmi, 0)
      pmi_data.append(pmi)
    else:
      pmi.append(0)

  pmi_matrix = coo_matrix((pmi_data, (rows, cols)), shape=matrix.shape)
  return pmi_matrix

### Load Dataset, Build Vocab, and Compute PMI Matrix

In [None]:
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")
corpus = dataset["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/157M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [None]:
corpus = [example["text"] for example in dataset["train"] if example["text"].strip() != ""]
corpus = [(text, None) for text in corpus]

In [None]:
vocab = build_vocabulary(corpus)

Building vocab:   0%|          | 0/1165029 [00:00<?, ?it/s]

In [None]:
co_matrix, word_counts, total = build_matrix(corpus, vocab)

Preprocessing sentences:   0%|          | 0/1165029 [00:00<?, ?it/s]

In [None]:
pmi_matrix = build_pmi(co_matrix, word_counts, total, positive=True)

Computing PMI:   0%|          | 0/55229592 [00:00<?, ?it/s]

### Mount Drive & Save Matrix

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pmi_matrix = pmi_matrix.tocsr()
save_npz('/content/drive/MyDrive/pmi_model.npz', pmi_matrix)

### Loading and Using SVD

In [None]:
pmi_matrix = load_npz('/content/drive/MyDrive/pmi_model.npz')

In [None]:
svd = TruncatedSVD(n_components=300)  # or 100, 200, depending on RAM
reduced_matrix = svd.fit_transform(pmi_matrix)
print(reduced_matrix.shape)

(68630, 300)


### Defining Analogy Methods

In [None]:
def get_vector(word, vocab, reduced_matrix):
    """Get the dense vector for a word."""
    if word not in vocab:
        return None
    return reduced_matrix[vocab[word]]

def find_best_match_fast(query_vec, reduced_matrix, vocab, exclude=None):
    reverse_vocab = {i: w for w, i in vocab.items()}

    # Use sklearn's version here
    similarities = cosine_similarity(query_vec.reshape(1, -1), reduced_matrix)[0]

    if exclude:
        exclude_indices = {vocab[word] for word in exclude if word in vocab}
        for idx in exclude_indices:
            similarities[idx] = -np.inf

    best_idx = np.argmax(similarities)
    return reverse_vocab[best_idx]

def solve_analogy(a, b, c, vocab, reduced_matrix):
    """Solve analogy: a is to b as c is to ?"""
    vec_a = get_vector(a, vocab, reduced_matrix)
    vec_b = get_vector(b, vocab, reduced_matrix)
    vec_c = get_vector(c, vocab, reduced_matrix)

    if vec_a is None or vec_b is None or vec_c is None:
      return "One or more words not in vocabulary."

    # vector arithmetic: b - a + c
    query_vec = vec_b - vec_a + vec_c

    # exclude input words from result
    result = find_best_match_fast(query_vec, reduced_matrix, vocab, exclude={a, b, c})
    return result


### Loading & Pre-processing Analogy Dataset

In [None]:
file_path = "/content/drive/MyDrive/questions-words.txt"

In [None]:
def load_analogies_by_section(filepath):
    sections = {}
    current_section = None

    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith(":"):
                current_section = line.strip()[2:].lower()
                sections[current_section] = []
            else:
                words = line.strip().lower().split()
                if len(words) == 4:
                    sections[current_section].append(tuple(words))
    return sections

In [None]:
def evaluate_by_section(sections, vocab, reduced_matrix):
    from collections import defaultdict

    results = {}
    total_correct = total_total = total_missing = 0

    for section, analogies in tqdm(sections.items(), desc="Evaluating Sections"):
        correct = total = missing = 0
        for a, b, c, expected in tqdm(analogies, desc=f"{section}", leave=False):
            if any(w not in vocab for w in (a, b, c, expected)):
                missing += 1
                continue

            vec_a = get_vector(a, vocab, reduced_matrix)
            vec_b = get_vector(b, vocab, reduced_matrix)
            vec_c = get_vector(c, vocab, reduced_matrix)
            query_vec = vec_b - vec_a + vec_c

            top = find_best_match_fast(query_vec, reduced_matrix, vocab, exclude=[a, b, c])
            predicted = top if top else None

            if predicted == expected:
              correct += 1
            total += 1

        accuracy = correct / total if total else 0
        results[section] = {
            "accuracy": accuracy,
            "correct": correct,
            "total": total,
            "missing": missing
        }
        total_correct += correct
        total_total += total
        total_missing += missing

    overall = {
        "accuracy": total_correct / total_total if total_total else 0,
        "correct": total_correct,
        "total": total_total,
        "missing": total_missing
    }

    return results, overall

In [None]:
sections = load_analogies_by_section(file_path)
section_results, overall = evaluate_by_section(sections, vocab, reduced_matrix)

# Print results
for sec, stats in section_results.items():
    print(f"{sec:30s} - Accuracy: {stats['accuracy']:.4f} ({stats['correct']}/{stats['total']}) Missing: {stats['missing']}")
print("\nOverall Accuracy:")
print(overall)

Evaluating Sections:   0%|          | 0/14 [00:00<?, ?it/s]

capital-common-countries:   0%|          | 0/506 [00:00<?, ?it/s]

capital-world:   0%|          | 0/4524 [00:00<?, ?it/s]

currency:   0%|          | 0/866 [00:00<?, ?it/s]

city-in-state:   0%|          | 0/2467 [00:00<?, ?it/s]

family:   0%|          | 0/506 [00:00<?, ?it/s]

gram1-adjective-to-adverb:   0%|          | 0/992 [00:00<?, ?it/s]

gram2-opposite:   0%|          | 0/812 [00:00<?, ?it/s]

gram3-comparative:   0%|          | 0/1332 [00:00<?, ?it/s]

gram4-superlative:   0%|          | 0/1122 [00:00<?, ?it/s]

gram5-present-participle:   0%|          | 0/1056 [00:00<?, ?it/s]

gram6-nationality-adjective:   0%|          | 0/1599 [00:00<?, ?it/s]

gram7-past-tense:   0%|          | 0/1560 [00:00<?, ?it/s]

gram8-plural:   0%|          | 0/1332 [00:00<?, ?it/s]

gram9-plural-verbs:   0%|          | 0/870 [00:00<?, ?it/s]

capital-common-countries       - Accuracy: 0.4249 (215/506) Missing: 0
capital-world                  - Accuracy: 0.1975 (560/2835) Missing: 1689
currency                       - Accuracy: 0.0097 (2/206) Missing: 660
city-in-state                  - Accuracy: 0.0794 (185/2330) Missing: 137
family                         - Accuracy: 0.4079 (155/380) Missing: 126
gram1-adjective-to-adverb      - Accuracy: 0.0903 (84/930) Missing: 62
gram2-opposite                 - Accuracy: 0.0860 (65/756) Missing: 56
gram3-comparative              - Accuracy: 0.4122 (549/1332) Missing: 0
gram4-superlative              - Accuracy: 0.1109 (110/992) Missing: 130
gram5-present-participle       - Accuracy: 0.4335 (430/992) Missing: 64
gram6-nationality-adjective    - Accuracy: 0.6789 (981/1445) Missing: 154
gram7-past-tense               - Accuracy: 0.2929 (457/1560) Missing: 0
gram8-plural                   - Accuracy: 0.4504 (536/1190) Missing: 142
gram9-plural-verbs             - Accuracy: 0.2672 (217/81

### Testing on SimLex999

In [None]:
simlex_path = "/content/drive/MyDrive/SimLex-999.txt"
simlex = pd.read_csv(simlex_path, sep='\t')

In [None]:
def clean_word(w):
    return w.rsplit("-", 1)[0].lower() if "-" in w else w.lower()

def evaluate_words(df, vocab, reduced_matrix, score_name):
    similarities = []
    human_scores = []

    for _, row in df.iterrows():
        w1, w2 = clean_word(row['word1']), clean_word(row['word2'])
        score = float(row[score_name])

        if w1 in vocab and w2 in vocab:
            vec1 = reduced_matrix[vocab[w1]]
            vec2 = reduced_matrix[vocab[w2]]
            sim = vec1 @ vec2 / (norm(vec1) * norm(vec2))
            similarities.append(sim)
            human_scores.append(score)

    if not similarities:
        return None, 0

    corr, _ = spearmanr(similarities, human_scores)
    return corr, len(similarities)

In [None]:
pos_tags = simlex['POS'].unique()

for pos in pos_tags:
    subset = simlex[simlex['POS'] == pos]
    corr, count = evaluate_words(subset, vocab, reduced_matrix, 'SimLex999')
    if corr is not None:
        print(f"POS: {pos:<5} | Spearman: {corr:.4f} | Pairs used: {count}")
    else:
        print(f"POS: {pos:<5} | Not enough valid pairs in vocab.")

POS: A     | Spearman: 0.3264 | Pairs used: 111
POS: N     | Spearman: 0.2928 | Pairs used: 663
POS: V     | Spearman: 0.0382 | Pairs used: 219


### Testing on WordSim353

In [None]:
!wget https://alfonseca.org/eng/research/pubs/ws353simrel.tar
!tar -xf ws353simrel.tar

--2025-07-03 14:50:24--  https://alfonseca.org/eng/research/pubs/ws353simrel.tar
Resolving alfonseca.org (alfonseca.org)... 172.67.221.145, 104.21.25.16, 2606:4700:3031::6815:1910, ...
Connecting to alfonseca.org (alfonseca.org)|172.67.221.145|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘ws353simrel.tar’

ws353simrel.tar         [ <=>                ]   5.33K  --.-KB/s    in 0s      

2025-07-03 14:50:24 (58.8 MB/s) - ‘ws353simrel.tar’ saved [5458]


gzip: stdin: unexpected end of file
tar: Child returned status 1
tar: Error is not recoverable: exiting now


In [None]:
sim_df = pd.read_csv("wordsim353_sim_rel/wordsim_similarity_goldstandard.txt", sep="\t", header=None, names=["word1", "word2", "score"])
rel_df = pd.read_csv("wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt", sep="\t", header=None, names=["word1", "word2", "score"])

In [None]:
sim_corr, sim_count = evaluate_words(sim_df, vocab, reduced_matrix, 'score')
rel_corr, rel_count = evaluate_words(rel_df, vocab, reduced_matrix, 'score')

print(f"Similarity Set  → Spearman: {sim_corr:.4f} | Pairs used: {sim_count}")
print(f"Relatedness Set → Spearman: {rel_corr:.4f} | Pairs used: {rel_count}")

Similarity Set  → Spearman: 0.6008 | Pairs used: 203
Relatedness Set → Spearman: 0.3768 | Pairs used: 252


### Testing More

In [None]:
!wget https://raw.githubusercontent.com/vecto-ai/word-benchmarks/master/word-similarity/monolingual/en/rg-65.csv
!wget https://raw.githubusercontent.com/vecto-ai/word-benchmarks/master/word-similarity/monolingual/en/men.csv
!wget https://raw.githubusercontent.com/vecto-ai/word-benchmarks/master/word-similarity/monolingual/en/mc-30.csv
!wget https://raw.githubusercontent.com/vecto-ai/word-benchmarks/master/word-similarity/monolingual/en/mturk-771.csv

--2025-07-03 14:51:30--  https://raw.githubusercontent.com/vecto-ai/word-benchmarks/master/word-similarity/monolingual/en/rg-65.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1418 (1.4K) [text/plain]
Saving to: ‘rg-65.csv’


2025-07-03 14:51:31 (20.3 MB/s) - ‘rg-65.csv’ saved [1418/1418]

--2025-07-03 14:51:31--  https://raw.githubusercontent.com/vecto-ai/word-benchmarks/master/word-similarity/monolingual/en/men.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94507 (92K) [text/plain]
Saving to: ‘men.csv’


2025-07-03 14:5

In [None]:
rg65 = pd.read_csv("rg-65.csv")
men = pd.read_csv("men.csv")
mc30 = pd.read_csv("mc-30.csv")
mturk771 = pd.read_csv("mturk-771.csv")

In [None]:
def extract_pos(w):
    return w.split("-")[-1] if "-" in w else "unk"

men["pos1"] = men["word1"].apply(extract_pos)
men["pos2"] = men["word2"].apply(extract_pos)

men_pos_matched = men[men["pos1"] == men["pos2"]].copy()
men_pos_matched["pos"] = men_pos_matched["pos1"]

In [None]:
datasets = {
    'RG-65': (rg65, 'similarity'),
    'MEN': (men, 'similarity'),
    'MC-30': (mc30, 'similarity'),
    'MTurk-771': (mturk771, 'similarity')
}

for name, (df, score_col) in datasets.items():
    corr, count = evaluate_words(df, vocab, reduced_matrix, score_col)
    print(f"{name}: Spearman correlation = {corr:.4f} on {count} pairs")

RG-65: Spearman correlation = 0.6567 on 64 pairs
MEN: Spearman correlation = 0.5636 on 2983 pairs
MC-30: Spearman correlation = 0.6785 on 30 pairs
MTurk-771: Spearman correlation = 0.5237 on 769 pairs


In [None]:
for pos_tag in men_pos_matched["pos"].unique():
    subset = men_pos_matched[men_pos_matched["pos"] == pos_tag]
    corr, count = evaluate_words(subset, vocab, reduced_matrix, 'similarity')
    if corr is not None:
        print(f"POS: {pos_tag:<2} | Spearman: {corr:.4f} | Pairs: {count}")
    else:
        print(f"POS: {pos_tag:<2} | Not enough valid pairs.")

POS: n  | Spearman: 0.6091 | Pairs: 1991
POS: j  | Spearman: 0.5623 | Pairs: 96
POS: v  | Spearman: 0.5135 | Pairs: 29


### Cosine Neighbour Space

In [None]:
def get_top_neighbors(word, vocab, reduced_matrix, top_n=10):
    if word not in vocab:
        print(f"'{word}' not in vocabulary.")
        return []

    index = vocab[word]
    vec = reduced_matrix[index]
    if norm(vec) == 0:
        print(f"'{word}' has a zero vector.")
        return []

    vec_norm = vec / norm(vec)

    # Normalize entire matrix safely (avoid division by 0)
    matrix_norms = np.linalg.norm(reduced_matrix, axis=1, keepdims=True)
    matrix_norms[matrix_norms == 0] = 1
    matrix_norm = reduced_matrix / matrix_norms

    sims = matrix_norm @ vec_norm

    # Use reverse vocab for fast lookup
    reverse_vocab = {i: w for w, i in vocab.items()}

    # Get sorted top indices, skipping the word itself
    top_indices = sims.argsort()[::-1]
    top_words = []
    for i in top_indices:
        if i == index:
            continue
        word_i = reverse_vocab.get(i)
        if word_i:
            top_words.append((word_i, sims[i]))
        if len(top_words) == top_n:
            break

    return top_words

In [None]:
neighbors = get_top_neighbors("brother", vocab, reduced_matrix, top_n=5)
for word, sim in neighbors:
    print(f"{word:<15} similarity: {sim:.4f}")

son             similarity: 0.9263
father          similarity: 0.8993
uncle           similarity: 0.8850
cousin          similarity: 0.8708
sons            similarity: 0.8697
