In [1]:
import os
from typing import Dict, List, Tuple

# Try these paths in order (edit if needed)
CANDIDATE_PATHS = [
    "word-test.v1.txt",                 # same directory as notebook
    "./word-test.v1.txt",
    "/mnt/data/word-test.v1.txt",       # common mounted path in this environment
]

SEMANTIC_SECTION = "capital-common-countries"
SYNTACTIC_SECTION = "gram7-past-tense"

def resolve_path(paths):
    for p in paths:
        if os.path.exists(p):
            return p
    raise FileNotFoundError(
        "Could not find word-test.v1.txt. Tried:\n" + "\n".join(paths) +
        "\n\nTip: put the file in the same folder as the notebook OR set ANALOGY_PATH to the correct location."
    )

ANALOGY_PATH = resolve_path(CANDIDATE_PATHS)
print("Using analogy file:", ANALOGY_PATH)

def load_analogy_questions(path: str, keep_sections) -> Dict[str, List[Tuple[str,str,str,str]]]:
    keep_sections = set(keep_sections)
    out = {s: [] for s in keep_sections}
    current = None
    found_sections = set()

    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue

            if line.startswith(":"):
                current = line[1:].strip()
                found_sections.add(current)
                continue

            if current not in keep_sections:
                continue

            parts = line.split()
            if len(parts) != 4:
                continue
            out[current].append(tuple(parts))

    print("Found sections in file (sample):", sorted(list(found_sections))[:10], "...")
    return out

sections = load_analogy_questions(ANALOGY_PATH, [SEMANTIC_SECTION, SYNTACTIC_SECTION])

print("Semantic questions:", len(sections[SEMANTIC_SECTION]))
print("Syntactic questions:", len(sections[SYNTACTIC_SECTION]))

Using analogy file: word-test.v1.txt
Found sections in file (sample): ['capital-common-countries', 'capital-world', 'city-in-state', 'currency', 'family', 'gram1-adjective-to-adverb', 'gram2-opposite', 'gram3-comparative', 'gram4-superlative', 'gram5-present-participle'] ...
Semantic questions: 506
Syntactic questions: 1560


In [2]:
# Cell 1 — Setup
import numpy as np
from typing import Dict, List, Tuple, Optional

In [3]:
# Cell 2 — Load required sections from the word analogy file

ANALOGY_PATH = "word-test.v1.txt"
SEMANTIC_SECTION = "capital-common-countries"
SYNTACTIC_SECTION = "gram7-past-tense"

def load_analogy_questions(path: str, keep_sections) -> Dict[str, List[Tuple[str,str,str,str]]]:
    keep_sections = set(keep_sections)
    out = {s: [] for s in keep_sections}
    current = None
    
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue
            if line.startswith(":"):
                current = line[1:].strip()
                continue
            if current not in keep_sections:
                continue
            
            parts = line.split()
            if len(parts) != 4:
                continue
            out[current].append(tuple(parts))
    return out

sections = load_analogy_questions(ANALOGY_PATH, [SEMANTIC_SECTION, SYNTACTIC_SECTION])
len(sections[SEMANTIC_SECTION]), len(sections[SYNTACTIC_SECTION])


(506, 1560)

In [4]:
# Cell 3 — Load pretrained GloVe (Gensim)
# NOTE: This requires internet the first time (it downloads the vectors).

import gensim.downloader as api

# Common choices:
#   glove-wiki-gigaword-50
#   glove-wiki-gigaword-100
#   glove-wiki-gigaword-200
#   glove-wiki-gigaword-300
kv = api.load("glove-wiki-gigaword-100")

# quick sanity check
kv.most_similar("king", topn=5)


[('prince', 0.7682328820228577),
 ('queen', 0.7507690787315369),
 ('son', 0.7020888328552246),
 ('brother', 0.6985775232315063),
 ('monarch', 0.6977890729904175)]

In [5]:
# Cell 4 — Fast analogy evaluation on KeyedVectors (cosine via normalized dot)

def evaluate_analogies_kv(kv, questions: List[Tuple[str,str,str,str]]):
    total_used = 0
    correct = 0
    skipped = 0
    
    # Pre-normalize for fast cosine search
    words = kv.index_to_key
    mat = kv.vectors.astype(np.float32)
    norms = np.linalg.norm(mat, axis=1, keepdims=True)
    mat_norm = mat / np.maximum(norms, 1e-12)

    key_to_index = kv.key_to_index
    
    def has(w): 
        return w in key_to_index

    for a, b, c, d in questions:
        if not (has(a) and has(b) and has(c) and has(d)):
            skipped += 1
            continue
        
        va = mat_norm[key_to_index[a]]
        vb = mat_norm[key_to_index[b]]
        vc = mat_norm[key_to_index[c]]

        q = vb - va + vc
        q = q / max(np.linalg.norm(q), 1e-12)

        sims = mat_norm @ q
        
        # exclude a,b,c
        sims[key_to_index[a]] = -np.inf
        sims[key_to_index[b]] = -np.inf
        sims[key_to_index[c]] = -np.inf
        
        pred_idx = int(np.argmax(sims))
        pred = words[pred_idx]
        
        total_used += 1
        if pred == d:
            correct += 1

    acc = correct / total_used if total_used > 0 else 0.0
    return {
        "used": total_used,
        "correct": correct,
        "accuracy": acc,
        "skipped_oov": skipped,
        "total_in_section": len(questions),
    }

sem_res = evaluate_analogies_kv(kv, sections[SEMANTIC_SECTION])
syn_res = evaluate_analogies_kv(kv, sections[SYNTACTIC_SECTION])

print(f"Semantic ({SEMANTIC_SECTION})  accuracy: {sem_res['accuracy']:.4f}  used={sem_res['used']}  skipped={sem_res['skipped_oov']}")
print(f"Syntactic ({SYNTACTIC_SECTION}) accuracy: {syn_res['accuracy']:.4f}  used={syn_res['used']}  skipped={syn_res['skipped_oov']}")


Semantic (capital-common-countries)  accuracy: 0.0000  used=0  skipped=506
Syntactic (gram7-past-tense) accuracy: 0.5545  used=1560  skipped=0


In [6]:
import re, math, time, random
import numpy as np
from typing import Dict, List, Tuple
np.random.seed(42)
random.seed(42)

In [7]:
ANALOGY_PATH = "word-test.v1.txt"
SEMANTIC_SECTION = "capital-common-countries"
SYNTACTIC_SECTION = "gram7-past-tense"

def load_analogy_questions(path: str, keep_sections) -> Dict[str, List[Tuple[str,str,str,str]]]:
    keep_sections = set(keep_sections)
    out = {s: [] for s in keep_sections}
    current = None
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            line = raw.strip()
            if not line:
                continue
            if line.startswith(":"):
                current = line[1:].strip()
                continue
            if current not in keep_sections:
                continue
            parts = line.split()
            if len(parts) == 4:
                out[current].append(tuple(parts))
    return out

sections = load_analogy_questions(ANALOGY_PATH, [SEMANTIC_SECTION, SYNTACTIC_SECTION])
print("Semantic questions:", len(sections[SEMANTIC_SECTION]))
print("Syntactic questions:", len(sections[SYNTACTIC_SECTION]))


Semantic questions: 506
Syntactic questions: 1560


In [8]:
# If you have raw text file:
RAW_TEXT_PATH = "archive/SorcerersStone.txt"

def simple_tokenize(text: str) -> List[str]:
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s']", " ", text)
    return text.split()

def load_sentences_from_textfile(path: str, max_lines=None) -> List[List[str]]:
    sents = []
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for i, line in enumerate(f):
            if max_lines is not None and i >= max_lines:
                break
            toks = simple_tokenize(line)
            if len(toks) >= 2:
                sents.append(toks)
    return sents

sentences = load_sentences_from_textfile(RAW_TEXT_PATH)
print("num sentences:", len(sentences))


num sentences: 3044


In [9]:
def build_vocab(sentences: List[List[str]], min_count: int = 2):
    freq = {}
    for sent in sentences:
        for w in sent:
            freq[w] = freq.get(w, 0) + 1
    vocab = [w for w,c in freq.items() if c >= min_count]
    vocab.sort()
    word2idx = {w:i for i,w in enumerate(vocab)}
    idx2word = vocab
    return word2idx, idx2word, freq

def encode_sentences(sentences: List[List[str]], word2idx: Dict[str,int]) -> List[List[int]]:
    enc = []
    for sent in sentences:
        ids = [word2idx[w] for w in sent if w in word2idx]
        if len(ids) >= 2:
            enc.append(ids)
    return enc

# === YOU MUST have 'sentences' defined by now ===
word2idx, idx2word, freq = build_vocab(sentences, min_count=2)
encoded = encode_sentences(sentences, word2idx)

V = len(idx2word)
print("Vocab size:", V, "Encoded sentences:", len(encoded))


Vocab size: 3355 Encoded sentences: 3033


In [10]:
def sigmoid(x):
    x = np.clip(x, -20, 20)
    return 1.0 / (1.0 + np.exp(-x))

def stable_softmax(x):
    x = x - np.max(x)
    ex = np.exp(np.clip(x, -20, 20))
    return ex / np.sum(ex)

def generate_skipgram_pairs(encoded_sents: List[List[int]], window_size: int):
    pairs = []
    for sent in encoded_sents:
        n = len(sent)
        for i, center in enumerate(sent):
            start = max(0, i - window_size)
            end = min(n, i + window_size + 1)
            for j in range(start, end):
                if j == i:
                    continue
                context = sent[j]
                pairs.append((center, context))
    return pairs

def evaluate_analogies_numpy(vectors: np.ndarray, word2idx: Dict[str,int], idx2word: List[str],
                            questions: List[Tuple[str,str,str,str]]):
    # vectors: (V,D)
    eps = 1e-12
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    mat = vectors / np.maximum(norms, eps)

    used = correct = skipped = 0
    for a,b,c,d in questions:
        if not (a in word2idx and b in word2idx and c in word2idx and d in word2idx):
            skipped += 1
            continue

        ia, ib, ic, id_ = word2idx[a], word2idx[b], word2idx[c], word2idx[d]
        q = mat[ib] - mat[ia] + mat[ic]
        q = q / max(np.linalg.norm(q), eps)

        sims = mat @ q
        sims[ia] = sims[ib] = sims[ic] = -np.inf
        pred = int(np.argmax(sims))

        used += 1
        if pred == id_:
            correct += 1

    acc = correct / used if used > 0 else 0.0
    return {"used": used, "correct": correct, "accuracy": acc, "skipped": skipped}

def eval_sem_syn(vectors, word2idx, idx2word):
    sem = evaluate_analogies_numpy(vectors, word2idx, idx2word, sections[SEMANTIC_SECTION])
    syn = evaluate_analogies_numpy(vectors, word2idx, idx2word, sections[SYNTACTIC_SECTION])
    return sem, syn


In [11]:
def train_skipgram_softmax(encoded_sents, V, D=100, window_size=5, epochs=1, lr=0.05, seed=42):
    rng = np.random.default_rng(seed)
    W_in  = (rng.standard_normal((V, D)) * 0.01).astype(np.float32)
    W_out = (rng.standard_normal((V, D)) * 0.01).astype(np.float32)

    pairs = generate_skipgram_pairs(encoded_sents, window_size)
    rng.shuffle(pairs)

    start_time = time.time()
    last_loss = None

    for ep in range(epochs):
        total_loss = 0.0
        rng.shuffle(pairs)

        for center, target in pairs:
            v = W_in[center]          # (D,)
            scores = W_out @ v        # (V,)
            probs = stable_softmax(scores)

            loss = -math.log(max(probs[target], 1e-12))
            total_loss += loss

            # gradients
            ds = probs
            ds[target] -= 1.0         # (V,)
            grad_v = W_out.T @ ds     # (D,)

            W_out -= lr * (ds[:, None] * v[None, :])
            W_in[center] -= lr * grad_v

        last_loss = total_loss / max(len(pairs), 1)
        print(f"[Skipgram Softmax] epoch {ep+1}/{epochs} avg_loss={last_loss:.4f}")

    elapsed = time.time() - start_time
    return W_in, last_loss, elapsed


In [12]:
def make_unigram_table(freq: Dict[str,int], word2idx: Dict[str,int], power=0.75):
    # returns distribution over indices
    probs = np.zeros(len(word2idx), dtype=np.float64)
    for w,i in word2idx.items():
        probs[i] = (freq[w] ** power)
    probs /= probs.sum()
    return probs

def train_skipgram_neg(encoded_sents, V, freq, word2idx, D=100, window_size=5, epochs=2, lr=0.05, K=5, seed=42):
    rng = np.random.default_rng(seed)
    W_in  = (rng.standard_normal((V, D)) * 0.01).astype(np.float32)
    W_out = (rng.standard_normal((V, D)) * 0.01).astype(np.float32)

    neg_dist = make_unigram_table(freq, word2idx)
    pairs = generate_skipgram_pairs(encoded_sents, window_size)

    start_time = time.time()
    last_loss = None

    for ep in range(epochs):
        total_loss = 0.0
        rng.shuffle(pairs)

        for center, pos in pairs:
            v = W_in[center]     # (D,)

            # positive
            u_pos = W_out[pos]
            score_pos = float(u_pos @ v)
            loss_pos = -math.log(max(sigmoid(score_pos), 1e-12))

            # negatives (sample indices)
            negs = rng.choice(V, size=K, replace=True, p=neg_dist)
            u_negs = W_out[negs]                 # (K,D)
            score_negs = u_negs @ v              # (K,)
            loss_negs = -np.sum(np.log(np.maximum(sigmoid(-score_negs), 1e-12)))

            loss = loss_pos + float(loss_negs)
            total_loss += loss

            # gradients
            g_pos = sigmoid(score_pos) - 1.0     # d/dscore for pos
            grad_v = g_pos * u_pos

            W_out[pos] -= lr * (g_pos * v)

            g_negs = sigmoid(score_negs)         # because loss has -log(sigmoid(-s)) => sigmoid(s)
            grad_v += (g_negs[:, None] * u_negs).sum(axis=0)

            W_out[negs] -= lr * (g_negs[:, None] * v[None, :])

            W_in[center] -= lr * grad_v

        last_loss = total_loss / max(len(pairs), 1)
        print(f"[Skipgram NEG] epoch {ep+1}/{epochs} avg_loss={last_loss:.4f}")

    elapsed = time.time() - start_time
    return W_in, last_loss, elapsed


In [13]:
def build_cooccurrence(encoded_sents, V, window_size=5):
    # Sparse dict: (i,j) -> X_ij
    X = {}
    for sent in encoded_sents:
        n = len(sent)
        for i, wi in enumerate(sent):
            start = max(0, i - window_size)
            end   = min(n, i + window_size + 1)
            for j in range(start, end):
                if j == i:
                    continue
                wj = sent[j]
                dist = abs(j - i)
                inc = 1.0 / dist  # common GloVe weighting
                X[(wi, wj)] = X.get((wi, wj), 0.0) + inc
    return X

def train_glove(encoded_sents, V, D=100, window_size=5, epochs=20, lr=0.05, x_max=100.0, alpha=0.75, seed=42):
    rng = np.random.default_rng(seed)
    W = (rng.standard_normal((V, D)) * 0.01).astype(np.float32)
    C = (rng.standard_normal((V, D)) * 0.01).astype(np.float32)
    bW = np.zeros(V, dtype=np.float32)
    bC = np.zeros(V, dtype=np.float32)

    # AdaGrad accumulators
    gW  = np.ones((V, D), dtype=np.float32)
    gC  = np.ones((V, D), dtype=np.float32)
    gbW = np.ones(V, dtype=np.float32)
    gbC = np.ones(V, dtype=np.float32)

    X = build_cooccurrence(encoded_sents, V, window_size)
    items = list(X.items())

    start_time = time.time()
    last_loss = None

    for ep in range(epochs):
        random.shuffle(items)
        total_loss = 0.0

        for (i, j), xij in items:
            w = (xij / x_max) ** alpha if xij < x_max else 1.0
            logx = math.log(max(xij, 1e-12))

            pred = float(W[i] @ C[j] + bW[i] + bC[j])
            diff = pred - logx
            loss = w * (diff ** 2)
            total_loss += loss

            # gradients
            grad = 2.0 * w * diff
            dWi = grad * C[j]
            dCj = grad * W[i]
            dbWi = grad
            dbCj = grad

            # AdaGrad update
            gW[i]  += dWi * dWi
            gC[j]  += dCj * dCj
            gbW[i] += dbWi * dbWi
            gbC[j] += dbCj * dbCj

            W[i]  -= (lr / np.sqrt(gW[i]))  * dWi
            C[j]  -= (lr / np.sqrt(gC[j]))  * dCj
            bW[i] -= (lr / math.sqrt(float(gbW[i]))) * dbWi
            bC[j] -= (lr / math.sqrt(float(gbC[j]))) * dbCj

        last_loss = total_loss / max(len(items), 1)
        if (ep+1) % max(1, epochs//5) == 0 or ep == 0:
            print(f"[GloVe] epoch {ep+1}/{epochs} avg_loss={last_loss:.4f}")

    elapsed = time.time() - start_time

    # Standard practice: use W + C as final embedding
    vectors = (W + C).astype(np.float32)
    return vectors, last_loss, elapsed


In [14]:
WINDOW_SIZE = 5
D = 100

results = []

# 1) Skipgram Softmax
W_sg, loss_sg, t_sg = train_skipgram_softmax(encoded, V, D=D, window_size=WINDOW_SIZE, epochs=1, lr=0.05)
sem_sg, syn_sg = eval_sem_syn(W_sg, word2idx, idx2word)
results.append(("Skipgram", WINDOW_SIZE, loss_sg, t_sg, syn_sg["accuracy"], sem_sg["accuracy"]))

# 2) Skipgram NEG
W_sgns, loss_sgns, t_sgns = train_skipgram_neg(encoded, V, freq, word2idx, D=D, window_size=WINDOW_SIZE, epochs=2, lr=0.05, K=5)
sem_sgns, syn_sgns = eval_sem_syn(W_sgns, word2idx, idx2word)
results.append(("Skipgram (NEG)", WINDOW_SIZE, loss_sgns, t_sgns, syn_sgns["accuracy"], sem_sgns["accuracy"]))

# 3) GloVe scratch
W_glove, loss_glove, t_glove = train_glove(encoded, V, D=D, window_size=WINDOW_SIZE, epochs=20, lr=0.05)
sem_glove, syn_glove = eval_sem_syn(W_glove, word2idx, idx2word)
results.append(("GloVe (scratch)", WINDOW_SIZE, loss_glove, t_glove, syn_glove["accuracy"], sem_glove["accuracy"]))

# Print table
print("\nModel\t\t\tWindow\tTraining Loss\tTraining Time(s)\tSyntactic Acc\tSemantic Acc")
for r in results:
    print(f"{r[0]:<16}\t{r[1]}\t{r[2]:.4f}\t\t{r[3]:.2f}\t\t\t{r[4]:.4f}\t\t{r[5]:.4f}")


[Skipgram Softmax] epoch 1/1 avg_loss=6.3748
[Skipgram NEG] epoch 1/2 avg_loss=2.7267
[Skipgram NEG] epoch 2/2 avg_loss=2.5282
[GloVe] epoch 1/20 avg_loss=0.0549
[GloVe] epoch 4/20 avg_loss=0.0246
[GloVe] epoch 8/20 avg_loss=0.0197
[GloVe] epoch 12/20 avg_loss=0.0166
[GloVe] epoch 16/20 avg_loss=0.0144
[GloVe] epoch 20/20 avg_loss=0.0129

Model			Window	Training Loss	Training Time(s)	Syntactic Acc	Semantic Acc
Skipgram        	5	6.3748		362.88			0.0000		0.0000
Skipgram (NEG)  	5	2.5282		98.00			0.0040		0.0000
GloVe (scratch) 	5	0.0129		58.02			0.0059		0.0000


In [15]:
# =========================
# ADD-ON 1: GloVe (Gensim) row (4th row)
# ADD-ON 2: WordSim353 CSV -> MSE (+ Spearman, optional)
# =========================

import os, math, time, random, re
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional
from scipy.stats import spearmanr

# ---------- Analogy helpers (works for numpy vectors + your vocab) ----------
def evaluate_analogies_numpy(vectors: np.ndarray, word2idx: Dict[str,int], idx2word: List[str],
                            questions: List[Tuple[str,str,str,str]]):
    eps = 1e-12
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    mat = vectors / np.maximum(norms, eps)

    used = correct = skipped = 0
    for a,b,c,d in questions:
        if not (a in word2idx and b in word2idx and c in word2idx and d in word2idx):
            skipped += 1
            continue

        ia, ib, ic, id_ = word2idx[a], word2idx[b], word2idx[c], word2idx[d]
        q = mat[ib] - mat[ia] + mat[ic]
        q = q / max(np.linalg.norm(q), eps)

        sims = mat @ q
        sims[ia] = sims[ib] = sims[ic] = -np.inf
        pred = int(np.argmax(sims))

        used += 1
        if pred == id_:
            correct += 1

    acc = correct / used if used > 0 else 0.0
    return {"used": used, "correct": correct, "accuracy": acc, "skipped": skipped}

def eval_sem_syn_numpy(vectors, word2idx, idx2word, sections, sem_section, syn_section):
    sem = evaluate_analogies_numpy(vectors, word2idx, idx2word, sections[sem_section])
    syn = evaluate_analogies_numpy(vectors, word2idx, idx2word, sections[syn_section])
    return sem, syn

# ---------- Analogy helpers for gensim KeyedVectors ----------
def evaluate_analogies_kv(kv, questions: List[Tuple[str,str,str,str]]):
    total_used = 0
    correct = 0
    skipped = 0

    words = kv.index_to_key
    mat = kv.vectors.astype(np.float32)
    norms = np.linalg.norm(mat, axis=1, keepdims=True)
    mat_norm = mat / np.maximum(norms, 1e-12)
    key_to_index = kv.key_to_index

    def has(w): 
        return w in key_to_index

    for a, b, c, d in questions:
        if not (has(a) and has(b) and has(c) and has(d)):
            skipped += 1
            continue

        ia, ib, ic, id_ = key_to_index[a], key_to_index[b], key_to_index[c], key_to_index[d]
        q = mat_norm[ib] - mat_norm[ia] + mat_norm[ic]
        q = q / max(np.linalg.norm(q), 1e-12)

        sims = mat_norm @ q
        sims[ia] = sims[ib] = sims[ic] = -np.inf
        pred_idx = int(np.argmax(sims))
        pred = words[pred_idx]

        total_used += 1
        if pred == d:
            correct += 1

    acc = correct / total_used if total_used > 0 else 0.0
    return {"used": total_used, "correct": correct, "accuracy": acc, "skipped": skipped}

# ---------- WordSim353: load + compute MSE (and Spearman) ----------
def load_wordsim353_csv(path: str) -> pd.DataFrame:
    """
    Robust loader for WordSim353 CSV variants.
    Tries to detect columns for word1, word2, score.
    """
    df = pd.read_csv(path)

    # normalize column names
    cols = {c: re.sub(r"\s+", "", c.strip().lower()) for c in df.columns}
    inv = {v: k for k, v in cols.items()}

    # Common variants: "Word 1", "Word 2", "Human (mean)"
    w1_candidates = ["word1", "word_1", "word 1", "wordone", "worda"]
    w2_candidates = ["word2", "word_2", "word 2", "wordtwo", "wordb"]
    s_candidates  = ["score", "similarity", "human", "human(mean)", "mean", "gold", "rating"]

    def find_col(cands):
        for c in cands:
            key = re.sub(r"\s+", "", c.strip().lower())
            if key in inv:
                return inv[key]
        return None

    w1_col = find_col(w1_candidates) or df.columns[0]
    w2_col = find_col(w2_candidates) or df.columns[1]

    score_col = find_col(s_candidates)
    if score_col is None:
        # fallback: last numeric column
        num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
        if not num_cols:
            raise ValueError("Couldn't find a numeric similarity score column in the WordSim CSV.")
        score_col = num_cols[-1]

    out = df[[w1_col, w2_col, score_col]].copy()
    out.columns = ["word1", "word2", "score"]
    out["word1"] = out["word1"].astype(str).str.strip()
    out["word2"] = out["word2"].astype(str).str.strip()
    out["score"] = pd.to_numeric(out["score"], errors="coerce")
    out = out.dropna(subset=["score"]).reset_index(drop=True)
    return out

def cosine_sim_matrix(vectors: np.ndarray) -> np.ndarray:
    eps = 1e-12
    norms = np.linalg.norm(vectors, axis=1, keepdims=True)
    return vectors / np.maximum(norms, eps)

def wordsim_metrics_numpy(vectors: np.ndarray, word2idx: Dict[str,int], wordsim_df: pd.DataFrame):
    """
    Returns MSE (after rescaling), Spearman correlation, and counts.
    Rescaling:
      - model cosine in [-1,1] -> [0,1] via (x+1)/2
      - human score normalized to [0,1] via min-max in the dataset
    """
    mat_norm = cosine_sim_matrix(vectors)

    model_sims = []
    human = []
    skipped = 0

    for w1, w2, s in wordsim_df[["word1","word2","score"]].itertuples(index=False):
        if w1 not in word2idx or w2 not in word2idx:
            skipped += 1
            continue
        i, j = word2idx[w1], word2idx[w2]
        sim = float(mat_norm[i] @ mat_norm[j])  # cosine
        model_sims.append(sim)
        human.append(float(s))

    model_sims = np.array(model_sims, dtype=np.float64)
    human = np.array(human, dtype=np.float64)

    if len(human) == 0:
        return {"used": 0, "skipped": skipped, "mse": np.nan, "spearman": np.nan, "p": np.nan}

    # Normalize to comparable scale for MSE
    model_01 = (model_sims + 1.0) / 2.0
    h_min, h_max = float(human.min()), float(human.max())
    human_01 = (human - h_min) / max(h_max - h_min, 1e-12)

    mse = float(np.mean((model_01 - human_01) ** 2))
    rho, p = spearmanr(model_sims, human)  # Spearman is rank-based, so no rescale needed
    return {"used": len(human), "skipped": skipped, "mse": mse, "spearman": float(rho), "p": float(p)}

def wordsim_metrics_kv(kv, wordsim_df: pd.DataFrame):
    words = kv.index_to_key
    mat = kv.vectors.astype(np.float32)
    norms = np.linalg.norm(mat, axis=1, keepdims=True)
    mat_norm = mat / np.maximum(norms, 1e-12)
    key_to_index = kv.key_to_index

    model_sims = []
    human = []
    skipped = 0

    for w1, w2, s in wordsim_df[["word1","word2","score"]].itertuples(index=False):
        if w1 not in key_to_index or w2 not in key_to_index:
            skipped += 1
            continue
        i, j = key_to_index[w1], key_to_index[w2]
        sim = float(mat_norm[i] @ mat_norm[j])
        model_sims.append(sim)
        human.append(float(s))

    model_sims = np.array(model_sims, dtype=np.float64)
    human = np.array(human, dtype=np.float64)

    if len(human) == 0:
        return {"used": 0, "skipped": skipped, "mse": np.nan, "spearman": np.nan, "p": np.nan}

    model_01 = (model_sims + 1.0) / 2.0
    h_min, h_max = float(human.min()), float(human.max())
    human_01 = (human - h_min) / max(h_max - h_min, 1e-12)

    mse = float(np.mean((model_01 - human_01) ** 2))
    rho, p = spearmanr(model_sims, human)
    return {"used": len(human), "skipped": skipped, "mse": mse, "spearman": float(rho), "p": float(p)}


In [16]:
# =========================
# RUN EVERYTHING (4 rows) + WordSim353 MSE/Spearman
# =========================
# Assumes you already have:
# - word2idx, idx2word, freq, encoded (from your corpus)
# - sections, SEMANTIC_SECTION, SYNTACTIC_SECTION (from analogy file)
# - training functions:
#     train_skipgram_softmax(...)
#     train_skipgram_neg(...)
#     train_glove(...)
#
# Provide your WordSim353 CSV path here:
WORDSIM_PATH = "wordsim353crowd.csv"   # <-- change if needed

wordsim_df = load_wordsim353_csv(WORDSIM_PATH)
print("WordSim rows:", len(wordsim_df))
wordsim_df.head()


WordSim rows: 353


Unnamed: 0,word1,word2,score
0,admission,ticket,5.536
1,alcohol,chemistry,4.125
2,aluminum,metal,6.625
3,announcement,effort,2.0625
4,announcement,news,7.1875


In [17]:
# ---- Train the 3 from-scratch models (your previous functions) ----
WINDOW_SIZE = 5
D = 100

results = []

# 1) Skip-gram Softmax
W_sg, loss_sg, t_sg = train_skipgram_softmax(encoded, len(idx2word), D=D, window_size=WINDOW_SIZE, epochs=1, lr=0.05)
sem_sg, syn_sg = eval_sem_syn_numpy(W_sg, word2idx, idx2word, sections, SEMANTIC_SECTION, SYNTACTIC_SECTION)
ws_sg = wordsim_metrics_numpy(W_sg, word2idx, wordsim_df)
results.append(("Skipgram", WINDOW_SIZE, loss_sg, t_sg, syn_sg["accuracy"], sem_sg["accuracy"], ws_sg["mse"], ws_sg["spearman"]))

# 2) Skip-gram NEG
W_sgns, loss_sgns, t_sgns = train_skipgram_neg(encoded, len(idx2word), freq, word2idx, D=D, window_size=WINDOW_SIZE, epochs=2, lr=0.05, K=5)
sem_sgns, syn_sgns = eval_sem_syn_numpy(W_sgns, word2idx, idx2word, sections, SEMANTIC_SECTION, SYNTACTIC_SECTION)
ws_sgns = wordsim_metrics_numpy(W_sgns, word2idx, wordsim_df)
results.append(("Skipgram (NEG)", WINDOW_SIZE, loss_sgns, t_sgns, syn_sgns["accuracy"], sem_sgns["accuracy"], ws_sgns["mse"], ws_sgns["spearman"]))

# 3) GloVe scratch
W_glove, loss_glove, t_glove = train_glove(encoded, len(idx2word), D=D, window_size=WINDOW_SIZE, epochs=20, lr=0.05)
sem_glove, syn_glove = eval_sem_syn_numpy(W_glove, word2idx, idx2word, sections, SEMANTIC_SECTION, SYNTACTIC_SECTION)
ws_glove = wordsim_metrics_numpy(W_glove, word2idx, wordsim_df)
results.append(("GloVe (scratch)", WINDOW_SIZE, loss_glove, t_glove, syn_glove["accuracy"], sem_glove["accuracy"], ws_glove["mse"], ws_glove["spearman"]))

# 4) GloVe (Gensim) row
# If gensim isn't available / no internet to download, we still create a row with NaNs.
glove_kv = None
gensim_status = "OK"
try:
    import gensim.downloader as api
    glove_kv = api.load("glove-wiki-gigaword-100")
except Exception as e:
    gensim_status = f"UNAVAILABLE: {type(e).__name__}: {e}"

if glove_kv is not None:
    sem_g = evaluate_analogies_kv(glove_kv, sections[SEMANTIC_SECTION])
    syn_g = evaluate_analogies_kv(glove_kv, sections[SYNTACTIC_SECTION])
    ws_g  = wordsim_metrics_kv(glove_kv, wordsim_df)
    results.append(("GloVe (Gensim)", "-", np.nan, np.nan, syn_g["accuracy"], sem_g["accuracy"], ws_g["mse"], ws_g["spearman"]))
else:
    results.append(("GloVe (Gensim)", "-", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan))

print("Gensim status:", gensim_status)


[Skipgram Softmax] epoch 1/1 avg_loss=6.3748
[Skipgram NEG] epoch 1/2 avg_loss=2.7267
[Skipgram NEG] epoch 2/2 avg_loss=2.5282
[GloVe] epoch 1/20 avg_loss=0.0550
[GloVe] epoch 4/20 avg_loss=0.0246
[GloVe] epoch 8/20 avg_loss=0.0198
[GloVe] epoch 12/20 avg_loss=0.0167
[GloVe] epoch 16/20 avg_loss=0.0145
[GloVe] epoch 20/20 avg_loss=0.0129
Gensim status: OK


In [18]:
# ---- Pretty table output ----
print("\nModel\t\t\tWindow\tTrainLoss\tTrainTime(s)\tSynAcc\t\tSemAcc\t\tWordSim MSE\tWordSim Spearman")
for (name, win, loss, tsec, synacc, semacc, mse, rho) in results:
    loss_str = f"{loss:.4f}" if isinstance(loss, (int,float)) and not np.isnan(loss) else "-"
    t_str    = f"{tsec:.2f}" if isinstance(tsec, (int,float)) and not np.isnan(tsec) else "-"
    syn_str  = f"{synacc:.4f}" if isinstance(synacc, (int,float)) and not np.isnan(synacc) else "-"
    sem_str  = f"{semacc:.4f}" if isinstance(semacc, (int,float)) and not np.isnan(semacc) else "-"
    mse_str  = f"{mse:.4f}" if isinstance(mse, (int,float)) and not np.isnan(mse) else "-"
    rho_str  = f"{rho:.4f}" if isinstance(rho, (int,float)) and not np.isnan(rho) else "-"
    print(f"{name:<16}\t{win}\t{loss_str}\t\t{t_str}\t\t{syn_str}\t{sem_str}\t{mse_str}\t\t{rho_str}")



Model			Window	TrainLoss	TrainTime(s)	SynAcc		SemAcc		WordSim MSE	WordSim Spearman
Skipgram        	5	6.3748		361.35		0.0000	0.0000	0.2625		-0.0008
Skipgram (NEG)  	5	2.5282		99.64		0.0040	0.0000	0.2061		0.0383
GloVe (scratch) 	5	0.0129		57.78		0.0079	0.0000	0.1234		0.0630
GloVe (Gensim)  	-	-		-		0.5545	0.0000	0.1552		0.4867


From the data above, we can see that Skipgram (Word2Vec) takes the longest training time and highest training lost due to it having to compute using the whole vocabulary for every training pair (computational workload is higher). Using negative sampling, we can see that it greatly decreases the training loss and training time. Lastly, GloVe converges the fastest and with minimal loss due to optimized weighted global statistic rather than predicting each word at a time. Gensim is pretrained and does not come with training loss or time.

In terms of similarity, GloVe gensim performed the best with the lowest MSE largely duye to a higher scale training.

In [19]:
import json
import numpy as np

# Suppose you have:
#   W_sgns (V,D) and word2idx dict
# OR W_glove (V,D) and word2idx dict
np.save("app/data/embeddings.npy", W_glove.astype(np.float32))   # choose your model here

with open("app/data/word2idx.json", "w", encoding="utf-8") as f:
    json.dump(word2idx, f)

In [20]:
import re

RAW_TEXT_PATH = "archive/SorcerersStone.txt"      # <-- change this
OUTPUT_PATH   = "app/data/corpus.txt"

def clean_text(text):
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def split_into_sentences(text):
    # simple sentence splitter (good enough for assignment)
    return re.split(r'(?<=[.!?])\s+', text)

with open(RAW_TEXT_PATH, "r", encoding="utf-8", errors="ignore") as f:
    raw = f.read()

raw = clean_text(raw)
sentences = split_into_sentences(raw)

# Filter very short sentences
sentences = [s for s in sentences if len(s.split()) >= 5]

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    for s in sentences:
        f.write(s + "\n")

print("Saved contexts:", len(sentences))
print("Sample:")
for s in sentences[:5]:
    print("-", s)

Saved contexts: 4403
Sample:
- Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much.
- They were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.
- Dursley was the director of a firm called Grunnings, which made drills.
- He was a big, beefy man with hardly any neck, although he did have a very large mustache.
- Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors.
