In [None]:
!pip install gensim

import os, random, urllib.request
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import Word2Vec

url = "https://mattmahoney.net/dc/text8.zip"
if not os.path.exists("text8.zip"):
    urllib.request.urlretrieve(url, "text8.zip")

!unzip -o text8.zip

words = open("text8").read().split()
print("Total tokens:", len(words))

min_count = 5
freq = {}
for w in words:
    freq[w] = freq.get(w, 0) + 1

vocab = {w:c for w,c in freq.items() if c >= min_count}
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = list(vocab)
V = len(vocab)

tokens = [word2idx[w] for w in words if w in vocab]
print("Vocab size:", V)
print("Filtered tokens:", len(tokens))

window = 2
pairs = []

for i in range(len(tokens)):
    for j in range(max(0, i-window), min(len(tokens), i+window+1)):
        if i != j:
            pairs.append((tokens[i], tokens[j]))

print("Training pairs:", len(pairs))

dim = 100
neg_k = 5

W = nn.Embedding(V, dim)
C = nn.Embedding(V, dim)

optimizer = optim.Adam(list(W.parameters()) + list(C.parameters()), lr=0.01)

epochs = 2
max_steps = 2_000_000

for epoch in range(epochs):
    random.shuffle(pairs)
    total_loss = 0.0

    for t, c in pairs[:max_steps]:
        optimizer.zero_grad()

        vt = W(torch.tensor(t))
        vc = C(torch.tensor(c))

        # positive loss
        pos_loss = torch.log(torch.sigmoid(torch.dot(vt, vc)))

        # negative sampling (vectorized)
        neg_ids = torch.randint(0, V, (neg_k,))
        neg_vecs = C(neg_ids)

        neg_loss = torch.log(torch.sigmoid(-torch.matmul(neg_vecs, vt))).sum()

        loss = -(pos_loss + neg_loss)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.2f}")

my_vectors = W.weight.detach().numpy()

# GENSIM
sentences = [tokens[i:i+1000] for i in range(0, len(tokens), 1000)]
gensim_model = Word2Vec(
    sentences,
    vector_size=100,
    window=2,
    min_count=5,
    sg=1,
    workers=4
)

# COSINE SIMILARITY
def cosine(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

print("\nCOSINE SIMILARITY (OURS vs GENSIM)")
for w in ["king", "queen", "man", "woman"]:
    if w in word2idx and w in gensim_model.wv:
        print(w, cosine(my_vectors[word2idx[w]], gensim_model.wv[w]))

def analogy(a, b, c, model, topn=5):
    target = model[b] - model[a] + model[c]
    scores = [(w, cosine(target, model[w])) for w in model.key_to_index]
    return sorted(scores, key=lambda x: x[1], reverse=True)[:topn]

print("\nANALOGY: king - man + woman")
print(analogy("man", "king", "woman", gensim_model.wv))

# BIAS DETECTION
gender_dir = gensim_model.wv["man"] - gensim_model.wv["woman"]

def bias(word):
    return cosine(gender_dir, gensim_model.wv[word])

print("\nGENDER BIAS")
for w in ["programmer", "doctor", "nurse", "homemaker"]:
    if w in gensim_model.wv:
        print(w, bias(w))


Archive:  text8.zip
  inflating: text8                   
Total tokens: 17005207
Vocabulary size: 71290
Filtered tokens: 16718844
