# Problem 7: Word Vectors - Manual NN (Text8)

**Dataset:** Text8 (200K words)  
**Method:** Neural network trained on PPMI matrix (Manual NumPy)

In [1]:
import numpy as np
from collections import Counter
import re
from tqdm import tqdm

## 1. Preprocessing

In [2]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z ]+", ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.split()

def build_vocabulary(words, vocab_size=5000, min_count=3):
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from',
                  'is', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'this', 'that', 'it', 'as'}
    
    word_counts = Counter(words)
    filtered = [(w, c) for w, c in word_counts.items() 
                if c >= min_count and len(w) >= 3 and w not in stop_words]
    
    most_common = sorted(filtered, key=lambda x: x[1], reverse=True)[:vocab_size]
    word_to_id = {word: idx for idx, (word, _) in enumerate(most_common)}
    id_to_word = {idx: word for word, idx in word_to_id.items()}
    corpus = [word_to_id[word] for word in words if word in word_to_id]
    
    return word_to_id, id_to_word, corpus

## 2. Co-occurrence Matrix

In [3]:
def build_cooccurrence_matrix(corpus, vocab_size, window_size=5):
    cooccur = np.zeros((vocab_size, vocab_size), dtype=np.float32)
    
    for i in tqdm(range(len(corpus)), desc="Building co-occurrence"):
        center = corpus[i]
        start = max(0, i - window_size)
        end = min(len(corpus), i + window_size + 1)
        
        for j in range(start, end):
            if i != j:
                context = corpus[j]
                distance = abs(i - j)
                weight = 1.0 / distance
                cooccur[center, context] += weight
    
    return cooccur

## 3. PPMI Computation

In [4]:
def compute_ppmi(cooccur_matrix):
    total = cooccur_matrix.sum()
    word_counts = cooccur_matrix.sum(axis=1)
    context_counts = cooccur_matrix.sum(axis=0)
    
    ppmi = np.zeros_like(cooccur_matrix)
    
    for i in tqdm(range(cooccur_matrix.shape[0]), desc="Computing PPMI"):
        for j in range(cooccur_matrix.shape[1]):
            if cooccur_matrix[i, j] > 0:
                p_ij = cooccur_matrix[i, j] / total
                p_i = word_counts[i] / total
                p_j = context_counts[j] / total
                
                pmi = np.log(p_ij / (p_i * p_j + 1e-10))
                ppmi[i, j] = max(0, pmi)
    
    np.fill_diagonal(ppmi, 0)
    
    row_sums = ppmi.sum(axis=1, keepdims=True)
    ppmi_normalized = np.divide(ppmi, row_sums, where=row_sums>0)
    
    return ppmi_normalized

## 4. Neural Network

In [5]:
class WordVectorNN:
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        self.W1 = np.random.normal(0, 0.1, (vocab_size, embedding_dim)).astype(np.float32)
        self.W2 = np.random.normal(0, 0.1, (embedding_dim, vocab_size)).astype(np.float32)
    
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / (np.sum(exp_x) + 1e-10)
    
    def forward(self, word_idx):
        hidden = self.W1[word_idx] 
        output = hidden @ self.W2 
        probs = self.softmax(output)
        return hidden, probs
    
    def backward(self, word_idx, hidden, probs, target, lr):
        d_output = probs - target 
        d_W2 = np.outer(hidden, d_output)  
        d_hidden = d_output @ self.W2.T
        d_W1 = d_hidden 
        self.W2 -= lr * d_W2
        self.W1[word_idx] -= lr * d_W1
    
    def train_step(self, word_idx, target, lr):
        hidden, probs = self.forward(word_idx)
        loss = -np.sum(target * np.log(probs + 1e-10))
        self.backward(word_idx, hidden, probs, target, lr)
        return loss
    
    def get_embedding(self, word_idx):
        return self.W1[word_idx]

## 5. Training

In [6]:
def train_model(model, ppmi_matrix, epochs=10, lr=0.1):
    vocab_size = ppmi_matrix.shape[0]
    
    for epoch in range(epochs):
        total_loss = 0
        indices = np.arange(vocab_size)
        np.random.shuffle(indices)
        
        for idx in tqdm(indices, desc=f"Epoch {epoch+1}/{epochs}"):
            loss = model.train_step(idx, ppmi_matrix[idx], lr)
            total_loss += loss
        
        avg_loss = total_loss / vocab_size
        print(f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}")

## 6. Evaluation

In [7]:
def cosine_similarity(v1, v2):
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    if norm1 == 0 or norm2 == 0:
        return 0
    return np.dot(v1, v2) / (norm1 * norm2)

def find_similar_words(word, word_to_id, id_to_word, model, top_k=15):
    if word not in word_to_id:
        return None
    
    word_idx = word_to_id[word]
    word_emb = model.get_embedding(word_idx)
    
    similarities = []
    for idx in range(model.vocab_size):
        if idx != word_idx:
            other_emb = model.get_embedding(idx)
            sim = cosine_similarity(word_emb, other_emb)
            similarities.append((id_to_word[idx], sim))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]

def evaluate_model(model, test_words, word_to_id, id_to_word):
    for word in test_words:
        similar = find_similar_words(word, word_to_id, id_to_word, model)
        if similar:
            print(f"\n{word.upper()}:")
            for w, sim in similar:
                print(f"  {w:20s} {sim:.4f}")

## 7. Run Pipeline

In [8]:
with open('../text8_200K.txt', 'r') as f:
    text = f.read()

words = preprocess_text(text)
print(f"Total words: {len(words):,}")

print("\nBuilding vocabulary...")
word_to_id, id_to_word, corpus = build_vocabulary(words, vocab_size=10000)
print(f"Vocab: {len(word_to_id):,}, Corpus: {len(corpus):,}")

print("\nBuilding co-occurrence matrix...")
cooccur = build_cooccurrence_matrix(corpus, len(word_to_id), window_size=5)

print("\nComputing PPMI...")
ppmi = compute_ppmi(cooccur)

print("\nInitializing model...")
model = WordVectorNN(vocab_size=len(word_to_id), embedding_dim=200)

print("\nTraining...")
train_model(model, ppmi, epochs=10, lr=0.789)

print("\n" + "="*60)
print("EVALUATION")
print("="*60)
test_words = ["china", "computer", "phone", "napoleon", "god", "catholic"]
evaluate_model(model, test_words, word_to_id, id_to_word)

Loading Text8...
Total words: 199,999

Building vocabulary...
Vocab: 6,897, Corpus: 121,918

Building co-occurrence matrix...


Building co-occurrence: 100%|██████████| 121918/121918 [00:00<00:00, 469306.59it/s]



Computing PPMI...


Computing PPMI: 100%|██████████| 6897/6897 [00:03<00:00, 2040.74it/s]



Initializing model...

Training...


Epoch 1/10: 100%|██████████| 6897/6897 [00:05<00:00, 1218.80it/s]


Epoch 1/10, Avg Loss: 8.8485


Epoch 2/10: 100%|██████████| 6897/6897 [00:05<00:00, 1236.70it/s]


Epoch 2/10, Avg Loss: 8.7655


Epoch 3/10: 100%|██████████| 6897/6897 [00:05<00:00, 1238.36it/s]


Epoch 3/10, Avg Loss: 8.6648


Epoch 4/10: 100%|██████████| 6897/6897 [00:05<00:00, 1168.63it/s]


Epoch 4/10, Avg Loss: 8.5148


Epoch 5/10: 100%|██████████| 6897/6897 [00:05<00:00, 1213.19it/s]


Epoch 5/10, Avg Loss: 8.2544


Epoch 6/10: 100%|██████████| 6897/6897 [00:05<00:00, 1206.42it/s]


Epoch 6/10, Avg Loss: 7.8358


Epoch 7/10: 100%|██████████| 6897/6897 [00:05<00:00, 1167.63it/s]


Epoch 7/10, Avg Loss: 7.3823


Epoch 8/10: 100%|██████████| 6897/6897 [00:05<00:00, 1215.26it/s]


Epoch 8/10, Avg Loss: 6.9226


Epoch 9/10: 100%|██████████| 6897/6897 [00:05<00:00, 1187.31it/s]


Epoch 9/10, Avg Loss: 6.4757


Epoch 10/10: 100%|██████████| 6897/6897 [00:05<00:00, 1225.92it/s]


Epoch 10/10, Avg Loss: 6.0846

EVALUATION

CHINA:
  japan                0.6247
  mongolia             0.5987
  myanmar              0.5951
  thailand             0.5836
  sri                  0.5798
  korea                0.5774
  india                0.5716
  asia                 0.5541
  buddhism             0.5443
  southeast            0.5393
  singapore            0.5380
  cambodia             0.5344
  regions              0.5312
  lanka                0.5285
  malaysia             0.5279

COMPUTER:
  software             0.4113
  equipment            0.3852
  animated             0.3786
  generated            0.3613
  motion               0.3606
  media                0.3524
  probability          0.3522
  ansi                 0.3436
  control              0.3400
  storage              0.3380
  pages                0.3365
  technique            0.3327
  unusual              0.3309
  navigation           0.3289
  intended             0.3288

PHONE:
  gear                 0.4430
 

The goal is to turn words into meaningful numbers (embeddings) that computers can understand. the word embeddings are nothing but words converted to numbers that capture meaning of the word. the network learns by trying to predict which words appear near each other in sequences. Words that are used in similar contexts like "Cat" and "dog" end up with similar number representations. 
1. A co-occurrence matrix that counts how often words appear near each other.
2. Then convert the raw counts to PPMI (Positive Pointwise Mutual Information), which measures if words appear together more than random chance would predict. This filters out meaningless co-occurrences.
3. Then train the neural network to reconstruct the PPMI matrix. Input: one hot encode vector for a word -> hidden layer (embedding) -> output: predicted PPMI values for all other words. The hidden layer thus becomes the word embedding.
4. words used in similar context like "cat" and "dog" both appear near "pet", "animal", "furry" will have similar PPMI patterns, so the network learns similar embeddings for them. The network is forced to compress words relationships into small embeddings to capture meaning.
5. Thus each word gets a dense vector (100 numbers) where similar words have similar vectors.
6. Eg: vector ("king") - vector ("man") + vector ("woman") = approx vector ("queen") -> the maths behind it. 