In [1]:
!pip install gensim nltk


Collecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/e4/0d/d60f023abd74e1ccd448c97ec9c0d78ddc43a95497c14939a05c5de6f887/gensim-4.3.3-cp39-cp39-macosx_11_0_arm64.whl.metadata
  Using cached gensim-4.3.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (8.3 kB)
Collecting nltk
  Obtaining dependency information for nltk from https://files.pythonhosted.org/packages/4d/66/7d9e26593edda06e8cb531874633f7c2372279c3b0f46235539fe546df8b/nltk-3.9.1-py3-none-any.whl.metadata
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Obtaining dependency information for numpy<2.0,>=1.18.5 from https://files.pythonhosted.org/packages/ae/8c/ab03a7c25741f9ebc92684a20125fbc9fc1b8e1e700beb9197d750fdff88/numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl.metadata
  Using cached numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Obtaining dependency

In [2]:
import nltk
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import reuters
import string

# Download dataset and tokenizer
nltk.download('punkt')
nltk.download('reuters')

# Load Reuters corpus (sample dataset)
sentences = [word_tokenize(reuters.raw(file_id).lower()) for file_id in reuters.fileids()]

# Remove punctuation
sentences = [[word for word in sentence if word.isalnum()] for sentence in sentences]

print(f"Sample sentence: {sentences[0]}")


[nltk_data] Downloading package punkt to /Users/vaibhav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package reuters to /Users/vaibhav/nltk_data...




In [3]:
# Train Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

# Save model
model.save("word2vec_reuters.model")


In [6]:
# Load model
model = Word2Vec.load("word2vec_reuters.model")

# Find similar words
word = "rope"
if word in model.wv:
    similar_words = model.wv.most_similar(word, topn=5)
    print(f"Words similar to '{word}': {similar_words}")
else:
    print(f"Word '{word}' not in vocabulary.")


Word 'rope' not in vocabulary.


## Co-occuence matrix 

In [8]:
import random

def initialize_matrices(vocab_size, embedding_dim):
    # Initialize word and context matrices with small random values
    W = [[random.uniform(-0.01, 0.01) for _ in range(embedding_dim)] for _ in range(vocab_size)]
    C = [[random.uniform(-0.01, 0.01) for _ in range(embedding_dim)] for _ in range(vocab_size)]
    return W, C

def matrix_factorization(M, W, C, learning_rate=0.01, epochs=5000):
    vocab_size = len(M)
    embedding_dim = len(W[0])

    for epoch in range(epochs):
        total_loss = 0
        for i in range(vocab_size):
            for j in range(vocab_size):
                if M[i][j] > 0:  # Only update for non-zero co-occurrence
                    # Compute prediction
                    predicted = sum(W[i][k] * C[j][k] for k in range(embedding_dim))
                    error = M[i][j] - predicted

                    # Update word and context vectors using gradient descent
                    for k in range(embedding_dim):
                        W[i][k] += learning_rate * error * C[j][k]
                        C[j][k] += learning_rate * error * W[i][k]

                    total_loss += error ** 2  # Compute loss

        if epoch % 500 == 0:
            print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

    return W, C

# Example Co-occurrence Matrix (from previous step)
M = [
    [0, 1, 0, 0, 0, 0, 0, 0],  # "I"
    [1, 0, 1, 1, 0, 0, 0, 0],  # "want"
    [1, 1, 0, 1, 1, 0, 0, 0],                # to
    [0, 1, 1, 0, 1, 1 ,0, 0],                # get
    [0, 0, 1, 1, 0, 1, 1, 0],                # good
    [0, 0, 1, 1, 0, 1, 1, 0],                # at
    [0, 0, 0, 1, 1, 0, 1, 1],  # "deep"
    [0, 0, 0, 0, 1, 1, 0, 1]   # "learning"
]

# Define parameters
vocab_size = len(M)
embedding_dim = 3  # Reduce to 2D for visualization

# Initialize matrices
W, C = initialize_matrices(vocab_size, embedding_dim)

# Train embeddings using matrix factorization
word_embeddings, context_embeddings = matrix_factorization(M, W, C)

print("\nWord Embeddings:")
for i, vec in enumerate(word_embeddings):
    print(f"Word {i}: {vec}")


Epoch 0, Loss: 27.0004
Epoch 500, Loss: 0.0037
Epoch 1000, Loss: 0.0004
Epoch 1500, Loss: 0.0003
Epoch 2000, Loss: 0.0003
Epoch 2500, Loss: 0.0003
Epoch 3000, Loss: 0.0002
Epoch 3500, Loss: 0.0002
Epoch 4000, Loss: 0.0002
Epoch 4500, Loss: 0.0002

Word Embeddings:
Word 0: [0.9650556424386794, 0.1919270311002789, -0.14527167982458658]
Word 1: [0.958565067577809, 0.23804208131819674, -0.17277550622386595]
Word 2: [0.9606294327740452, 0.2535995705445823, -0.11155500357093034]
Word 3: [0.9171117069091419, 0.3547063078973181, -0.17876467230158663]
Word 4: [0.8772820597188794, 0.46108695391963056, -0.12723690801378695]
Word 5: [0.8762592551041872, 0.4620669448888442, -0.131150757772208]
Word 6: [0.8577030222680163, 0.526559685028864, -0.002359915552266447]
Word 7: [0.8391726677668584, 0.555511207741579, -0.052391152516393334]


## Skip gram

In [9]:
import numpy as np
import random

# Sample text corpus
corpus = "I love deep learning and natural language processing"

# Tokenize (split into words)
words = corpus.split()
vocab = list(set(words))  # Unique words
word_to_id = {word: i for i, word in enumerate(vocab)}  # Map word → index
id_to_word = {i: word for word, i in word_to_id.items()}  # Map index → word
vocab_size = len(vocab)

# Parameters
window_size = 2  # How many words on each side we consider as context

# Generate training pairs (target, context)
training_pairs = []

for i, target in enumerate(words):
    target_idx = word_to_id[target]
    start = max(0, i - window_size)
    end = min(len(words), i + window_size + 1)
    
    for j in range(start, end):
        if i != j:  # Avoid pairing the word with itself
            context_idx = word_to_id[words[j]]
            training_pairs.append((target_idx, context_idx))

print("Sample training pairs (word indices):", training_pairs)


Sample training pairs (word indices): [(7, 1), (7, 4), (1, 7), (1, 4), (1, 3), (4, 7), (4, 1), (4, 3), (4, 5), (3, 1), (3, 4), (3, 5), (3, 0), (5, 4), (5, 3), (5, 0), (5, 2), (0, 3), (0, 5), (0, 2), (0, 6), (2, 5), (2, 0), (2, 6), (6, 0), (6, 2)]


In [10]:
# Embedding size (dimension of word vectors)
embedding_dim = 5  

# Initialize weight matrices randomly
W = np.random.randn(vocab_size, embedding_dim) * 0.01
C = np.random.randn(embedding_dim, vocab_size) * 0.01


In [11]:
def softmax(x):
    e_x = np.exp(x - np.max(x))  # Numerical stability
    return e_x / np.sum(e_x)

def forward_pass(target_idx):
    # Lookup word embedding
    v_target = W[target_idx]  # Shape: (embedding_dim,)
    
    # Compute scores (dot product with context matrix)
    scores = np.dot(v_target, C)  # Shape: (vocab_size,)
    
    # Convert to probability distribution using softmax
    y_pred = softmax(scores)  # Shape: (vocab_size,)
    
    return y_pred  # Probability distribution over all words


In [12]:
def compute_loss(y_pred, true_idx):
    return -np.log(y_pred[true_idx])  # Negative log likelihood


In [15]:
def backward_pass(target_idx, context_idx, y_pred, C, learning_rate=0.01):
    # One-hot encoding for the true context word
    y_true = np.zeros(vocab_size)
    y_true[context_idx] = 1  # The actual context word

    # Compute error
    error = y_pred - y_true  # Shape: (vocab_size,)

    # Compute gradients
    dW = np.outer(error, C.T)  # Gradient for W
    dC = np.outer(W[target_idx], error)  # Gradient for C

    # Update parameters
    W[target_idx] -= learning_rate * dW.sum(axis=0)
    C -= learning_rate * dC


In [17]:
# Training parameters
epochs = 1000
learning_rate = 0.01

for epoch in range(epochs):
    loss = 0
    for target_idx, context_idx in training_pairs:
        # Forward pass
        y_pred = forward_pass(target_idx)

        # Compute loss
        loss += compute_loss(y_pred, context_idx)

        # Backward pass (update weights)
        backward_pass(target_idx, context_idx, y_pred, C, learning_rate)

    if epoch % 100 == 0:
        print(f"Epoch {epoch}, Loss: {loss:.4f}")


ValueError: operands could not be broadcast together with shapes (5,) (40,) (5,) 