**Import Libraries**

In [None]:
import time
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import nltk
from nltk.corpus import reuters, stopwords

nltk.download('reuters')
nltk.download('stopwords')

# to ensure to produce same random number to debug and model comparison
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<torch._C.Generator at 0x785ea9b1a130>

**Prepare the dataset**

In [None]:

stop_words = set(stopwords.words('english'))
sentences = []

for fileid in reuters.fileids():
    words = [
        w.lower()
        for w in reuters.words(fileid)
        # clean the dataset document by removing stopwords
        if w.isalpha() and w.lower() not in stop_words
    ]
    sentences.append(words)

print("Total sentences:", len(sentences))

Total sentences: 10788


In [None]:
# set vocab limit to avoid training the model fneeds GPU for londer period
VOCAB_LIMIT = 10000
UNK_TOKEN = "<UNK>"

all_words = [w for sentence in sentences for w in sentence]
word_counts = Counter(all_words)

# build the vocabulary
vocab = [UNK_TOKEN] + [
    word for word, _ in word_counts.most_common(VOCAB_LIMIT - 1)
]

word2index = {word: idx for idx, word in enumerate(vocab)}
index2word = {idx: word for word, idx in word2index.items()}

vocab_size = len(vocab)

print("Vocabulary size:", vocab_size)
print("UNK index:", word2index[UNK_TOKEN])


Vocabulary size: 10000
UNK index: 0


In [None]:
corpus = []
for sentence in sentences:
    indexed_sentence = [
        # each word is replaced by its index from word2index
        # this is to make dataset ready for model training
        word2index.get(word, word2index[UNK_TOKEN])
        for word in sentence
    ]
    corpus.append(indexed_sentence)


**Prepare Train Data**

In [None]:
# dynamic windows where default is 2
def random_batch(batch_size, corpus, window_size=2):
    input_batch = []
    label_batch = []

    while len(input_batch) < batch_size:

        # pick a random sentence index
        sentence_idx = random.randint(0, len(corpus) - 1)
        sentence = corpus[sentence_idx]

        # ensure sentence is long enough to pick a center and context word
        if len(sentence) < 2 * window_size + 1:
            continue

        # pick a random center word position within the sentence
        center_word_pos = random.randint(window_size, len(sentence) - 1 - window_size)
        center_word_index = sentence[center_word_pos]

        # pick a random context word position within the window around the center word
        # and ensure it's not the center word itself
        context_word_pos = random.choice(
            list(range(center_word_pos - window_size, center_word_pos + window_size + 1))
        )
        if context_word_pos == center_word_pos:
            continue
        context_word_index = sentence[context_word_pos]

        input_batch.append(center_word_index)
        label_batch.append(context_word_index)

    return input_batch, label_batch


**Implement the Model**

In [None]:

class Skipgram(nn.Module):

    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)

    def forward(self, center, outside, all_vocabs):
        center_embedding  = self.embedding_center(center).unsqueeze(1)
        outside_embedding = self.embedding_outside(outside).unsqueeze(1)
        all_embedding     = self.embedding_outside(all_vocabs)

        top_term = torch.exp(
            outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        )

        lower_term = all_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        lower_term_sum = torch.sum(torch.exp(lower_term), dim=1)

        loss = -torch.mean(torch.log(top_term / lower_term_sum))
        return loss


**Set Hyperparameters for Model Training**

In [None]:
embedding_dim = 100
batch_size = 128
num_epochs = 5000
learning_rate = 0.001
window_size = 2

model = Skipgram(vocab_size, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

all_vocabs = torch.LongTensor(range(vocab_size))
all_vocabs = all_vocabs.unsqueeze(0).repeat(batch_size, 1)


**Test the model before training**

In [None]:
# get a batch
x_batch, y_batch = random_batch(batch_size, corpus)

# create tensors first
input_tensor = torch.LongTensor(x_batch)
label_tensor = torch.LongTensor(y_batch)

# create all_vocabs
all_vocabs = torch.LongTensor(range(vocab_size))
all_vocabs = all_vocabs.unsqueeze(0).repeat(input_tensor.size(0), 1)

# test loss
test_loss = model(input_tensor, label_tensor, all_vocabs)
print("Initial test loss (before training):", test_loss.item())


Initial test loss (before training): 38.501251220703125


**Train the Model**

In [None]:
import matplotlib.pyplot as plt

# list to store losses
losses = []

start_time = time.time()

for epoch in range(num_epochs):

    x_batch, y_batch = random_batch(batch_size, corpus)

    # tensors first
    input_tensor = torch.LongTensor(x_batch)
    label_tensor = torch.LongTensor(y_batch)

    # then all_vocabs
    all_vocabs = torch.LongTensor(range(vocab_size))
    all_vocabs = all_vocabs.unsqueeze(0).repeat(input_tensor.size(0), 1)

    # forward pass
    loss = model(input_tensor, label_tensor, all_vocabs)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # store the loss
    losses.append(loss.item())

    if (epoch + 1) % 500 == 0:
        print(f"Epoch {epoch+1} | Loss: {loss.item():.6f}")

end_time = time.time()
print(f"\nTraining completed in {end_time - start_time:.2f} seconds")

# Print final loss
print(f"Final training loss: {losses[-1]:.6f}")


Epoch 500 | Loss: 23.140018
Epoch 1000 | Loss: 22.115686
Epoch 1500 | Loss: 20.569551
Epoch 2000 | Loss: 19.657822
Epoch 2500 | Loss: 17.945671
Epoch 3000 | Loss: 15.884174
Epoch 3500 | Loss: 14.302668
Epoch 4000 | Loss: 14.190167
Epoch 4500 | Loss: 14.916005
Epoch 5000 | Loss: 13.188715

Training completed in 4456.45 seconds
Final training loss: 13.188715


**Save the Trained Word2Vec model**

In [None]:
MODEL_PATH = "word2vec_skipgram.pth"

torch.save({
    "model_state_dict": model.state_dict(),
    "word2index": word2index,
    "index2word": index2word,
    "embedding_dim": embedding_dim
}, MODEL_PATH)

print("Model saved to", MODEL_PATH)


Model saved to word2vec_skipgram.pth


**Load the model**

In [None]:
checkpoint = torch.load("word2vec_skipgram.pth", map_location="cpu")

word2index = checkpoint["word2index"]
index2word = checkpoint["index2word"]
embedding_dim = checkpoint["embedding_dim"]

vocab_size = len(word2index)

model = Skipgram(vocab_size, embedding_dim)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()


Skipgram(
  (embedding_center): Embedding(10000, 100)
  (embedding_outside): Embedding(10000, 100)
)

**Extract embeddings from the loaded model**

In [None]:
import torch.nn.functional as F

embeddings = model.embedding_center.weight.data
embeddings = F.normalize(embeddings, dim=1)


**Evaluate semantic & syntactic accuracy**

In [None]:
# load word analogies dataset and returns two lists for semantic and synthetic evaluation
def load_analogy_dataset(filepath):
    semantic = []
    syntactic = []

    current_section = None

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if line.startswith(":"):
                if "capital-common-countries" in line:
                    current_section = "semantic"
                elif "past-tense" in line:
                    current_section = "syntactic"
                else:
                    current_section = None
                continue

            if current_section is None:
                continue

            words = line.lower().split()
            if len(words) != 4:
                continue

            if current_section == "semantic":
                semantic.append(words)
            else:
                syntactic.append(words)

    return semantic, syntactic


In [None]:
import torch
import torch.nn.functional as F

def analogy_accuracy(analogies, embeddings, word2index, index2word):
    correct = 0
    total = 0

    vocab_size = embeddings.size(0)

    for a, b, c, d in analogies:
        if a not in word2index or b not in word2index \
           or c not in word2index or d not in word2index:
            continue

        va = embeddings[word2index[a]]
        vb = embeddings[word2index[b]]
        vc = embeddings[word2index[c]]

        # word2Vec analogy: b - a + c
        target_vec = vb - va + vc
        target_vec = F.normalize(target_vec.unsqueeze(0), dim=1)

        # cosine similarity with all words
        similarities = torch.matmul(target_vec, embeddings.T).squeeze()

        # exclude query words
        similarities[word2index[a]] = -1e9
        similarities[word2index[b]] = -1e9
        similarities[word2index[c]] = -1e9

        predicted_index = torch.argmax(similarities).item()
        predicted_word = index2word[predicted_index]

        if predicted_word == d:
            correct += 1

        total += 1

    return correct / total if total > 0 else 0


In [None]:

semantic, syntactic = load_analogy_dataset("/content/word_analogies_dataset.txt")

semantic_acc = analogy_accuracy(
    semantic, embeddings, word2index, index2word
)

syntactic_acc = analogy_accuracy(
    syntactic, embeddings, word2index, index2word
)


In [None]:
print(f"Semantic accuracy: {semantic_acc:.4f}")
print(f"Syntactic accuracy: {syntactic_acc:.4f}")


Semantic accuracy: 0.0000
Syntactic accuracy: 0.0000


**Load similarity dataset**

In [None]:
import pandas as pd

# load similarity dataset
sim_df = pd.read_csv("/content/wordsim353crowd.csv")

sim_df.head()


Unnamed: 0,Word 1,Word 2,Human (Mean)
0,admission,ticket,5.536
1,alcohol,chemistry,4.125
2,aluminum,metal,6.625
3,announcement,effort,2.0625
4,announcement,news,7.1875


**Compute dot-product similarities**

In [None]:
model_sims = []
human_sims = []

UNK_INDEX = word2index.get("<UNK>")

for _, row in sim_df.iterrows():
    w1 = str(row[0]).lower()
    w2 = str(row[1]).lower()
    human_score = float(row[2])

    idx1 = word2index.get(w1, UNK_INDEX)
    idx2 = word2index.get(w2, UNK_INDEX)

    v1 = embeddings[idx1]
    v2 = embeddings[idx2]

    dot_sim = torch.dot(v1, v2).item()

    model_sims.append(dot_sim)
    human_sims.append(human_score)


  w1 = str(row[0]).lower()
  w2 = str(row[1]).lower()
  human_score = float(row[2])


**Calculate Spearman correlation**

In [None]:
from scipy.stats import spearmanr

correlation, p_value = spearmanr(model_sims, human_sims)

print(f"Spearman Correlation: {correlation:.4f}")



Spearman Correlation: 0.1131
