# GloVE

Let's work on implementation of GloVE.

In [None]:
import time
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import nltk
from nltk.corpus import reuters, stopwords

nltk.download('reuters')
nltk.download('stopwords')

# to ensure to produce same random number to debug and model comparison
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)


[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<torch._C.Generator at 0x7a21947e9ff0>

## 1. Load data

In [None]:
stop_words = set(stopwords.words('english'))
sentences = []

for fileid in reuters.fileids():
    words = [
        w.lower()
        for w in reuters.words(fileid)
        # clean the dataset document by removing stopwords
        if w.isalpha() and w.lower() not in stop_words
    ]
    sentences.append(words)

print("Total sentences:", len(sentences))


Total sentences: 10788


In [None]:
# set vocab limit to avoid training the model fneeds GPU for londer period
VOCAB_LIMIT = 10000
UNK_TOKEN = "<UNK>"

all_words = [w for sentence in sentences for w in sentence]
word_counts = Counter(all_words)

# build the vocabulary
vocab = [UNK_TOKEN] + [
    word for word, _ in word_counts.most_common(VOCAB_LIMIT - 1)
]

word2index = {word: idx for idx, word in enumerate(vocab)}
index2word = {idx: word for word, idx in word2index.items()}

vocab_size = len(vocab)

UNK_INDEX = word2index[UNK_TOKEN]

print("Vocabulary size:", vocab_size)
print("UNK index:", word2index[UNK_TOKEN])


Vocabulary size: 10000
UNK index: 0


In [None]:
corpus = []
for sentence in sentences:
    indexed_sentence = [
        # each word is replaced by its index from word2index
        # this is to make dataset ready for model training
        word2index.get(word, word2index[UNK_TOKEN])
        for word in sentence
    ]
    corpus.append(indexed_sentence)


## 2. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 1.

In [None]:
def generate_skipgrams(corpus, window_size=2):
    skip_grams = []

    for doc in corpus:
        for i, center_word in enumerate(doc):
            for j in range(
                max(0, i - window_size),
                min(len(doc), i + window_size + 1)
            ):
                if i != j:
                    skip_grams.append((center_word, doc[j]))

    return skip_grams


In [None]:
# DEFAULT window size = 2 
WINDOW_SIZE = 2
skip_grams = generate_skipgrams(sentences, window_size=WINDOW_SIZE)

print("Total skip-grams:", len(skip_grams))


Total skip-grams: 3421336


In [None]:
from collections import defaultdict

X_ik = defaultdict(int)

for pair in skip_grams:
    X_ik[pair] += 1

print("Total co-occurrence pairs:", len(X_ik))


Total co-occurrence pairs: 1401533


**Weighting function**

GloVe includes a weighting function to scale down too frequent words.

<img src = "../figures/glove_weighting_func.png" width=400>

In [None]:
x_max = 100
alpha = 0.75

weighting_dic = {}

for pair, count in X_ik.items():
    if count < x_max:
        weighting_dic[pair] = (count / x_max) ** alpha
    else:
        weighting_dic[pair] = 1.0


**Prepare train data**

In [None]:

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):

    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []

    skip_grams_id = [
        (word2index.get(w1, UNK_INDEX),  word2index.get(w2, UNK_INDEX)) for w1, w2 in skip_grams
    ]

    random_index = np.random.choice(
        range(len(skip_grams_id)),
        batch_size,
        replace=False
    )

    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])

        pair = skip_grams[index]
        cooc = X_ik[pair]
        random_coocs.append([math.log(cooc)])

        weighting = weighting_dic[pair]
        random_weightings.append([weighting])

    return (
        np.array(random_inputs),
        np.array(random_labels),
        np.array(random_coocs),
        np.array(random_weightings)
    )


**Model**

In [None]:

class Glove(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.center_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.context_embedding = nn.Embedding(vocab_size, embedding_dim)

        self.center_bias = nn.Embedding(vocab_size, 1)
        self.context_bias = nn.Embedding(vocab_size, 1)

    def forward(self, center, context, cooc, weighting):
        center_embed = self.center_embedding(center).squeeze(1)
        context_embed = self.context_embedding(context).squeeze(1)

        center_bias = self.center_bias(center).squeeze(1)
        context_bias = self.context_bias(context).squeeze(1)

        inner_product = torch.sum(center_embed * context_embed, dim=1, keepdim=True)

        loss = weighting * (
            inner_product + center_bias + context_bias - cooc
        ) ** 2

        return torch.mean(loss)


**Training**

In [None]:
embedding_dim = 100
batch_size = 128
num_epochs = 5000
learning_rate = 0.001

model = Glove(vocab_size, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [None]:
import math

start_time = time.time()

losses = []

for epoch in range(num_epochs):

    x, y, cooc, weighting = random_batch(
        batch_size,
        sentences,
        skip_grams,
        X_ik,
        weighting_dic
    )

    x_tensor = torch.LongTensor(x)
    y_tensor = torch.LongTensor(y)
    cooc_tensor = torch.FloatTensor(cooc)
    weighting_tensor = torch.FloatTensor(weighting)

    loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    losses.append(loss.item());

    if (epoch + 1) % 500 == 0:
        print(f"Epoch {epoch+1} | Loss: {loss.item():.6f}")

end_time = time.time()

print(f"Training completed in {end_time - start_time:.2f} seconds")

print("Final loss:", loss.item())

Epoch 500 | Loss: 20.272104
Epoch 1000 | Loss: 18.815372
Epoch 1500 | Loss: 11.936865
Epoch 2000 | Loss: 9.624596
Epoch 2500 | Loss: 8.206407
Epoch 3000 | Loss: 5.208466
Epoch 3500 | Loss: 5.306939
Epoch 4000 | Loss: 6.229774
Epoch 4500 | Loss: 5.293838
Epoch 5000 | Loss: 8.381731
Training completed in 6627.40 seconds
Final loss: 8.381731033325195


In [None]:
embeddings = model.center_embedding.weight.data
print("Embedding matrix shape:", embeddings.shape)


Embedding matrix shape: torch.Size([10000, 100])


**Save the trained GloVe model**

In [None]:
MODEL_PATH = "Glove.pth"

torch.save({
    "model_state_dict": model.state_dict(),
    "word2index": word2index,
    "index2word": index2word,
    "embedding_dim": embedding_dim
}, MODEL_PATH)

print("Model saved to", MODEL_PATH)


Model saved to Glove.pth


**Load the model**

In [None]:
checkpoint = torch.load("Glove.pth", map_location="cpu")

word2index = checkpoint["word2index"]
index2word = checkpoint["index2word"]
embedding_dim = checkpoint["embedding_dim"]

vocab_size = len(word2index)

model = Glove(vocab_size, embedding_dim)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()


Glove(
  (center_embedding): Embedding(10000, 100)
  (context_embedding): Embedding(10000, 100)
  (center_bias): Embedding(10000, 1)
  (context_bias): Embedding(10000, 1)
)

**Extract embeddings from the loaded model**

In [None]:
import torch.nn.functional as F

embeddings = model.center_embedding.weight.data
embeddings = F.normalize(embeddings, dim=1)

**Evaluate semantic & syntactic accuracy**

In [None]:
def load_analogy_dataset(filepath):
    semantic = []
    syntactic = []

    current_section = None

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            if line.startswith(":"):
                if "capital-common-countries" in line:
                    current_section = "semantic"
                elif "past-tense" in line:
                    current_section = "syntactic"
                else:
                    current_section = None
                continue

            if current_section is None:
                continue

            words = line.lower().split()
            if len(words) != 4:
                continue

            if current_section == "semantic":
                semantic.append(words)
            else:
                syntactic.append(words)

    return semantic, syntactic


In [None]:
import torch
import torch.nn.functional as F

def analogy_accuracy(analogies, embeddings, word2index, index2word):
    correct = 0
    total = 0

    vocab_size = embeddings.size(0)

    for a, b, c, d in analogies:
        if a not in word2index or b not in word2index \
           or c not in word2index or d not in word2index:
            continue

        va = embeddings[word2index[a]]
        vb = embeddings[word2index[b]]
        vc = embeddings[word2index[c]]

        # Word2Vec analogy: b - a + c
        target_vec = vb - va + vc
        target_vec = F.normalize(target_vec.unsqueeze(0), dim=1)

        # Cosine similarity with all words
        similarities = torch.matmul(target_vec, embeddings.T).squeeze()

        # Exclude query words
        similarities[word2index[a]] = -1e9
        similarities[word2index[b]] = -1e9
        similarities[word2index[c]] = -1e9

        predicted_index = torch.argmax(similarities).item()
        predicted_word = index2word[predicted_index]

        if predicted_word == d:
            correct += 1

        total += 1

    return correct / total if total > 0 else 0


In [None]:
semantic, syntactic = load_analogy_dataset("/content/sample_data/word_analogies_dataset.txt")

semantic_acc = analogy_accuracy(
    semantic, embeddings, word2index, index2word
)

syntactic_acc = analogy_accuracy(
    syntactic, embeddings, word2index, index2word
)

print(f"Semantic accuracy: {semantic_acc:.4f}")
print(f"Syntactic accuracy: {syntactic_acc:.4f}")


Semantic accuracy: 0.0000
Syntactic accuracy: 0.0000


**Load similarity dataset**

In [None]:
import pandas as pd

# Load similarity dataset
sim_df = pd.read_csv("/content/sample_data/wordsim353crowd.csv")

sim_df.head()


Unnamed: 0,Word 1,Word 2,Human (Mean)
0,admission,ticket,5.536
1,alcohol,chemistry,4.125
2,aluminum,metal,6.625
3,announcement,effort,2.0625
4,announcement,news,7.1875


**Compute dot-product similarities**

In [None]:
model_sims = []
human_sims = []

UNK_INDEX = word2index.get("<UNK>")

for _, row in sim_df.iterrows():
    w1 = str(row[0]).lower()
    w2 = str(row[1]).lower()
    human_score = float(row[2])

    idx1 = word2index.get(w1, UNK_INDEX)
    idx2 = word2index.get(w2, UNK_INDEX)

    v1 = embeddings[idx1]
    v2 = embeddings[idx2]

    dot_sim = torch.dot(v1, v2).item()

    model_sims.append(dot_sim)
    human_sims.append(human_score)


  w1 = str(row[0]).lower()
  w2 = str(row[1]).lower()
  human_score = float(row[2])


**Spearman correlation**

In [None]:
from scipy.stats import spearmanr

correlation, p_value = spearmanr(model_sims, human_sims)

print(f"Spearman Correlation: {correlation:.4f}")
print(f"P-value: {p_value:.4e}")


Spearman Correlation: 0.0993
P-value: 6.2390e-02
