In [1]:
import time

import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import wikipediaapi

In [2]:
wiki = wikipediaapi.Wikipedia(user_agent="waynestalk/1.0", language="en")
page = wiki.page("Oolong")
corpus = page.text

nltk.download("punkt")
sentences = nltk.sent_tokenize(corpus)
tokenized_corpus = [[word.lower() for word in nltk.word_tokenize(sentence) if word.isalpha()] for sentence in sentences]
tokenized_corpus[:5]

[nltk_data] Downloading package punkt to /Users/wayne/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[['oolong',
  'uk',
  'us',
  'simplified',
  'chinese',
  '乌龙茶',
  'traditional',
  'chinese',
  '烏龍茶',
  'pinyin',
  'wūlóngchá',
  'tê',
  'black',
  'dragon',
  'tea',
  'is',
  'a',
  'traditional',
  'chinese',
  'tea',
  'camellia',
  'sinensis',
  'produced',
  'through',
  'a',
  'process',
  'that',
  'includes',
  'withering',
  'the',
  'leaves',
  'under',
  'strong',
  'sun',
  'and',
  'allowing',
  'some',
  'oxidation',
  'to',
  'occur',
  'before',
  'curling',
  'and',
  'twisting'],
 ['most',
  'oolong',
  'teas',
  'especially',
  'those',
  'of',
  'fine',
  'quality',
  'involve',
  'unique',
  'tea',
  'plant',
  'cultivars',
  'that',
  'are',
  'exclusively',
  'used',
  'for',
  'particular',
  'varieties'],
 ['the',
  'degree',
  'of',
  'oxidation',
  'which',
  'is',
  'controlled',
  'by',
  'the',
  'length',
  'of',
  'time',
  'between',
  'picking',
  'and',
  'final',
  'drying',
  'can',
  'range',
  'from',
  'to',
  'depending',
  'on',
  'the',


In [3]:
vocab = set([word for sentence in tokenized_corpus for word in sentence])
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for i, word in enumerate(vocab)}
len(vocab)

580

In [4]:
window_size = 2
vocab_size = len(vocab)
co_occurrence_matrix = torch.zeros((vocab_size, vocab_size))

for sentence in tokenized_corpus:
    for i, word in enumerate(sentence):
        word_index = word_to_index[word]
        for j in range(max(0, i - window_size), min(i + window_size + 1, len(sentence))):
            if i != j:
                context_index = word_to_index[sentence[j]]
                co_occurrence_matrix[word_index, context_index] += 1

In [5]:
word_indices = []
context_indices = []
co_occurrences = []

for i in range(vocab_size):
    for j in range(vocab_size):
        if co_occurrence_matrix[i, j] > 0:
            word_indices.append(i)
            context_indices.append(j)
            co_occurrences.append(co_occurrence_matrix[i, j])

word_indices = torch.tensor(word_indices, dtype=torch.long)
context_indices = torch.tensor(context_indices, dtype=torch.long)
co_occurrences = torch.tensor(co_occurrences, dtype=torch.float)

In [6]:
class GloVe(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(GloVe, self).__init__()
        self.word_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.context_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.word_bias = nn.Embedding(vocab_size, 1)
        self.context_bias = nn.Embedding(vocab_size, 1)

        nn.init.uniform_(self.word_embedding.weight, a=-0.5, b=0.5)
        nn.init.uniform_(self.context_embedding.weight, a=-0.5, b=0.5)
        nn.init.zeros_(self.word_bias.weight)
        nn.init.zeros_(self.context_bias.weight)

    def forward(self, word_index, context_index, co_occurrence):
        word_emb = self.word_embedding(word_index)
        context_emb = self.context_embedding(context_index)
        word_b = self.word_bias(word_index).squeeze()
        context_b = self.context_bias(context_index).squeeze()

        weighting = self.weighting_function(co_occurrence)
        log_co_occurrence = torch.log(co_occurrence)
        dot = (word_emb * context_emb).sum(dim=1)
        loss = weighting * (dot + word_b + context_b - log_co_occurrence) ** 2
        return loss.sum()

    def weighting_function(self, x, x_max=100, alpha=0.75):
        return torch.where(x < x_max, (x / x_max) ** alpha, torch.ones_like(x))

In [7]:
embedding_dim = 1000
model = GloVe(vocab_size, embedding_dim)
optimizer = optim.Adam(model.parameters(), lr=0.01)
epochs = 500

start_time = time.time()

for epoch in range(epochs):
    loss = model(word_indices, context_indices, co_occurrences)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch == 0 or (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

end_time = time.time()
print(f"Training time: {end_time - start_time} seconds")

Epoch: 0, Loss: 1147.0933837890625
Epoch: 99, Loss: 0.01006692461669445
Epoch: 199, Loss: 0.0013765881303697824
Epoch: 299, Loss: 0.007692785933613777
Epoch: 399, Loss: 0.031206317245960236
Epoch: 499, Loss: 0.027982018887996674
Training time: 2.2056429386138916 seconds


In [8]:
def get_final_embedding(word):
    word_index = torch.tensor(word_to_index[word], dtype=torch.long)
    w_vec = model.word_embedding(word_index).detach()
    c_vec = model.context_embedding(word_index).detach()
    return (w_vec + c_vec) / 2.0

In [9]:
word = "oolong"
embedding_vector = get_final_embedding(word).numpy()
print(f"Embedding {embedding_vector.shape} for '{word}': {embedding_vector}")

Embedding (1000,) for 'oolong': [-4.10067856e-01  4.20374453e-01 -1.72718585e-01 -3.62678617e-02
  3.13591123e-01  2.26124749e-01 -2.13840276e-01 -9.51855630e-02
 -2.43870080e-01 -2.47711688e-01 -2.42738232e-01 -1.46448687e-01
  9.40047204e-02  5.53254187e-02 -4.69003171e-02 -3.96527350e-02
 -1.20700993e-01  1.27118707e-01  3.70532632e-01 -3.51445973e-01
  1.20724082e-01  2.60446221e-03  8.52338597e-02  9.27290544e-02
 -1.21149562e-01  1.44617841e-01 -1.50572687e-01  1.55139163e-01
  3.12368125e-01 -1.59607813e-01 -3.06892246e-02 -1.98293626e-01
  2.16845557e-01  2.67929554e-01 -2.52793640e-01  2.54302651e-01
  7.62687624e-03  9.85463411e-02 -1.08464591e-01 -1.60161778e-01
  2.17930883e-01 -1.67212129e-01  2.85758406e-01  1.87484145e-01
  2.15308666e-01 -3.42692971e-01 -5.13638705e-02 -1.65757626e-01
  1.76723152e-01  1.36814773e-01  1.87422112e-02  1.09966755e-01
  6.19772449e-02  1.54829711e-01  2.79403478e-01 -5.46631664e-02
  9.83600691e-02  1.36221945e-03  1.22198090e-01 -5.101641

In [10]:
sentence1 = "tea is popular in taiwan".split()
sentence2 = "oolong is famous in taiwan".split()
sentence1_embeddings = [get_final_embedding(word) for word in sentence1]
sentence2_embeddings = [get_final_embedding(word) for word in sentence2]
vector1 = torch.stack(sentence1_embeddings).mean(dim=0)
vector2 = torch.stack(sentence2_embeddings).mean(dim=0)
cosine_sim = nn.CosineSimilarity(dim=0)
similarity = cosine_sim(vector1, vector2).item()
print(f"Sentence 1: {sentence1}")
print(f"Sentence 2: {sentence2}")
print(f"Similarity between sentences: {similarity}")

Sentence 1: ['tea', 'is', 'popular', 'in', 'taiwan']
Sentence 2: ['oolong', 'is', 'famous', 'in', 'taiwan']
Similarity between sentences: 0.6013368964195251
