## Model Comparison and Performance Analysis

| **Model**                        | **Window Size** | **Training Loss** | **Training Time** | **Syntactic Accuracy** | **Semantic Accuracy** |
|-----------------------------------|:---------------:|:-----------------:|:---------------------------:|:----------------------:|:---------------------:|
| **Skipgram**                      |        2        |      7.804078     |           0.838465          |           0            |           0           |
| **Skipgram (with negative sampling)** |        2        |      0.252254     |           0.915590          |           0            |           0           |
| **GloVe**                         |        2        |      1.761795     |           0.163522          |           0            |           0           |
| **GloVe (Gensim implementation)** |       NA        |         NA        |              NA             |         0.50           |         0.93          |


## Correlation

| **Model**            | **Skipgram** | **Skipgram negative** | **GloVe** | **GloVe gensim** |
|----------------------|--------------|----------------------------------------|-----------|------------------------------------|
| **Correlation**       |    0.064204  |     -0.05998                           |   0.05199 |            0.54308                |


In [1]:
import numpy as np
import time
import pickle
import torch
import torch.nn as nn
from collections import Counter
import nltk
from nltk.corpus import brown
import torch.optim as optim
from itertools import combinations_with_replacement
import math
import os
import pandas as pd
from scipy.stats import spearmanr
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.corpus import gutenberg


In [2]:
# news_corpus = brown.sents()
# news_corpus = news_corpus[:1000]  # Take the first 1000 sentences
# news_corpus = [[word.lower() for word in sent] for sent in news_corpus]
# print(news_corpus[:5])

# Load only the "news" category sentences from the Brown corpus
news_corpus = brown.sents(categories='news')
# Limit to 10,000 sentences (or less, depending on your task)
news_corpus = news_corpus[:1000]
# news_corpus = news_corpus[:100000] #todo
# Preprocess: Convert all words to lowercase
news_corpus = [[word.lower() for word in sent] for sent in news_corpus]

print(news_corpus[:5]) 

[['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['the', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'city', 'executive', 'committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'city', 'of', 'atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ['the', 'september-october', 'term', 'jury', 'had', 'been', 'charged', 'by', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'to', 'investigate', 'reports', 'of', 'possible', '``', 'irregularities', "''", 'in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'mayor-nominate', 'ivan', 'allen', 'jr.', '.'], ['``', 'only', 'a', 'relative', 'handful', 'of', 'such', 'rep

In [3]:
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]
#assign unique integer
vocabs = list(set(flatten(news_corpus))) #all the words we have in the system - <UNK>

In [4]:
vocabs.append('<UNK>') #append unknown token to vocab

In [5]:
print(vocabs)



In [6]:
len(vocabs)

4273

In [7]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}
word2index['traffic']

1311

In [8]:
index2word = {v:k for k, v in word2index.items()}
index2word[10]

'feeling'

In [9]:
print(f"Default number of threads: {torch.get_num_threads()}")

Default number of threads: 8


In [10]:
torch.set_num_threads(torch.get_num_threads()) 
device = torch.device('cpu')


In [11]:
torch.manual_seed(42)
voc_size   = len(vocabs)
emb_size = batch_size = window_size = 2

## Word2Vec

### 1. Prepare train data

In [12]:
#create pairs of center word, and outside word

def random_batch(batch_size, corpus, windows_size):

    skipgrams = []

    #loop each corpus
    for doc in corpus:
        #look from the 2nd word until second last word
        for i in range(windows_size, len(doc)-windows_size):
            #center word
            center = word2index[doc[i]]
            #outside words = 2 words
            outside = []
            for j in range(i-windows_size, i+windows_size+1):
                outside.append(word2index[doc[j]])
            #for each of these two outside words, we gonna append to a list
            for each_out in outside:
                skipgrams.append([center, each_out])
                #center, outside1;   center, outside2
                
    random_index = np.random.choice(range(len(skipgrams)), batch_size, replace=False)
    
    inputs, labels = [], []
    for index in random_index:
        inputs.append([skipgrams[index][0]])
        labels.append([skipgrams[index][1]])
        
    return np.array(inputs), np.array(labels)

## Model

In [13]:
class Skipgram(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Skipgram, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
    
    def forward(self, center, outside, all_vocabs):
        center_embedding     = self.embedding_center(center)  #(batch_size, 1, emb_size)
        outside_embedding    = self.embedding_center(outside) #(batch_size, 1, emb_size)
        all_vocabs_embedding = self.embedding_center(all_vocabs) #(batch_size, voc_size, emb_size)
        
        top_term = torch.exp(outside_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2))
        #batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1) 

        lower_term = all_vocabs_embedding.bmm(center_embedding.transpose(1, 2)).squeeze(2)
        #batch_size, voc_size, emb_size) @ (batch_size, emb_size, 1) = (batch_size, voc_size, 1) = (batch_size, voc_size) 
        
        lower_term_sum = torch.sum(torch.exp(lower_term), 1)  #(batch_size, 1)
        
        loss = -torch.mean(torch.log(top_term / lower_term_sum))  #scalar
        
        return loss

In [14]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

In [15]:
#prepare all vocabs
all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, voc_size).to(device)
all_vocabs

tensor([[   0,    1,    2,  ..., 4270, 4271, 4272],
        [   0,    1,    2,  ..., 4270, 4271, 4272]])

In [16]:
skipgram_model  = Skipgram(voc_size, emb_size).to(device)
optimizer  = optim.Adam(skipgram_model.parameters(), lr=0.001)
# epochs = 10000 #todo
epochs = 10
start_time = time.time()

for epoch in range(epochs):
    #get batch
    input_batch, label_batch = random_batch(batch_size, news_corpus, window_size)
    input_tensor = torch.LongTensor(input_batch).to(device)
    label_tensor = torch.LongTensor(label_batch).to(device)
    
    #predict
    loss = skipgram_model(input_tensor, label_tensor, all_vocabs)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    # if (epoch + 1) % 1000 == 0: #todo
    print(f"Epoch {epoch+1:6.0f}   |   Loss: {loss:2.6f}")

print(f"Train time: {time.time()-start_time}")

Epoch      1   |   Loss: 8.404066
Epoch      2   |   Loss: 7.567184
Epoch      3   |   Loss: 8.743052
Epoch      4   |   Loss: 8.983487
Epoch      5   |   Loss: 8.476640
Epoch      6   |   Loss: 7.716052
Epoch      7   |   Loss: 8.771048
Epoch      8   |   Loss: 11.064310
Epoch      9   |   Loss: 9.715869
Epoch     10   |   Loss: 9.133204
Train time: 0.8357000350952148


## 3. Negative Sampling

### Unigram distribution

$$P(w)=U(w)^{3/4}/Z$$

In [17]:
z = 0.001

In [18]:
#count
from collections import Counter

word_count = Counter(flatten(news_corpus))
word_count

#get the total number of words
num_total_words = sum([c for w, c in word_count.items()])
num_total_words

22079

In [19]:
unigram_table = []

for v in vocabs:
    uw = word_count[v] / num_total_words
    uw_alpha = int((uw ** 0.75) / z)
    unigram_table.extend([v] * uw_alpha)

Counter(unigram_table)

Counter({'the': 137,
         ',': 89,
         '.': 87,
         'of': 73,
         'to': 62,
         'a': 53,
         'in': 52,
         'and': 47,
         'for': 36,
         'that': 30,
         '``': 29,
         "''": 28,
         'is': 26,
         'he': 26,
         'on': 25,
         'said': 25,
         'be': 24,
         'by': 22,
         'was': 22,
         'would': 21,
         'as': 19,
         'it': 19,
         'has': 17,
         'will': 17,
         'with': 17,
         'his': 15,
         'state': 15,
         'at': 15,
         'not': 14,
         'this': 14,
         'an': 14,
         'been': 12,
         'from': 12,
         'which': 12,
         'who': 12,
         'but': 11,
         'have': 11,
         'are': 11,
         '--': 11,
         'more': 10,
         'new': 10,
         'president': 10,
         'one': 10,
         'city': 10,
         'administration': 10,
         'mr.': 10,
         'they': 9,
         'or': 9,
         'some': 9,
         

## Model

In [20]:
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.shape[0]
    neg_samples = []
    for i in range(batch_size):  #(1, k)
        target_index = targets[i].item()
        nsample      = []
        while (len(nsample) < k):
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).reshape(1, -1))
        
    return torch.cat(neg_samples) #batch_size, k

$$\mathbf{J}_{\text{neg-sample}}(\mathbf{v}_c,o,\mathbf{U})=-\log(\sigma(\mathbf{u}_o^T\mathbf{v}_c))-\sum_{k=1}^K\log(\sigma(-\mathbf{u}_k^T\mathbf{v}_c))$$

In [21]:
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        self.logsigmoid        = nn.LogSigmoid()
    
    def forward(self, center, outside, negative):
        #center, outside:  (bs, 1)
        #negative       :  (bs, k)
        
        center_embed   = self.embedding_center(center) #(bs, 1, emb_size)
        outside_embed  = self.embedding_outside(outside) #(bs, 1, emb_size)
        negative_embed = self.embedding_outside(negative) #(bs, k, emb_size)
        
        uovc           = outside_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, 1)
        ukvc           = -negative_embed.bmm(center_embed.transpose(1, 2)).squeeze(2) #(bs, k)
        ukvc_sum       = torch.sum(ukvc, 1).reshape(-1, 1) #(bs, 1)
        
        loss           = self.logsigmoid(uovc) + self.logsigmoid(ukvc_sum)
        
        return -torch.mean(loss)

### 3. Training

In [22]:
skipgram_neg_model   = SkipgramNegSampling(voc_size, emb_size).to(device)
optimizer  = optim.Adam(skipgram_neg_model.parameters(), lr=0.001)
# epochs = 10000 #todo
epochs = 10
k = 5
start_time = time.time()

for epoch in range(epochs):
    
    #get batch
    input_batch, label_batch = random_batch(batch_size, news_corpus, window_size)
    input_tensor = torch.LongTensor(input_batch).to(device)
    label_tensor = torch.LongTensor(label_batch).to(device)
    
    #predict
    neg_samples = negative_sampling(label_tensor, unigram_table, k).to(device)
    loss = skipgram_neg_model(input_tensor, label_tensor, neg_samples)
    
    #backprogate
    optimizer.zero_grad()
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print the loss
    # if (epoch + 1) % 1000 == 0: #todo
    print(f"Epoch {epoch+1:6.0f} | Loss: {loss:2.6f}")

print(f"Training time: {time.time()-start_time}")

Epoch      1 | Loss: 1.673046
Epoch      2 | Loss: 2.207402
Epoch      3 | Loss: 1.561828
Epoch      4 | Loss: 1.078969
Epoch      5 | Loss: 1.582053
Epoch      6 | Loss: 1.544562
Epoch      7 | Loss: 1.131451
Epoch      8 | Loss: 1.624706
Epoch      9 | Loss: 3.207410
Epoch     10 | Loss: 1.909121
Training time: 0.7606899738311768


## 2. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 2.

In [23]:
X_i = Counter(flatten(news_corpus))
X_i

Counter({'the': 1569,
         ',': 878,
         '.': 857,
         'of': 676,
         'to': 549,
         'a': 440,
         'in': 438,
         'and': 377,
         'for': 266,
         'that': 211,
         '``': 202,
         "''": 191,
         'he': 176,
         'is': 171,
         'on': 165,
         'said': 164,
         'be': 153,
         'by': 143,
         'was': 138,
         'would': 131,
         'it': 119,
         'as': 114,
         'with': 102,
         'has': 100,
         'will': 97,
         'his': 89,
         'at': 84,
         'state': 82,
         'an': 81,
         'not': 78,
         'this': 75,
         'been': 66,
         'which': 65,
         'from': 65,
         'who': 61,
         'are': 58,
         'have': 58,
         '--': 58,
         'but': 57,
         'city': 54,
         'more': 52,
         'mr.': 52,
         'administration': 50,
         'one': 48,
         'new': 48,
         'president': 48,
         'they': 46,
         'had': 45,
  

In [24]:
skip_grams = []

for doc in news_corpus:
    for i in range(2, len(doc)-2):
        center = doc[i]
        outside = [doc[i-2], doc[i-1], doc[i+1], doc[i+2]]
        for each_out in outside:
            skip_grams.append((center, each_out))

skip_grams

[('county', 'the'),
 ('county', 'fulton'),
 ('county', 'grand'),
 ('county', 'jury'),
 ('grand', 'fulton'),
 ('grand', 'county'),
 ('grand', 'jury'),
 ('grand', 'said'),
 ('jury', 'county'),
 ('jury', 'grand'),
 ('jury', 'said'),
 ('jury', 'friday'),
 ('said', 'grand'),
 ('said', 'jury'),
 ('said', 'friday'),
 ('said', 'an'),
 ('friday', 'jury'),
 ('friday', 'said'),
 ('friday', 'an'),
 ('friday', 'investigation'),
 ('an', 'said'),
 ('an', 'friday'),
 ('an', 'investigation'),
 ('an', 'of'),
 ('investigation', 'friday'),
 ('investigation', 'an'),
 ('investigation', 'of'),
 ('investigation', "atlanta's"),
 ('of', 'an'),
 ('of', 'investigation'),
 ('of', "atlanta's"),
 ('of', 'recent'),
 ("atlanta's", 'investigation'),
 ("atlanta's", 'of'),
 ("atlanta's", 'recent'),
 ("atlanta's", 'primary'),
 ('recent', 'of'),
 ('recent', "atlanta's"),
 ('recent', 'primary'),
 ('recent', 'election'),
 ('primary', "atlanta's"),
 ('primary', 'recent'),
 ('primary', 'election'),
 ('primary', 'produced'),
 (

In [25]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({('of', 'the'): 376,
         ('the', 'of'): 368,
         (',', 'the'): 189,
         ('the', ','): 181,
         ('the', 'in'): 173,
         ('in', 'the'): 173,
         ('to', 'the'): 162,
         ('the', 'to'): 162,
         (',', ','): 101,
         (',', 'said'): 92,
         ('for', 'the'): 91,
         ('the', 'for'): 91,
         ('of', 'a'): 86,
         (',', 'and'): 84,
         ('and', ','): 82,
         ('a', 'of'): 80,
         ('the', '.'): 79,
         ('on', 'the'): 76,
         ('the', 'on'): 75,
         ('the', 'and'): 74,
         ('and', 'the'): 74,
         ('said', ','): 70,
         ('to', 'a'): 68,
         ('a', 'to'): 66,
         ('that', 'the'): 65,
         ('the', 'that'): 65,
         (',', 'of'): 64,
         ('of', ','): 63,
         ('he', ','): 63,
         (',', 'he'): 63,
         ("''", ','): 54,
         (',', "''"): 54,
         ('a', ','): 54,
         (',', 'a'): 53,
         (',', 'in'): 46,
         ('he', 'said'): 46,
         (

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "../figures/glove_weighting_func.png" width=400>

In [26]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [27]:
X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocabs, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [28]:
def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

## 4. Model

<img src ="../figures/glove.png" width=400>

In [29]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size):
        super(Glove, self).__init__()
        self.embedding_center  = nn.Embedding(voc_size, emb_size)
        self.embedding_outside = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)
    
    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.embedding_center(center) #(batch_size, 1, emb_size)
        outside_embeds = self.embedding_outside(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

## 5. Training

In [30]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
glove_model = Glove(voc_size, embedding_size).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(glove_model.parameters(), lr=0.001)

In [31]:
# Training
# epochs = 10000 #todo
epochs = 10
start_time = time.time()

for epoch in range(epochs):    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, news_corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch).to(device)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch).to(device)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch).to(device)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch).to(device) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = glove_model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()

    # if (epoch + 1) % 1000 == 0: todo
    print(f"Epoch: {epoch + 1} | cost: {loss:.6f}")
    
print(f"Training time: {time.time()-start_time}")

Epoch: 1 | cost: 16.266325
Epoch: 2 | cost: 4.335627
Epoch: 3 | cost: 1.148447
Epoch: 4 | cost: 1.776109
Epoch: 5 | cost: 5.049794
Epoch: 6 | cost: 66.619392
Epoch: 7 | cost: 21.762074
Epoch: 8 | cost: 6.029619
Epoch: 9 | cost: 1.802944
Epoch: 10 | cost: 10.500910
Training time: 0.15765905380249023


## GloVe (Gensim)

For looking at word vectors, we'll use **Gensim**. **Gensim** isn't really a deep learning package. It's a package for for word and text similarity modeling, which started with (LDA-style) topic models and grew into SVD and neural word representations. But its efficient and scalable, and quite widely used.   We gonna use **GloVe** embeddings, downloaded at [the Glove page](https://nlp.stanford.edu/projects/glove/). They're inside [this zip file](https://nlp.stanford.edu/data/glove.6B.zip)

In [32]:
glove_file = datapath(os.path.abspath('glove.6B.100d.txt'))  #search on the google
gensim_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [33]:
#return the vectors
gensim_model['coffee'].shape

(100,)

In [34]:
# caculating embeddings
def calculate_word_embedding(model, vocabulary):
    word_embedding = {}  #  dict to store word embeddings

    for term in vocabulary:  
        try:
            term_index = word2index[term]  # find index of word
        except KeyError:  # if word not there
            term_index = word2index['<UNK>']  # use unknown token index

        input_index = torch.LongTensor([word2index[term]])  # convert word index to tensor

        # get center and outside vectors from model
        vector_center = model.embedding_center(input_index)
        vector_outside = model.embedding_outside(input_index)
        # average the two vectors
        final_vector = (vector_center + vector_outside) / 2
        # get the first two values from the vector
        final_vector = final_vector[0][0].item(), final_vector[0][1].item()
        word_embedding[term] = np.array(final_vector)  # store in dict

    return word_embedding  # return word embeddings

# function to get vector for a specific word
def fetch_vector(embedding_dict, term):
    try:
        term_index = word2index[term]  # try getting index of term
    except KeyError:  # if word not there
        term = '<UNK>'  # use unknown token index

    return embedding_dict[term]  # return vector for word


In [36]:
# Compute word embedding for each model
skipgram = calculate_word_embedding(skipgram_model, vocabs)  # embeddings from skipgram model
skipgram_negative = calculate_word_embedding(skipgram_neg_model, vocabs)  # embeddings from skipgram negatives
glove = calculate_word_embedding(glove_model, vocabs) 
vector_storage = {"skipgram": skipgram, "skipgram_negative": skipgram_negative, "glove": glove} # storing all embeddings in one place

In [37]:
# Save the computed embedding to pickles
for vector_set in vector_storage.items():
    with open(f"{vector_set[0]}.pkl", "wb") as file:
        pickle.dump(vector_set[1], file)

In [38]:
# Fetch a specific word embedding
fetch_vector(skipgram_negative, 'berlin')

array([0.34038132, 0.08317519])

In [39]:
# Load dataset
with open("word-test.v1.txt", "r") as file:
    dataset_content = file.read()

In [40]:
# Preprocess dataset
formatted_data = dataset_content.replace("\t", "")  # remove tabs from the data
data_sections = formatted_data.split(': ')  # split the data into sections based on ': '

# get "capital-common-countries" section
capital_pairs = data_sections[1].split('\n')[1:-1]  # split section into lines and ignore first and last line
capital_pairs = [pair.split(" ") for pair in capital_pairs]  # split each line into words (country and capital)

# get the "gram7-past-tense" section
past_tense_pairs = data_sections[12].split('\n')[1:-1]  # split section into lines, ignore first and last line
past_tense_pairs = [pair.split(" ") for pair in past_tense_pairs]  # split each line into words (word pairs)

In [43]:
# cosine similarity
def cosine_similarity(vec1, vec2):
    dot = np.dot(vec1, vec2)  # get dot product of two vectors
    norm1 = np.linalg.norm(vec1)  # calculate vec1 magnitude 
    norm2 = np.linalg.norm(vec2)  # calculate vec2 magnitude
    return dot / (norm1 * norm2)  # return cosine similarity

In [44]:
# find the word in vocab closest to a given vector
def closest_match(vector, embeddings):
    try:
        words_in_vocab = list(embeddings.keys())  # get list of words in embeddings
    except:  # fallback if .keys() not supported
        words_in_vocab = list(embeddings.key_to_index.keys())  # fallback

    similarity_scores = {}  # to store similarity for each word

    for word in words_in_vocab: 
        similarity_scores[word] = cosine_similarity(vector, embeddings[word])  # calc cosine similarity

    return max(similarity_scores, key=similarity_scores.get)  # return word with highest similarity

In [46]:
# calculate analogy tasks accuracy
def calculate_accuracy(pairs, embeddings):
    matches = 0  # counter for correct predictions

    for example in pairs:  # loop through each analogy
        example = [term.lower() for term in example]  # make all terms lowercase

        try:
            # try calculating predicted vector using word embeddings
            predicted_vector = fetch_vector(embeddings, example[1]) - fetch_vector(embeddings, example[0]) + fetch_vector(embeddings, example[2])
            predicted_word = closest_match(predicted_vector, embeddings)  # find closest word to predicted vector
        except KeyError:  # if key error (word not found)
            # fallback: use most_similar method for model-based prediction
            predicted_word = embeddings.most_similar(positive=[example[1], example[2]], negative=[example[0]])[0][0]

        if example[3] == predicted_word:  # check if predicted word matches actual answer
            matches += 1  # if match, increment count

    return matches / len(pairs)  # return accuracy (correct matches / total examples)


In [47]:
# # tesing
# skipgram_sem_acc = calculate_accuracy(capital_pairs, skipgram)
# skipgram_syn_acc = calculate_accuracy(past_tense_pairs, skipgram)

# neg_sem_acc = calculate_accuracy(capital_pairs, skipgram_negative)
# neg_syn_acc = calculate_accuracy(past_tense_pairs, skipgram_negative)

# glove_sem_acc = calculate_accuracy(capital_pairs, glove)
# glove_syn_acc = calculate_accuracy(past_tense_pairs, glove)

# glove_gensim_sem_acc = calculate_accuracy(capital_pairs, gensim_model)
# glove_gensim_syn_acc = calculate_accuracy(past_tense_pairs, gensim_model)

# # Display results
# print("Word2Vec skipgram accuracy")
# print(f"Semantic: {skipgram_sem_acc}")
# print(f"Syntactic: {skipgram_syn_acc}\n")

# print("Word2Vec negative accuracy")
# print(f"Semantic: {neg_sem_acc}")
# print(f"Syntactic: {neg_syn_acc}\n")

# print("GloVe accuracy")
# print(f"Semantic: {glove_sem_acc}")
# print(f"Syntactic: {glove_syn_acc}\n")

# print("GloVe gensim accuracy")
# print(f"Semantic: {glove_gensim_sem_acc}")
# print(f"Syntactic: {glove_gensim_syn_acc}\n")



### I initially selected the "news" category from the NLTK dataset to categorize news articles. However, I encountered an issue where the model's accuracy was 0, likely due to the data or the model's approach. As a result, I decided to switch to using the Brown corpus by utilizing brown.sents(categories='news') for better results. 



In [48]:
# tesing
skipgram_sem_acc = calculate_accuracy(capital_pairs, skipgram)
skipgram_syn_acc = calculate_accuracy(past_tense_pairs, skipgram)

neg_sem_acc = calculate_accuracy(capital_pairs, skipgram_negative)
neg_syn_acc = calculate_accuracy(past_tense_pairs, skipgram_negative)

glove_sem_acc = calculate_accuracy(capital_pairs, glove)
glove_syn_acc = calculate_accuracy(past_tense_pairs, glove)

glove_gensim_sem_acc = calculate_accuracy(capital_pairs, gensim_model)
glove_gensim_syn_acc = calculate_accuracy(past_tense_pairs, gensim_model)

# display results
print("Word2Vec skipgram accuracy")
print(f"Semantic: {skipgram_sem_acc}")
print(f"Syntactic: {skipgram_syn_acc}\n")

print("Word2Vec negative accuracy")
print(f"Semantic: {neg_sem_acc}")
print(f"Syntactic: {neg_syn_acc}\n")

print("GloVe accuracy")
print(f"Semantic: {glove_sem_acc}")
print(f"Syntactic: {glove_syn_acc}\n")

print("GloVe gensim accuracy")
print(f"Semantic: {glove_gensim_sem_acc}")
print(f"Syntactic: {glove_gensim_syn_acc}\n")

Word2Vec skipgram accuracy
Semantic: 0.0
Syntactic: 0.0

Word2Vec negative accuracy
Semantic: 0.0
Syntactic: 0.0

GloVe accuracy
Semantic: 0.0
Syntactic: 0.0

GloVe gensim accuracy
Semantic: 0.5064102564102564
Syntactic: 0.5064102564102564



In [54]:
print(f"Semantic: {glove_gensim_sem_acc}")

Semantic: 0.9387351778656127


In [49]:
# load similarity dataset for similarity correlation
similar_words = pd.read_csv('wordsim_similarity_goldstandard.txt', sep="\t", header=None, names=['word_1', 'word_2', 'similarities'])
similar_words

Unnamed: 0,word_1,word_2,similarities
0,tiger,cat,7.35
1,tiger,tiger,10.00
2,plane,car,5.77
3,train,car,6.31
4,television,radio,6.77
...,...,...,...
198,rooster,voyage,0.62
199,noon,string,0.54
200,chord,smile,0.54
201,professor,cucumber,0.31


In [50]:
# Creating a method to compute dot products between word embeddings for different models
def compute_dot_product(model, word1, word2):
    word1_vec = fetch_vector(model, word1.lower())
    word2_vec = fetch_vector(model, word2.lower())
    return np.dot(word1_vec, word2_vec)

In [51]:
# Applying the method to calculate dot products between word embeddings
similar_words['skipgram'] = similar_words.apply(lambda row: compute_dot_product(skipgram, row['word_1'], row['word_2']), axis=1)
similar_words['skipgram_negative_sampling'] = similar_words.apply(lambda row: compute_dot_product(skipgram_negative, row['word_1'], row['word_2']), axis=1)
similar_words['glove'] = similar_words.apply(lambda row: compute_dot_product(glove, row['word_1'], row['word_2']), axis=1)
similar_words['glove_gensim'] = similar_words.apply(lambda row: np.dot(gensim_model[row['word_1'].lower()], gensim_model[row['word_2'].lower()]), axis=1)

In [52]:
# Extracting similarity scores and model-specific dot products
wordsim_scores = similar_words['similarities'].to_numpy()
skipgram_scores = similar_words['skipgram'].to_numpy()
negative_sampling_scores = similar_words['skipgram_negative_sampling'].to_numpy()
glove_scores = similar_words['glove'].to_numpy()
gensim_scores = similar_words['glove_gensim'].to_numpy()

In [53]:
# correlations
print(f"Skipgram: {spearmanr(wordsim_scores, skipgram_scores).statistic}")
print(f"Skipgram Negative: {spearmanr(wordsim_scores, negative_sampling_scores).statistic}")
print(f"GloVe: {spearmanr(wordsim_scores, glove_scores).statistic}")
print(f"GloVe gensim: {spearmanr(wordsim_scores, gensim_scores).statistic}")

Skipgram: 0.06420451879921557
Skipgram Negative: -0.059984396233358724
GloVe: 0.05199217528883568
GloVe gensim: 0.5430870624672256
