In the same directory, make sure the following files exist:
1. $\texttt{questions-words.txt}$, which can be downloaded from https://github.com/nicholas-leonard/word2vec/blob/master/questions-words.txt
2. $\texttt{glove.6B.300d.txt}$, which can be downloaded from https://www.kaggle.com/datasets/thanakomsn/glove6b300dtxt

In [1]:
!pip install transformers
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [4]:
### Standard libraries
import operator
from tqdm import tqdm
import numpy as np
import pandas as pd

### For GloVe
import gensim
from gensim.test.utils import get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

In [5]:
word2vec_glove_file = get_tmpfile("glove.6B.300d.word2vec.txt")
glove2word2vec('glove.6B.300d.txt', word2vec_glove_file)

model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_glove_file)

def glove_analogy(a, b, c, d):
    if a in model and b in model and c in model and d in model:
        a_emb, b_emb = model[a], model[b]
        c_emb, d_emb = model[c], model[d]
        f_emb = c_emb - a_emb + b_emb
        
        ## Cosine similarity of d and f
        cos_sim = np.sum(d_emb * f_emb)/np.sqrt(np.sum(d_emb**2) * np.sum(f_emb**2))
        
        ## Most similar (select only lower case words; without _)
        most_sim = model.similar_by_vector(f_emb, topn=200000, restrict_vocab=None)
        most_sim = [x for x in most_sim if x[0] == x[0].lower() and '_' not in x[0] and x[0] not in {a,b,c}]
        try:
            rank_of_d = np.where(np.array([x[0] for x in most_sim]) == d)[0][0] + 1
        except:
            rank_of_d = None
        top_10 = most_sim[:10] 
        
        return (cos_sim, rank_of_d, top_10)
        
    else:
        return 'At least one word is not in the vocabulary list'

  glove2word2vec('glove.6B.300d.txt', word2vec_glove_file)


In [6]:
all_in_vocab = []
result = []
categories = []

category = 'none'

with open('questions-words.txt') as file:
    for line in tqdm(file):
        if line[0] == ':':
            category = line[2:].strip('\n')
        if line[0] != ':':
            a, b, c, d = [x.lower() for x in line.strip('\n').split(' ')]
            temp = glove_analogy(a, b, c, d)
            if temp != 'At least one word is not in the vocabulary list':
                all_in_vocab.append((a, b, c, d))
                result.append(temp)
                categories.append(category)
                
glove_res = pd.DataFrame()
glove_res['task'] = all_in_vocab
glove_res['cosine'] = [x[0] for x in result]
glove_res['rank'] = [x[1] for x in result]
glove_res['top_10'] = [x[2] for x in result]
glove_res['category'] = categories               

19558it [2:22:48,  2.28it/s]


In [7]:
glove_res

Unnamed: 0,task,cosine,rank,top_10,category
0,"(athens, greece, baghdad, iraq)",0.723732,1.0,"[(iraq, 0.7237322926521301), (iraqi, 0.6456960...",capital-common-countries
1,"(athens, greece, bangkok, thailand)",0.770864,1.0,"[(thailand, 0.7708642482757568), (thai, 0.5920...",capital-common-countries
2,"(athens, greece, beijing, china)",0.775182,1.0,"[(china, 0.7751821279525757), (chinese, 0.6065...",capital-common-countries
3,"(athens, greece, berlin, germany)",0.749873,1.0,"[(germany, 0.7498731017112732), (german, 0.587...",capital-common-countries
4,"(athens, greece, bern, switzerland)",0.549539,1.0,"[(switzerland, 0.5495386123657227), (liechtens...",capital-common-countries
...,...,...,...,...,...
19539,"(write, writes, talk, talks)",0.319299,54.0,"[(columnist, 0.45770207047462463), (talked, 0....",gram9-plural-verbs
19540,"(write, writes, think, thinks)",0.636422,1.0,"[(thinks, 0.6364215612411499), (knows, 0.57215...",gram9-plural-verbs
19541,"(write, writes, vanish, vanishes)",0.379217,5.0,"[(disappear, 0.4414120316505432), (disappears,...",gram9-plural-verbs
19542,"(write, writes, walk, walks)",0.575540,1.0,"[(walks, 0.57554030418396), (walking, 0.561013...",gram9-plural-verbs


In [8]:
glove_res.to_csv("glove_res.csv")