In the same directory, make sure the following files exist:
1. $\texttt{questions-words.txt}$, which can be downloaded from https://github.com/nicholas-leonard/word2vec/blob/master/questions-words.txt

In [1]:
!pip install transformers
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
### Standard libraries
import operator
from tqdm import tqdm
import numpy as np
import pandas as pd


### For BART
from transformers import BartModel, BartTokenizer

2023-05-23 21:14:49.239318: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-23 21:14:49.382417: W tensorflow/tsl/platform/default/dso_loader.cc:66] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/slurm/lib64:
2023-05-23 21:14:49.382459: I tensorflow/tsl/cuda/cudart_stub.cc:28] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-05-23 21:14:49.426012: E tensorflow/tsl/lib/monitoring/collection_registry.cc:81] Cannot register 2 metrics with the same name: /tensorflow/core/bfc_allocator_delay
2023-05-23 21:14:50.787744: W tensorflow/tsl/platform/default/dso_loader.cc:66] 

In [6]:
model = BartModel.from_pretrained("facebook/bart-base")
embedding_matrix = model.encoder.embed_tokens.weight
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

token_list = []

for token in tokenizer.get_vocab().keys():
    token_list.append(token)
    
token_list = np.array(token_list)

def convert_word(x):
    if x[0] == 'Ġ':
        return x[1:].lower()
    else:
        return x.lower()
    
lower_case_token = []
lower_case_token_idx = []

for i in range(len(token_list)):
    converted = convert_word(token_list[i])
    if converted not in lower_case_token:
        lower_case_token.append(converted)
        lower_case_token_idx.append(i)
        
lower_case_token = np.array(lower_case_token)

embedding_matrix = embedding_matrix.detach().numpy()[lower_case_token_idx,:]
emb_mat_npy = embedding_matrix.T

get_embedding_dict = {lower_case_token[i]: embedding_matrix[i] for i in range(len(lower_case_token))}
token_list_set = set(get_embedding_dict.keys())

def get_most_similar_words(emb, sim_metric):
    sim_list = []
    dotprod = np.dot(emb, emb_mat_npy)
    den1 = np.sqrt(np.sum(emb**2))
    den2 = np.sqrt(np.sum(((emb_mat_npy)**2),axis=0))
    if sim_metric == 'dot':
        sim_list = dotprod
    elif sim_metric == 'cosine':
        sim_list = (dotprod/(den1 * den2))
    temp = [(lower_case_token[i], sim_list[i]) for i in range(len(lower_case_token))]
    return sorted(temp,key=operator.itemgetter(1),reverse=True)

def analogy(a, b, c, d):
    if a in token_list_set and b in token_list_set and c in token_list_set and d in token_list_set:
        a_emb, b_emb = get_embedding_dict[a], get_embedding_dict[b]
        c_emb, d_emb = get_embedding_dict[c], get_embedding_dict[d]
        f_emb = c_emb - a_emb + b_emb
        
        ## Cosine similarity of d and f
        cos_sim = np.sum(d_emb * f_emb)/np.sqrt(np.sum(d_emb**2) * np.sum(f_emb**2))
        
        ## Most similar
        most_sim = get_most_similar_words(f_emb, 'cosine')
        most_sim = [x for x in most_sim if x[0] not in {a,b,c}]
        rank_of_d = np.where(np.array([x[0] for x in most_sim]) == d)[0][0] + 1
        top_10 = most_sim[:10] 
        
        return (cos_sim, rank_of_d, top_10)
        
    else:
        return 'At least one word is not in the vocabulary list'

In [7]:
all_in_vocab = []
result = []
categories = []

category = 'none'

with open('questions-words.txt') as file:
    for line in tqdm(file):
        if line[0] == ':':
            category = line[2:].strip('\n')
        if line[0] != ':':
            a, b, c, d = [x.lower() for x in line.strip('\n').split(' ')]
            temp = analogy(a, b, c, d)
            if temp != 'At least one word is not in the vocabulary list':
                all_in_vocab.append((a, b, c, d))
                result.append(temp)
                categories.append(category)
                
bart_res = pd.DataFrame()
bart_res['task'] = all_in_vocab
bart_res['cosine'] = [x[0] for x in result]
bart_res['rank'] = [x[1] for x in result]
bart_res['top_10'] = [x[2] for x in result]
bart_res['category'] = categories

19558it [34:51,  9.35it/s] 


In [8]:
bart_res

Unnamed: 0,task,cosine,rank,top_10,category
0,"(athens, greece, baghdad, iraq)",0.642231,1,"[(iraq, 0.64223135), (iraqi, 0.55553335), (syr...",capital-common-countries
1,"(athens, greece, bangkok, thailand)",0.695670,1,"[(thailand, 0.69567), (cambodia, 0.4977392), (...",capital-common-countries
2,"(athens, greece, beijing, china)",0.661186,1,"[(china, 0.661186), (chinese, 0.47734934), (ja...",capital-common-countries
3,"(athens, greece, berlin, germany)",0.601279,1,"[(germany, 0.6012795), (france, 0.4640275), (d...",capital-common-countries
4,"(athens, greece, bern, switzerland)",0.364594,1,"[(switzerland, 0.3645942), (spain, 0.3280067),...",capital-common-countries
...,...,...,...,...,...
10996,"(write, writes, speak, speaks)",0.772114,1,"[(speaks, 0.7721138), (spoke, 0.6394584), (spe...",gram9-plural-verbs
10997,"(write, writes, talk, talks)",0.572955,2,"[(talked, 0.57785845), (talks, 0.57295513), (t...",gram9-plural-verbs
10998,"(write, writes, think, thinks)",0.704055,1,"[(thinks, 0.70405537), (thought, 0.48446873), ...",gram9-plural-verbs
10999,"(write, writes, walk, walks)",0.704394,1,"[(walks, 0.70439386), (walked, 0.6366826), (wa...",gram9-plural-verbs


In [9]:
bart_res.to_csv("bart_res.csv")