In the same directory, make sure the following files exist:
1. $\texttt{questions-words.txt}$, which can be downloaded from https://github.com/nicholas-leonard/word2vec/blob/master/questions-words.txt

In [3]:
!pip install transformers
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [4]:
### Standard libraries
import operator
from tqdm import tqdm
import numpy as np
import pandas as pd

### For BERT
from transformers import BertModel, BertTokenizer

In [6]:
model = BertModel.from_pretrained("bert-base-uncased")
embedding_matrix = model.embeddings.word_embeddings.weight
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

token_list = []

for token in tokenizer.vocab.keys():
    token_list.append(token)
    
token_list = np.array(token_list)

token_list_filt = token_list[1996: 29612] ## only consider words, not unused tokens or symbols
token_list_set = set(token_list_filt)

def get_embedding(word):
    assert word in token_list_set
    idx = np.where(token_list == word)[0]
    return embedding_matrix[idx].detach().numpy()

get_embedding_dict = {}
for token in token_list_set:
    get_embedding_dict[token] = get_embedding(token)
    
emb_mat_npy = embedding_matrix.detach().numpy().T

def get_most_similar_words(emb, sim_metric):
    sim_list = []
    dotprod = np.dot(emb, emb_mat_npy)
    den1 = np.sqrt(np.sum(emb**2))
    den2 = np.sqrt(np.sum(((emb_mat_npy)**2),axis=0))
    if sim_metric == 'dot':
        sim_list = dotprod
    elif sim_metric == 'cosine':
        sim_list = (dotprod/(den1 * den2))[0]    
    temp = {token_list[i]: sim_list[i] for i in range(len(token_list))}
    return sorted(temp.items(),key=operator.itemgetter(1),reverse=True)

def analogy(a, b, c, d):
    if a in token_list_set and b in token_list_set and c in token_list_set and d in token_list_set:
        a_emb, b_emb = get_embedding_dict[a], get_embedding_dict[b]
        c_emb, d_emb = get_embedding_dict[c], get_embedding_dict[d]
        f_emb = c_emb - a_emb + b_emb
        
        ## Cosine similarity of d and f
        cos_sim = np.sum(d_emb * f_emb)/np.sqrt(np.sum(d_emb**2) * np.sum(f_emb**2))
        
        ## Most similar
        most_sim = get_most_similar_words(f_emb, 'cosine')
        most_sim = [x for x in most_sim if x[0] not in {a,b,c}]
        rank_of_d = np.where(np.array([x[0] for x in most_sim]) == d)[0][0] + 1
        top_10 = most_sim[:10] 
        
        return (cos_sim, rank_of_d, top_10)
        
    else:
        return 'At least one word is not in the vocabulary list'

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
all_in_vocab = []
result = []
categories = []

category = 'none'

with open('questions-words.txt') as file:
    for line in tqdm(file):
        if line[0] == ':':
            category = line[2:].strip('\n')
        if line[0] != ':':
            a, b, c, d = [x.lower() for x in line.strip('\n').split(' ')]
            temp = analogy(a, b, c, d)
            if temp != 'At least one word is not in the vocabulary list':
                all_in_vocab.append((a, b, c, d))
                result.append(temp)
                categories.append(category)
                
bert_res = pd.DataFrame()
bert_res['task'] = all_in_vocab
bert_res['cosine'] = [x[0] for x in result]
bert_res['rank'] = [x[1] for x in result]
bert_res['top_10'] = [x[2] for x in result]
bert_res['category'] = categories

19558it [21:41, 15.03it/s] 


In [9]:
bert_res

Unnamed: 0,task,cosine,rank,top_10,category
0,"(athens, greece, baghdad, iraq)",0.613978,1,"[(iraq, 0.6139779), (mesopotamia, 0.60773236),...",capital-common-countries
1,"(athens, greece, bangkok, thailand)",0.653616,1,"[(thailand, 0.6536159), (cambodia, 0.5840745),...",capital-common-countries
2,"(athens, greece, beijing, china)",0.527557,4,"[(tianjin, 0.54582757), (nanjing, 0.53440684),...",capital-common-countries
3,"(athens, greece, berlin, germany)",0.547957,1,"[(germany, 0.5479567), (italy, 0.43626735), (s...",capital-common-countries
4,"(athens, greece, bern, switzerland)",0.504809,2,"[(##נ, 0.5089948), (switzerland, 0.5048087), (...",capital-common-countries
...,...,...,...,...,...
12149,"(write, writes, speak, speaks)",0.762034,1,"[(speaks, 0.7620337), (spoke, 0.6635216), (spe...",gram9-plural-verbs
12150,"(write, writes, talk, talks)",0.630666,1,"[(talks, 0.63066584), (talked, 0.627512), (tal...",gram9-plural-verbs
12151,"(write, writes, think, thinks)",0.691880,1,"[(thinks, 0.69188005), (thinking, 0.53577286),...",gram9-plural-verbs
12152,"(write, writes, walk, walks)",0.700617,1,"[(walks, 0.7006168), (walked, 0.6397044), (wal...",gram9-plural-verbs


In [10]:
bert_res.to_csv("bert_res.csv")