# First adding the boiler plate code to get the seeds and GPU and imports all setup.

In [16]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm




# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

# random seed
seed = 1234

# set random seed
if seed is not None:
    print(f'random seed: {seed}')
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

device: cuda
random seed: 1234


# Load the average contextualized embeddings generated in problem 1. And the glove words.

In [17]:
# Now make sure we can load everything properly.
embed_tok_id_df = pd.read_csv('datafiles/tok_id_embeds.csv')
embed_vecs = np.load('datafiles/cont_embed_vecs.npy')

# Now to get back the original data frame.
avg_cont_embed_df = pd.DataFrame({'tok_id':embed_tok_id_df['tok_id'].to_numpy(),'avg_cont_embed':list(embed_vecs)})

In [18]:
avg_cont_embed_df.shape

(49504, 2)

In [19]:
display(avg_cont_embed_df.head())

Unnamed: 0,tok_id,avg_cont_embed
0,0,"[-0.05390735231835314, 0.08659995588741708, -0..."
1,133,"[-0.06945269355797774, 0.005067573929508234, 0..."
2,735,"[0.0278572527410288, 0.17078800192580665, -0.0..."
3,34546,"[0.0019569261815032732, 0.042429491532477666, ..."
4,16,"[0.0823113064755886, 0.21211939386916828, 0.04..."


In [21]:
glove_words_dict = {'word':[]}

with open('datafiles/glove.6B.300d-vocabulary.txt','r') as file:
    for line in file:
        glove_words_dict['word'].append(line.strip())

glove_words_df = pd.DataFrame(glove_words_dict)

glove_words_df

Unnamed: 0,word
0,the
1,","
2,.
3,of
4,to
...,...
399995,chanty
399996,kronik
399997,rolonda
399998,zsombor


# Perform tokenization on the glove words.

In [30]:
from transformers import AutoTokenizer

transformer_name = 'FacebookAI/roberta-base'
tokenizer = AutoTokenizer.from_pretrained(transformer_name, use_fast=True)
ignore_index = -100

In [23]:
def tokenize_sentance(batch):
    # First calll tokenizer.
    tokenized_sentances = tokenizer(
        batch['word'],
        truncation=True,
        #padding=True
        #return_tensors="pt"
    )

    # Now add word id's to each of the sentances.
    word_ids_lst = []
    for i in range(len(batch['word'])):
        curr_word_ids = tokenized_sentances.word_ids(batch_index=i)

        curr_word_ids = [w_id if w_id!=None else ignore_index for w_id in curr_word_ids]
        
        word_ids_lst.append(curr_word_ids)
    tokenized_sentances['word_ids'] = word_ids_lst

    return tokenized_sentances

In [24]:
from datasets import Dataset

glove_words_ds = Dataset.from_pandas(glove_words_df)

glove_words_ds = glove_words_ds.map(tokenize_sentance,batched=True)

Map:   0%|          | 0/400000 [00:00<?, ? examples/s]

In [25]:
glove_words_ds.to_pandas()

Unnamed: 0,word,input_ids,attention_mask,word_ids
0,the,"[0, 627, 2]","[1, 1, 1]","[-100, 0, -100]"
1,",","[0, 6, 2]","[1, 1, 1]","[-100, 0, -100]"
2,.,"[0, 4, 2]","[1, 1, 1]","[-100, 0, -100]"
3,of,"[0, 1116, 2]","[1, 1, 1]","[-100, 0, -100]"
4,to,"[0, 560, 2]","[1, 1, 1]","[-100, 0, -100]"
...,...,...,...,...
399995,chanty,"[0, 40805, 219, 2]","[1, 1, 1, 1]","[-100, 0, 0, -100]"
399996,kronik,"[0, 330, 2839, 967, 2]","[1, 1, 1, 1, 1]","[-100, 0, 0, 0, -100]"
399997,rolonda,"[0, 9396, 11192, 2]","[1, 1, 1, 1]","[-100, 0, 0, -100]"
399998,zsombor,"[0, 329, 29, 5223, 368, 2]","[1, 1, 1, 1, 1, 1]","[-100, 0, 0, 0, 0, -100]"


In [26]:
# Now convert back to a data frame.
glove_words_df = glove_words_ds.to_pandas()
avg_cont_emb_ids = np.sort(avg_cont_embed_df['tok_id'].to_numpy())

# Now that we have the tokenized glove sub-words and corresponding words the next thing to do is actually loop through all those words and for each gather all the information needed to compute its averaged contextualized embedding.

This is the main loop that I constructed that will actually performe the word processing.

In [27]:
from transformers import RobertaModel
mod_embeding_sz = RobertaModel.from_pretrained(transformer_name, add_pooling_layer=False).config.hidden_size
# Now we will loop through each row representing each word and for each one where the word_id
# isn't -100 I need to get the embedding for the corresponding subword tokin.

avg_word_embed = {}
missing_subword_tot = 0
missing_subw_word = {}

prog_bar = tqdm(glove_words_df.iterrows(), total=glove_words_df.shape[0],desc=f"Missing subwords have occured {missing_subword_tot} times thus far!",leave=True)

for ind,row in prog_bar:
    # First access the current word id's and input_ids.
    subword_ids = row['input_ids']
    word_ids = row['word_ids']
    word = row['word']

    for i in range(len(subword_ids)):
        curr_subword = subword_ids[i]
        curr_word = word_ids[i]
        
        
        ind_embed = np.searchsorted(avg_cont_emb_ids,curr_subword)
        is_in = ind_embed < len(avg_cont_emb_ids) and avg_cont_emb_ids[ind_embed] == curr_subword
        
        if curr_subword != -100 and is_in:
            # First get the calculated average emneding for the subword_id.
            avg_cont_embed_curr = avg_cont_embed_df.loc[avg_cont_embed_df['tok_id']==curr_subword,\
                                                        'avg_cont_embed'].iloc[0]

            # Check if sub-word is in dict.
            if word not in avg_word_embed:
                avg_word_embed[word] = {'tot_oc':0,'cont_embed_sum':np.zeros(mod_embeding_sz)}

            # Now word is in dict add it can be recorded.
            avg_word_embed[word]['tot_oc'] = avg_word_embed[word]['tot_oc'] + 1
            avg_word_embed[word]['cont_embed_sum'] = np.add(avg_word_embed[word]['cont_embed_sum'],\
                                                                     avg_cont_embed_curr)
        else:
            if word not in missing_subw_word:
                missing_subw_word[word] = []

            missing_subw_word[word].append(curr_subword)
            missing_subword_tot += 1
            prog_bar.set_description(f"Missing subwords have occured {missing_subword_tot} times thus far!")

Missing subwords have occured 0 times thus far!:   0%|          | 0/400000 [00:00<?, ?it/s]

In [28]:
print(len(list(avg_cont_embed_df['tok_id'])))
print(len(set(list(avg_cont_embed_df['tok_id']))))

49504
49504


In [33]:
missing_subword_tot

165

Ok good so the output below confirms that although some words had subwords that did not apear in the example text that we computed our average embedings on in problem one, each word had at minimum one subword associated with it that we had a vec for.

In [34]:
# I do want to verify reall qucik that every word has at least one subword contextualized embedding.
len(avg_word_embed.keys())==len(glove_words_df['word'].to_list())

True

In [35]:
avg_word_embed_final = {}

for key,value in avg_word_embed.items():
    avg_word_embed_final[key] = value['cont_embed_sum'] / value['tot_oc']


In [36]:

avg_cont_embed_lst = []

for value in avg_word_embed_final.values():
    avg_cont_embed_lst.append(np.array(value))

glove_words_df = pd.DataFrame({'word':glove_words_df['word'].to_list()})

In [37]:
print(avg_cont_embed_lst[1][3] == avg_word_embed_final[','][3])

True


In [20]:
# Now we can finally save these average glove embeddings.
twod_emb_vecs_np = np.stack(avg_cont_embed_lst)
np.save('datafiles/glove_avg_word_vecs.npy',twod_emb_vecs_np)

glove_words_df.to_csv('datafiles/glove_words_df.csv',index=False)

# Loading the averaged contextualized glove embeddings for each glove word into a pandas DF. And then defining the most similar function from scratch.

### NOTE Run this cell after loading the contextualized embeddings generated in problem 1. This cell loads the saved contextualized embedding to word mapppings.

In [38]:
# Now load the saved files to re-construct a df with all the embeddings.

avg_glove_cont_embed = pd.DataFrame({'word':pd.read_csv('datafiles/glove_words_df.csv')['word'].to_list(),\
                                     'avg_cont_embed':list(np.load('datafiles/glove_avg_word_vecs.npy'))})

avg_glove_cont_embed

Unnamed: 0,word,avg_cont_embed
0,the,"[-0.049957829023000076, -0.007305835309671829,..."
1,",","[-0.01428139524482378, 0.09616818351671312, 0...."
2,.,"[-0.05385901583476626, 0.08959653819978437, -0..."
3,of,"[-0.0032876783164984616, 0.05872345707161294, ..."
4,to,"[-0.011179784707645279, 0.05175610889715885, -..."
...,...,...
399995,chanty,"[-0.010604510937390288, 0.10740373668637104, 0..."
399996,kronik,"[-0.03847935333754696, 0.10718849114094631, 0...."
399997,rolonda,"[-0.017326816602421496, 0.08318891926132678, -..."
399998,zsombor,"[-0.005383800975908686, 0.08314268892852025, 0..."


### Making the index to key and key to index mappings that will be used in the most simiilar function.

In [39]:
key_to_index = {}
index_to_key = {}

for ind,row in avg_glove_cont_embed.iterrows():
    key_to_index[row['word']]=ind
    index_to_key[ind] = row['word']

Checking the key to index and index to key mappings.

In [40]:

n = 10

print({key: key_to_index[key] for key in list(key_to_index.keys())[0:n]})
print("\n")
print({key: index_to_key[key] for key in list(index_to_key.keys())[0:n]})


{'the': 0, ',': 1, '.': 2, 'of': 3, 'to': 4, 'and': 5, 'in': 6, 'a': 7, '"': 8, "'s": 9}


{0: 'the', 1: ',', 2: '.', 3: 'of', 4: 'to', 5: 'and', 6: 'in', 7: 'a', 8: '"', 9: "'s"}


### Now normalizing the contectualized embeddings that are required for the cosine similarity driving the most similar function.

In [41]:
# Now I need to normalize all the embedding vectors.
avg_glove_cont_embed_normed = avg_glove_cont_embed.copy()

# Now compute the normed version for each vector.
avg_glove_cont_embed_normed['avg_cont_embed'] = avg_glove_cont_embed_normed['avg_cont_embed'].apply(lambda x: x/np.linalg.norm(x))

for ind,row in avg_glove_cont_embed_normed.iterrows():
    if (1-np.linalg.norm(row['avg_cont_embed'])) >= 0.000001:
        print(f'Norm of a vecor was {np.linalg.norm(row['avg_cont_embed'])} FAILED!!!!!')
        break

### Now defining and actually using the most similar function on the inputs shown in the example emebdding notebook.

In [42]:
# Now I can define a function that uses these normed vectors.
def most_similar(word, topn=10):
    # Get all the contextualized embedding vectors.
    vectors = np.stack(avg_glove_cont_embed_normed['avg_cont_embed'].to_list())
    

    # Now actually compute the similarity scores.
    word_id = key_to_index[word]
    word_index = index_to_key.keys()

    emb = vectors[word_id]

    similarities = vectors @ emb
    

    # Sort the index of the similarities in ascending order and then reverse that to be descending.
    sim_ids_asc = similarities.argsort()
    sim_ids_desc = sim_ids_asc[::-1]

    # Remove the word that we are comparing the other words to from the list.
    mask = sim_ids_desc != word_id
    sim_ids_desc = sim_ids_desc[mask]

    top_n_sim_ids = sim_ids_desc[:topn]

    # Now take the top n most similar emebddings and make a list of tuples where the first part is
    # the word and the second part the sim score.
    top_n_sim_vals = [(index_to_key[w_id],similarities[w_id]) for w_id in top_n_sim_ids]
    return(top_n_sim_vals)
    

# The results of the desired most similar calles.

In [43]:
most_similar("cactus")

[('candel', 0.9961354396184285),
 ('cattle', 0.9959096427882971),
 ('cotton', 0.9957315960503977),
 ('ipec', 0.9956895038453432),
 ('crape', 0.9956829312293125),
 ('cowers', 0.9956722910406367),
 ('cress', 0.995654797464526),
 ('camas', 0.9956196125434017),
 ('canyon', 0.9955722332729319),
 ('casher', 0.9955600976599122)]

In [44]:
most_similar("cake")

[('cakes', 0.9970061178837591),
 ('beer', 0.9954684669775973),
 ('tree', 0.9950716231886965),
 ('hack', 0.994930802911462),
 ('bread', 0.9949149720361999),
 ('flake', 0.9948764168639996),
 ('heart', 0.9948009563419673),
 ('sticks', 0.9946944092845534),
 ('axe', 0.9946921708381746),
 ('rider', 0.9945955090281285)]

In [45]:
most_similar("angry")

[('ryang', 0.9999999999999999),
 ('riang', 0.9963455656414293),
 ('ryong', 0.9961570127608913),
 ('ryun', 0.9961293893374107),
 ('ryanggang', 0.9961174751260478),
 ('reang', 0.9959698522222888),
 ('angre', 0.9959698522222888),
 ('layang', 0.995882868618484),
 ('anyang', 0.9958617792201145),
 ('angra', 0.9957265234711375)]

In [46]:
most_similar("quickly")

[('cleanly', 0.996091148507774),
 ('closely', 0.9959891183410248),
 ('quietly', 0.9959157368849155),
 ('solidly', 0.9958704868535058),
 ('coldly', 0.9956938703785),
 ('wildly', 0.9956249683785581),
 ('smartly', 0.9955571573016364),
 ('safely', 0.9954547638571951),
 ('shortly', 0.9953964589136268),
 ('sweetly', 0.9953727042162738)]

In [47]:
most_similar("between")

[('inbetween', 0.9916833522764783),
 ('below', 0.9910037777116857),
 ('before', 0.9900203016950515),
 ('during', 0.9899937578302302),
 ('above', 0.9894070438055993),
 ('within', 0.9892429289330674),
 ('about', 0.9891306822007894),
 ('betweenness', 0.9890898100251142),
 ('outside', 0.9883810354828868),
 ('using', 0.9883311756754602)]

In [48]:
most_similar("the")

[('tuesday.the', 0.9952064044948798),
 ('it.the', 0.9950649089792694),
 ('wednesday.the', 0.9948170206480321),
 ('school.the', 0.9947773667514636),
 ('people.the', 0.9945479665300365),
 ('reported.the', 0.9944874079235709),
 ('region.the', 0.9942999119709547),
 ('here.the', 0.9942526083900078),
 ('state.the', 0.9941868098008284),
 ('area.the', 0.994139669140341)]