### Importing Necessary Dependencies and Packages

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np

import transformers
from transformers import BertTokenizer, BertModel

### Experimenting with BERT

In [4]:
def pretrained_bert_model():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    return tokenizer, model

In [5]:
def token_id_creater(tokenizer, sentence):
    tokens = tokenizer.tokenize(sentence)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    return token_ids

In [6]:
exp_tokenizer, exp_model = pretrained_bert_model()

exp_text = "Here is the sentence I want embeddings for using BERT."
exp_tokens = exp_tokenizer.tokenize(exp_text)

print(exp_tokens)

['here', 'is', 'the', 'sentence', 'i', 'want', 'em', '##bed', '##ding', '##s', 'for', 'using', 'bert', '.']


In [7]:
exp_tokens_ids = exp_tokenizer.convert_tokens_to_ids(exp_tokens)

print(exp_tokens_ids)
print(torch.tensor([exp_tokens_ids]))

[2182, 2003, 1996, 6251, 1045, 2215, 7861, 8270, 4667, 2015, 2005, 2478, 14324, 1012]
tensor([[ 2182,  2003,  1996,  6251,  1045,  2215,  7861,  8270,  4667,  2015,
          2005,  2478, 14324,  1012]])


In [8]:
exp_output = exp_model(torch.tensor([exp_tokens_ids]))

# print(exp_output of final hidden state)
print(exp_output[0].shape)

# alternatively, we can also use last_hidden_state attribute.
print(exp_output.last_hidden_state.shape)

print(exp_output[1].shape)

# still figuring out how to access the embeddings from the hidden layers of BERT.

torch.Size([1, 14, 768])
torch.Size([1, 14, 768])
torch.Size([1, 768])


### experimenting with passing two sentences into bert at the same time.

In [9]:
exp_text_2 = "Here is a sentence I want embeddings for using BERT."
exp_tokens_2 = exp_tokenizer.tokenize(exp_text_2)

exp_tokens_ids_2 = exp_tokenizer.convert_tokens_to_ids(exp_tokens_2)

exp_output_2 = exp_model(torch.tensor([exp_tokens_ids, exp_tokens_ids_2]))

print(exp_output_2[0].shape)

# passing multiple sentences to BERT model will return the embeddings for each sentence separately unaffected by the other sentences.
# if we want to get the embeddings for the entire text, we need to pass the entire text as a single sentence - concatenate the sentences or tokens.

torch.Size([2, 14, 768])


### creating embeddings for individual words in vocabulary

In [10]:
from vocabulary import vocab

In [1]:
print(len(vocab))

examp = vocab[1]
print(exp_tokenizer.tokenize(examp))
examp_token_id = token_id_creater(exp_tokenizer, examp)
print(examp_token_id)

NameError: name 'vocab' is not defined

In [36]:
vocabulary_embeddings = {}

i = 0

# creating an example vocab of only the first 10000 words
exp_vocab = vocab

for word in exp_vocab:

    i += 1
    
    token_id_curr = token_id_creater(exp_tokenizer, word)
    output = exp_model(torch.tensor([token_id_curr]))

    # storing the pooled output
    vocabulary_embeddings[word] = output[1]

    if (i % 1000 == 0):
        print(f"Processed {i} words")


# print number of key-value pairs in vocabulary_embeddings
print(len(vocabulary_embeddings.keys()))

Processed 1000 words
Processed 2000 words
Processed 3000 words
Processed 4000 words
Processed 5000 words
Processed 6000 words
Processed 7000 words
Processed 8000 words
Processed 9000 words
Processed 10000 words
Processed 11000 words
Processed 12000 words
Processed 13000 words
Processed 14000 words
Processed 15000 words
Processed 16000 words
Processed 17000 words
Processed 18000 words
Processed 19000 words
Processed 20000 words
Processed 21000 words
Processed 22000 words
Processed 23000 words
Processed 24000 words
Processed 25000 words
Processed 26000 words
Processed 27000 words
Processed 28000 words
Processed 29000 words
Processed 30000 words
Processed 31000 words


: 

In [None]:
print(vocabulary_embeddings['air'].shape)

torch.Size([1, 768])


In [19]:
def cosine_similarity(input_embedding1, input_embedding2):
    
    # Normalize the input and reconstructed embeddings
    input_embeddings1_norm = F.normalize(input_embedding1, p=2, dim=-1)
    input_embedding2_norm = F.normalize(input_embedding2, p=2, dim=-1)

    # Compute cosine similarity
    cosine_similarities = torch.sum(input_embeddings1_norm * input_embedding2_norm, dim=-1)

    # return cosine similarity
    return cosine_similarities.item()

In [20]:
# testing cosine similarity function

print(cosine_similarity(vocabulary_embeddings['air'], vocabulary_embeddings['aah']))

0.701630711555481


In [21]:
# now testing it for embeddings of vectors that are similar

embedding1 = exp_model(torch.tensor([token_id_creater(exp_tokenizer, 'air')]))

print(cosine_similarity(embedding1[1], vocabulary_embeddings['air']))

1.0


In [22]:
# now using it to create a function that returns the most similar word from a vector as input

def most_similar_word(input_embedding, vocabulary_embeddings):
    most_similar_word = None
    max_similarity = -1

    for word, embedding in vocabulary_embeddings.items():
        similarity = cosine_similarity(input_embedding, embedding)
        if similarity > max_similarity:
            most_similar_word = word
            max_similarity = similarity

    return most_similar_word, max_similarity



In [23]:
# testing most_similar_word function

print(most_similar_word(embedding1[1], vocabulary_embeddings))


('air', 1.0)


In [35]:
print(exp_output[0].shape)

# iterate through all embeddings in the output and store their closest words in a 

closest_words = []

print(exp_output[0].shape[1])

for i in range(exp_output[0].shape[1]):
    word_embedding = exp_output[0][:, i, :]
    word, similarity = most_similar_word(word_embedding, vocabulary_embeddings)
    closest_words.append((word, similarity))

print(closest_words)


torch.Size([1, 14, 768])
14
[('censure', 0.09438993781805038), ('clad', 0.0928436815738678), ('censure', 0.13513685762882233), ('beam', 0.1256704330444336), ('beam', 0.12083977460861206), ('airing', 0.07985206693410873), ('combo', 0.08452193439006805), ('combo', 0.0784529447555542), ('attract', 0.09313622862100601), ('beam', 0.10810612887144089), ('chabuks', 0.0981726348400116), ('credent', 0.06288355588912964), ('censure', 0.06640621274709702), ('bays', 0.08555959165096283)]
