In [None]:
# Problem 4

# Analyze the embeddings of the word ‘bank’ in the following 2 sentences.
# Sentence 1: I went to a bank to deposit money.
# Sentence 2: I sat near a bank of a river.
# Download the embeddings of all the words in the above 2 sentences for the following 2 models.
# • Glove embeddings
# • BERT embeddings from Hugging face Transformer web portal
# Compute the Euclidian distance between the Glove and BERT embeddings for the word ‘bank.
# Expected answer:
# • Glove embedding: Euclidian distance between the embeddings for the word ‘bank’ used in 2 sentences = 0
# • BERT embeddings: Euclidian distance between the embeddings for the word ‘bank’ used in 2 sentences ≠ 0

In [None]:
# GloVe Embeddings:
# Static Embeddings: GloVe generates a single, fixed vector for each word, regardless of its context.
# Result: The Euclidean distance between the embeddings of "bank" in both sentences would be 0 because GloVe uses the same embedding for each instance of "bank" without taking context into account. This static nature of GloVe embeddings means 
# it cannot differentiate between "bank" as a financial institution and "bank" as a riverbank.

# BERT Embeddings:
# Contextualized Embeddings: BERT creates unique embeddings for words based on the context in which they are used.
# Result: The Euclidean distance between the embeddings of "bank" in the two sentences would be non-zero. 
# BERT would interpret "bank" differently in each sentence, producing distinct embeddings for each context 
# (financial institution vs. riverbank).
# Expected Outcome
# GloVe: Euclidean distance between "bank" embeddings in the two sentences = 0
# BERT: Euclidean distance between "bank" embeddings in the two sentences ≠ 0

In [2]:
import numpy as np
from scipy.spatial.distance import euclidean
from transformers import BertTokenizer, BertModel
import torch

2024-11-08 21:48:37.168365: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-08 21:48:37.278144: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731131317.317173   10600 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731131317.328079   10600 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-08 21:48:37.424413: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
sentence1 = "I went to a bank to deposit money."
sentence2 = "I sat near a bank of a river."

model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and get embeddings for each sentence

def get_bert_embedding(sentence, word):
    
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(inputs['input_ids'])
    word_id = tokenizer.convert_tokens_to_ids(word)
    word_index = inputs['input_ids'][0].tolist().index(word_id)
    word_embedding = outputs.last_hidden_state[0][word_index].detach().numpy()
    
    return word_embedding

In [9]:
# BERT embeddings 
bert_embedding_bank_sentence1 = get_bert_embedding(sentence1, "bank")
bert_embedding_bank_sentence2 = get_bert_embedding(sentence2, "bank")

# Euclidean distance between BERT embeddings
print("Euclidean distance between BERT embeddings: ")
bert_distance = euclidean(bert_embedding_bank_sentence1, bert_embedding_bank_sentence2)

print("BERT Embeddings Euclidean Distance (Contextualized):", bert_distance)

Euclidean distance between BERT embeddings: 
BERT Embeddings Euclidean Distance (Contextualized): 14.706682205200195


In [12]:
# The Glove embeddings analysis is described in the file : HW5_Bogdan_Tanasa_Problem4_embeddings_glove.ipynb