In [2]:
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input
input_text = "Hello, BERT!"
inputs = tokenizer(input_text, return_tensors='pt')

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state
    
# This gives you the embeddings after all transformations.
# If you specifically want just the initial embeddings:
initial_embeddings = model.embeddings(inputs['input_ids'])


In [7]:
print (initial_embeddings)
print (type(initial_embeddings))
print(initial_embeddings.shape)


tensor([[[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
           3.8253e-02,  1.6400e-01],
         [ 3.7386e-01, -1.5575e-02, -2.4561e-01,  ..., -3.1657e-02,
           5.5144e-01, -5.2406e-01],
         [ 4.6705e-04,  1.6225e-01, -6.4443e-02,  ...,  4.9443e-01,
           6.9413e-01,  3.6286e-01],
         [ 7.4566e-01,  2.8742e-01,  4.1331e-01,  ...,  8.7860e-01,
           3.9919e-01, -2.0667e-01],
         [ 6.4243e-01, -4.2258e-01, -4.0628e-01,  ...,  6.2612e-01,
           5.6107e-01,  5.0588e-01],
         [-3.2507e-01, -3.1879e-01, -1.1632e-01,  ..., -3.9602e-01,
           4.1120e-01, -7.7552e-02]]], grad_fn=<NativeLayerNormBackward0>)
<class 'torch.Tensor'>
torch.Size([1, 6, 768])


#### This shows that the Bank word has different context in its embeddings...

In [8]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Function to get the embedding of a word from a sentence
def get_word_embedding(sentence, word):
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(**inputs)
    word_id = tokenizer.convert_tokens_to_ids(word)
    word_position = inputs["input_ids"][0].tolist().index(word_id)
    return outputs["last_hidden_state"][0][word_position].detach().numpy()

# Compare embeddings for the word 'bank' in two different contexts
sentence1 = "I sat by the river bank."
sentence2 = "I deposited money in the bank."

embedding1 = get_word_embedding(sentence1, "bank")
embedding2 = get_word_embedding(sentence2, "bank")

# Calculate cosine similarity or any other metric to see the difference
# For simplicity, let's use dot product
similarity = torch.nn.functional.cosine_similarity(
    torch.tensor(embedding1).unsqueeze(0), torch.tensor(embedding2).unsqueeze(0)
)

print(f"Cosine similarity between the embeddings: {similarity.item()}")

Cosine similarity between the embeddings: 0.5257286429405212


#### Initial value of the embeddings for the word bank

In [9]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Function to get the initial embedding of a word from a sentence
def get_initial_word_embedding(sentence, word):
    inputs = tokenizer(sentence, return_tensors="pt")
    word_id = tokenizer.convert_tokens_to_ids(word)
    word_position = inputs["input_ids"][0].tolist().index(word_id)
    
    # Extracting the initial embeddings
    initial_embeddings = model.embeddings(inputs["input_ids"])
    
    return initial_embeddings[0][word_position].detach().numpy()

# Compare initial embeddings for the word 'bank' in two different contexts
sentence1 = "I sat by the river bank."
sentence2 = "I deposited money in the bank."

embedding1 = get_initial_word_embedding(sentence1, "bank")
embedding2 = get_initial_word_embedding(sentence2, "bank")

# Calculate cosine similarity or any other metric to see the difference
# For simplicity, let's use dot product
similarity = torch.nn.functional.cosine_similarity(
    torch.tensor(embedding1).unsqueeze(0), torch.tensor(embedding2).unsqueeze(0)
)

print(f"Cosine similarity between the initial embeddings: {similarity.item()}")


Cosine similarity between the initial embeddings: 0.9999998807907104


#### Let us track the similarity starting to drift thru the layers as attentions and context gets changed.

In [10]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize the sentences
sentence1 = "I sat by the river bank."
sentence2 = "I deposited money in the bank."

inputs1 = tokenizer(sentence1, return_tensors="pt")
inputs2 = tokenizer(sentence2, return_tensors="pt")

word_id = tokenizer.convert_tokens_to_ids("bank")
word_position1 = inputs1["input_ids"][0].tolist().index(word_id)
word_position2 = inputs2["input_ids"][0].tolist().index(word_id)

# Get initial embeddings
initial_embeddings1 = model.embeddings(inputs1["input_ids"])
initial_embeddings2 = model.embeddings(inputs2["input_ids"])

cosine_sim = torch.nn.functional.cosine_similarity(initial_embeddings1[0][word_position1].unsqueeze(0),
                                                   initial_embeddings2[0][word_position2].unsqueeze(0))
print(f"Layer 0 (initial embeddings) similarity: {cosine_sim.item()}")

# Process both sentences through each BERT layer
hidden_states1 = [initial_embeddings1]
hidden_states2 = [initial_embeddings2]

for i, layer in enumerate(model.encoder.layer):
    layer_output1 = layer(hidden_states1[-1], attention_mask=inputs1["attention_mask"])
    hidden_states1.append(layer_output1[0])
    
    layer_output2 = layer(hidden_states2[-1], attention_mask=inputs2["attention_mask"])
    hidden_states2.append(layer_output2[0])
    
    cosine_sim = torch.nn.functional.cosine_similarity(hidden_states1[-1][0][word_position1].unsqueeze(0),
                                                       hidden_states2[-1][0][word_position2].unsqueeze(0))
    print(f"Layer {i + 1} similarity: {cosine_sim.item()}")


Layer 0 (initial embeddings) similarity: 0.9999998807907104
Layer 1 similarity: 0.7583111524581909
Layer 2 similarity: 0.6887192726135254
Layer 3 similarity: 0.6551121473312378
Layer 4 similarity: 0.5860087275505066
Layer 5 similarity: 0.5718533396720886
Layer 6 similarity: 0.5652674436569214
Layer 7 similarity: 0.5206235647201538
Layer 8 similarity: 0.49700790643692017
Layer 9 similarity: 0.4941667318344116
Layer 10 similarity: 0.5007324814796448
Layer 11 similarity: 0.5522936582565308
Layer 12 similarity: 0.5257290005683899
