In [45]:
#!pip install transformers


In [44]:
#! pip install torch

### 1. Import relevant libraries and set a random seed for reproducibility of bert model

In [19]:
#Based on https://www.geeksforgeeks.org/how-to-generate-word-embedding-using-bert/
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
# Set a random seed
random_seed = 42
random.seed(random_seed)

# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

### 2. Load bert tokenizer and model

In [21]:
# Load BERT tokenizer and model 'bert-base-uncased' is the most common in NLP tasks

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

### 3. Tokenize and encode sentence, output is token id and token attention

In [50]:
# Input text

text = ["GeeksforGeeks is a computer science portal!" , "This is another sentence"]    

# Tokenize and encode text using batch_encode_plus
# The function returns a dictionary containing the token IDs and attention masks
encoding = tokenizer.batch_encode_plus( text,# List of input texts
    padding=True,              # Pad to the maximum sequence length
    truncation=True,           # Truncate to the maximum sequence length if necessary
    return_tensors='pt',      # Return PyTorch tensors
    add_special_tokens=True    # Add special tokens CLS and SEP
)

input_ids = encoding['input_ids']  # Token IDs
# print input IDs
print("Input ID:", input_ids)
attention_mask = encoding['attention_mask']  # Attention mask
# print attention mask
print("Attention mask:", attention_mask)

Input ID: tensor([[  101, 29294, 22747, 21759,  4402,  5705,  2003,  1037,  3274,  2671,
          9445,   999,   102],
        [  101,  2023,  2003,  2178,  6251,   102,     0,     0,     0,     0,
             0,     0,     0]])
Attention mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]])


### 4. Generate word embeddings using bert model, inputs are token ids and token attentions

In [52]:
# Generate embeddings using BERT model
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    word_embeddings = outputs.last_hidden_state  # This contains the embeddings

# Output the shape of word embeddings
print("Shape of Word Embeddings:", word_embeddings.shape)

# each token has 768 sized vector embedding associated with it, 13 is the number of tokens and 1 refers to the sentence 

Shape of Word Embeddings: torch.Size([2, 13, 768])


### 5. You can visualize the tokens, corresponding embeddings and decoded text for reference

In [53]:
# Decode the token IDs back to text
decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
#print decoded text
print("Decoded Text:", decoded_text)

# Tokenize the text again for reference
tokenized_text = tokenizer.tokenize(decoded_text)
#print tokenized text
print("tokenized Text:", tokenized_text)

# Encode the text
encoded_text = tokenizer.encode(text, return_tensors='pt')  # Returns a tensor
# Print encoded text
print("Encoded Text:", encoded_text)


Decoded Text: geeksforgeeks is a computer science portal!
tokenized Text: ['geek', '##sf', '##org', '##ee', '##ks', 'is', 'a', 'computer', 'science', 'portal', '!']
Encoded Text: tensor([[101, 100, 100, 102]])


In [58]:
# Print word embeddings for each token
for token, embedding in zip(tokenized_text, word_embeddings[0]):
    print("Token:", token)
   # print("Embedding:", embedding)

Token: geek
Token: ##sf
Token: ##org
Token: ##ee
Token: ##ks
Token: is
Token: a
Token: computer
Token: science
Token: portal
Token: !


### 6. Sentence level embedding for various NLP tasks

In [60]:
# Compute the average of word embeddings to get the sentence embedding
sentence_embedding = word_embeddings.mean(dim=1)  # Average pooling along the sequence length dimension

# Print the sentence embedding
print("Sentence Embedding")
#print(sentence_embedding)

# Output the shape of the sentence embedding
print("Shape of Sentence Embedding:", sentence_embedding.shape)

Sentence Embedding
Shape of Sentence Embedding: torch.Size([2, 768])


### 7. NLP task : Check the cosine similarity with another sentence

In [62]:
sentence_embedding[0]

tensor([-2.5059e-02,  2.2726e-01,  3.1609e-01,  1.8410e-01,  1.8076e-01,
        -7.8699e-01,  4.3954e-02,  5.0712e-01, -3.1514e-01, -1.4152e-01,
         1.5725e-01, -6.4614e-01, -4.3374e-02,  5.9408e-01, -3.6698e-01,
        -7.7098e-03, -2.3622e-01,  1.8715e-01, -2.4054e-01, -2.5308e-01,
        -1.3704e-01, -2.8994e-01, -7.4470e-02,  4.0289e-01,  2.3488e-01,
         5.8243e-03,  1.5201e-01, -4.3085e-02, -1.1464e-01,  7.0477e-02,
         2.7549e-01,  4.6293e-01,  8.8530e-02,  9.7691e-02, -5.5874e-01,
        -1.9001e-01, -3.6973e-01,  1.1568e-01,  1.1165e-02,  6.9461e-01,
         1.7377e-02, -3.4202e-01,  1.2402e-01,  2.3606e-01, -2.3257e-01,
        -2.6446e-01, -1.1868e-01,  1.1279e-01, -4.4310e-02, -4.6620e-02,
        -4.8528e-01,  4.1347e-01, -2.0555e-01, -5.7425e-02,  2.5642e-01,
         6.4752e-01,  8.9251e-02, -3.9436e-01, -2.4395e-01, -2.1154e-02,
         1.6391e-01,  2.9419e-01, -7.6012e-02, -2.8513e-01, -3.5262e-03,
         3.7497e-02,  7.5735e-02,  5.4062e-01, -7.4

In [75]:
# Example sentence for similarity comparison
example_sentence = "GeeksforGeeks is a technology website"

# Tokenize and encode the example sentence
example_encoding = tokenizer.batch_encode_plus(
    [example_sentence],
    padding=True,
    truncation=True,
    return_tensors='pt',
    add_special_tokens=True
)
example_input_ids = example_encoding['input_ids']
example_attention_mask = example_encoding['attention_mask']

# Generate embeddings for the example sentence
with torch.no_grad():
    example_outputs = model(example_input_ids, attention_mask=example_attention_mask)
    example_sentence_embedding = example_outputs.last_hidden_state.mean(dim=1)

# Compute cosine similarity between the original sentence embedding and the example sentence embedding
similarity_score = cosine_similarity(sentence_embedding[0].view(1,768), example_sentence_embedding)

# Print the similarity score
print("Cosine Similarity Score:", similarity_score[0][0])


Cosine Similarity Score: 0.90691483


array([[0.90691483]], dtype=float32)