In [1]:
from transformers import BertTokenizer

# Load BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example text
text = "The quick brown fox jumps over the lazy dog."

# Tokenize with BERT tokenizer
bert_inputs = bert_tokenizer(text, return_tensors='pt')

print("Token IDs:", bert_inputs['input_ids'])

attention_mask = bert_inputs['attention_mask']
print("Attention Mask:", attention_mask)

token_type_ids = bert_inputs['token_type_ids']
print("Token Type IDs:", token_type_ids)

# Print the tokens themselves to understand the splits
tokens = bert_tokenizer.convert_ids_to_tokens(bert_inputs['input_ids'][0])
print("Tokens:", tokens)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Token IDs: tensor([[  101,  1996,  4248,  2829,  4419, 14523,  2058,  1996, 13971,  3899,
          1012,   102]])
Attention Mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Token Type IDs: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Tokens: ['[CLS]', 'the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.', '[SEP]']


In [2]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Define the text
text = "The quick brown fox jumps over the lazy dog."

# Tokenize the text
inputs = tokenizer(text, return_tensors='pt')

# Obtain the embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Extract the last hidden state (embeddings)
last_hidden_states = outputs.last_hidden_state

# Print the dimensions of the embeddings
print("Shape of the last hidden state (embeddings):", last_hidden_states.shape)

# Print embeddings for each token along with their vector dimension
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
for token, embedding in zip(tokens, last_hidden_states[0]):
    print(f"Token: {token}, Embedding Dimension: {embedding.shape}, Embedding (first 5 components): {embedding[:5]}...")  # Display first 5 components for brevity

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Shape of the last hidden state (embeddings): torch.Size([1, 12, 768])
Token: [CLS], Embedding Dimension: torch.Size([768]), Embedding (first 5 components): tensor([-0.3608,  0.2271, -0.3030, -0.1880,  0.0475])...
Token: the, Embedding Dimension: torch.Size([768]), Embedding (first 5 components): tensor([-0.3276, -0.3762, -0.5044,  0.0098,  0.9037])...
Token: quick, Embedding Dimension: torch.Size([768]), Embedding (first 5 components): tensor([-0.4000, -0.4212,  0.4903,  0.0033,  0.4567])...
Token: brown, Embedding Dimension: torch.Size([768]), Embedding (first 5 components): tensor([ 0.1209, -0.2728,  0.5550, -0.1874,  0.7759])...
Token: fox, Embedding Dimension: torch.Size([768]), Embedding (first 5 components): tensor([ 0.0323, -0.2305, -0.1756, -0.1121,  0.5692])...
Token: jumps, Embedding Dimension: torch.Size([768]), Embedding (first 5 components): tensor([ 0.2432, -0.0648,  0.3022,  0.2046,  0.7072])...
Token: over, Embedding Dimension: torch.Size([768]), Embedding (first 5 comp

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_sentence_embedding(text):
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    sentence_embedding = torch.mean(last_hidden_states, dim=1).numpy()
    return sentence_embedding

# Example texts
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "A fast brown fox leaps over a sleepy dog.",
    "This sentence is completely different from the others."
]

# Generate embeddings for texts
embeddings = [get_sentence_embedding(text) for text in texts]
print(embeddings)


# Query text
query_text = "The quick red fox jumps over the lazy dog."
query_embedding = get_sentence_embedding(query_text)

# Compute cosine similarities
similarities = cosine_similarity(query_embedding, np.vstack(embeddings))

# Print query text
print (f"Query text: {query_text}")

# Print similarities
for i, text in enumerate(texts):
    print(f"Similarity with '{text}': {similarities[0][i]}")