#### Text embeddings represent text as vectors, we can use metrics like cosine similarity to compare how close tow embeddings are
#### However, each token is represented by one vector, we want one vector per sentence

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel

sentences = [
    "I took my dog for a walk.",
    "Today is going to rain.",
    "I took my cat for a walk."
]
model_ckpt = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    model_output = model(**encoded_input)
print(model_output.keys(), "\n", model_output)
print(f"Token embedding shape: {model_output.last_hidden_state.shape}")
# [3, 10, 384] represents the [num_sentences, num_tokens, embed_dim]

  from .autonotebook import tqdm as notebook_tqdm


odict_keys(['last_hidden_state', 'pooler_output']) 
 BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-3.8393e-02, -2.8853e-01,  4.0027e-01,  ...,  3.5841e-01,
          -2.9464e-01, -1.3763e-01],
         [ 1.0028e-01,  6.0142e-01,  6.0682e-01,  ..., -4.2398e-02,
          -1.3764e+00, -8.1835e-01],
         [ 3.7000e-01,  9.1386e-01, -1.4837e-02,  ...,  2.8944e-01,
          -1.0217e+00,  6.4255e-02],
         ...,
         [ 3.6804e-01, -5.6256e-01,  3.8511e-01,  ..., -5.2118e-01,
          -5.6799e-01,  1.6809e-01],
         [ 1.2226e-01,  6.1453e-01,  5.5021e-01,  ...,  5.7623e-01,
          -1.1041e+00,  5.8898e-02],
         [ 1.6646e-01,  4.3578e-01,  7.0070e-01,  ...,  2.9473e-01,
          -5.6696e-01, -2.0016e-01]],

        [[-6.2539e-02,  3.6616e-01,  6.2801e-01,  ...,  1.6939e-01,
          -2.1909e-01,  7.0204e-02],
         [-2.6119e-02,  5.0957e-01,  1.0327e+00,  ..., -8.2783e-01,
          -6.0066e-01,  7.5721e-01],
         [ 3.2615e-02,  1.46

#### The sentence vector can be created using mean pooling, the mean is taken on the sentence level by averaging the embedded token value in each sentence 

In [2]:
import torch
import torch.nn.functional as F

print("attention mask shape:", encoded_input["attention_mask"].shape, ", tokenized input shape:", encoded_input["input_ids"].shape, ", model output shape:", model_output.last_hidden_state.shape)

def mean_pooling(model_output, attention_mask):
    expanded_attention_mask = attention_mask.unsqueeze(dim=-1).expand(model_output.last_hidden_state.shape)
    return torch.sum(model_output.last_hidden_state * expanded_attention_mask, dim=1) / torch.clamp(expanded_attention_mask.sum(1), min=1e-9)

sentence_embedding = mean_pooling(model_output=model_output, attention_mask=encoded_input["attention_mask"])
print(f"sentence embedding shape: {sentence_embedding.shape}")

attention mask shape: torch.Size([3, 10]) , tokenized input shape: torch.Size([3, 10]) , model output shape: torch.Size([3, 10, 384])
sentence embedding shape: torch.Size([3, 384])


#### Calculate the cosine similarity

In [3]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

sentence_embedding = sentence_embedding.numpy()
scores = np.zeros((sentence_embedding.shape[0], sentence_embedding.shape[0]))
for idx in range(sentence_embedding.shape[0]):
    scores[idx, :] = cosine_similarity([sentence_embedding[idx]], sentence_embedding)[0]
print(scores)

[[0.99999976 0.13776848 0.83245271]
 [0.13776848 0.99999994 0.1419073 ]
 [0.83245265 0.1419073  0.99999988]]


#### Same trick can be applied to measure similarity of query against a corpus of docs
<div><img src="image/semantic_search.png" width=400 ></div>

In [40]:
from datasets import load_dataset
from transformers import AutoModel, AutoTokenizer
import torch

squad = load_dataset("squad", split="validation[:100]")
print(squad)
model_ckpt = "sentence-transformers/all-MiniLM-L6-v2"
model = AutoModel.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = model.to("cuda")

def get_embeddings(text_list):
    encoded_input = tokenizer(text_list, padding=True, truncation=True, return_tensors="pt")
    encoded_input = {k: v.to("cuda") for k, v in encoded_input.items()}
    with torch.no_grad():
        model_output = model(**encoded_input)
    return mean_pooling(model_output=model_output, attention_mask=encoded_input["attention_mask"])

squad_with_embeddings = squad.map(lambda x: {"embeddings": get_embeddings(x["context"]).cpu().numpy()[0]})
# take 0 index as the model output has a starting 1 dimension represent batch
print(squad_with_embeddings)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 100
})




Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'embeddings'],
    num_rows: 100
})


In [50]:
import pandas as pd

question = "Who headlined the halftime show for Super Bowl 50?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
print(question_embedding.shape)

scores, samples = squad_with_embeddings.get_nearest_examples("embeddings", question_embedding, k=30)
pd.DataFrame(samples["context"]).rename(columns={0:"context"}).join(pd.DataFrame(scores).rename(columns={0:"scores"}))

(1, 384)


Unnamed: 0,context,scores
0,"CBS broadcast Super Bowl 50 in the U.S., and c...",23.663601
1,"CBS broadcast Super Bowl 50 in the U.S., and c...",23.663601
2,"CBS broadcast Super Bowl 50 in the U.S., and c...",23.663601
3,"CBS broadcast Super Bowl 50 in the U.S., and c...",23.663601
4,"CBS broadcast Super Bowl 50 in the U.S., and c...",23.663601
5,"CBS broadcast Super Bowl 50 in the U.S., and c...",23.663601
6,"CBS broadcast Super Bowl 50 in the U.S., and c...",23.663601
7,"CBS broadcast Super Bowl 50 in the U.S., and c...",23.663601
8,"CBS broadcast Super Bowl 50 in the U.S., and c...",23.663601
9,"CBS broadcast Super Bowl 50 in the U.S., and c...",23.663601
