In [29]:
import torch
from transformers import AutoTokenizer, AutoModel

In [30]:
sentences = [
    "Europe is the richest continent in the world",
    "Weather of Europe is very cold.",
    "Europe is the hub of global business, resulting trades and commerce flourish in Europe.",
    "Africa the poorest continent of all.",
    "Due to the presence of deserts, Africa's economic condition is very shubby."
]

In [31]:
model_checkpoint = "sentence-transformers/all-MiniLM-L6-v2"

In [32]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


In [33]:
model = AutoModel.from_pretrained(model_checkpoint)

In [34]:
encoded_input = tokenizer(sentences, padding=True, truncation = True, return_tensors = "pt")

In [35]:
with torch.no_grad():
    model_output = model(**encoded_input)

In [36]:
token_embeddings = model_output.last_hidden_state
token_embeddings.size()

torch.Size([5, 19, 384])

In [37]:
import torch
import torch.nn.functional as f

In [38]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = (attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float())
    return torch.sum(token_embeddings * input_mask_expanded,1)/torch.clamp(input_mask_expanded.sum(1), min = 1e-9)
    

In [39]:
input_mask_expanded = encoded_input["attention_mask"].unsqueeze(-1).expand(token_embeddings.size())
input_mask_expanded.size()

torch.Size([5, 19, 384])

In [40]:
encoded_input["attention_mask"].size()

torch.Size([5, 19])

In [41]:
encoded_input["attention_mask"].unsqueeze(-1).size()

torch.Size([5, 19, 1])

In [42]:
token_embeddings.size()

torch.Size([5, 19, 384])

In [43]:
sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])

In [44]:
sentence_embeddings = f.normalize(sentence_embeddings, p =2, dim = 1)

In [45]:
print(f"Shape of the sentence embeddings are :{sentence_embeddings.size()}")

Shape of the sentence embeddings are :torch.Size([5, 384])


In [46]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [47]:
sentence_embeddings = sentence_embeddings.detach().numpy()

In [48]:
sentence_embeddings.shape[0]

5

In [49]:
scores = np.zeros((sentence_embeddings.shape[0],sentence_embeddings.shape[0]))
scores

array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [50]:
for idx in range(sentence_embeddings.shape[0]):
    scores[idx,:] = cosine_similarity([sentence_embeddings[idx]], sentence_embeddings)[0]

In [51]:
scores

array([[1.        , 0.41478124, 0.66580451, 0.58141041, 0.39146572],
       [0.41478124, 1.00000024, 0.34884676, 0.22602443, 0.25337166],
       [0.66580451, 0.34884676, 1.        , 0.22559901, 0.22889572],
       [0.58141041, 0.22602443, 0.22559901, 1.00000012, 0.60745049],
       [0.39146575, 0.25337166, 0.22889572, 0.60745049, 0.99999988]])

In [52]:
from datasets import load_dataset

In [53]:
squad = load_dataset("squad", split = "validation[:100]")

In [54]:
def get_embeddings(text_lists):
    encoded_input = tokenizer(text_lists, padding = True, truncation = True, return_tensors = "pt")
    encoded_input = {k: v.to("cpu") for k,v in encoded_input.items()}
    with torch.no_grad():
        model_output = model(**encoded_input)
    return mean_pooling(model_output, encoded_input["attention_mask"])

In [55]:
squad_with_embeddings = squad.map(lambda x:{"embeddings" : get_embeddings(x["context"]).cpu().numpy()[0]})

  0%|          | 0/100 [00:00<?, ?ex/s]

In [59]:
from datasets import faiss

In [60]:
squad_with_embeddings.add_faiss_index(column = "embeddings")
question = "What is the capital of Bangladesh?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape


In [None]:
scores, samples = squad_with_embeddings.get_nearest_examples("embeddings", question_embedding, k =3)