In [1]:
from sentence_transformers import util, SentenceTransformer

embedding_model = SentenceTransformer('all-mpnet-base-v2', device='cpu')


  from tqdm.autonotebook import tqdm, trange


In [6]:
import json
import torch
import numpy as np

with open("../data/chunks_with_embeddings.json", "r") as f:
    chunks = json.load(f)

embeddings = torch.tensor([np.array(chunk['embedding']) for chunk in chunks], dtype=torch.float32)
embeddings[0]

  embeddings = torch.tensor([np.array(chunk['embedding']) for chunk in chunks], dtype=torch.float32)


tensor([ 6.7424e-02,  9.0228e-02, -5.0955e-03, -3.1755e-02,  7.3908e-02,
         3.5198e-02, -1.9799e-02,  4.6769e-02,  5.3573e-02,  5.0123e-03,
         3.3393e-02, -1.6223e-03,  1.7608e-02,  3.6265e-02, -3.1675e-04,
        -1.0712e-02,  1.5426e-02,  2.6218e-02,  2.7766e-03,  3.6494e-02,
        -4.4411e-02,  1.8936e-02,  4.9012e-02,  1.6402e-02, -4.8578e-02,
         3.1828e-03,  2.7299e-02, -2.0475e-03, -1.2283e-02, -7.2805e-02,
         1.2045e-02,  1.0730e-02,  2.1000e-03, -8.1777e-02,  2.6783e-06,
        -1.8143e-02, -1.2080e-02,  2.4717e-02, -6.2747e-02,  7.3544e-02,
         2.2162e-02, -3.2877e-02, -1.8010e-02,  2.2295e-02,  5.6137e-02,
         1.7951e-03,  5.2593e-02, -3.3174e-03, -8.3388e-03, -1.0628e-02,
         2.3192e-03, -2.2393e-02, -1.5301e-02, -9.9305e-03,  4.6532e-02,
         3.5747e-02, -2.5476e-02,  2.6369e-02,  3.7491e-03, -3.8268e-02,
         2.5833e-02,  4.1287e-02,  2.5818e-02,  3.3297e-02, -2.5178e-02,
         4.5152e-02,  4.4900e-04, -9.9662e-02,  4.9

In [7]:
query = "macronutrients"

query_embedding = embedding_model.encode(query, convert_to_tensor=True)

dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
top_results = torch.topk(dot_scores, k=5)
top_results

torch.return_types.topk(
values=tensor([0.7376, 0.6542, 0.6380, 0.6179, 0.6140]),
indices=tensor([  43,   48,   47, 1038,   57]))

In [14]:
import textwrap

def print_chunk(text, score, wrap_length=80):
    print(f"Score: {score:.2f}")
    print(textwrap.fill(text, width=wrap_length))
    print()

In [16]:

for idx, score in zip(top_results.indices, top_results.values):
    print_chunk(chunks[idx]['chunk'], score.item())

Score: 0.74
Macronutrients  Nutrients  that  are  needed  in  large  amounts  are  called
macronutrients. There are three classes of macronutrients:  carbohydrates,
lipids, and proteins. These can be metabolically  processed into cellular
energy. The energy from macronutrients  comes from their chemical bonds. This
chemical energy is  converted into cellular energy that is then utilized to
perform work,  allowing our bodies to conduct their basic functions. A unit of
measurement of food energy is the calorie. On nutrition food labels  the amount
given for “calories” is actually equivalent to each calorie  multiplied by one
thousand. A kilocalorie (one thousand calories,  denoted with a small “c”) is
synonymous with the “Calorie” (with a  capital “C”) on nutrition food labels.
Water is also a macronutrient in  the sense that you require a large amount of
it, but unlike the other  macronutrients, it does not yield calories.
Carbohydrates  Carbohydrates are molecules composed of carbon, h

## Similarity Search

In [29]:
import torch

def cosine_similarity(vec1, vec2):
    dot_product = torch.matmul(vec1, vec2.T).squeeze()
    norm1 = torch.sqrt(torch.sum(vec1 ** 2))
    norm2 = torch.sqrt(torch.sum(vec2 ** 2, dim=1))
    return dot_product / (norm1 * norm2)

sim_score = cosine_similarity(query_embedding, embeddings)
sim_score

tensor([0.4814, 0.4655, 0.4039,  ..., 0.3932, 0.3636, 0.3879])

In [36]:
def retrieve_chunks(query, embeddings, chunks, top_k):
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    #cosine similarity
    sim_score = cosine_similarity(query_embedding, embeddings)
    top_results = torch.topk(sim_score, k=top_k)
    top_chunks = []
    for idx, score in zip(top_results.indices, top_results.values):
        chunks[idx]['score'] = score.item()
        top_chunks.append(chunks[idx])
    
    return top_chunks

top_results = retrieve_chunks("macronutrients", embeddings, chunks, 5)
top_results

[{'page_number': 5,
  'chunk': 'Macronutrients  Nutrients  that  are  needed  in  large  amounts  are  called  macronutrients. There are three classes of macronutrients:  carbohydrates, lipids, and proteins. These can be metabolically  processed into cellular energy. The energy from macronutrients  comes from their chemical bonds. This chemical energy is  converted into cellular energy that is then utilized to perform work,  allowing our bodies to conduct their basic functions. A unit of  measurement of food energy is the calorie. On nutrition food labels  the amount given for “calories” is actually equivalent to each calorie  multiplied by one thousand. A kilocalorie (one thousand calories,  denoted with a small “c”) is synonymous with the “Calorie” (with a  capital “C”) on nutrition food labels. Water is also a macronutrient in  the sense that you require a large amount of it, but unlike the other  macronutrients, it does not yield calories.  Carbohydrates  Carbohydrates are molecule