# Retrival logic discussion

In [None]:
import os
from pathlib import Path

import polars as pl
import torch
from dotenv import find_dotenv, load_dotenv
from sentence_transformers import SentenceTransformer, util

load_dotenv(find_dotenv())

In [None]:
try:  # inside a script
	BASE_DIR = Path(__file__).resolve().parent.parent
except NameError:  # inside a notebook
	BASE_DIR = Path.cwd().parent

In [None]:
# load the chunked and embedded text
dataframe_path = pdf_path = BASE_DIR / "data" / "text_chunk_and_embeddings_df.parquet"

In [None]:
# reading parquet file back for further processing
text_chunks_and_embedding_df_load = pl.read_parquet(dataframe_path)

In [None]:
# load and setup embedding model
model = SentenceTransformer("google/embeddinggemma-300m", token=os.getenv("HF_TOKEN"))

### let's write code for code similarity based on dot product, then sort the result based on similarity score

In [None]:
query = "macronutrients functions"

In [None]:
query_embedding = model.encode(query, normalize_embeddings=True)

In [None]:
text_chunks_and_embedding_df_load.head()

In [None]:
summary = text_chunks_and_embedding_df_load.describe()
numeric_cols = [c for c, t in summary.schema.items() if t.is_numeric()]
summary = summary.with_columns(
	[pl.col(c).round(2) if c in numeric_cols else pl.col(c) for c in summary.columns]
)
summary

In [None]:
embeddings = text_chunks_and_embedding_df_load["embedding"].to_list()
print(len(embeddings))

In [None]:
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()
print(
	f"Time to get scores on {len(text_chunks_and_embedding_df_load['embedding'])} embeddings: {end_time - start_time} seconds."
)

In [None]:
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

In [None]:
# helper function for printing
import textwrap


def print_wrapped(text, wrap_length=80):
	wrapped_text = textwrap.fill(text, wrap_length)
	print(wrapped_text)

In [None]:
print(f"Query: {query}")
print("Results: \n")
for score, idx in zip(
	top_results_dot_product[0].numpy(), top_results_dot_product[1].numpy()
):
	print(f"Score: {score:.4f}")
	print("Text: \n")
	print_wrapped(text_chunks_and_embedding_df_load["sentence_chunk"][int(idx)])
	print(f"Page Number: {text_chunks_and_embedding_df_load['page_number'][int(idx)]}")
	print("\n" + "-" * 80 + "\n")

## the values are low here because we are using normalized vector which eliminates the magnitude part which is not used in cosine similarity score which but helps boost the score faster

In [None]:
# creating final semantic search pipeline
def retrieve_relevant_resources(
	query: str,
	embeddings: list,
	model: SentenceTransformer,
	n_resources_to_return: int,
	print_time: bool = True,
) -> (torch.Tensor, torch.Tensor):
	"""Embeds the query and retrieves the top n_resources_to_return most relevant resources's score and index.
	Args:
	    query (str): The query to search for.
	    embeddings (list): List of embeddings to search for the query in.
	    model (SentenceTransformer): The SentenceTransformer model to use for embedding.
	    n_resources_to_return (int): The number of resources to return.
	    print_time (bool): print the time taken to retrieve the resources.
	Returns:
	    None
	"""
	start_time = timer()
	query_embedding = model.encode(query, normalize_embeddings=True)
	dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
	scores, indices = torch.topk(dot_scores, k=n_resources_to_return)
	end_time = timer()
	if print_time:
		print(
			f"Time to get scores on {len(embeddings)} embeddings: {end_time - start_time} seconds."
		)
	return scores, indices

In [None]:
def print_top_results_and_scores(
	query: str,
	embeddings: list,
	pages_and_chunks: list[dict] = text_chunks_and_embedding_df_load,
	n_reources_to_return: int = 5,
):
	"""This function prints the top n_reources_to_return most relevant resources's score and index."""
	scores, indices = retrieve_relevant_resources(
		query=query,
		embeddings=embeddings,
		model=model,
		n_resources_to_return=n_reources_to_return,
	)
	print(f"Query: {query}")
	print("Results: \n")
	for score, idx in zip(scores.numpy(), indices.numpy()):
		print(f"Score: {score:.4f}")
		print("Text: \n")
		print_wrapped(pages_and_chunks["sentence_chunk"][int(idx)])
		print(f"Page Number: {pages_and_chunks['page_number'][int(idx)]}")
		print("-" * 80)

In [None]:
query = "symptoms of pellagra"
scores, indices = retrieve_relevant_resources(
	query=query, embeddings=embeddings, model=model, n_resources_to_return=5
)
scores, indices

In [None]:
print_top_results_and_scores(query=query, embeddings=embeddings)