# Retrival logic discussion

In [None]:
import os
import re
from pathlib import Path
from typing import Dict, List, Union

import polars as pl
import pymupdf
import torch
from dotenv import find_dotenv, load_dotenv
from sentence_transformers import SentenceTransformer, util
from tqdm.auto import tqdm

load_dotenv(find_dotenv())

In [None]:
try:  # inside a script
	BASE_DIR = Path(__file__).resolve().parent.parent
except NameError:  # inside a notebook
	BASE_DIR = Path.cwd().parent

In [None]:
# load the PDF to be parsed, chunked and embedded
pdf_path = BASE_DIR / "data" / "human_nutrition_text.pdf"

In [None]:
# helper function
def text_formatter(text: str) -> str:
	"""Performs minor text formatting."""
	import re

	cleaned_text = re.sub(
		r"\s+", " ", text
	)  # Replace multiple whitespace with single space
	cleaned_text = cleaned_text.strip()
	return cleaned_text

In [None]:
def open_and_read_pdf(file_path: Union[str, Path]) -> Union[List[Dict], None]:
	"""
	Opens a pdf file and reads its content page by page, and collects statistics.
	Parameters:
	    file_path (str | Path): The path to the pdf file to be opened and read.
	Returns:
	    list[dict]: A list of dictionaries containing the page number, character count, word count, sentence count, token count, and extracted text for each page.
	"""
	if not Path(file_path).exists():
		raise FileNotFoundError(f"PDF file not found: {file_path}")
	try:
		doc = pymupdf.open(file_path)
		pages_and_texts = []
		for page_number, page in tqdm(enumerate(doc)):
			text = page.get_text()
			if not text or not text.strip():  # Skip empty pages
				continue
			if text and text.strip():
				text = text_formatter(text)
				sentences = re.split(r"[.!?]+", text)  # Simple sentence splitter
				sentence_count = len(
					[s for s in sentences if s.strip()]
				)  # Count non-empty sentences
				pages_and_texts.append(
					{
						"page_number": page_number - 41,
						"page_char_count": len(text),
						"page_word_count": len(text.split()),
						"page_sentence_count_raw": sentence_count,
						"page_token_count": int(len(text) / 4),
						"text": text,
					}
				)
		return pages_and_texts
	except Exception as e:
		print(f"Error reading PDF file: {e}")
		return None

In [None]:
pages_and_texts = open_and_read_pdf(file_path=pdf_path)
if pages_and_texts:
	print(pages_and_texts[:2])

In [None]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

In [None]:
# running on all pages
for item in tqdm(pages_and_texts):
	item["sentences"] = list(nlp(item["text"]).sents)

	# make sure all sentences are strings
	item["sentences"] = [str(sentence) for sentence in item["sentences"]]

	# Count the sentences
	item["page_sentence_count_spacy"] = len(item["sentences"])

In [None]:
# now splitting sentences into chunks
num_sentence_chunk_size = 10


def split_sentence_list(input_list: list[str], slice_size: int) -> list[list[str]]:
	"""Function takes list of sentences as input and slices it based in slice size
	Args:
	    input_list (list[str]): list of sentences
	    slice_size (int): a number to slice the input list by
	Returns:
	    list[list[str]]: two list created based on list slice functionality
	Example:
	    An input list of 17 sentences with 10 as slice size will return two list:
	    1) list of first 10 sentences, 2) list of remaining 7 sentences
	"""
	return [
		input_list[i : i + slice_size] for i in range(0, len(input_list), slice_size)
	]

In [None]:
# add chunks to pages_and_texts
for item in tqdm(pages_and_texts):
	item["sentence_chunks"] = split_sentence_list(
		input_list=item["sentences"], slice_size=num_sentence_chunk_size
	)
	item["num_chunks"] = len(item["sentence_chunks"])

In [None]:
# splitting chunks into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
	for sentence_chunk in item["sentence_chunks"]:
		chunk_dict = {}
		chunk_dict["page_number"] = item["page_number"]

		# join the sentence together to make a paragraph like structure.
		joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
		joined_sentence_chunk = re.sub(
			r"\.([A-Z])", r". \1", joined_sentence_chunk
		)  # ".A" -> ". A" for any full-stop/capital letter combo
		chunk_dict["sentence_chunk"] = joined_sentence_chunk

		# Stats
		chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
		chunk_dict["chunk_word_count"] = sum(
			1 for word in joined_sentence_chunk.split()
		)
		chunk_dict["chunk_token_count"] = round(len(joined_sentence_chunk) / 4, 2)
		pages_and_chunks.append(chunk_dict)
print(f"We have {len(pages_and_chunks)} chunks now.")

In [None]:
# converting data structure to dataframe
df = pl.DataFrame(pages_and_chunks)

In [None]:
# chunks which are less 30 in length provide little information, so removing them from the dataframe
min_token_length = 30
pages_and_chunks_over_min_token_len = df.filter(
	pl.col("chunk_token_count") > min_token_length
).to_dicts()
pages_and_chunks_over_min_token_len[:2]

In [None]:
# load and setup embedding model
model = SentenceTransformer("google/embeddinggemma-300m", token=os.getenv("HF_TOKEN"))

In [None]:
# converting to tensor as required by SentenceTransformer dot env functionality
for item in tqdm(pages_and_chunks_over_min_token_len):
	embedding_array = model.encode_document(
		item["sentence_chunk"], convert_to_tensor=True
	)
	item["embedding"] = embedding_array.tolist()

In [None]:
pages_and_chunks_over_min_token_len[:2]

In [None]:
df = pl.DataFrame(pages_and_chunks_over_min_token_len)
df.head()

### let's write code for code similarity based on dot product, then sort the result based on similarity score

In [None]:
query = "micronutrients functions"

In [None]:
query_embedding = model.encode(query, convert_to_tensor=True)

In [None]:
embeddings = df["embedding"].to_list()
embeddings = torch.tensor(embeddings)
print(len(embeddings))

In [None]:
len(df["embedding"])

In [None]:
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()
print(
	f"Time to get scores on {len(df['embedding'])} embeddings: {end_time - start_time} seconds."
)

In [None]:
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

In [None]:
# helper function for printing
import textwrap


def print_wrapped(text, wrap_length=80):
	wrapped_text = textwrap.fill(text, wrap_length)
	print(wrapped_text)

In [None]:
print(f"Query: {query}")
print("Results:")
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
	print(f"Score: {score:.4f}")
	print("Text: \n")
	print_wrapped(df["sentence_chunk"][int(idx)])
	print(f"Page Number: {df['page_number'][int(idx)]}")
	print("\n" + "-" * 80 + "\n")

In [None]:
# creating final semantic search pipeline
def retrieve_relevant_resources(
	query: str,
	embeddings: list,
	model: SentenceTransformer,
	n_resources_to_return: int,
	print_time: bool = True,
) -> (torch.Tensor, torch.Tensor):
	"""Embeds the query and retrieves the top n_resources_to_return most relevant resources's score and index.
	Args:
	    query (str): The query to search for.
	    embeddings (list): List of embeddings to search for the query in.
	    model (SentenceTransformer): The SentenceTransformer model to use for embedding.
	    n_resources_to_return (int): The number of resources to return.
	    print_time (bool): print the time taken to retrieve the resources.
	Returns:
	    None
	"""
	start_time = timer()
	query_embedding = model.encode(query, convert_to_tensor=True)
	dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
	scores, indices = torch.topk(dot_scores, k=n_resources_to_return)
	end_time = timer()
	if print_time:
		print(
			f"Time to get scores on {len(embeddings)} embeddings: {end_time - start_time} seconds."
		)
	return scores, indices

In [None]:
def print_top_results_and_scores(
	query: str,
	embeddings: list,
	pages_and_chunks: list[dict] = df,
	n_reources_to_return: int = 5,
):
	"""This function prints the top n_reources_to_return most relevant resources's score and index."""
	scores, indices = retrieve_relevant_resources(
		query=query,
		embeddings=embeddings,
		model=model,
		n_resources_to_return=n_reources_to_return,
	)
	print(f"Query: {query}")
	print("Results: \n")
	for score, idx in zip(scores, indices):
		print(f"Score: {score:.4f}")
		print("Text: \n")
		print_wrapped(pages_and_chunks["sentence_chunk"][int(idx)])
		print(f"Page Number: {pages_and_chunks['page_number'][int(idx)]}")
		print("-" * 80)

In [None]:
query = "symptoms of pellagra"
scores, indices = retrieve_relevant_resources(
	query=query, embeddings=embeddings, model=model, n_resources_to_return=5
)
scores, indices

In [None]:
print_top_results_and_scores(query=query, embeddings=embeddings)