# Embedding Strategies

In [None]:
from pathlib import Path
import pymupdf
from tqdm.auto import tqdm

from experiments.rag_chunking_strategy_part2 import client

In [None]:
try:  # inside a script
	BASE_DIR = Path(__file__).resolve().parent.parent
except NameError:  # inside a notebook
	BASE_DIR = Path.cwd().parent

In [None]:
print(f"Project root set to: {BASE_DIR}")

In [None]:
pdf_path = BASE_DIR / "data" / "human_nutrition_text.pdf"

In [None]:
import re
from typing import Dict, List, Union

In [None]:
def text_formatter(text: str) -> str:
	"""Performs minor text formatting."""
	import re

	cleaned_text = re.sub(
		r"\s+", " ", text
	)  # Replace multiple whitespace with single space
	cleaned_text = cleaned_text.strip()
	return cleaned_text

In [None]:
def open_and_read_pdf(file_path: Union[str, Path]) -> Union[List[Dict], None]:
	"""
	Opens a pdf file and reads its content page by page, and collects statistics.
	Parameters:
	    file_path (str | Path): The path to the pdf file to be opened and read.
	Returns:
	    list[dict]: A list of dictionaries containing the page number, character count, word count, sentence count, token count, and extracted text for each page.
	"""
	if not Path(file_path).exists():
		raise FileNotFoundError(f"PDF file not found: {file_path}")
	try:
		doc = pymupdf.open(file_path)
		pages_and_texts = []
		for page_number, page in tqdm(enumerate(doc)):
			text = page.get_text()
			if not text or not text.strip():  # Skip empty pages
				continue
			if text and text.strip():
				text = text_formatter(text)
				sentences = re.split(r"[.!?]+", text)  # Simple sentence splitter
				sentence_count = len(
					[s for s in sentences if s.strip()]
				)  # Count non-empty sentences
				pages_and_texts.append(
					{
						"page_number": page_number - 41,
						"page_char_count": len(text),
						"page_word_count": len(text.split()),
						"page_sentence_count_raw": sentence_count,
						"page_token_count": int(len(text) / 4),
						"text": text,
					}
				)
		return pages_and_texts
	except Exception as e:
		print(f"Error reading PDF file: {e}")
		return None

In [None]:
pages_and_texts = open_and_read_pdf(file_path=pdf_path)
if pages_and_texts:
	print(pages_and_texts[:2])

In [None]:
import polars as pl

df = pl.DataFrame(pages_and_texts)
summary = df.describe()
numeric_cols = [c for c, t in summary.schema.items() if t.is_numeric()]
summary = summary.with_columns(
	[pl.col(c).round(2) if c in numeric_cols else pl.col(c) for c in summary.columns]
)
summary

## Chunking Preparation 
- first step is to add sentences from page as new key value pair to `pages_and_texts` data structure
- divide the sentences into two chunks
    - **chunk-1**: 10 sentences
    - **chunk-2**: rest of the sentences

In [None]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

In [None]:
# testing spacy to identify sentences from paragraph
test_para = """
Simple string splitting methods are unreliable for sentence segmentation. They often fail on common text elements like abbreviations. For example, 'Dr. Smith' would be incorrectly split. Decimal values such as 3.14 also cause problems. spaCy's sentencizer component solves these issues effectively. It uses a trained model to identify true sentence boundaries. This model correctly handles abbreviations and decimals. It also manages quoted speech and ellipses properly. This provides a robust foundation for further text analysis. Therefore, spaCy offers a significant advantage over basic methods."""
list_sentences = [sent.text.strip() for sent in nlp(test_para).sents]
print(f"# of sentences identified: {len(list_sentences)}")
for i, sent in enumerate(list_sentences, 1):
	print(f"{i}:- {sent}")

In [None]:
# running on all pages
for item in tqdm(pages_and_texts):
	item["sentences"] = list(nlp(item["text"]).sents)

	# make sure all sentences are strings
	item["sentences"] = [str(sentence) for sentence in item["sentences"]]

	# Count the sentences
	item["page_sentence_count_spacy"] = len(item["sentences"])

In [None]:
df = pl.DataFrame(pages_and_texts)
summary = df.describe()
numeric_cols = [c for c, t in summary.schema.items() if t.is_numeric()]
summary = summary.with_columns(
	[pl.col(c).round(2) if c in numeric_cols else pl.col(c) for c in summary.columns]
)
summary

In [None]:
import random
import pprint

if pages_and_texts:
	pprint.pp(pages_and_texts[random.randint(0, 1179)])

In [None]:
# now splitting sentences into chunks
num_sentence_chunk_size = 10


def split_sentence_list(input_list: list[str], slice_size: int) -> list[list[str]]:
	"""Function takes list of sentences as input and slices it based in slice size
	Args:
	    input_list (list[str]): list of sentences
	    slice_size (int): a number to slice the input list by
	Returns:
	    list[list[str]]: two list created based on list slice functionality
	Example:
	    An input list of 17 sentences with 10 as slice size will return two list:
	    1) list of first 10 sentences, 2) list of remaining 7 sentences
	"""
	return [
		input_list[i : i + slice_size] for i in range(0, len(input_list), slice_size)
	]

In [None]:
# add chunks to pages_and_texts
for item in tqdm(pages_and_texts):
	item["sentence_chunks"] = split_sentence_list(
		input_list=item["sentences"], slice_size=num_sentence_chunk_size
	)
	item["num_chunks"] = len(item["sentence_chunks"])

In [None]:
import random
import pprint

if pages_and_texts:
	pprint.pp(pages_and_texts[random.randint(0, 1179)])

In [None]:
df = pl.DataFrame(pages_and_texts)
summary = df.describe()
numeric_cols = [c for c, t in summary.schema.items() if t.is_numeric()]
summary = summary.with_columns(
	[pl.col(c).round(2) if c in numeric_cols else pl.col(c) for c in summary.columns]
)
summary

## shifting from pages and texts to pages and chunks
- currently we have pages and text (group of sentences) with multiple chunks with page as the item parent
- we would now shift to chunk as the parent item with all the information as is. (assumption is size should atleast double from 1179 to ~2358)

In [None]:
# splitting chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
	for sentence_chunk in item["sentence_chunks"]:
		chunk_dict = {}
		chunk_dict["page_number"] = item["page_number"]

		# join the sentence together to make a paragraph like structure.
		joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
		joined_sentence_chunk = re.sub(
			r"\.([A-Z])", r". \1", joined_sentence_chunk
		)  # ".A" -> ". A" for any full-stop/capital letter combod
		chunk_dict["sentence_chunk"] = joined_sentence_chunk

		# Stats
		chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
		chunk_dict["chunk_word_count"] = sum(
			1 for word in joined_sentence_chunk.split()
		)
		chunk_dict["chunk_token_count"] = round(len(joined_sentence_chunk) / 4, 2)
		pages_and_chunks.append(chunk_dict)
print(f"We have {len(pages_and_chunks)} chunks now.")

In [None]:
random.sample(pages_and_chunks, k=1)

In [None]:
df = pl.DataFrame(pages_and_chunks)
summary = df.describe()
numeric_cols = [c for c, t in summary.schema.items() if t.is_numeric()]
summary = summary.with_columns(
	[pl.col(c).round(2) if c in numeric_cols else pl.col(c) for c in summary.columns]
)
summary

In [None]:
# remove chunks which are less than 30 make the data a bit cleaner
# the chunks which are less than 30 are usually related to footnotes, which can be seen by below code
min_token_length = 30
sample_df = df.filter(pl.col("chunk_token_count") <= min_token_length).sample(10)
for row in sample_df.iter_rows(named=True):
	print(
		f"Chunk Token Count: {row['chunk_token_count']} - Chunk: {row['sentence_chunk']}"
	)

In [None]:
# as seen above the chunk which are less 30 in length provide little information so removing them from dataframe
pages_and_chunks_over_min_token_len = df.filter(
	pl.col("chunk_token_count") > min_token_length
).to_dicts()
pages_and_chunks_over_min_token_len[:2]

In [None]:
df = pl.DataFrame(pages_and_chunks_over_min_token_len)
summary = df.describe()
numeric_cols = [c for c, t in summary.schema.items() if t.is_numeric()]
summary = summary.with_columns(
	[pl.col(c).round(2) if c in numeric_cols else pl.col(c) for c in summary.columns]
)
summary

In [None]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("google/embeddinggemma-300m", token=os.getenv("HF_TOKEN"))

documents = [
	"Venus is often called Earth's twin because of its similar size and proximity.",
	"Mars, known for its reddish appearance, is often referred to as the Red Planet.",
	"Jupiter, the largest planet in our solar system, has a prominent red spot.",
	"Saturn, famous for its rings, is sometimes mistaken for the Red Planet.",
]
document_embeddings = model.encode_document(documents)
print(document_embeddings.shape)
embedding_dict = dict(zip(documents, document_embeddings))

# for document, embedding in embedding_dict.items():
#     print(f"Document: {document}")
#     print(f"Embedding: {embedding}")
#     print("---")

In [None]:
%%time
for item in tqdm(pages_and_chunks_over_min_token_len):
	item["embedding"] = model.encode_document(
		item["sentence_chunk"], normalize_embeddings=True
	)

In [None]:
%%time
model.to("cpu")
for item in tqdm(pages_and_chunks_over_min_token_len):
	item["embedding"] = model.encode_document(item["sentence_chunk"])

In [None]:
%%time
for item in tqdm(pages_and_chunks_over_min_token_len):
	item["embedding"] = model.encode_document(
		item["sentence_chunk"], normalize_embeddings=True
	)

In [None]:
from openai import OpenAI

openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


def get_embedding(text, model="text-embedding-3-small"):
	text = text.replace("\n", " ")
	return client.embeddings.create(input=[text], model=model).data[0].embedding

In [None]:
%%time
for item in tqdm(pages_and_chunks_over_min_token_len):
	item["embedding"] = get_embedding(item["sentence_chunk"])

In [None]:
model = SentenceTransformer("Qwen/Qwen3-Embedding-8B", token=os.getenv("HF_TOKEN"))
documents = [
	"Venus is often called Earth's twin because of its similar size and proximity.",
	"Mars, known for its reddish appearance, is often referred to as the Red Planet.",
	"Jupiter, the largest planet in our solar system, has a prominent red spot.",
	"Saturn, famous for its rings, is sometimes mistaken for the Red Planet.",
]
document_embeddings = model.encode_document(documents)
print(document_embeddings.shape)
embedding_dict = dict(zip(documents, document_embeddings))

In [None]:
%%time
model = SentenceTransformer("google/embeddinggemma-300m", token=os.getenv("HF_TOKEN"))
for item in tqdm(pages_and_chunks_over_min_token_len):
	item["embedding"] = model.encode_document(
		item["sentence_chunk"], normalize_embeddings=True
	)

In [None]:
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [None]:
%%time
text_chunk_embeddings = model.encode(
	text_chunks, batch_size=50, normalize_embeddings=True, show_progress_bar=True
)

In [None]:
%%time
text_chunk_embeddings = model.encode_document(
	text_chunks, batch_size=32, normalize_embeddings=True, show_progress_bar=True
)