# Embedding Strategies

In [1]:
from pathlib import Path
import pymupdf
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
try:  # inside a script
	BASE_DIR = Path(__file__).resolve().parent.parent
except NameError:  # inside a notebook
	BASE_DIR = Path.cwd().parent

In [3]:
print(f"Project root set to: {BASE_DIR}")

Project root set to: /Users/tejaspancholi/Developer/python/vizuara


In [4]:
pdf_path = BASE_DIR / "data" / "human_nutrition_text.pdf"

In [5]:
import re
from typing import Dict, List, Union

In [6]:
def text_formatter(text: str) -> str:
	"""Performs minor text formatting."""
	import re

	cleaned_text = re.sub(
		r"\s+", " ", text
	)  # Replace multiple whitespace with single space
	cleaned_text = cleaned_text.strip()
	return cleaned_text

In [7]:
def open_and_read_pdf(file_path: Union[str, Path]) -> Union[List[Dict], None]:
	"""
	Opens a pdf file and reads its content page by page, and collects statistics.
	Parameters:
	    file_path (str | Path): The path to the pdf file to be opened and read.
	Returns:
	    list[dict]: A list of dictionaries containing the page number, character count, word count, sentence count, token count, and extracted text for each page.
	"""
	if not Path(file_path).exists():
		raise FileNotFoundError(f"PDF file not found: {file_path}")
	try:
		doc = pymupdf.open(file_path)
		pages_and_texts = []
		for page_number, page in tqdm(enumerate(doc)):
			text = page.get_text()
			if not text or not text.strip():  # Skip empty pages
				continue
			if text and text.strip():
				text = text_formatter(text)
				sentences = re.split(r"[.!?]+", text)  # Simple sentence splitter
				sentence_count = len(
					[s for s in sentences if s.strip()]
				)  # Count non-empty sentences
				pages_and_texts.append(
					{
						"page_number": page_number - 41,
						"page_char_count": len(text),
						"page_word_count": len(text.split()),
						"page_sentence_count_raw": sentence_count,
						"page_token_count": int(len(text) / 4),
						"text": text,
					}
				)
		return pages_and_texts
	except Exception as e:
		print(f"Error reading PDF file: {e}")
		return None

In [8]:
pages_and_texts = open_and_read_pdf(file_path=pdf_path)
if pages_and_texts:
	print(pages_and_texts[:2])

1208it [00:00, 1247.00it/s]

[{'page_number': -41, 'page_char_count': 29, 'page_word_count': 4, 'page_sentence_count_raw': 1, 'page_token_count': 7, 'text': 'Human Nutrition: 2020 Edition'}, {'page_number': -39, 'page_char_count': 308, 'page_word_count': 42, 'page_sentence_count_raw': 1, 'page_token_count': 77, 'text': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE'}]





In [9]:
import polars as pl

df = pl.DataFrame(pages_and_texts)
summary = df.describe()
numeric_cols = [c for c, t in summary.schema.items() if t.is_numeric()]
summary = summary.with_columns(
	[pl.col(c).round(2) if c in numeric_cols else pl.col(c) for c in summary.columns]
)
summary

statistic,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
str,f64,f64,f64,f64,f64,str
"""count""",1179.0,1179.0,1179.0,1179.0,1179.0,"""1179"""
"""null_count""",0.0,0.0,0.0,0.0,0.0,"""0"""
"""mean""",561.27,1148.06,176.2,14.81,286.62,
"""std""",348.9,529.51,83.18,9.41,132.39,
"""min""",-41.0,15.0,3.0,1.0,3.0,"""(Source: UNICEF, 1986, How to …"
"""25%""",259.0,764.0,117.0,9.0,191.0,
"""50%""",561.0,1207.0,187.0,13.0,301.0,
"""75%""",862.0,1577.0,240.0,20.0,394.0,
"""max""",1166.0,2271.0,393.0,82.0,567.0,"""• food insecure with severe hu…"


## Chunking Preparation 
- first step is to add sentences from page as new key value pair to `pages_and_texts` data structure
- divide the sentences into two chunks
    - **chunk-1**: 10 sentences
    - **chunk-2**: rest of the sentences

In [10]:
from spacy.lang.en import English

nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x14c21acd0>

In [11]:
# testing spacy to identify sentences from paragraph
test_para = """
Simple string splitting methods are unreliable for sentence segmentation. They often fail on common text elements like abbreviations. For example, 'Dr. Smith' would be incorrectly split. Decimal values such as 3.14 also cause problems. spaCy's sentencizer component solves these issues effectively. It uses a trained model to identify true sentence boundaries. This model correctly handles abbreviations and decimals. It also manages quoted speech and ellipses properly. This provides a robust foundation for further text analysis. Therefore, spaCy offers a significant advantage over basic methods."""
list_sentences = [sent.text.strip() for sent in nlp(test_para).sents]
print(f"# of sentences identified: {len(list_sentences)}")
for i, sent in enumerate(list_sentences, 1):
	print(f"{i}:- {sent}")

# of sentences identified: 10
1:- Simple string splitting methods are unreliable for sentence segmentation.
2:- They often fail on common text elements like abbreviations.
3:- For example, 'Dr. Smith' would be incorrectly split.
4:- Decimal values such as 3.14 also cause problems.
5:- spaCy's sentencizer component solves these issues effectively.
6:- It uses a trained model to identify true sentence boundaries.
7:- This model correctly handles abbreviations and decimals.
8:- It also manages quoted speech and ellipses properly.
9:- This provides a robust foundation for further text analysis.
10:- Therefore, spaCy offers a significant advantage over basic methods.


In [12]:
# running on all pages
for item in tqdm(pages_and_texts):
	item["sentences"] = list(nlp(item["text"]).sents)

	# make sure all sentences are strings
	item["sentences"] = [str(sentence) for sentence in item["sentences"]]

	# Count the sentences
	item["page_sentence_count_spacy"] = len(item["sentences"])

100%|██████████| 1179/1179 [00:00<00:00, 2831.33it/s]


In [13]:
df = pl.DataFrame(pages_and_texts)
summary = df.describe()
numeric_cols = [c for c, t in summary.schema.items() if t.is_numeric()]
summary = summary.with_columns(
	[pl.col(c).round(2) if c in numeric_cols else pl.col(c) for c in summary.columns]
)
summary

statistic,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count_spacy
str,f64,f64,f64,f64,f64,str,f64,f64
"""count""",1179.0,1179.0,1179.0,1179.0,1179.0,"""1179""",1179.0,1179.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,"""0""",0.0,0.0
"""mean""",561.27,1148.06,176.2,14.81,286.62,,,10.57
"""std""",348.9,529.51,83.18,9.41,132.39,,,6.16
"""min""",-41.0,15.0,3.0,1.0,3.0,"""(Source: UNICEF, 1986, How to …",,1.0
"""25%""",259.0,764.0,117.0,9.0,191.0,,,5.0
"""50%""",561.0,1207.0,187.0,13.0,301.0,,,11.0
"""75%""",862.0,1577.0,240.0,20.0,394.0,,,15.0
"""max""",1166.0,2271.0,393.0,82.0,567.0,"""• food insecure with severe hu…",,28.0


In [14]:
import random
import pprint

if pages_and_texts:
	pprint.pp(pages_and_texts[random.randint(0, 1179)])

{'page_number': 767,
 'page_char_count': 205,
 'page_word_count': 28,
 'page_sentence_count_raw': 6,
 'page_token_count': 51,
 'text': 'An interactive or media element has been excluded from this version '
         'of the text. You can view it online here: '
         'http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=432 Pacific '
         'Based Dietary Guidelines | 767',
 'sentences': ['An interactive or media element has been excluded from this '
               'version of the text.',
               'You can view it online here: http://pressbooks.oer.hawaii.edu/ '
               'humannutrition2/?p=432 Pacific Based Dietary Guidelines | 767'],
 'page_sentence_count_spacy': 2}


In [15]:
# now splitting sentences into chunks
num_sentence_chunk_size = 10


def split_sentence_list(input_list: list[str], slice_size: int) -> list[list[str]]:
	"""Function takes list of sentences as input and slices it based in slice size
	Args:
	    input_list (list[str]): list of sentences
	    slice_size (int): a number to slice the input list by
	Returns:
	    list[list[str]]: two list created based on list slice functionality
	Example:
	    An input list of 17 sentences with 10 as slice size will return two list:
	    1) list of first 10 sentences, 2) list of remaining 7 sentences
	"""
	return [
		input_list[i : i + slice_size] for i in range(0, len(input_list), slice_size)
	]

In [16]:
# add chunks to pages_and_texts
for item in tqdm(pages_and_texts):
	item["sentence_chunks"] = split_sentence_list(
		input_list=item["sentences"], slice_size=num_sentence_chunk_size
	)
	item["num_chunks"] = len(item["sentence_chunks"])

100%|██████████| 1179/1179 [00:00<00:00, 1415307.50it/s]


In [44]:
import random
import pprint

if pages_and_texts:
	pprint.pp(pages_and_texts[random.randint(0, 1179)])

{'page_number': 888,
 'page_char_count': 722,
 'page_word_count': 101,
 'page_sentence_count_raw': 9,
 'page_token_count': 180,
 'text': 'Learning Activities Technology Note: The second edition of the Human '
         'Nutrition Open Educational Resource (OER) textbook features '
         'interactive learning activities. These activities are available in '
         'the web-based textbook and not available in the downloadable '
         'versions (EPUB, Digital PDF, Print_PDF, or Open Document). Learning '
         'activities may be used across various mobile devices, however, for '
         'the best user experience it is strongly recommended that users '
         'complete these activities using a desktop or laptop computer and in '
         'Google Chrome. An interactive or media element has been excluded '
         'from this version of the text. You can view it online here: '
         'http://pressbooks.oer.hawaii.edu/ humannutrition2/?p=469 888 | '
         'Adolescence',
 'sen

In [17]:
df = pl.DataFrame(pages_and_texts)
summary = df.describe()
numeric_cols = [c for c, t in summary.schema.items() if t.is_numeric()]
summary = summary.with_columns(
	[pl.col(c).round(2) if c in numeric_cols else pl.col(c) for c in summary.columns]
)
summary

statistic,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count_spacy,sentence_chunks,num_chunks
str,f64,f64,f64,f64,f64,str,f64,f64,f64,f64
"""count""",1179.0,1179.0,1179.0,1179.0,1179.0,"""1179""",1179.0,1179.0,1179.0,1179.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,"""0""",0.0,0.0,0.0,0.0
"""mean""",561.27,1148.06,176.2,14.81,286.62,,,10.57,,1.56
"""std""",348.9,529.51,83.18,9.41,132.39,,,6.16,,0.61
"""min""",-41.0,15.0,3.0,1.0,3.0,"""(Source: UNICEF, 1986, How to …",,1.0,,1.0
"""25%""",259.0,764.0,117.0,9.0,191.0,,,5.0,,1.0
"""50%""",561.0,1207.0,187.0,13.0,301.0,,,11.0,,2.0
"""75%""",862.0,1577.0,240.0,20.0,394.0,,,15.0,,2.0
"""max""",1166.0,2271.0,393.0,82.0,567.0,"""• food insecure with severe hu…",,28.0,,3.0


## shifting from pages and texts to pages and chunks
- currently we have pages and text (group of sentences) with multiple chunks with page as the item parent
- we would now shift to chunk as the parent item with all the information as is. (assumption is size should atleast double from 1179 to ~2358)

In [20]:
# splitting chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts):
	for sentence_chunk in item["sentence_chunks"]:
		chunk_dict = {}
		chunk_dict["page_number"] = item["page_number"]

		# join the sentence together to make a paragraph like structure.
		joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
		joined_sentence_chunk = re.sub(
			r"\.([A-Z])", r". \1", joined_sentence_chunk
		)  # ".A" -> ". A" for any full-stop/capital letter combod
		chunk_dict["sentence_chunk"] = joined_sentence_chunk

		# Stats
		chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
		chunk_dict["chunk_word_count"] = sum(
			1 for word in joined_sentence_chunk.split()
		)
		chunk_dict["chunk_token_count"] = round(len(joined_sentence_chunk) / 4, 2)
		pages_and_chunks.append(chunk_dict)
print(f"We have {len(pages_and_chunks)} chunks now.")

100%|██████████| 1179/1179 [00:00<00:00, 55459.31it/s]

We have 1843 chunks now.





In [21]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 552,
  'sentence_chunk': 'holds the collagen fibers together and without sufficient levels in the body, collagen strands are weak and abnormal. (Figure 9.8 “The Role of Vitamin C in Collagen Synthesis”) Figure 9.8 The Role of Vitamin C in Collagen Synthesis Image by Allison Calabrese / CC BY 4.0 Vitamin C levels in the body are affected by the amount in the diet, which influences how much is absorbed and how much the kidney allows to be excreted, such that the higher the intake, the more vitamin C is excreted. Vitamin C is not stored in any significant amount in the body, but once it has reduced a free radical, it is very effectively regenerated and therefore it can exist in the body as a functioning antioxidant for many weeks. The classic condition associated with vitamin C deficiency is scurvy. The signs and symptoms of scurvy include skin disorders, bleeding gums, painful joints, weakness, depression, and increased susceptibility to infections. Scurvy is prevented b