# RAG chunking strategy


In [3]:
import time
import requests
from pathlib import Path
import pymupdf
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
try:  # inside a script
	BASE_DIR = Path(__file__).resolve().parent.parent
except NameError:  # inside a notebook
	BASE_DIR = Path.cwd().parent

In [5]:
print(f"Project root set to: {BASE_DIR}")

Project root set to: /Users/tejaspancholi/Developer/python/vizuara


In [6]:
pdf_path = BASE_DIR / "data" / "human_nutrition_text.pdf"

In [7]:
def download_pdf_requests(
	url: str, dest: Path, timeout: int = 30, max_retries: int = 3
) -> None:
	"""Download a PDF file from URL with progress tracking and error handling."""
	dest.parent.mkdir(parents=True, exist_ok=True)
	for attempt in range(max_retries):
		try:
			response = requests.get(url, stream=True, timeout=timeout)
			response.raise_for_status()

			content_type = response.headers.get("content-type", "").lower()
			if "pdf" not in content_type:
				raise ValueError(f"Invalid content type: {content_type}")
			total = int(response.headers.get("content-length", 0))
			with tqdm(
				total=total, unit="iB", unit_scale=True, desc="Downloading PDF"
			) as t:
				with dest.open("wb") as f:
					for chunk in response.iter_content(chunk_size=8192):
						if chunk:
							f.write(chunk)
							t.update(len(chunk))
			print(f"\nSuccessfully downloaded PDF to {dest}")
			return
		except requests.exceptions.RequestException as e:
			print(f"Download failed: {e}")
			if attempt == max_retries - 1:
				raise
			time.sleep(2**attempt)

In [8]:
if not pdf_path.is_file():
	download_pdf_requests(
		"https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf",
		pdf_path,
	)

In [9]:
def text_formatter(text: str) -> str:
	"""Performs minor text formatting."""
	import re

	cleaned_text = re.sub(
		r"\s+", " ", text
	)  # Replace multiple whitespace with single space
	cleaned_text = cleaned_text.strip()
	return cleaned_text

In [10]:
import re
from typing import Dict, List, Union
from pathlib import Path


def open_and_read_pdf(file_path: Union[str, Path]) -> Union[List[Dict], None]:
	"""
	Opens a pdf file and reads its content page by page, and collects statistics.
	Parameters:
	    file_path (str | Path): The path to the pdf file to be opened and read.
	Returns:
	    list[dict]: A list of dictionaries containing the page number, character count, word count, sentence count, token count, and extracted text for each page.
	"""
	if not Path(file_path).exists():
		raise FileNotFoundError(f"PDF file not found: {file_path}")
	try:
		doc = pymupdf.open(file_path)
		pages_and_texts = []
		for page_number, page in tqdm(enumerate(doc)):
			text = page.get_text()
			if not text or not text.strip():  # Skip empty pages
				continue
			if text and text.strip():
				text = text_formatter(text)
				sentences = re.split(r"[.!?]+", text)  # Simple sentence splitter
				sentence_count = len(
					[s for s in sentences if s.strip()]
				)  # Count non-empty sentences
				pages_and_texts.append(
					{
						"page_number": page_number - 41,
						"page_char_count": len(text),
						"page_word_count": len(text.split()),
						"page_sentence_count_raw": sentence_count,
						"page_token_count": int(len(text) / 4),
						"text": text,
					}
				)
		return pages_and_texts
	except Exception as e:
		print(f"Error reading PDF file: {e}")
		return None

In [11]:
pages_and_texts = open_and_read_pdf(file_path=pdf_path)
if pages_and_texts:
	print(pages_and_texts[:2])

1208it [00:00, 1249.78it/s]

[{'page_number': -41, 'page_char_count': 29, 'page_word_count': 4, 'page_sentence_count_raw': 1, 'page_token_count': 7, 'text': 'Human Nutrition: 2020 Edition'}, {'page_number': -39, 'page_char_count': 308, 'page_word_count': 42, 'page_sentence_count_raw': 1, 'page_token_count': 77, 'text': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE'}]





In [10]:
import random

random.sample(pages_and_texts, k=3)

[{'page_number': 222,
  'page_char_count': 1568,
  'page_word_count': 232,
  'page_sentence_count_raw': 10,
  'page_token_count': 392,
  'text': 'There is no good evidence that chronic caffeine exposure increases blood pressure chronically in people without hypertension. Some have hypothesized that caffeine elevates calcium excretion and therefore could potentially harm bones. The scientific consensus at this time is that caffeine minimally affects calcium levels and intake is not associated with any increased risk for osteoporosis or the incidence of fractures in most women. Although the effect of caffeine on calcium excretion is small, postmenopausal women with risk factors for osteoporosis may want to make sure their dietary caffeine intake is low or moderate and not excessive. The Caffeine Myth A diuretic refers to any substance that elevates the normal urine output above that of drinking water. Caffeinated beverages are commonly believed to be dehydrating due to their diuretic eff

In [11]:
import polars as pl

df = pl.DataFrame(pages_and_texts)
summary = df.describe()
numeric_cols = [c for c, t in summary.schema.items() if t.is_numeric()]
summary = summary.with_columns(
	[pl.col(c).round(2) if c in numeric_cols else pl.col(c) for c in summary.columns]
)
summary

statistic,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
str,f64,f64,f64,f64,f64,str
"""count""",1179.0,1179.0,1179.0,1179.0,1179.0,"""1179"""
"""null_count""",0.0,0.0,0.0,0.0,0.0,"""0"""
"""mean""",561.27,1148.06,176.2,14.81,286.62,
"""std""",348.9,529.51,83.18,9.41,132.39,
"""min""",-41.0,15.0,3.0,1.0,3.0,"""(Source: UNICEF, 1986, How to …"
"""25%""",259.0,764.0,117.0,9.0,191.0,
"""50%""",561.0,1207.0,187.0,13.0,301.0,
"""75%""",862.0,1577.0,240.0,20.0,394.0,
"""max""",1166.0,2271.0,393.0,82.0,567.0,"""• food insecure with severe hu…"


## Method 1: Fixed size chunking

In [15]:
def chunk_text(text: str, chunk_size: int = 500) -> List[str]:
	"""Splits text into chunks of specified size with overlap.
	Args:
	    text (str): The text to be chunked.
	    chunk_size (int): The size of each chunk in words.
	Returns:
	    List[str]: A list of text chunks.
	"""
	chunks = []
	current_chunk = ""
	words = text.split()

	for word in words:
		if len(current_chunk) + len(word) + 1 <= chunk_size:
			current_chunk += word + " "
		else:
			chunks.append(current_chunk.strip())
			current_chunk = word + " "

	if current_chunk:
		chunks.append(current_chunk.strip())

	return chunks

In [16]:
def chunk_pdf_pages(pages_and_texts: list, chunk_size: int = 500) -> List[Dict]:
	"""Chunks the text of each page into smaller segments.
	Args:
	    pages_and_texts (list): List of dictionaries containing page information and text.
	    chunk_size (int): The size of each chunk in words.
	Returns:
	    List[Dict]: A list of dictionaries with chunked text and associated metadata.
	"""
	chunked_data = []

	for page in pages_and_texts:
		page_number = page["page_number"]
		text = page["text"]
		chunks = chunk_text(text, chunk_size)

		for i, chunk in enumerate(chunks):
			chunked_data.append(
				{
					"page_number": page_number,
					"chunk_index": i,
					"chunk_text": chunk,
					"chunk_word_count": len(chunk.split()),
					"chunk_char_count": len(chunk),
					"chunk_token_count": int(len(chunk) / 4),
				}
			)

	return chunked_data

In [17]:
chunked_pages = chunk_pdf_pages(pages_and_texts, chunk_size=500)

In [19]:
print(f"Total chunks created: {len(chunked_pages)}")
print(
	f"25th chunk (page {chunked_pages[24]['page_number']}): {chunked_pages[24]['chunk_text'][:200]}..."
)

Total chunks created: 3321
25th chunk (page -28): Introduction University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 515 Fat-Soluble Vitamins University of Hawai‘i at Mānoa Food Science and Human Nutritio...


In [20]:
import random
import textwrap


def _scattered_indices(n: int, k: int, jitter_frac: float = 0.08) -> list[int]:
	"""Generate k scattered indices over range n with some jitter.
	Args:
	    n (int): The total number of items.
	    k (int): The number of indices to generate.
	    jitter_frac (float): Fractional jitter to apply to each index.
	Returns:
	    list[int]: A list of k scattered indices.
	"""
	if k <= 0:
		return []
	if k == 1:
		return [random.randrange(n)]
	anchors = [int(round(i * (n - 1) / (k - 1))) for i in range(k)]
	out, seen = [], set()
	radius = max(1, int(jitter_frac * n))
	for a in anchors:
		lo, hi = max(0, a - radius), min(n - 1, a + radius)
		j = random.randint(lo, hi)
		if j not in seen:
			out.append(j)
			seen.add(j)
	while len(out) < k:
		r = random.randrange(n)
		if r not in seen:
			out.append(r)
			seen.add(r)
	return out

In [21]:
def _draw_boxed_chunk(c: dict, wrap_at: int = 96) -> str:
	"""Draws a boxed representation of a text chunk.
	Args:
	    c (dict): A dictionary containing chunk metadata and text.
	    wrap_at (int): The maximum width for text wrapping.
	Returns:
	    str: A string representation of the boxed chunk.
	"""
	header = (
		f" Chunk p{c['page_number']} - idx {c['chunk_index']}  | "
		f"chars {c['chunk_char_count']} - words {c['chunk_word_count']} - tokens {c['chunk_token_count']}"
	)
	# wrap body text, avoid breaking long words awkwardly
	wrapped_lines = textwrap.wrap(
		c["chunk_text"], width=wrap_at, break_long_words=False, replace_whitespace=False
	)
	content_width = max([0, *map(len, wrapped_lines)])
	box_width = max(len(header), content_width + 2)  # +2 - side padding

	top = "┌" + "─" * (box_width) + "┐"
	hline = "|" + header.ljust(box_width) + "|"
	sep = "├" + "─" * (box_width) + "┤"
	body = "\n".join(
		"│ " + line.ljust(box_width - 2) + " │" for line in wrapped_lines
	) or ("|" + "".ljust(box_width - 2) + " |")
	bottom = "└" + "─" * (box_width) + "┘"
	return "\n".join([top, hline, sep, body, bottom])

In [22]:
def show_random_chunks(
	pages_and_texts: list, chunk_size: int = 500, k: int = 5, seed: int | None = 42
) -> None:
	"""Displays n random chunks from the chunked pages.
	Args:
	    pages_and_texts (list): List of tuples (page_number, text) for each page.
	    chunk_size (int): Size of each text chunk.
	    k (int): Number of random chunks to display.
	    seed (int | None): Random seed for reproducibility.
	"""
	if seed is not None:
		random.seed(seed)

	# Chunk the text from each page
	all_chunks = []
	all_chunks = chunk_pdf_pages(pages_and_texts, chunk_size)
	if not all_chunks:
		print("No chunks available to display.")
		return
	indices = _scattered_indices(len(all_chunks), k)
	print(
		f"Showing {len(indices)} scattered random chunks out of {len(all_chunks)} total chunks:\n"
	)
	for i, idx in enumerate(indices, 1):
		print(f"#{i}")
		print(_draw_boxed_chunk(all_chunks[idx]))
		print()  # extra newline between chunks

In [23]:
assert pages_and_texts is not None
show_random_chunks(pages_and_texts, chunk_size=500, k=5, seed=42)

Showing 5 scattered random chunks out of 3321 total chunks:

#1
┌──────────────────────────────────────────────────────────────────────────────────────────────────┐
| Chunk p-9 - idx 0  | chars 290 - words 49 - tokens 72                                            |
├──────────────────────────────────────────────────────────────────────────────────────────────────┤
│ Skylar Hara Skylar Hara is an undergraduate student student in the Tropical Agriculture and the  │
│ Environment program at the University of Hawai‘i at Mānoa. She has a growing love for plants and │
│ hopes to go to graduate school to conduct research in the future. About the Contributors |       │
│ xxxiii                                                                                           │
└──────────────────────────────────────────────────────────────────────────────────────────────────┘

#2
┌─────────────────────────────────────────────────────────────────────────────────────────────────┐
| Chunk p198 - idx 0  | 

### here you might have seen some chunk are smaller than 500 even though we have mentioned as chunk size as 500, its because the processes is happening at page level and it can happen once a couple of chunk are done at the page, rest of text is smaller than 500.

## Method 2: Semantic chunking

In [24]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk

nltk.download("punkt", quiet=True)

True

In [25]:
semantic_model = SentenceTransformer("all-MiniLM-L6-v2")

In [39]:
def semantic_chunk_text(
	text: str, similarity_threshold: float = 0.8, max_tokens: int = 500
) -> list:
	"""Splits text into semantically coherent chunks based on sentence embeddings.
	Args:
	    text (str): The text to be chunked.
	    similarity_threshold (float): Cosine similarity threshold to determine chunk boundaries.
	    max_tokens (int): Maximum number of tokens per chunk.
	Returns:
	    list: A list of semantically coherent text chunks.
	"""
	sentences = nltk.sent_tokenize(text)
	if not sentences:
		return []

	# ensure embeddings is a numpy array of shape (n_sentences, dim)
	embeddings = semantic_model.encode(sentences, convert_to_numpy=True)
	if not isinstance(embeddings, np.ndarray):
		embeddings = np.array(embeddings)

	chunks = []
	current_chunk = [sentences[0]]
	current_indices = [0]

	for i in range(1, len(sentences)):
		current_embedding = np.mean(embeddings[current_indices], axis=0)
		next_embedding = embeddings[i]
		sim = float(
			cosine_similarity(
				current_embedding.reshape(1, -1), next_embedding.reshape(1, -1)
			)[0, 0]
		)

		chunk_token_count = len(" ".join(current_chunk)) // 4

		if sim >= similarity_threshold and chunk_token_count < max_tokens:
			current_chunk.append(sentences[i])
			current_indices.append(i)
		else:
			chunks.append(" ".join(current_chunk))
			current_chunk = [sentences[i]]
			current_indices = [i]
	if current_chunk:
		chunks.append(" ".join(current_chunk))
	return chunks

In [40]:
def semantic_chunk_pdf_pages(
	pages_and_texts: list, similarity_threshold: float = 0.8, max_tokens: int = 500
) -> list[dict]:
	"""Chunks the text of each page into semantically coherent segments.
	Args:
	    pages_and_texts (list): List of dictionaries containing page information and text.
	    similarity_threshold (float): Cosine similarity threshold to determine chunk boundaries.
	    max_tokens (int): Maximum number of tokens per chunk.
	Returns:
	    list[dict]: A list of dictionaries with semantically chunked text and associated metadata.
	"""
	all_chunks = []

	for page in tqdm(pages_and_texts, desc="Semantic chunking pages"):
		page_number = page["page_number"]
		text = page["text"]
		chunks = semantic_chunk_text(text, similarity_threshold, max_tokens)

		for i, chunk in enumerate(chunks):
			all_chunks.append(
				{
					"page_number": page_number,
					"chunk_index": i,
					"chunk_text": chunk,
					"chunk_word_count": len(chunk.split()),
					"chunk_char_count": len(chunk),
					"chunk_token_count": int(len(chunk) / 4),
				}
			)

	return all_chunks

In [41]:
import nltk

nltk.download("punkt_tab")
semantic_chunk_pages = semantic_chunk_pdf_pages(
	pages_and_texts, similarity_threshold=0.75, max_tokens=500
)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/tejaspancholi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Semantic chunking pages: 100%|██████████| 1179/1179 [00:12<00:00, 93.12it/s]


In [42]:
print(f"Total semantic chunks created: {len(semantic_chunk_pages)}")
print(
	f"25th semantic chunk (page {semantic_chunk_pages[24]['page_number']}): {semantic_chunk_pages[24]['chunk_text'][:200]}..."
)

Total semantic chunks created: 12027
25th semantic chunk (page -29): Alcohol Introduction University of Hawai‘i at Mānoa Food Science and Human Nutrition Program and Human Nutrition Program 431 Alcohol Metabolism University of Hawai‘i at Mānoa Food Science and Human Nu...


### Number of chunks have drastically increased is because there is very limited similiarity meaning between pages or paras, leading to smaller chunk sizes

In [44]:
import random


def _scattered_indices(n: int, k: int, jitter_frac: float = 0.08) -> list[int]:
	"""Evenely spaced anchors + random jitters + indices scattered across [0,n-1]
	Args:
	    n (int): The total number of items.
	    k (int): The number of indices to generate.
	    jitter_frac (float): Fractional jitter to apply to each index.
	Returns:
	    list[int]: A list of k scattered indices.
	"""
	if k <= 0:
		return []
	if k == 1:
		return [random.randrange(n)]
	anchors = [int(round(i * (n - 1) / (k - 1))) for i in range(k)]
	out, seen = [], set()
	radius = max(1, int(jitter_frac * n))
	for a in anchors:
		lo, hi = max(0, a - radius), min(n - 1, a + radius)
		j = random.randint(lo, hi)
		if j not in seen:
			out.append(j)
			seen.add(j)
	while len(out) < k:
		r = random.randrange(n)
		if r not in seen:
			out.append(r)
			seen.add(r)
	return out

In [45]:
def _draw_boxed_chunk(c: dict, wrap_at: int = 96) -> str:
	"""Draws a boxed representation of a text chunk.
	Args:
	    c (dict): A dictionary containing chunk metadata and text.
	    wrap_at (int): The maximum width for text wrapping.
	Returns:
	    str: A string representation of the boxed chunk.
	"""
	approx_tokens = c.get("chunk_token_count", len(c["chunk_text"]) / 4)
	header = (
		f" Chunk p{c['page_number']} - idx {c['chunk_index']}  | "
		f"chars {c['chunk_char_count']} - words {c['chunk_word_count']} - tokens {round(approx_tokens, 2)}"
	)
	# wrap body text, avoid breaking long words awkwardly
	wrapped_lines = textwrap.wrap(
		c["chunk_text"], width=wrap_at, break_long_words=False, replace_whitespace=False
	)
	content_width = max([0, *map(len, wrapped_lines)])
	box_width = max(len(header), content_width + 2)  # +2 - side padding

	top = "┌" + "─" * (box_width) + "┐"
	hline = "|" + header.ljust(box_width) + "|"
	sep = "├" + "─" * (box_width) + "┤"
	body = "\n".join(
		"│ " + line.ljust(box_width - 2) + " │" for line in wrapped_lines
	) or ("|" + "".ljust(box_width - 2) + " |")
	bottom = "└" + "─" * (box_width) + "┘"
	return "\n".join([top, hline, sep, body, bottom])

In [46]:
def show_semantic_random_chunks(
	semantic_chunked_pages: list[dict], k: int = 5, seed: int | None = 42
) -> None:
	"""Displays n random chunks from the semantic chunked pages.
	Args:
	    pages_and_texts (list): List of tuples (page_number, text) for each page.
	    k (int): Number of random chunks to display.
	    seed (int | None): Random seed for reproducibility.
	"""
	if seed is not None:
		random.seed(seed)

	n = len(semantic_chunked_pages)
	if n == 0:
		print("No semantic chunks available to display.")
		return
	idxs = _scattered_indices(n, k)
	print(
		f"Showing {len(idxs)} scattered random semantic chunks out of {n} total chunks:\n"
	)
	for i, idx in enumerate(idxs, 1):
		print(f"#{i}")
		print(_draw_boxed_chunk(semantic_chunked_pages[idx]))
		print()  # extra newline between chunks

In [47]:
assert semantic_chunk_pages is not None
show_semantic_random_chunks(semantic_chunk_pages, k=5, seed=42)

Showing 5 scattered random semantic chunks out of 12027 total chunks:

#1
┌──────────────────────────────────────────────────────────────────────────────────────────────────┐
| Chunk p56 - idx 7  | chars 197 - words 31 - tokens 49                                            |
├──────────────────────────────────────────────────────────────────────────────────────────────────┤
│ Observing the connection between the beverage and longevity, Dr. Elie Metchnikoff began his      │
│ research on beneficial bacteria and the longevity of life that led to his book, The Prolongation │
│ of Life.                                                                                         │
└──────────────────────────────────────────────────────────────────────────────────────────────────┘

#2
┌────────────────────────────────────────────────────────┐
| Chunk p232 - idx 8  | chars 54 - words 10 - tokens 13  |
├────────────────────────────────────────────────────────┤
│ As a result, the liver rapidly conve

## Method 3 - Recursive Chunking

### How it works

- if chunk is smaller than `max_chunk_size`, its chunked as is
- if its larger, it will try to split by `\n\n` which is usually between two sections
- if that also does not work it will try to split by `\n` which is usually a para
-  if para is still too large, its split by sentence
- repeat the process recursively
- *note* the list of separator can be changed, by supplying it during method invocation. 

In [13]:
import nltk

nltk.download("punkt", quiet=True)

True

In [14]:
def recursive_chunk_text(
	text: str, max_chunk_size: int = 1000, min_chunk_size: int = 100
) -> list:
	"""Recursively chunks text into smaller segments based on size constraints.
	Tries splitting by sections, then newlines, then sentences.
	Args:
	    text (str): The text to be chunked.
	    max_chunk_size (int): Maximum size of each chunk in characters.
	    min_chunk_size (int): Minimum size of each chunk in characters.
	Returns:
	    list: A list of text chunks.
	"""

	def split_chunk(chunk: str) -> list:
		if len(chunk) <= max_chunk_size:
			return [chunk.strip()]
		# first try splitting by sections (double newlines)
		sections = chunk.split("\n\n")
		if len(sections) > 1:
			result = []
			for section in sections:
				if section.strip():
					result.extend(split_chunk(section.strip()))
			return result
		# next try splitting by single newlines
		sections = chunk.split("\n")
		if len(sections) > 1:
			result = []
			for section in sections:
				if section.strip():
					result.extend(split_chunk(section.strip()))
			return result
		# finally split by sentences
		sentences = nltk.sent_tokenize(chunk)
		chunks, current_chunk, current_size = [], [], 0
		for sentence in sentences:
			if current_size + len(sentence) > max_chunk_size:
				if current_chunk:
					chunks.append(" ".join(current_chunk).strip())
				current_chunk, current_size = [sentence], len(sentence)
			else:
				current_chunk.append(sentence)
				current_size += len(sentence)
		if current_chunk:
			chunks.append(" ".join(current_chunk).strip())
		return chunks

	return split_chunk(text)

In [15]:
def recursive_chunk_pdf_pages(
	pages_and_texts: list, max_chunk_size: int = 1000, min_chunk_size: int = 100
) -> list[dict]:
	"""Chunks the text of each page into smaller segments using recursive chunking.
	Args:
	    pages_and_texts (list): List of dictionaries containing page information and text.
	    max_chunk_size (int): Maximum size of each chunk in characters.
	    min_chunk_size (int): Minimum size of each chunk in characters.
	Returns:
	    list[dict]: A list of dictionaries with recursively chunked text and associated metadata.
	"""
	all_chunks = []

	for page in tqdm(pages_and_texts, desc="Recursive chunking pages"):
		page_number = page["page_number"]
		text = page["text"]
		chunks = recursive_chunk_text(text, max_chunk_size, min_chunk_size)

		for i, chunk in enumerate(chunks):
			all_chunks.append(
				{
					"page_number": page_number,
					"chunk_index": i,
					"chunk_text": chunk,
					"chunk_word_count": len(chunk.split()),
					"chunk_char_count": len(chunk),
					"chunk_token_count": int(len(chunk) / 4),
				}
			)

	return all_chunks

In [16]:
recursive_chunked_pages = recursive_chunk_pdf_pages(
	pages_and_texts, max_chunk_size=1000, min_chunk_size=100
)
print(f"Total recursive chunks created: {len(recursive_chunked_pages)}")
print(
	f"25th recursive chunk (page {recursive_chunked_pages[24]['page_number']}): {recursive_chunked_pages[24]['chunk_text'][:200]}..."
)

Recursive chunking pages: 100%|██████████| 1179/1179 [00:00<00:00, 14248.25it/s]

Total recursive chunks created: 1949
25th recursive chunk (page -17): Preface UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM ‘A‘ohe pau ka ‘ike i ka hālau ho‘okahi Knowledge isn’t taught in all one place This open acc...





In [None]:
import random


def _scattered_indices(n: int, k: int, jitter_frac: float = 0.08) -> list[int]:
	"""Generate k scattered indices over range n with some jitter.
	Args:
	    n (int): The total number of items.
	    k (int): The number of indices to generate.
	    jitter_frac (float): Fractional jitter to apply to each index.
	Returns:
	    list[int]: A list of k scattered indices.
	"""
	if k <= 0:
		return []
	if k == 1:
		return [random.randrange(n)]
	anchors = [int(round(i * (n - 1) / (k - 1))) for i in range(k)]
	out, seen = [], set()
	radius = max(1, int(jitter_frac * n))
	for a in anchors:
		lo, hi = max(0, a - radius), min(n - 1, a + radius)
		j = random.randint(lo, hi)
		if j not in seen:
			out.append(j)
			seen.add(j)
	while len(out) < k:
		r = random.randrange(n)
		if r not in seen:
			out.append(r)
			seen.add(r)
	return out


def _draw_boxed_chunk(c: dict, wrap_at: int = 96) -> str:
	"""Draws a boxed representation of a text chunk.
	Args:
	    c (dict): A dictionary containing chunk metadata and text.
	    wrap_at (int): The maximum width for text wrapping.
	Returns:
	    str: A string representation of the boxed chunk.
	"""
	approx_tokens = c.get("chunk_token_count", len(c["chunk_text"]) / 4)
	header = (
		f" Chunk p{c['page_number']} - idx {c['chunk_index']}  | "
		f"chars {c['chunk_char_count']} - words {c['chunk_word_count']} - tokens {round(approx_tokens, 2)}"
	)
	# wrap body text, avoid breaking long words awkwardly
	wrapped_lines = textwrap.wrap(
		c["chunk_text"], width=wrap_at, break_long_words=False, replace_whitespace=False
	)
	content_width = max([0, *map(len, wrapped_lines)])
	box_width = max(len(header), content_width + 2)  # +2 - side padding

	top = "┌" + "─" * (box_width) + "┐"
	hline = "|" + header.ljust(box_width) + "|"
	sep = "├" + "─" * (box_width) + "┤"
	body = "\n".join(
		"│ " + line.ljust(box_width - 2) + " │" for line in wrapped_lines
	) or ("|" + "".ljust(box_width - 2) + " |")
	bottom = "└" + "─" * (box_width) + "┘"
	return "\n".join([top, hline, sep, body, bottom])


def show_recursive_random_chunks(
	recursive_chunked_pages: list[dict], k: int = 5, seed: int | None = 42
) -> None:
	"""Displays n random chunks from the recursive chunked pages.
	Args:
	    pages_and_texts (list): List of tuples (page_number, text) for each page.
	    k (int): Number of random chunks to display.
	    seed (int | None): Random seed for reproducibility.
	"""
	if seed is not None:
		random.seed(seed)

	n = len(recursive_chunked_pages)
	if n == 0:
		print("No recursive chunks available to display.")
		return
	idxs = _scattered_indices(n, k)
	print(
		f"Showing {len(idxs)} scattered random recursive chunks out of {n} total chunks:\n"
	)
	for i, idx in enumerate(idxs, 1):
		print(f"#{i}")
		print(_draw_boxed_chunk(recursive_chunked_pages[idx]))
		print()  # extra newline between chunks

In [18]:
assert recursive_chunked_pages is not None and len(recursive_chunked_pages) > 0
show_recursive_random_chunks(recursive_chunked_pages, k=5, seed=42)

Showing 5 scattered random recursive chunks out of 1949 total chunks:

#1
┌──────────────────────────────────────────────────────────────────────────────────────────────────┐
| Chunk p-14 - idx 0  | chars 584 - words 89 - tokens 146                                          |
├──────────────────────────────────────────────────────────────────────────────────────────────────┤
│ Alan Titchenal Dr. Titchenal received a PhD in nutrition from the University of California at    │
│ Davis with emphasis on exercise physiology and physiological chemistry. His work at the          │
│ University of Hawai‘i at Mānoa has focused on the broad areas of nutrition and human performance │
│ and translation of nutrition science for public consumption. This has included the “Got          │
│ Nutrients?” project that provides daily messages on topics related to nutrition, fitness, and    │
│ health and the publication of over 600 articles in the Honolulu Star- Advertiser newspaper.      │
│ xxviii | About 

## Method 4: Structured based chunking

### How it works
- The function looks for `headers` such as Chapter numbers(e.g. CHAPTER 1) or section heading(i.e. 1.1 Introduction)
- Every time it finds a header it starts a new chunk till either another heading is reached or max token size limit is exceeded
- This preserves logical flow

### Engineer's decision
- Works well with documents which have clear hierarchy(chapter, section, subsection)

In [19]:
import random


# helper function to detect chapter start
def _is_chapter_start(text: str) -> bool:
	"""Detects if a line indicates the start of a new chapter or section."""
	# chapter_patterns = [
	#     r'^\s*CHAPTER\s+\d+',  # Matches "CHAPTER 1", "CHAPTER 2", etc.
	#     r'^\s*\d+(\.\d+)*\s+[A-Z][a-zA-Z\s]*',  # Matches "1. Introduction", "2.1 Background", etc.
	# ]
	# for pattern in chapter_patterns:
	#     if re.match(pattern, line):
	#         return True
	# return False
	return re.search(r"university\s+of\s+hawai", text, flags=re.IGNORECASE) is not None


def _guess_title_from_page(text: str) -> str:
	"""the previous line of 'University of Hawaii' is likely the title
	falls back to the first ~120 characters
	Args:
	    text (str): The text of the page.
	Returns:
	    str: The guessed title of the page.
	"""
	match = re.search(r"university\s+of\s+hawai", text, flags=re.IGNORECASE)
	if match:
		title = text[: match.start()].strip()
		title = re.sub(r"\s+", " ", title.strip())  # Clean up whitespace
		if 10 <= len(title) <= 180:
			return title
	# fallback to first ~120 characters
	title = re.sub(r"\s+", " ", text).strip()
	return title[:120] if title else "Untitled"


def chapter_chunk_pdf_pages(pages_and_texts: list) -> list[dict]:
	"""
	Chunks PDF pages into sections based on chapter titles.
	Args:
	    pages_and_texts (list): List of tuples (page_number, text) for each page.
	Returns:
	    list[dict]: A list of dictionaries with chapter titles and associated pages.
	"""
	if not pages_and_texts:
		return []
	chapter_starts = []
	for i, page in enumerate(pages_and_texts):
		text = page["text"]
		if _is_chapter_start(text):
			chapter_starts.append(i)
	# if nothing detected, return all pages as a single chunk
	if not chapter_starts:
		# No chapters found, return all pages as a single chunk
		all_text = " ".join(page["text"] for page in pages_and_texts).strip()
		return [
			{
				"chapter_index": 0,
				"chapter_title": _guess_title_from_page(pages_and_texts[0]["text"]),
				"start_page": pages_and_texts[0]["page_number"],
				"end_page": pages_and_texts[-1]["page_number"],
				"chunk_char_count": len(all_text),
				"chunk_word_count": len(all_text.split()),
				"chunk_token_count": int(len(all_text) / 4),
				"chunk_text": all_text,
			}
		]
	# build chapter ranges (start -> next start - 1)
	chapter_chunks = []
	for idx, start in enumerate(chapter_starts):
		end = (
			chapter_starts[idx + 1] - 1
			if idx + 1 < len(chapter_starts)
			else len(pages_and_texts) - 1
		)
		if end < start:
			continue  # skip invalid ranges
		chapter_pages = pages_and_texts[start : end + 1]
		chapter_text = " ".join(page["text"] for page in chapter_pages).strip()
		chapter_title = _guess_title_from_page(chapter_pages[0]["text"])
		chapter_chunks.append(
			{
				"chapter_index": idx,
				"chapter_title": chapter_title,
				"start_page": chapter_pages[0]["page_number"],
				"end_page": chapter_pages[-1]["page_number"],
				"chunk_char_count": len(chapter_text),
				"chunk_word_count": len(chapter_text.split()),
				"chunk_token_count": int(len(chapter_text) / 4),
				"chunk_text": chapter_text,
			}
		)
	return chapter_chunks

In [20]:
structured_chunk_pages = chapter_chunk_pdf_pages(pages_and_texts)
print(f"Total chapter chunks created: {len(structured_chunk_pages)}")
print(
	f"1st chapter chunk (pages {structured_chunk_pages[0]['start_page']} to {structured_chunk_pages[0]['end_page']}): {structured_chunk_pages[0]['chunk_text'][:200]}..."
)

Total chapter chunks created: 171
1st chapter chunk (pages -39 to -39): Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FI...
