# local RAG pipeline


In [1]:
import time
import requests
from pathlib import Path
import pymupdf
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
try:  # inside a script
	BASE_DIR = Path(__file__).resolve().parent.parent
except NameError:  # inside a notebook
	BASE_DIR = Path.cwd().parent

In [3]:
print(f"Project root set to: {BASE_DIR}")

Project root set to: F:\repos\python\The_Architect_Guide_To_LLM_Agent


In [4]:
pdf_path = BASE_DIR / "data" / "human_nutrition_text.pdf"

In [5]:
def download_pdf_requests(
	url: str, dest: Path, timeout: int = 30, max_retries: int = 3
) -> None:
	"""Download a PDF file from URL with progress tracking and error handling."""
	dest.parent.mkdir(parents=True, exist_ok=True)
	for attempt in range(max_retries):
		try:
			response = requests.get(url, stream=True, timeout=timeout)
			response.raise_for_status()

			content_type = response.headers.get("content-type", "").lower()
			if "pdf" not in content_type:
				raise ValueError(f"Invalid content type: {content_type}")
			total = int(response.headers.get("content-length", 0))
			with tqdm(
				total=total, unit="iB", unit_scale=True, desc="Downloading PDF"
			) as t:
				with dest.open("wb") as f:
					for chunk in response.iter_content(chunk_size=8192):
						if chunk:
							f.write(chunk)
							t.update(len(chunk))
			print(f"\nSuccessfully downloaded PDF to {dest}")
			return
		except requests.exceptions.RequestException as e:
			print(f"Download failed: {e}")
			if attempt == max_retries - 1:
				raise
			time.sleep(2**attempt)

In [6]:
if not pdf_path.is_file():
	download_pdf_requests(
		"https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf",
		pdf_path,
	)

Downloading PDF: 26.9MiB [00:15, 1.77MiB/s]


Successfully downloaded PDF to F:\repos\python\The_Architect_Guide_To_LLM_Agent\data\human_nutrition_text.pdf





In [7]:
def text_formatter(text: str) -> str:
	"""Performs minor text formatting."""
	import re

	cleaned_text = re.sub(
		r"\s+", " ", text
	)  # Replace multiple whitespace with single space
	cleaned_text = cleaned_text.strip()
	return cleaned_text

In [8]:
import re
from typing import Dict, List, Union
from pathlib import Path


def open_and_read_pdf(file_path: Union[str, Path]) -> Union[List[Dict], None]:
	"""
	Opens a pdf file and reads its content page by page, and collects statistics.
	Parameters:
	    file_path (str | Path): The path to the pdf file to be opened and read.
	Returns:
	    list[dict]: A list of dictionaries containing the page number, character count, word count, sentence count, token count, and extracted text for each page.
	"""
	if not Path(file_path).exists():
		raise FileNotFoundError(f"PDF file not found: {file_path}")
	try:
		doc = pymupdf.open(file_path)
		pages_and_texts = []
		for page_number, page in tqdm(enumerate(doc)):
			text = page.get_text()
			if not text or not text.strip():  # Skip empty pages
				continue
			if text and text.strip():
				text = text_formatter(text)
				sentences = re.split(r"[.!?]+", text)  # Simple sentence splitter
				sentence_count = len(
					[s for s in sentences if s.strip()]
				)  # Count non-empty sentences
				pages_and_texts.append(
					{
						"page_number": page_number - 41,
						"page_char_count": len(text),
						"page_word_count": len(text.split()),
						"page_sentence_count_raw": sentence_count,
						"page_token_count": int(len(text) / 4),
						"text": text,
					}
				)
		return pages_and_texts
	except Exception as e:
		print(f"Error reading PDF file: {e}")
		return None

In [9]:
pages_and_texts = open_and_read_pdf(file_path=pdf_path)
if pages_and_texts:
	print(pages_and_texts[:2])

1208it [00:00, 1802.94it/s]

[{'page_number': -41, 'page_char_count': 29, 'page_word_count': 4, 'page_sentence_count_raw': 1, 'page_token_count': 7, 'text': 'Human Nutrition: 2020 Edition'}, {'page_number': -39, 'page_char_count': 308, 'page_word_count': 42, 'page_sentence_count_raw': 1, 'page_token_count': 77, 'text': 'Human Nutrition: 2020 Edition UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN NUTRITION PROGRAM ALAN TITCHENAL, SKYLAR HARA, NOEMI ARCEO CAACBAY, WILLIAM MEINKE-LAU, YA-YUN YANG, MARIE KAINOA FIALKOWSKI REVILLA, JENNIFER DRAPER, GEMADY LANGFELDER, CHERYL GIBBY, CHYNA NICOLE CHUN, AND ALLISON CALABRESE'}]





In [None]:
import random

random.sample(pages_and_texts, k=3)

In [None]:
import polars as pl

df = pl.DataFrame(pages_and_texts)
summary = df.describe()
numeric_cols = [c for c, t in summary.schema.items() if t.is_numeric()]
summary = summary.with_columns(
	[pl.col(c).round(2) if c in numeric_cols else pl.col(c) for c in summary.columns]
)
summary

## Chunking Strategy
- Fixed size Chunking

    - Chunk size is fixed
    - usually used with social media where data is huge and unstructured and deep understanding is not important but speed is important

    Pros

        - Fast process with less overhead

    Cons

        - Information loss as chunk can be start and stop at any place
        - No semantic info is captured
- Semantic chunking

    - based on similarity of sentences are within a threshold they are part of same chunk

    Pros
        - maintains coherence
        - improves retrieved information
    
    Cons
        - High complexity and compute
        - Threshold sensitivity
        - Inconsistent chunk size

- Structural chunking
    
    - Can be combined with semantic chunking
    - if document structure is there then most intuitive way of chunking

    Pros
        - fast for well structured document
        - consistent and human understandable
    
    Cons
        - chunk size can be unpredictable and might become too large
        - large chunk can lead to hallucination

- Recursive Chunking

    - exploits structure as well as make sure chunk size are manageable
    - structural chunking on steroids

    Pros
        - Avoids splitting halfway, more coherent compared to fixed size chunking

    Cons
        - computational overhead
        - inconsistent chunk size

- LLM Chunking

    - this applies where apart from semantic chunking everything has failed due to context drift (change in context a lot during single document)
    - LLM understands semantic of the complete document and does chunking based on its logic

    Pros
        - high semantic accuracy
        - good for document with rapid context change, unstructured text
    
    Cons
        - computationally expensive
        - context window limitation
        - stochastic output



### Engineer decision thought process
- Use fixed size chunking when simplicity and speed matters then perfect coherence
- Document or medical data then use structured chunking + recursive chunking
- transcript from debate, video, meeting where timestamp might not be there then use semantic chunking
- if all above fail then use llm chunking **try to avoid as much as possible** for large data.

- For example
- **Legal domain** - structured & recursive
- **Finance domain** - fixed -> semantic
- **Healthcare Data** - structured & recursive
- **Education domain** - Semantic -> LLM