
# RAG — Micro Ingest (PDF → Chunks → JSONL → Embeddings)

This notebook is a **smoke test** for our RAG pipeline's text path.
You will:
1. Verify GPU availability (optional).
2. Parse a PDF into clean text **chunks**.
3. Save chunks to **JSONL** (`data/processed/<doc_id>.jsonl`).
4. Generate **embeddings** for those chunks (GPU if available, otherwise CPU).

> Place a sample PDF in `data/raw/` before running **Step 3**.


In [1]:
import platform
import torch

print("torch:", torch.__version__, "| python:", platform.python_version())
print("cuda available:", torch.cuda.is_available())
print("torch cuda runtime:", torch.version.cuda)
if torch.cuda.is_available():
    print("device:", torch.cuda.get_device_name(0))

torch: 2.5.1 | python: 3.11.13
cuda available: True
torch cuda runtime: 12.4
device: NVIDIA GeForce RTX 4050 Laptop GPU


In [2]:
# --- Path resolution that works from repo root OR notebooks/ ---
from pathlib import Path

def find_project_root(start: Path, marker: str = "data", max_hops: int = 4) -> Path:
    """Walk upwards from `start` until a folder named `marker` exists."""
    p = start.resolve()
    for _ in range(max_hops):
        if (p / marker).exists():
            return p
        if p.parent == p:
            break
        p = p.parent
    raise FileNotFoundError(f"Could not find '{marker}/' upwards from {start}")

PROJECT_ROOT = find_project_root(Path.cwd(), marker="data")
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR  = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"

print("project_root :", PROJECT_ROOT)
print("data_dir     :", DATA_DIR)
print("raw_dir      :", RAW_DIR)
print("processed_dir:", PROC_DIR)

# quick sanity check: list PDFs you’ve placed in data/raw
pdfs = sorted(RAW_DIR.glob("*.pdf"))
print(f"found {len(pdfs)} pdf(s):", [p.name for p in pdfs[:5]])

project_root : D:\Silicon Valley\Phase 1 - Aug to Sep\Projects\3. RAG Chatbot\RAG_Chatbot
data_dir     : D:\Silicon Valley\Phase 1 - Aug to Sep\Projects\3. RAG Chatbot\RAG_Chatbot\data
raw_dir      : D:\Silicon Valley\Phase 1 - Aug to Sep\Projects\3. RAG Chatbot\RAG_Chatbot\data\raw
processed_dir: D:\Silicon Valley\Phase 1 - Aug to Sep\Projects\3. RAG Chatbot\RAG_Chatbot\data\processed
found 1 pdf(s): ['sample.pdf']


In [3]:
# --- Clean text: normalize unicode + collapse whitespace ---
import re, unicodedata

_WS_RE = re.compile(r"\s+")          # collapse any whitespace run
_SOFT_HYPHEN = "\u00AD"              # invisible hyphen inserted in PDFs
_BOM = "\ufeff"                      # byte-order mark
_NBSP = "\u00A0"                     # non-breaking space

def clean_text(s: str) -> str:
    if not s:
        return ""
    # remove BOM/soft-hyphen and normalize unicode (compatibility form)
    s = s.replace(_BOM, "").replace(_SOFT_HYPHEN, "")
    s = unicodedata.normalize("NFKC", s)
    # turn NBSP into a normal space, then collapse whitespace
    s = s.replace(_NBSP, " ")
    s = _WS_RE.sub(" ", s)
    return s.strip()

In [6]:
raw = "Title\u00A0 \n\n  — Intro\u00ADduction \t\t page 1 \ufeff"
print("RAW:     ", repr(raw))
print("CLEANED: ", repr(clean_text(raw)))

RAW:      'Title\xa0 \n\n  — Intro\xadduction \t\t page 1 \ufeff'
CLEANED:  'Title — Introduction page 1'


In [7]:
# --- Chunk text into overlapping word windows ---
from typing import List

CHUNK_MAX_WORDS   = 220   # ~300-350 tokens for MiniLM-like models
CHUNK_OVERLAP     = 40    # keep continuity across boundaries
MIN_CHUNK_WORDS   = 25    # drop tiny fragments (headers, junk)

def chunk_text(text: str,
               max_words: int = CHUNK_MAX_WORDS,
               overlap: int = CHUNK_OVERLAP,
               min_words: int = MIN_CHUNK_WORDS) -> List[str]:

    words = text.split()
    if not words:
        return []
    step = max(max_words - overlap, 1)   # avoid infinite loop
    chunks, i = [], 0
    while i < len(words):
        piece = " ".join(words[i:i+max_words]).strip()
        if piece and piece.count(" ") + 1 >= min_words:
            chunks.append(piece)
        i += step
    return chunks

In [14]:
_demo = (
    "RAG combines retrieval with generation. The retriever fetches relevant passages, "
    "and the generator (LLM) answers using those passages as grounded context. "
    "Overlapping windows help preserve coherence near boundaries."
) * 5  # repeat to force multiple chunks

demo_chunks = chunk_text(_demo, max_words=30, overlap=8, min_words=10)
print("chunks:", len(demo_chunks))
for j, c in enumerate(demo_chunks[:3], 1):
    print(f"[{j}] {c[:90]}...")

chunks: 6
[1] RAG combines retrieval with generation. The retriever fetches relevant passages, and the g...
[2] windows help preserve coherence near boundaries.RAG combines retrieval with generation. Th...
[3] passages as grounded context. Overlapping windows help preserve coherence near boundaries....


In [16]:
from typing import List, Dict, Any
import fitz  # PyMuPDF


def parse_pdf_to_chunks(pdf_path: Path) -> Dict[str, Any]:
    """Read a PDF and return {'document_id', 'chunks':[...]}."""
    assert pdf_path.exists(), f"PDF not found: {pdf_path}"
    import uuid
    doc_id = str(uuid.uuid4())

    pdf = fitz.open(str(pdf_path))
    chunks: List[Dict[str, Any]] = []

    for page_idx in range(len(pdf)):
        page = pdf.load_page(page_idx)
        text = clean_text(page.get_text("text"))
        if not text:
            # likely scanned; we’ll add OCR later
            continue
        for part in chunk_text(text):
            chunks.append({
                "chunk_id": str(uuid.uuid4()),
                "document_id": doc_id,
                "page_no": page_idx + 1,
                "section_path": None,   # we’ll fill this when we add heading-aware
                "text": part,
                "tokens_est": round(len(part.split()) * 1.33),
            })
    return {"document_id": doc_id, "chunks": chunks}

In [21]:
# pick first PDF in data/raw or set PDF_PATH explicitly
pdfs = sorted(RAW_DIR.glob("*.pdf"))
assert pdfs, "No PDFs in data/raw. Add one and re-run."
PDF_PATH = pdfs[0]
record = parse_pdf_to_chunks(PDF_PATH)

print("document_id:", record["document_id"], "| total chunks:", len(record["chunks"]))
print("sample chunk ->")
print(record["chunks"][0]["text"], "...")

document_id: d697a0be-041a-4cfb-8957-353df6e320c3 | total chunks: 6
sample chunk ->
WORK EXPERIENCE  Upwork & Fiverr - Remote (Part-Time) Freelance Data Scientist - [ 09/2021 – Current ] • For Humans (Germany): Architected a fully automated Shopify analytics pipeline (Python, Pandas, Streamlit) that ingests daily sales data, calculates 15+ KPIs (CLV, CAC, repeat rate), and delivers real-time interactive dashboards—slashing report generation time by 90% and empowering data-driven product decisions. • Auto Finance Canada (Canada): Designed and deployed an automated data integrity system (Python, Google Sheets API) for over 10M customer records. The system utilized machine learning-based anomaly detection alongside rule-based validation, reducing manual data preparation by 80% while ensuring regulatory compliance. • MaineWorks & YourModernAccountant: Developed advanced Power-BI & Looker Studio dashboards (SQL , Python) to monitor workforce reintegration success metrics and financial KPIs

In [22]:
import json

def write_chunks_jsonl(record, out_dir: Path = PROC_DIR) -> Path:
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / f"{record['document_id']}.jsonl"
    with out_path.open("w", encoding="utf-8") as f:
        for ch in record["chunks"]:
            f.write(json.dumps(ch, ensure_ascii=False) + "\n")
    return out_path

out_path = write_chunks_jsonl(record)
print("wrote:", out_path)

wrote: D:\Silicon Valley\Phase 1 - Aug to Sep\Projects\3. RAG Chatbot\RAG_Chatbot\data\processed\d697a0be-041a-4cfb-8957-353df6e320c3.jsonl
