
# RAG — Micro Ingest (PDF → Chunks → JSONL → Embeddings)

This notebook is a **smoke test** for our RAG pipeline's text path.
You will:
1. Verify GPU availability (optional).
2. Parse a PDF into clean text **chunks**.
3. Save chunks to **JSONL** (`data/processed/<doc_id>.jsonl`).
4. Generate **embeddings** for those chunks (GPU if available, otherwise CPU).

> Place a sample PDF in `data/raw/` before running **Step 3**.


In [1]:
import platform
try:
    import torch
    print("torch:", torch.__version__, "| python:", platform.python_version())
    print("cuda available:", torch.cuda.is_available())
    print("torch cuda runtime:", torch.version.cuda)
    if torch.cuda.is_available():
        print("device:", torch.cuda.get_device_name(0))
except Exception as e:
    print("PyTorch not available in this environment:", e)

torch: 2.5.1 | python: 3.11.13
cuda available: True
torch cuda runtime: 12.4
device: NVIDIA GeForce RTX 4050 Laptop GPU


In [3]:
import os, re, uuid, json, time
from pathlib import Path
from typing import List, Dict, Any

import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer

DATA_DIR = Path("../data")
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"
PROC_DIR.mkdir(parents=True, exist_ok=True)

def clean_text(s: str) -> str:
    """Normalize whitespace and strip leading/trailing spaces."""
    return re.sub(r"\s+", " ", s or "").strip()

def chunk_text(text: str, max_words: int = 220, overlap: int = 40) -> List[str]:
    """Split text into slightly overlapping word windows."""
    words = text.split()
    chunks, i = [], 0
    step = max(max_words - overlap, 1)
    while i < len(words):
        chunks.append(" ".join(words[i:i+max_words]))
        i += step
    return [c for c in chunks if c.strip()]

def parse_pdf_to_chunks(pdf_path: Path) -> Dict[str, Any]:
    """Read a PDF and return {doc_id, chunks(list of dicts)}."""
    assert pdf_path.exists(), f"PDF not found: {pdf_path}"
    doc_id = str(uuid.uuid4())
    pdf = fitz.open(str(pdf_path))
    chunks: List[Dict[str, Any]] = []
    for page_idx in range(len(pdf)):
        page = pdf.load_page(page_idx)
        text = clean_text(page.get_text("text"))
        if not text:
            # Scanned page (no text) — OCR will be added later
            continue
        for part in chunk_text(text):
            chunks.append({
                "chunk_id": str(uuid.uuid4()),
                "document_id": doc_id,
                "page_no": page_idx + 1,
                "section_path": None,
                "text": part,
                "tokens_est": round(len(part.split()) * 1.33),
            })
    return {"document_id": doc_id, "chunks": chunks}

def write_chunks_jsonl(record: Dict[str, Any], out_dir: Path = PROC_DIR) -> Path:
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / f"{record['document_id']}.jsonl"
    with out_path.open("w", encoding="utf-8") as f:
        for ch in record["chunks"]:
            f.write(json.dumps(ch, ensure_ascii=False) + "\n")
    return out_path

def embed_jsonl(jsonl_path: Path, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
    device = "cuda"
    try:
        import torch
        if not torch.cuda.is_available():
            device = "cpu"
    except Exception:
        device = "cpu"
    model = SentenceTransformer(model_name, device=device)
    texts = []
    with jsonl_path.open("r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            texts.append(obj["text"])
    vecs = model.encode(texts, batch_size=64, normalize_embeddings=True, convert_to_numpy=True)
    print(f"Embeddings shape: {vecs.shape} | device: {device}")
    return vecs