In [1]:
from pathlib import Path

DATA_DIR = Path("X:/ML Projects/News RAG/data/pdfs")   # <- put your folder here
ARTIFACTS = Path("./artifacts"); ARTIFACTS.mkdir(exist_ok=True)

# chunk sizes tuned for history/politics prose
FINE_CHUNK_CHARS   = 1800   # ~450 tokens
FINE_OVERLAP_CHARS = 300    # ~75 tokens
COARSE_MIN_PAGES   = 5      # fallback “chapter” size when TOC missing

EMB_MODEL_TEXT  = "BAAI/bge-large-en-v1.5"    # passages
EMB_MODEL_QUERY = "BAAI/bge-large-en-v1.5"    # queries (bge works for both)
RERANK_MODEL    = "BAAI/bge-reranker-large"   # cross-encoder for re-ranking

INDEX_DIM = 1024  # bge-large-en is 1024-d


In [2]:
import fitz, json, hashlib, re
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional

@dataclass
class DocMeta:
    doc_id: str
    title: str
    author: Optional[str]
    source_path: str
    pages: int
    creationdate: Optional[str]
    moddate: Optional[str]

@dataclass
class CoarseSeg:
    coarse_id: str
    doc_id: str
    title: str         # chapter/headline or synthetic
    page_start: int
    page_end: int
    text: str

def pdf_sha(path: str) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(1<<20), b""):
            h.update(chunk)
    return h.hexdigest()[:16]

def extract_pdf(path: Path):
    doc = fitz.open(path)
    md = doc.metadata or {}
    meta = DocMeta(
        doc_id = pdf_sha(str(path)),
        title  = md.get("title") or path.stem,
        author = md.get("author"),
        source_path = str(path),
        pages  = len(doc),
        creationdate = md.get("creationDate"),
        moddate = md.get("modDate"),
    )
    # full text per page
    pages = [doc[i].get_text("text") for i in range(len(doc))]
    # try TOC
    toc = doc.get_toc(simple=True)  # list of [level, title, page]
    coarse: List[CoarseSeg] = []
    if toc:
        # convert TOC to page spans
        entries = []
        for i, (lvl, title, page1) in enumerate(toc):
            page0 = max(page1-1, 0)
            page2 = (toc[i+1][2]-1) if i+1 < len(toc) else len(doc)-1
            if page2 < page0: continue
            entries.append((title.strip(), page0, page2))
        for title, p0, p1 in entries:
            text = "\n".join(pages[p0:p1+1]).strip()
            if not text: continue
            coarse.append(CoarseSeg(
                coarse_id=f"{meta.doc_id}:c:{p0}-{p1}",
                doc_id=meta.doc_id, title=title, page_start=p0, page_end=p1, text=text
            ))
    else:
        # fallback coarse segments by fixed windows
        p = 0
        while p < len(pages):
            q = min(p+COARSE_MIN_PAGES-1, len(pages)-1)
            text = "\n".join(pages[p:q+1]).strip()
            if text:
                coarse.append(CoarseSeg(
                    coarse_id=f"{meta.doc_id}:c:{p}-{q}",
                    doc_id=meta.doc_id, title=f"Section p{p+1}-{q+1}", page_start=p, page_end=q, text=text
                ))
            p = q+1
    doc.close()
    return meta, pages, coarse


In [3]:
def split_recursive(text: str, max_chars=FINE_CHUNK_CHARS, overlap=FINE_OVERLAP_CHARS):
    seps = ["\n\n", "\n", ". "]
    chunks = [text]
    for sep in seps:
        new = []
        for c in chunks:
            if len(c) <= max_chars: new.append(c); continue
            parts = c.split(sep)
            buf = ""
            for p in parts:
                piece = (p + (sep if c.find(sep)>=0 else ""))
                if len(buf) + len(piece) > max_chars and buf:
                    new.append(buf.strip())
                    # start new with overlap tail
                    buf = buf[-overlap:] + piece
                else:
                    buf += piece
            if buf.strip():
                new.append(buf.strip())
        chunks = new
    # final hard wrap
    final=[]
    for c in chunks:
        s=0
        while s < len(c):
            final.append(c[s:s+max_chars])
            s += (max_chars - overlap)
    return [x.strip() for x in final if x.strip()]


In [4]:
from tqdm import tqdm

def build_corpora(pdf_dir: Path):
    docs_meta: Dict[str, DocMeta] = {}
    coarse_segments: List[CoarseSeg] = []
    fine_chunks = []   # dicts with metadata

    for pdf in tqdm(sorted(pdf_dir.glob("**/*.pdf"))):
        meta, pages, coarse = extract_pdf(pdf)
        docs_meta[meta.doc_id] = meta
        coarse_segments.extend(coarse)
        # fine chunks (anchored in coarse)
        for seg in coarse:
            for i, chunk in enumerate(split_recursive(seg.text)):
                fine_chunks.append({
                    "fine_id": f"{seg.coarse_id}:f:{i}",
                    "doc_id": meta.doc_id,
                    "coarse_id": seg.coarse_id,
                    "title": seg.title,
                    "page_start": seg.page_start,
                    "page_end": seg.page_end,
                    "text": chunk,
                    # room for future enrichment
                    "year": None, "author": meta.author, "source_path": meta.source_path
                })
    return docs_meta, coarse_segments, fine_chunks

docs_meta, coarse_segments, fine_chunks = build_corpora(DATA_DIR)

print(f"Docs: {len(docs_meta)} | Coarse segs: {len(coarse_segments)} | Fine chunks: {len(fine_chunks)}")


100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [01:29<00:00,  1.03s/it]

Docs: 29 | Coarse segs: 5397 | Fine chunks: 153375





In [5]:
import numpy as np
from sentence_transformers import SentenceTransformer

PASSAGE_PREFIX = "passage: "
QUERY_PREFIX   = "query: "

emb_text = SentenceTransformer(EMB_MODEL_TEXT, device="cpu")   # switch to 'cuda' if available

def embed_passages(texts: list[str]) -> np.ndarray:
    return emb_text.encode([PASSAGE_PREFIX + t for t in texts],
                           normalize_embeddings=True, batch_size=64, convert_to_numpy=True)

def embed_queries(texts: list[str]) -> np.ndarray:
    return emb_text.encode([QUERY_PREFIX + t for t in texts],
                           normalize_embeddings=True, batch_size=16, convert_to_numpy=True)


W1107 15:54:20.280000 25184 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
import faiss
from rank_bm25 import BM25Okapi
# from rapidfuzz.string_metric import levenshtein
import re

def tokenize(text):
    # simple tokenizer for BM25 (can swap to nltk/spacy later)
    return re.findall(r"[A-Za-z0-9\-]+", text.lower())

# ---- Coarse index ----
coarse_texts = [c.text for c in coarse_segments]
coarse_vecs  = embed_passages(coarse_texts).astype('float32')
index_coarse = faiss.IndexHNSWFlat(INDEX_DIM, 64)  # M=64
index_coarse.hnsw.efConstruction = 200
index_coarse.add(coarse_vecs)

bm25_coarse = BM25Okapi([tokenize(t) for t in coarse_texts])

# ---- Fine index ----
fine_texts = [c["text"] for c in fine_chunks]
fine_vecs  = embed_passages(fine_texts).astype('float32')
index_fine = faiss.IndexHNSWFlat(INDEX_DIM, 64)
index_fine.hnsw.efConstruction = 200
index_fine.add(fine_vecs)

bm25_fine = BM25Okapi([tokenize(t) for t in fine_texts])


In [12]:
import sys
!pip install rapidfuzz

