In [2]:

from pathlib import Path
import hashlib, yaml, pandas as pd, json, re
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.schema import Document
import fitz  # PyMuPDF for TOC + page spans

DATA_DIR = Path("X:/ML Projects/News RAG/data/pdfs")
META_YAML = Path("./books_meta.yaml")
ARTIFACTS = Path("./artifacts"); ARTIFACTS.mkdir(exist_ok=True)

def doc_id_for(path: Path)->str:
    h = hashlib.sha256()
    with open(path,'rb') as f:
        for chunk in iter(lambda: f.read(1<<20), b""): h.update(chunk)
    return h.hexdigest()[:16]

# load user-supplied metadata (title/author/publisher/…)
def load_meta_yaml(path: Path)->pd.DataFrame:
    if not path.exists(): return pd.DataFrame()
    data = yaml.safe_load(path.read_text(encoding="utf-8"))
    df = pd.DataFrame(data)
    # must include at least one of filename or doc_id to match
    return df

meta_df = load_meta_yaml(META_YAML)

def enrich_meta(basic: dict)->dict:
    """merge loader metadata with your YAML row (by filename or doc_id)."""
    row = None
    if not meta_df.empty:
        fname = Path(basic["source"]).name
        m = meta_df[(meta_df.get("filename","")==fname) | (meta_df.get("doc_id","")==basic["doc_id"])]
        if len(m): row = m.iloc[0].to_dict()
    return {**basic, **(row or {})}

def load_pages(path: Path)->list[Document]:
    # page-level documents with LangChain
    loader = PyMuPDFLoader(str(path))
    docs = loader.load()
    # standardize metadata
    did = doc_id_for(path)
    for d in docs:
        d.metadata.update({
            "doc_id": did,
            "source": str(path),
            "page": d.metadata.get("page", 0),
            "title": d.metadata.get("title") or path.stem
        })
        d.metadata = enrich_meta(d.metadata)
    return docs

def build_toc_coarse(path: Path, doc_id: str):
    """Coarse segments by TOC (chapters/sections). Fallback to 1–3 page groups."""
    pdf = fitz.open(path)
    pages = len(pdf)
    toc = pdf.get_toc(simple=True)  # [level, title, page1-based]
    segs = []
    if toc:
        for i,(lvl,title,p1) in enumerate(toc):
            p0 = max(p1-1,0)
            p2 = (toc[i+1][2]-1) if i+1<len(toc) else pages-1
            segs.append({"title":title.strip(), "start_page":p0, "end_page":p2})
    else:
        step = 2  # ~1–3 pages per segment
        for p in range(0,pages,step):
            segs.append({"title":f"Section p{p+1}-{min(p+step,pages)}","start_page":p,"end_page":min(p+step-1,pages-1)})
    for s in segs:
        s["doc_id"]=doc_id
        s["coarse_id"]=f"{doc_id}:{s['start_page']}-{s['end_page']}"
    pdf.close()
    return segs


In [4]:
import spacy, itertools
nlp = spacy.load("en_core_web_sm", disable=["parser","tagger","lemmatizer"])
nlp.max_length = 2_000_000

def extract_ner_batch(texts: list[str])->list[dict]:
    ents = []
    for doc in nlp.pipe(texts, batch_size=32):
        dd = {}
        for e in doc.ents:
            dd.setdefault(e.label_, set()).add(e.text)
        ents.append({k: sorted(list(v)) for k,v in dd.items()})
    return ents


ImportError: cannot import name 'ModelMetaclass' from 'pydantic.main' (C:\Users\HP\AppData\Local\Programs\Python\Python311\Lib\site-packages\pydantic\main.py)