In [1]:
from pathlib import Path
import json

docs_path = Path("normalized/docs.jsonl")
docs = [json.loads(line) for line in docs_path.read_text(encoding="utf-8").splitlines() if line.strip()]
len(docs), docs[0]["title"]


(40, 'Antibiotic Stewardship')

In [2]:
CHUNK_SIZE = 800
CHUNK_OVERLAP = 120

def chunk_text(text: str, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(start + chunk_size, n)
        chunks.append(text[start:end])
        if end == n:
            break
        start = max(0, end - overlap)
    return chunks


In [3]:
chunks = []
for d in docs:
    parts = chunk_text(d["text"])
    for i, part in enumerate(parts):
        chunks.append({
            "chunk_id": f"{d['doc_id']}::{i}",
            "doc_id": d["doc_id"],
            "title": d["title"],
            "text": part,
            "metadata": d["metadata"] | {"file_type": d["file_type"], "file_name": d["file_name"]}
        })

len(chunks), chunks[0]["chunk_id"], chunks[0]["text"][:200]


(82,
 'antibiotic_stewardship::0',
 'Antibiotic Stewardship: When Antibiotics Help Antibiotic Stewardship: When Antibiotics Help Fictional educational content for software testing only. Metadata created: 2025-12-29 tags: demo, rag, medic')

In [None]:
out_dir = Path("normalized")
out_path = out_dir / "chunks.jsonl"

with out_path.open("w", encoding="utf-8") as f:
    for c in chunks:
        f.write(json.dumps(c, ensure_ascii=False) + "\n")

out_path.as_posix()


'normalized/chunks.jsonl'

: 