In [1]:
import os
from pathlib import Path

os.chdir(r"C:\Vijay\Toy_Agent")
list(Path.cwd().glob("*.ipynb"))


[WindowsPath('C:/Vijay/Toy_Agent/chunk_docs.ipynb'),
 WindowsPath('C:/Vijay/Toy_Agent/embed_load_chunk.ipynb'),
 WindowsPath('C:/Vijay/Toy_Agent/ingest_normalize.ipynb'),
 WindowsPath('C:/Vijay/Toy_Agent/langgraph_rag_agent.ipynb')]

In [2]:
from pathlib import Path

DOCS_DIR = Path("docs")
files = sorted([p for p in DOCS_DIR.iterdir() if p.is_file()])
len(files), [f.name for f in files[:10]]


(40,
 ['antibiotic_stewardship.html',
  'antibiotic_stewardship.json',
  'antibiotic_stewardship.pdf',
  'antibiotic_stewardship.txt',
  'antibiotic_stewardship.xml',
  'asthma_action_plan.html',
  'asthma_action_plan.json',
  'asthma_action_plan.pdf',
  'asthma_action_plan.txt',
  'asthma_action_plan.xml'])

In [3]:
import json
from pypdf import PdfReader
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

def load_txt(path: Path) -> str:
    return path.read_text(encoding="utf-8", errors="ignore")

def load_pdf(path: Path) -> str:
    reader = PdfReader(str(path))
    return "\n".join((page.extract_text() or "") for page in reader.pages)

def load_json(path: Path) -> str:
    data = json.loads(path.read_text(encoding="utf-8", errors="ignore"))
    # prefer "full_text" if present, else stringify
    return data.get("full_text") or json.dumps(data, ensure_ascii=False, indent=2)

def load_xml(path: Path) -> str:
    tree = ET.parse(path)
    root = tree.getroot()
    texts = []
    for elem in root.iter():
        if elem.text and elem.text.strip():
            texts.append(elem.text.strip())
    return " ".join(texts)

def load_html(path: Path) -> str:
    soup = BeautifulSoup(path.read_text(encoding="utf-8", errors="ignore"), "html.parser")
    return soup.get_text(separator=" ", strip=True)

def load_file(path: Path) -> str:
    ext = path.suffix.lower()
    if ext == ".txt":
        return load_txt(path)
    if ext == ".pdf":
        return load_pdf(path)
    if ext == ".json":
        return load_json(path)
    if ext == ".xml":
        return load_xml(path)
    if ext == ".html":
        return load_html(path)
    raise ValueError(f"Unsupported file: {path}")


In [4]:
from pathlib import Path

test = load_file(Path(r"C:\Vijay\Toy_Agent\docs\antibiotic_stewardship.html"))
print(test[:500])


Antibiotic Stewardship: When Antibiotics Help Antibiotic Stewardship: When Antibiotics Help Fictional educational content for software testing only. Metadata created: 2025-12-29 tags: demo, rag, medical, antibiotic license: Synthetic (generated for demo) source: generated Summary This is a fictional educational document about Antibiotic Stewardship: When Antibiotics Help. It is for software testing and does not provide medical advice. Key points Symptoms and experiences can vary from person to p


In [5]:
normalized = []

for path in files:
    text = load_file(path)

    doc = {
        "doc_id": path.stem,
        "file_name": path.name,
        "file_type": path.suffix.replace(".", ""),
        "title": path.stem.replace("_", " ").title(),
        "text": text,
        "metadata": {
            "source_path": str(path)
        }
    }
    normalized.append(doc)

len(normalized), normalized[0]["file_name"], normalized[0]["text"][:300]


(40,
 'antibiotic_stewardship.html',
 'Antibiotic Stewardship: When Antibiotics Help Antibiotic Stewardship: When Antibiotics Help Fictional educational content for software testing only. Metadata created: 2025-12-29 tags: demo, rag, medical, antibiotic license: Synthetic (generated for demo) source: generated Summary This is a fictional')

In [6]:
out_dir = Path("normalized")
out_dir.mkdir(exist_ok=True)

out_path = out_dir / "docs.jsonl"

import json
with out_path.open("w", encoding="utf-8") as f:
    for doc in normalized:
        f.write(json.dumps(doc, ensure_ascii=False) + "\n")

out_path.as_posix()


'normalized/docs.jsonl'