In [13]:
PDF_FILE = "WHO.pdf"   # your local file


In [14]:
#!/usr/bin/env python3
"""
WHO Triage Guidelines Extractor
- Parses WHO Adult Triage & Treatment PDF
- Extracts condition/symptom -> triage level -> action
- Saves structured JSON
"""

import re, json, pdfplumber
from pathlib import Path
from datetime import datetime
import hashlib

# ---------- Config ----------
OUTPUT_FILE = "who_guidelines.json"

# ---------- Helpers ----------
def generate_id(text: str) -> str:
    return hashlib.md5(text.encode()).hexdigest()[:12]

def detect_triage_level(text: str) -> str:
    """Detect triage level keywords."""
    triage_patterns = {
        "Level 1": r"\b(Red|Immediate|Resuscitation)\b",
        "Level 2": r"\b(Orange|Emergency)\b",
        "Level 3": r"\b(Yellow|Urgent)\b",
        "Level 4": r"\b(Green|Standard)\b",
        "Level 5": r"\b(Blue|Non[- ]?urgent)\b",
    }
    for level, pat in triage_patterns.items():
        if re.search(pat, text, re.I):
            return level
    return ""

# ---------- Main Extraction ----------
def extract_guidelines_from_pdf(path: str):
    guidelines = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            lines = [l.strip() for l in text.split("\n") if l.strip()]

            for line in lines:
                triage_level = detect_triage_level(line)
                if triage_level:
                    guideline = {
                        "id": generate_id(line),
                        "text": line,
                        "metadata": {
                            "symptoms": [],
                            "diagnosis": [],
                            "recommendation": [],
                            "level": triage_level
                        },
                        "source": Path(path).name,
                        "last_updated": datetime.now().isoformat()
                    }
                    guidelines.append(guideline)
    return guidelines

# ---------- Runner ----------
def main():
    print(f"[INFO] Processing {PDF_FILE}...")
    guidelines = extract_guidelines_from_pdf(PDF_FILE)
    print(f"[INFO] Extracted {len(guidelines)} rules.")

    with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
        json.dump(guidelines, f, indent=2, ensure_ascii=False)

    print(f"[INFO] Guidelines saved to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()


[INFO] Processing WHO.pdf...
[INFO] Extracted 17 rules.
[INFO] Guidelines saved to who_guidelines.json
