# Extraction Inspection

Run each extractor individually, display per-sheet results side-by-side with source data, and highlight extraction coverage and gaps.

In [None]:
import sys
import json
from pathlib import Path

sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
from IPython.display import Markdown, display
from src.config import INPUT_DATA_FILE, SHEET_NAMES, EXTRACTIONS_DIR
from src.loader import load_emr_file
from src.extractors import extract_sheet
from src.schemas import ExtractorOutput

## Load Data

In [None]:
data_sheets = load_emr_file(INPUT_DATA_FILE)

print(f"Loaded {len(data_sheets)} sheets from {INPUT_DATA_FILE.name}\n")
for name, df in data_sheets.items():
    print(f"  {name}: {len(df)} rows")

## Run Individual Extractors

In [None]:
from src.config import SHEET_NAME_TO_PROMPT

results: dict[str, ExtractorOutput] = {}
for sheet_name, df in data_sheets.items():
    if df.empty or sheet_name not in SHEET_NAME_TO_PROMPT:
        continue
    print(f"Extracting: {sheet_name} ...")
    eo = await extract_sheet(sheet_name, df)
    results[sheet_name] = eo
    print(f"  -> {len(eo.findings)} findings extracted")

print(f"\nCompleted {len(results)} extractions.")

## Extraction Results by Sheet

In [None]:
for sheet_name, eo in results.items():
    source_rows = len(data_sheets[sheet_name])
    findings_count = len(eo.findings)
    header = (
        f"### {sheet_name}\n\n"
        f"**Source rows:** {source_rows} | "
        f"**Findings extracted:** {findings_count}\n\n"
    )
    rows = []
    for i, f in enumerate(eo.findings, 1):
        rows.append(f"| {i} | {f.datetime or '—'} | {f.category} | {f.content[:120]}{'…' if len(f.content) > 120 else ''} |")
    table = "| # | Datetime | Category | Content |\n|---|----------|----------|---------|\n" + "\n".join(rows)
    display(Markdown(header + table))
    display(Markdown("---"))

## Coverage Analysis

In [None]:
coverage_rows = []
for sheet_name in SHEET_NAMES:
    source_rows = len(data_sheets.get(sheet_name, pd.DataFrame()))
    findings = len(results[sheet_name].findings) if sheet_name in results else 0
    pct = (findings / source_rows * 100) if source_rows > 0 else 0.0
    coverage_rows.append({"Sheet": sheet_name, "Source Rows": source_rows, "Findings Extracted": findings, "Coverage %": round(pct, 1)})

coverage_df = pd.DataFrame(coverage_rows)
display(coverage_df)

EXTRACTIONS_DIR.mkdir(parents=True, exist_ok=True)
for sheet_name, eo in results.items():
    out_path = EXTRACTIONS_DIR / f"{sheet_name.replace(' ', '_').lower()}.json"
    out_path.write_text(json.dumps(eo.model_dump(), ensure_ascii=False, indent=2), encoding="utf-8")
print(f"\nSaved {len(results)} extraction files to {EXTRACTIONS_DIR}")