# 40 Validate Index Outputs (Scaffold)

Stage: `04_validation`
Discipline: quality assurance and backtest-ready validation scaffolding.

Input:
- `outputs/index_pipeline/30_scoring/municipio_indices_scored.csv`

Outputs:
- `outputs/index_pipeline/40_validation/validation_metrics.csv`
- `outputs/index_pipeline/40_validation/validation_summary.md`


In [None]:
# Cell 1: Setup
from pathlib import Path
import numpy as np
import pandas as pd


def find_repo_root():
    p = Path.cwd().resolve()
    for c in [p, *p.parents]:
        if (c / "JupyterNotebooks").exists():
            return c
    return p


REPO_ROOT = find_repo_root()
BASE_OUT = REPO_ROOT / "JupyterNotebooks" / "outputs" / "index_pipeline"
INPUT_FILE = BASE_OUT / "30_scoring" / "municipio_indices_scored.csv"
OUTPUT_DIR = BASE_OUT / "40_validation"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

if not INPUT_FILE.exists():
    raise FileNotFoundError(f"Missing dependency: {INPUT_FILE}")

idx_df = pd.read_csv(INPUT_FILE)
print(f"Rows: {len(idx_df)}")

try:
    from IPython.display import display
except ImportError:
    display = print


In [None]:
# Cell 2: Validation metrics
metrics = []

# Coverage and null checks
metrics.append({"metric": "row_count", "value": len(idx_df)})
metrics.append({"metric": "null_rate_priority_index_conf_adj", "value": float(idx_df["priority_index_conf_adj"].isna().mean())})

# Distribution checks
metrics.append({"metric": "priority_min", "value": float(idx_df["priority_index_conf_adj"].min())})
metrics.append({"metric": "priority_max", "value": float(idx_df["priority_index_conf_adj"].max())})
metrics.append({"metric": "priority_mean", "value": float(idx_df["priority_index_conf_adj"].mean())})

# Band distribution
for band, cnt in idx_df["priority_band"].value_counts(dropna=False).to_dict().items():
    metrics.append({"metric": f"band_count_{band}", "value": int(cnt)})

# Missing-data stress test (drop hazard for 30% rows and measure shift)
tmp = idx_df.copy()
np.random.seed(42)
drop_mask = np.random.rand(len(tmp)) < 0.30
stress = tmp["priority_index_conf_adj"].copy()
stress[drop_mask] = stress[drop_mask].median()
stability_mae = float(np.mean(np.abs(tmp["priority_index_conf_adj"] - stress)))
metrics.append({"metric": "stress_test_mae_priority", "value": stability_mae})

metrics_df = pd.DataFrame(metrics)
metrics_out = OUTPUT_DIR / "validation_metrics.csv"
metrics_df.to_csv(metrics_out, index=False)

summary_lines = []
summary_lines.append("# Index Validation Summary")
summary_lines.append("")
summary_lines.append(f"- Row count: {len(idx_df)}")
summary_lines.append(f"- Priority range: {idx_df['priority_index_conf_adj'].min():.2f} to {idx_df['priority_index_conf_adj'].max():.2f}")
summary_lines.append(f"- Mean confidence: {idx_df['confidence_score'].mean():.2f}")
summary_lines.append(f"- Stress-test MAE (30% missing proxy): {stability_mae:.3f}")
summary_lines.append("")
summary_lines.append("## Band Counts")
for band, cnt in idx_df["priority_band"].value_counts().to_dict().items():
    summary_lines.append(f"- {band}: {cnt}")

summary_out = OUTPUT_DIR / "validation_summary.md"
summary_out.write_text("\n".join(summary_lines), encoding="utf-8")

print(f"Outputs:\n  {metrics_out}\n  {summary_out}")
display(metrics_df)
