
# Evaluation (XGB vs NN) — Consolidated Notebook

This single notebook replaces separate variants and provides:
- **Interactive evaluation** using `03_scored/*_with_predictions.csv` (label + `xgb_prob` + `nn_prob`).
- **Headless artifacts ingestion** from `04_eval/` (summary/point metrics CSVs, link to latest HTML report).
- Optional threshold selection helpers (and commented-out ipywidgets block).

> No endpoint calls happen here; this notebook is presentation/analysis only.


In [None]:

import os, io, re, glob, json
from datetime import datetime
import boto3, pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (roc_auc_score, average_precision_score, brier_score_loss,
                             roc_curve, precision_recall_curve, confusion_matrix,
                             accuracy_score, precision_score, recall_score, f1_score)

AWS_REGION = os.getenv("AWS_REGION", "us-east-1")
BUCKET = os.getenv("BUCKET", "diabetes-directory")
SCORED_PREFIX = os.getenv("SCORED_PREFIX", "03_scored")
EVAL_PREFIX = os.getenv("EVAL_PREFIX", "04_eval")
LABEL_COL = os.getenv("LABEL_COL", "readmitted")

SESSION = boto3.session.Session(region_name=AWS_REGION)
S3 = SESSION.client("s3")

print("Region:", AWS_REGION, "| Bucket:", BUCKET)


In [None]:

def s3_list(bucket: str, prefix: str):
    paginator = S3.get_paginator("list_objects_v2")
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            yield obj

def s3_read_csv(bucket: str, key: str) -> pd.DataFrame:
    obj = S3.get_object(Bucket=bucket, Key=key)
    return pd.read_csv(io.BytesIO(obj["Body"].read()))

def find_latest_with_predictions(bucket: str, scored_prefix: str):
    latest_test = None; latest_train = None
    candidates = []
    for obj in s3_list(bucket, f"{scored_prefix}/"):
        key = obj["Key"]
        if key.lower().endswith("_with_predictions.csv"):
            candidates.append((obj["LastModified"], key))
    candidates.sort(reverse=True)
    for _, key in candidates:
        lname = key.lower()
        if (latest_test is None) and ("test" in lname):
            latest_test = key
        if (latest_train is None) and ("train" in lname):
            latest_train = key
    return latest_test, latest_train

TEST_KEY, TRAIN_KEY = find_latest_with_predictions(BUCKET, SCORED_PREFIX)
print("Detected:")
print("  TEST_KEY :", TEST_KEY)
print("  TRAIN_KEY:", TRAIN_KEY)


In [None]:

if not TEST_KEY:
    raise SystemExit("Could not auto-detect a test_with_predictions.csv in 03_scored/. Set TEST_KEY manually above.")

df_test = s3_read_csv(BUCKET, TEST_KEY)
df_train = s3_read_csv(BUCKET, TRAIN_KEY) if TRAIN_KEY else None
print("Test shape:", df_test.shape, "| Train shape:", None if df_train is None else df_train.shape)


In [None]:

def coerce_label(y: pd.Series) -> pd.Series:
    mapping = {"NO":0,"No":0,"no":0,"0":0,"FALSE":0,"False":0,"false":0,
               "YES":1,"Yes":1,"yes":1,"1":1,"TRUE":1,"True":1,"true":1,
               "<30":1,">30":1}
    if y.dtype == object:
        y = y.map(mapping).fillna(y)
    y = pd.to_numeric(y, errors="coerce")
    return y.astype(int)

needed = [LABEL_COL, "xgb_prob", "nn_prob"]
miss = [c for c in needed if c not in df_test.columns]
assert not miss, f"Missing required columns in test: {miss}"

y_t = coerce_label(df_test[LABEL_COL])
pxgb_t = pd.to_numeric(df_test["xgb_prob"], errors="coerce")
pnn_t  = pd.to_numeric(df_test["nn_prob"], errors="coerce")
pens_t = (pxgb_t + pnn_t) / 2.0

if df_train is not None:
    miss_tr = [c for c in needed if c not in df_train.columns]
    assert not miss_tr, f"Missing required columns in train: {miss_tr}"
    y_tr = coerce_label(df_train[LABEL_COL])
    pxgb_tr = pd.to_numeric(df_train["xgb_prob"], errors="coerce")
    pnn_tr  = pd.to_numeric(df_train["nn_prob"], errors="coerce")
    pens_tr = (pxgb_tr + pnn_tr) / 2.0


In [None]:

def summary_metrics(y_true, p):
    return {"AUC": float(roc_auc_score(y_true, p)),
            "AUPRC": float(average_precision_score(y_true, p)),
            "Brier": float(brier_score_loss(y_true, p))}

def point_metrics(y_true, p, thr):
    y_pred = (p >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return {"Accuracy": float(accuracy_score(y_true, y_pred)),
            "Precision": float(precision_score(y_true, y_pred, zero_division=0)),
            "Recall": float(recall_score(y_true, y_pred, zero_division=0)),
            "F1": float(f1_score(y_true, y_pred, zero_division=0)),
            "TN": int(tn), "FP": int(fp), "FN": int(fn), "TP": int(tp)}

def plot_roc(y_true, p, title):
    fpr, tpr, _ = roc_curve(y_true, p)
    auc = roc_auc_score(y_true, p)
    plt.figure(); plt.plot(fpr, tpr, label=f"AUC={auc:.3f}"); plt.plot([0,1],[0,1], linestyle="--")
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate"); plt.title(title); plt.legend(loc="lower right")
    plt.tight_layout(); plt.show()

def plot_pr(y_true, p, title):
    prec, rec, _ = precision_recall_curve(y_true, p)
    ap = average_precision_score(y_true, p)
    plt.figure(); plt.plot(rec, prec, label=f"AP={ap:.3f}")
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title(title); plt.legend(loc="lower left")
    plt.tight_layout(); plt.show()

def plot_confusion(y_true, p, thr, title):
    y_pred = (p >= thr).astype(int)
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    plt.figure(); plt.imshow(cm, interpolation="nearest")
    plt.title(f"{title} (thr={thr:.2f})"); plt.xlabel("Predicted"); plt.ylabel("Actual")
    plt.xticks([0,1],[0,1]); plt.yticks([0,1],[0,1])
    import numpy as np
    for (i,j), v in np.ndenumerate(cm): plt.text(j, i, str(v), ha="center", va="center")
    plt.tight_layout(); plt.show()


In [None]:

rows = []
for name, p in [("XGB", pxgb_t), ("NN", pnn_t), ("EnsembleAvg", pens_t)]:
    rows.append({"Model": name, **summary_metrics(y_t, p)})
summary_test = pd.DataFrame(rows).set_index("Model")
summary_test


In [None]:

THRESHOLD = 0.5  # tweak and re-run
rows = []
for name, p in [("XGB", pxgb_t), ("NN", pnn_t), ("EnsembleAvg", pens_t)]:
    rows.append({"Model": name, **point_metrics(y_t, p, THRESHOLD)})
pd.DataFrame(rows).set_index("Model")


In [None]:

plot_roc(y_t, pxgb_t, "XGB ROC — Test")
plot_roc(y_t, pnn_t,  "NN ROC — Test")
plot_roc(y_t, pens_t, "Ensemble ROC — Test")

plot_pr(y_t, pxgb_t, "XGB PR — Test")
plot_pr(y_t, pnn_t,  "NN PR — Test")
plot_pr(y_t, pens_t, "Ensemble PR — Test")

plot_confusion(y_t, pxgb_t, THRESHOLD, "XGB Confusion — Test")
plot_confusion(y_t, pnn_t,  THRESHOLD, "NN Confusion — Test")
plot_confusion(y_t, pens_t, THRESHOLD, "Ensemble Confusion — Test")


In [None]:

if df_train is not None:
    rows = []
    for name, p in [("XGB", pxgb_tr), ("NN", pnn_tr), ("EnsembleAvg", pens_tr)]:
        rows.append({"Model": name, **summary_metrics(y_tr, p)})
    display(pd.DataFrame(rows).set_index("Model"))
    rows = []
    for name, p in [("XGB", pxgb_tr), ("NN", pnn_tr), ("EnsembleAvg", pens_tr)]:
        rows.append({"Model": name, **point_metrics(y_tr, p, THRESHOLD)})
    display(pd.DataFrame(rows).set_index("Model"))


In [None]:

def optimal_thresholds(y_true, p):
    import numpy as np
    prec, rec, thr = precision_recall_curve(y_true, p)
    thr = np.append(thr, 1.0)
    f1s = 2 * (prec * rec) / np.maximum(prec + rec, 1e-12)
    thr_f1 = thr[np.nanargmax(f1s)]
    fpr, tpr, thr2 = roc_curve(y_true, p)
    J = tpr - fpr
    thr_j = thr2[np.nanargmax(J)]
    return {"F1_opt": float(thr_f1), "YoudenJ_opt": float(thr_j)}

print("Optimal thresholds (TEST):")
print("XGB:", optimal_thresholds(y_t, pxgb_t))
print("NN :", optimal_thresholds(y_t, pnn_t))
print("ENS:", optimal_thresholds(y_t, pens_t))


## Load headless artifacts from `04_eval/` (optional)

In [None]:

def find_latest_eval_tables(bucket: str, eval_prefix: str):
    latest_summary = None; latest_points = None; latest_report = None
    cand = []
    for obj in s3_list(bucket, f"{eval_prefix}/"):
        key = obj["Key"]
        lname = key.lower()
        if lname.endswith(".csv") or lname.endswith(".html"):
            cand.append((obj["LastModified"], key))
    cand.sort(reverse=True)
    for _, key in cand:
        if key.endswith(".csv") and "summary_" in key and latest_summary is None:
            latest_summary = key
        if key.endswith(".csv") and "point_metrics_" in key and latest_points is None:
            latest_points = key
        if key.endswith(".html") and "report_" in key and latest_report is None:
            latest_report = key
        if latest_summary and latest_points and latest_report:
            break
    return latest_summary, latest_points, latest_report

SUM_KEY, POINT_KEY, REPORT_KEY = find_latest_eval_tables(BUCKET, EVAL_PREFIX)
print("Latest eval artifacts:")
print("  summary_csv :", SUM_KEY)
print("  points_csv  :", POINT_KEY)
print("  html_report :", REPORT_KEY)

if SUM_KEY:
    df_sum = s3_read_csv(BUCKET, SUM_KEY)
    display(df_sum.head())
if POINT_KEY:
    df_pts = s3_read_csv(BUCKET, POINT_KEY)
    display(df_pts.head())
if REPORT_KEY:
    print("Open in browser:", f"https://{BUCKET}.s3.amazonaws.com/{REPORT_KEY}")


In [None]:

# Optional interactive slider (uncomment if ipywidgets is installed)
# from ipywidgets import interact, FloatSlider
# def show_at_threshold(thr=0.5):
#     rows = []
#     for name, p in [("XGB", pxgb_t), ("NN", pnn_t), ("EnsembleAvg", pens_t)]:
#         rows.append({"Model": name, **point_metrics(y_t, p, thr)})
#     display(pd.DataFrame(rows).set_index("Model"))
#     plot_confusion(y_t, pxgb_t, thr, "XGB Confusion — Test")
#     plot_confusion(y_t, pnn_t,  thr, "NN Confusion — Test")
#     plot_confusion(y_t, pens_t, thr, "Ensemble Confusion — Test")
# interact(show_at_threshold, thr=FloatSlider(min=0.01, max=0.99, step=0.01, value=0.5))
