In [None]:
# ==== Day 11 · Setup & Mount (safe even if already mounted) ====
import os, json, joblib, numpy as np, pandas as pd
import matplotlib.pyplot as plt; import seaborn as sns
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, average_precision_score, confusion_matrix)
from sklearn.inspection import partial_dependence, PartialDependenceDisplay
import warnings
warnings.filterwarnings("ignore")

# Mount (won't crash if already mounted)
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
except Exception:
    pass

# Paths (keep same project layout you used before)
BASE_DIR  = "/content/drive/MyDrive/heartriskx"
DATA_DIR  = f"{BASE_DIR}/data"
MODEL_DIR = f"{BASE_DIR}/models/final"
OUT_DIR   = f"{BASE_DIR}/outputs/day11"
os.makedirs(OUT_DIR, exist_ok=True)

RANDOM_STATE = 42
plt.rcParams["figure.figsize"] = (8,6)
sns.set(style="whitegrid")

print("Paths OK:", os.path.exists(DATA_DIR), os.path.exists(MODEL_DIR), OUT_DIR)


Mounted at /content/drive
Paths OK: True True /content/drive/MyDrive/heartriskx/outputs/day11


In [None]:
# ==== Day 11 · Load datasets just like before ====
heart2020 = pd.read_csv(f"{DATA_DIR}/heart_2020.csv")
heart2020["target"] = heart2020["HeartDisease"].map({"Yes":1, "No":0}).astype(int)
heart2020 = heart2020.drop(columns=["HeartDisease"])

cardio = pd.read_csv(f"{DATA_DIR}/cardio_train.csv", sep=';').rename(columns={"cardio":"target"})

uci = pd.read_csv(f"{DATA_DIR}/uci_heart.csv", header=None)
uci.columns = ["age","sex","cp","trestbps","chol","fbs","restecg",
               "thalach","exang","oldpeak","slope","ca","thal","target"]
uci["target"] = (uci["target"] > 0).astype(int)

for name, df in [("heart2020", heart2020), ("cardio", cardio), ("uci", uci)]:
    print(name, df.shape, " target:", df["target"].value_counts().to_dict())


heart2020 (319795, 18)  target: {0: 292422, 1: 27373}
cardio (70000, 13)  target: {0: 35021, 1: 34979}
uci (303, 14)  target: {0: 164, 1: 139}


In [None]:
# ==== Day 11 · Load final pipelines (bundle json) ====
def load_bundle(prefix):
    jpath = f"{MODEL_DIR}/{prefix}_bundle.json"
    with open(jpath, "r") as f:
        info = json.load(f)
    pipe = joblib.load(info["pipeline_path"])
    thr  = info.get("threshold", 0.5)
    return pipe, thr

models = {}
for prefix in ["heart2020","cardio","uci"]:
    try:
        pipe, thr = load_bundle(prefix)
        models[prefix] = {"pipe":pipe, "thr":thr}
        print(f"Loaded {prefix}: thr={thr}")
    except Exception as e:
        print(f"Could not load {prefix} -> {e}")


Loaded heart2020: thr=0.723115550317454
Loaded cardio: thr=0.38848665919874714
Loaded uci: thr=0.4891037479162372


In [None]:
# ==== Day 11 · Helpers ====
def metrics_at(y_true, proba, thr):
    pred = (proba >= thr).astype(int)
    return {
        "acc": accuracy_score(y_true, pred),
        "prec": precision_score(y_true, pred, zero_division=0),
        "rec": recall_score(y_true, pred, zero_division=0),
        "f1": f1_score(y_true, pred, zero_division=0),
        "roc": roc_auc_score(y_true, proba),
        "prauc": average_precision_score(y_true, proba)
    }


In [None]:
# ==== Day 11 · Cell 4 — SHAP helpers (old-API compatible) ====
import shap
import numpy as np
import matplotlib.pyplot as plt

# For older SHAP versions (no enable_js / no new Explainers)
# We will use: shap.TreeExplainer(model, data=background, feature_perturbation="interventional")

def _get_pipe_bits(prefix):
    """Return (pipe, thr, prep, clf, feature_names) for a saved bundle."""
    pipe = models[prefix]["pipe"]
    thr  = models[prefix]["thr"]
    prep = pipe.named_steps.get("prep", None)
    clf  = pipe.named_steps.get("clf", None)
    if prep is None or clf is None:
        raise ValueError(f"[{prefix}] pipeline missing 'prep' or 'clf' steps.")
    # names from the fitted preprocessor (stable with ColumnTransformer)
    feat_names = np.array(prep.get_feature_names_out())
    return pipe, thr, prep, clf, feat_names

def _prep_data(pipe, df, max_bg=400, max_explain=400, seed=42):
    """Transform df with pipeline.prep to get the matrix used by the classifier."""
    X_raw = df.drop(columns=["target"])
    prep = pipe.named_steps["prep"]
    Xt   = prep.transform(X_raw)  # do NOT fit; it's already fitted in the bundle

    n = Xt.shape[0]
    rng = np.random.RandomState(seed)
    bg_idx   = rng.choice(n, size=min(n, max_bg), replace=False)
    show_idx = rng.choice(n, size=min(n, max_explain), replace=False)

    # Convert to dense if it's sparse (for KernelExplainer compatibility if we ever need it)
    try:
        Xt = Xt.toarray()
    except Exception:
        pass

    X_bg   = Xt[bg_idx]
    X_show = Xt[show_idx]
    return X_bg, X_show

def compute_shap_old(prefix, df, max_bg=400, max_explain=400):
    """
    Old-stable SHAP path:
      - Use TreeExplainer(model, data=background, feature_perturbation="interventional")
      - For binary classifiers, take class-1 shap values (index 1)
    Returns: (shap_values, X_show, feature_names)
    """
    pipe, thr, prep, clf, feat_names = _get_pipe_bits(prefix)
    X_bg, X_show = _prep_data(pipe, df, max_bg=max_bg, max_explain=max_explain)

    # Try tree explainer first with safe background; fallback to KernelExplainer only if needed.
    try:
        expl = shap.TreeExplainer(
            clf,
            data=X_bg,
            feature_perturbation="interventional",   # avoids leaf coverage warnings
            model_output="raw"                        # old API; proba conversion is handled internally
        )
        sv = expl.shap_values(X_show)
        # LightGBM binary -> list of 2 arrays; take positive class
        if isinstance(sv, list):
            if len(sv) == 2:
                shap_vals = sv[1]
            else:
                shap_vals = sv[0]
        else:
            shap_vals = sv
    except Exception as e:
        print(f"[{prefix}] TreeExplainer failed ({e}); switching to KernelExplainer (slower).")
        f = lambda z: clf.predict_proba(z)[:, 1]
        expl = shap.KernelExplainer(f, X_bg)
        shap_vals = expl.shap_values(X_show, nsamples="auto")

    return shap_vals, X_show, feat_names

def save_shap_plots(prefix, shap_vals, X_show, feature_names, out_dir, top_n=20):
    os.makedirs(out_dir, exist_ok=True)

    # Beeswarm (dot)
    plt.figure(figsize=(10,6))
    shap.summary_plot(shap_vals, X_show, feature_names=feature_names, show=False, max_display=top_n)
    plt.title(f"{prefix.upper()} — SHAP Summary (dot)")
    p1 = os.path.join(out_dir, f"{prefix}_shap_summary_dot.png")
    plt.savefig(p1, bbox_inches="tight")
    plt.close()

    # Bar (mean |SHAP|)
    plt.figure(figsize=(10,6))
    shap.summary_plot(shap_vals, X_show, feature_names=feature_names, plot_type="bar", show=False, max_display=top_n)
    plt.title(f"{prefix.upper()} — SHAP Summary (bar)")
    p2 = os.path.join(out_dir, f"{prefix}_shap_summary_bar.png")
    plt.savefig(p2, bbox_inches="tight")
    plt.close()

    print(f"✅ Saved SHAP plots for {prefix}:")
    print("   •", p1)
    print("   •", p2)

    # Also save the raw arrays to reuse later if needed
    np.save(os.path.join(out_dir, f"{prefix}_shap_values.npy"), shap_vals)
    np.save(os.path.join(out_dir, f"{prefix}_X_show.npy"),      X_show)
    np.save(os.path.join(out_dir, f"{prefix}_feature_names.npy"), feature_names)


In [None]:
# ==== Day 11 · Cell 5 — Run SHAP for all three models and save plots ====

# Expect: variables heart2020, cardio, uci, OUT_DIR, models are already defined (from earlier cells)
OUT_SHAP_DIR = os.path.join(OUT_DIR, "day11_shap")
os.makedirs(OUT_SHAP_DIR, exist_ok=True)

# HEART2020
sv_h, Xh_show, fh = compute_shap_old("heart2020", heart2020, max_bg=400, max_explain=400)
print(f"heart2020: SHAP shape = {np.array(sv_h).shape}, samples explained = {Xh_show.shape[0]}")
save_shap_plots("heart2020", sv_h, Xh_show, fh, OUT_SHAP_DIR, top_n=20)

# CARDIO
sv_c, Xc_show, fc = compute_shap_old("cardio", cardio, max_bg=400, max_explain=400)
print(f"cardio: SHAP shape = {np.array(sv_c).shape}, samples explained = {Xc_show.shape[0]}")
save_shap_plots("cardio", sv_c, Xc_show, fc, OUT_SHAP_DIR, top_n=20)

# UCI  (smaller dataset → smaller max_explain to be safe)
sv_u, Xu_show, fu = compute_shap_old("uci", uci, max_bg=200, max_explain=min(uci.shape[0], 300))
print(f"uci: SHAP shape = {np.array(sv_u).shape}, samples explained = {Xu_show.shape[0]}")
save_shap_plots("uci", sv_u, Xu_show, fu, OUT_SHAP_DIR, top_n=20)

print("\nAll SHAP artifacts saved under:", OUT_SHAP_DIR)


heart2020: SHAP shape = (400, 50), samples explained = 400
✅ Saved SHAP plots for heart2020:
   • /content/drive/MyDrive/heartriskx/outputs/day11/day11_shap/heart2020_shap_summary_dot.png
   • /content/drive/MyDrive/heartriskx/outputs/day11/day11_shap/heart2020_shap_summary_bar.png
cardio: SHAP shape = (400, 11), samples explained = 400
✅ Saved SHAP plots for cardio:
   • /content/drive/MyDrive/heartriskx/outputs/day11/day11_shap/cardio_shap_summary_dot.png
   • /content/drive/MyDrive/heartriskx/outputs/day11/day11_shap/cardio_shap_summary_bar.png
uci: SHAP shape = (300, 20), samples explained = 300
✅ Saved SHAP plots for uci:
   • /content/drive/MyDrive/heartriskx/outputs/day11/day11_shap/uci_shap_summary_dot.png
   • /content/drive/MyDrive/heartriskx/outputs/day11/day11_shap/uci_shap_summary_bar.png

All SHAP artifacts saved under: /content/drive/MyDrive/heartriskx/outputs/day11/day11_shap


In [None]:
# ==== Day 11 · Cell 6 — SHAP dependence plots for top-k features ====
import numpy as np
import matplotlib.pyplot as plt
import shap
import os

DEP_DIR = os.path.join(OUT_SHAP_DIR, "dependence")
os.makedirs(DEP_DIR, exist_ok=True)

def mean_abs_shap(shap_vals, feature_names):
    """Return a DataFrame of mean|SHAP| for ranking."""
    vals = np.abs(np.array(shap_vals))
    means = vals.mean(axis=0)
    import pandas as pd
    return pd.DataFrame({"feature": feature_names, "mean_abs_shap": means}).sort_values(
        "mean_abs_shap", ascending=False
    )

def save_dependence_batch(prefix, shap_vals, X_show, feature_names, top_k=8):
    rank = mean_abs_shap(shap_vals, feature_names)
    top_feats = rank["feature"].head(top_k).tolist()

    print(f"\n{prefix.upper()} — top {top_k} features for dependence plots:")
    for i, f in enumerate(top_feats, 1):
        print(f"  {i:02d}. {f}")

    for f in top_feats:
        plt.figure(figsize=(7,5))
        # color by the same feature (simple, stable)
        shap.dependence_plot(
            f, shap_vals, X_show,
            feature_names=feature_names,
            interaction_index=None,
            show=False
        )
        out_path = os.path.join(DEP_DIR, f"{prefix}_dep_{f.replace(' ', '_').replace('/', '_')}.png")
        plt.title(f"{prefix.upper()} — Dependence: {f}")
        plt.savefig(out_path, bbox_inches="tight")
        plt.close()
    print(f"✅ Saved {len(top_feats)} dependence plots to: {DEP_DIR}")

# Run for each dataset (uses arrays from Cell 5)
save_dependence_batch("heart2020", sv_h, Xh_show, fh, top_k=8)
save_dependence_batch("cardio",    sv_c, Xc_show, fc, top_k=8)
save_dependence_batch("uci",       sv_u, Xu_show, fu, top_k=8)



HEART2020 — top 8 features for dependence plots:
  01. cat__GenHealth_Excellent
  02. cat__Sex_Female
  03. cat__GenHealth_Very good
  04. cat__AgeCategory_18-24
  05. cat__AgeCategory_70-74
  06. cat__Diabetic_Yes
  07. cat__Smoking_No
  08. cat__AgeCategory_80 or older
✅ Saved 8 dependence plots to: /content/drive/MyDrive/heartriskx/outputs/day11/day11_shap/dependence

CARDIO — top 8 features for dependence plots:
  01. num__ap_hi
  02. num__age
  03. num__cholesterol
  04. num__weight
  05. num__ap_lo
  06. num__active
  07. num__gluc
  08. num__height
✅ Saved 8 dependence plots to: /content/drive/MyDrive/heartriskx/outputs/day11/day11_shap/dependence

UCI — top 8 features for dependence plots:
  01. cat__ca_0.0
  02. num__cp
  03. num__sex
  04. cat__thal_3.0
  05. cat__thal_7.0
  06. num__age
  07. num__slope
  08. num__oldpeak
✅ Saved 8 dependence plots to: /content/drive/MyDrive/heartriskx/outputs/day11/day11_shap/dependence


<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

<Figure size 700x500 with 0 Axes>

In [None]:
# ==== Day 11 · Cell 7 — Global importances (model-based) ====
import pandas as pd, seaborn as sns, matplotlib.pyplot as plt
import os

os.makedirs(OUT_DIR, exist_ok=True)

def save_feature_importance(prefix, top_k=20):
    pipe = models[prefix]["pipe"]; prep = pipe.named_steps["prep"]; clf = pipe.named_steps["clf"]
    if hasattr(clf, "feature_importances_"):
        names = prep.get_feature_names_out()
        imp   = clf.feature_importances_
        fi = (pd.DataFrame({"feature":names, "importance":imp})
              .sort_values("importance", ascending=False)
              .head(top_k))
        out_csv = f"{OUT_DIR}/{prefix}_feature_importance_top{top_k}.csv"
        fi.to_csv(out_csv, index=False)

        plt.figure(figsize=(8,6))
        sns.barplot(y="feature", x="importance", data=fi)
        plt.title(f"{prefix} — Top {top_k} feature importances")
        plt.tight_layout()
        out_png = f"{OUT_DIR}/{prefix}_feature_importance_top{top_k}.png"
        plt.savefig(out_png, bbox_inches="tight"); plt.close()
        print(f"{prefix}: saved importances → {out_csv}")
    else:
        print(f"{prefix}: classifier has no .feature_importances_")

for p in ["heart2020","cardio","uci"]:
    if p in models:
        save_feature_importance(p, top_k=20)


heart2020: saved importances → /content/drive/MyDrive/heartriskx/outputs/day11/heart2020_feature_importance_top20.csv
cardio: saved importances → /content/drive/MyDrive/heartriskx/outputs/day11/cardio_feature_importance_top20.csv
uci: saved importances → /content/drive/MyDrive/heartriskx/outputs/day11/uci_feature_importance_top20.csv


In [None]:
# ==== Day 11 · Cell 8 — PDP/ICE on a few numeric raw features ====
from sklearn.inspection import PartialDependenceDisplay

def pdp_for(prefix, df, features, grid_resolution=20):
    pipe = models[prefix]["pipe"]
    X = df.drop(columns=["target"])
    for feat in features:
        if feat not in X.columns:
            print(f"{prefix}: skip PDP for '{feat}' (not in raw columns).")
            continue
        try:
            fig, ax = plt.subplots(figsize=(6,4))
            PartialDependenceDisplay.from_estimator(
                pipe, X, [feat], kind="both", grid_resolution=grid_resolution, ax=ax
            )
            plt.title(f"{prefix} — PDP/ICE: {feat}")
            out_png = f"{OUT_DIR}/{prefix}_pdp_{feat}.png"
            plt.savefig(out_png, bbox_inches="tight"); plt.close()
            print(f"{prefix}: PDP saved → {out_png}")
        except Exception as e:
            print(f"{prefix}: PDP failed for '{feat}' → {e}")

# Reasonable numeric raw features per dataset:
pdp_for("heart2020", heart2020, features=["BMI","SleepTime","PhysicalHealth"])
pdp_for("cardio",    cardio,    features=["age","ap_hi","ap_lo","weight","height"])
pdp_for("uci",       uci,       features=["age","trestbps","chol","thalach","oldpeak"])

print("✅ PDP/ICE plots saved to:", OUT_DIR)


heart2020: PDP saved → /content/drive/MyDrive/heartriskx/outputs/day11/heart2020_pdp_BMI.png
heart2020: PDP saved → /content/drive/MyDrive/heartriskx/outputs/day11/heart2020_pdp_SleepTime.png
heart2020: PDP saved → /content/drive/MyDrive/heartriskx/outputs/day11/heart2020_pdp_PhysicalHealth.png
cardio: PDP saved → /content/drive/MyDrive/heartriskx/outputs/day11/cardio_pdp_age.png
cardio: PDP saved → /content/drive/MyDrive/heartriskx/outputs/day11/cardio_pdp_ap_hi.png
cardio: PDP saved → /content/drive/MyDrive/heartriskx/outputs/day11/cardio_pdp_ap_lo.png
cardio: PDP saved → /content/drive/MyDrive/heartriskx/outputs/day11/cardio_pdp_weight.png
cardio: PDP saved → /content/drive/MyDrive/heartriskx/outputs/day11/cardio_pdp_height.png
uci: PDP saved → /content/drive/MyDrive/heartriskx/outputs/day11/uci_pdp_age.png
uci: PDP saved → /content/drive/MyDrive/heartriskx/outputs/day11/uci_pdp_trestbps.png
uci: PDP saved → /content/drive/MyDrive/heartriskx/outputs/day11/uci_pdp_chol.png
uci: PDP s