In [None]:
# Mount + installs
from google.colab import drive
drive.mount('/content/drive')

!pip -q install shap imbalanced-learn xgboost lightgbm

# Paths & basics
base_path = "/content/drive/MyDrive/heartriskx/data/"
out_dir   = "/content/drive/MyDrive/heartriskx/outputs/day6"
model_dir = "/content/drive/MyDrive/heartriskx/models/day6"

import os, numpy as np, pandas as pd, matplotlib.pyplot as plt
os.makedirs(out_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

RANDOM_STATE = 42

# sklearn imports
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.calibration import calibration_curve
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, average_precision_score, brier_score_loss)

# models
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

# save/load
import joblib

# SHAP
import shap


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Heart2020
heart2020 = pd.read_csv(base_path + "heart_2020.csv")
heart2020["target"] = heart2020["HeartDisease"].map({"Yes": 1, "No": 0}).astype(int)
heart2020 = heart2020.drop(columns=["HeartDisease"])

# Cardio
cardio = pd.read_csv(base_path + "cardio_train.csv", sep=";")
cardio = cardio.rename(columns={"cardio": "target"})

# UCI (Cleveland)
uci = pd.read_csv(base_path + "uci_heart.csv", header=None)
uci.columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]
uci["target"] = (uci["target"] > 0).astype(int)

print("Heart2020:", heart2020.shape, " target=", heart2020["target"].value_counts().to_dict())
print("Cardio   :", cardio.shape,    " target=", cardio["target"].value_counts().to_dict())
print("UCI      :", uci.shape,       " target=", uci["target"].value_counts().to_dict())


Heart2020: (319795, 18)  target= {0: 292422, 1: 27373}
Cardio   : (70000, 13)  target= {0: 35021, 1: 34979}
UCI      : (303, 14)  target= {0: 164, 1: 139}


In [None]:
# Heart2020
Xh = heart2020.drop(columns=['target'])
yh = heart2020['target']
Xh_tr, Xh_te, yh_tr, yh_te = train_test_split(
    Xh, yh, test_size=0.2, stratify=yh, random_state=RANDOM_STATE
)

# Cardio
Xc = cardio.drop(columns=['target','id'])
yc = cardio['target']
Xc_tr, Xc_te, yc_tr, yc_te = train_test_split(
    Xc, yc, test_size=0.2, stratify=yc, random_state=RANDOM_STATE
)

# UCI
Xu = uci.drop(columns=['target'])
yu = uci['target']
Xu_tr, Xu_te, yu_tr, yu_te = train_test_split(
    Xu, yu, test_size=0.2, stratify=yu, random_state=RANDOM_STATE
)


In [None]:
def make_preprocessor(X, scale_numeric=False):
    num_cols = X.select_dtypes(include=np.number).columns.tolist()
    cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()

    transformers = []
    if num_cols:
        if scale_numeric:
            transformers.append(("num", StandardScaler(), num_cols))
        else:
            transformers.append(("num", "passthrough", num_cols))
    if cat_cols:
        transformers.append(("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols))

    return ColumnTransformer(transformers)

def eval_and_report(pipeline, name, Xtr, ytr, Xte, yte, save_prefix):
    pipeline.fit(Xtr, ytr)
    proba = pipeline.predict_proba(Xte)[:,1]
    pred = (proba >= 0.5).astype(int)

    metrics = {
        "accuracy": accuracy_score(yte, pred),
        "precision": precision_score(yte, pred),
        "recall": recall_score(yte, pred),
        "f1": f1_score(yte, pred),
        "roc_auc": roc_auc_score(yte, proba),
        "pr_auc": average_precision_score(yte, proba),
        "brier": brier_score_loss(yte, proba)
    }
    print(f"{name}: Acc={metrics['accuracy']:.3f}, Prec={metrics['precision']:.3f}, "
          f"Rec={metrics['recall']:.3f}, F1={metrics['f1']:.3f}, "
          f"ROC-AUC={metrics['roc_auc']:.3f}, PR-AUC={metrics['pr_auc']:.3f}, "
          f"Brier={metrics['brier']:.3f}")

    joblib.dump(pipeline, f"{model_dir}/{save_prefix}.joblib")
    return metrics

# Build with Day 5 best params
pre_h = make_preprocessor(Xh_tr, scale_numeric=False)
best_h_lgbm = Pipeline(steps=[
    ("prep", pre_h),
    ("clf", LGBMClassifier(
        random_state=RANDOM_STATE,
        class_weight='balanced',
        n_estimators=200,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        min_child_samples=20,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_lambda=1.0
    ))
])

metrics_h = eval_and_report(best_h_lgbm, "Heart2020 LGBM (tuned)",
                            Xh_tr, yh_tr, Xh_te, yh_te,
                            save_prefix="heart2020_lgbm")


[LightGBM] [Info] Number of positive: 21898, number of negative: 233938
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.091701 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 255836, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




Heart2020 LGBM (tuned): Acc=0.732, Prec=0.215, Rec=0.806, F1=0.339, ROC-AUC=0.840, PR-AUC=0.351, Brier=0.168


In [None]:
pre_c = make_preprocessor(Xc_tr, scale_numeric=False)
best_c_lgbm = Pipeline(steps=[
    ("prep", pre_c),
    ("clf", LGBMClassifier(
        random_state=RANDOM_STATE,
        class_weight='balanced',
        n_estimators=200,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        min_child_samples=20,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_lambda=1.0
    ))
])

metrics_c = eval_and_report(best_c_lgbm, "Cardio LGBM (tuned)",
                            Xc_tr, yc_tr, Xc_te, yc_te,
                            save_prefix="cardio_lgbm")


[LightGBM] [Info] Number of positive: 27983, number of negative: 28017
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006319 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 714
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000




Cardio LGBM (tuned): Acc=0.735, Prec=0.754, Rec=0.697, F1=0.724, ROC-AUC=0.800, PR-AUC=0.784, Brier=0.181


In [None]:
pre_u = make_preprocessor(Xu_tr, scale_numeric=True)
best_u_lgbm = Pipeline(steps=[
    ("prep", pre_u),
    ("clf", LGBMClassifier(
        random_state=RANDOM_STATE,
        class_weight='balanced',
        n_estimators=200,
        learning_rate=0.03,
        num_leaves=31,
        max_depth=-1
    ))
])

metrics_u = eval_and_report(best_u_lgbm, "UCI LGBM (tuned)",
                            Xu_tr, yu_tr, Xu_te, yu_te,
                            save_prefix="uci_lgbm")


[LightGBM] [Info] Number of positive: 111, number of negative: 131
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 247
[LightGBM] [Info] Number of data points in the train set: 242, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
UCI LGBM (tuned): Acc=0.885, Prec=0.839, Rec=0.929, F1=0.881, ROC-AUC=0.948, PR-AUC=0.945, Brier=0.092




In [None]:
# === Step 4: SHAP Explainability ===

# Heart2020
loaded_h = joblib.load(f"{model_dir}/heart2020_lgbm.joblib")
Xh_tr_trans = loaded_h.named_steps["prep"].fit_transform(Xh_tr)
clf_h = loaded_h.named_steps["clf"]

explainer_h = shap.TreeExplainer(clf_h)
shap_values_h = explainer_h.shap_values(Xh_tr_trans)[1]  # take positive class
shap.summary_plot(
    shap_values_h, Xh_tr_trans,
    feature_names=loaded_h.named_steps["prep"].get_feature_names_out(),
    show=False
)
plt.title("Heart2020 SHAP Summary")
plt.savefig(f"{out_dir}/heart2020_shap_summary.png", bbox_inches="tight")
plt.close()

# Cardio
loaded_c = joblib.load(f"{model_dir}/cardio_lgbm.joblib")
Xc_tr_trans = loaded_c.named_steps["prep"].fit_transform(Xc_tr)
clf_c = loaded_c.named_steps["clf"]

explainer_c = shap.TreeExplainer(clf_c)
shap_values_c = explainer_c.shap_values(Xc_tr_trans)[1]
shap.summary_plot(
    shap_values_c, Xc_tr_trans,
    feature_names=loaded_c.named_steps["prep"].get_feature_names_out(),
    show=False
)
plt.title("Cardio SHAP Summary")
plt.savefig(f"{out_dir}/cardio_shap_summary.png", bbox_inches="tight")
plt.close()

# UCI
loaded_u = joblib.load(f"{model_dir}/uci_lgbm.joblib")
Xu_tr_trans = loaded_u.named_steps["prep"].fit_transform(Xu_tr)
clf_u = loaded_u.named_steps["clf"]

explainer_u = shap.TreeExplainer(clf_u)
shap_values_u = explainer_u.shap_values(Xu_tr_trans)[1]
shap.summary_plot(
    shap_values_u, Xu_tr_trans,
    feature_names=loaded_u.named_steps["prep"].get_feature_names_out(),
    show=False
)
plt.title("UCI SHAP Summary")
plt.savefig(f"{out_dir}/uci_shap_summary.png", bbox_inches="tight")
plt.close()




AssertionError: Summary plots need a matrix of shap_values, not a vector.

In [None]:
# === Step 4: SHAP Explainability (fixed) ===

def get_shap_values(explainer, X):
    """Handles SHAP output format changes for LightGBM binary classifier."""
    shap_vals = explainer.shap_values(X)
    # Case 1: list of arrays (old style)
    if isinstance(shap_vals, list):
        return shap_vals[1]  # positive class
    # Case 2: single array, 1D → make it 2D
    elif shap_vals.ndim == 1:
        return shap_vals.reshape(-1, 1)
    return shap_vals

# Heart2020
loaded_h = joblib.load(f"{model_dir}/heart2020_lgbm.joblib")
Xh_tr_trans = loaded_h.named_steps["prep"].fit_transform(Xh_tr)
clf_h = loaded_h.named_steps["clf"]

explainer_h = shap.TreeExplainer(clf_h)
shap_values_h = get_shap_values(explainer_h, Xh_tr_trans)
shap.summary_plot(
    shap_values_h, Xh_tr_trans,
    feature_names=loaded_h.named_steps["prep"].get_feature_names_out(),
    show=False
)
plt.title("Heart2020 SHAP Summary")
plt.savefig(f"{out_dir}/heart2020_shap_summary.png", bbox_inches="tight")
plt.close()

# Cardio
loaded_c = joblib.load(f"{model_dir}/cardio_lgbm.joblib")
Xc_tr_trans = loaded_c.named_steps["prep"].fit_transform(Xc_tr)
clf_c = loaded_c.named_steps["clf"]

explainer_c = shap.TreeExplainer(clf_c)
shap_values_c = get_shap_values(explainer_c, Xc_tr_trans)
shap.summary_plot(
    shap_values_c, Xc_tr_trans,
    feature_names=loaded_c.named_steps["prep"].get_feature_names_out(),
    show=False
)
plt.title("Cardio SHAP Summary")
plt.savefig(f"{out_dir}/cardio_shap_summary.png", bbox_inches="tight")
plt.close()

# UCI
loaded_u = joblib.load(f"{model_dir}/uci_lgbm.joblib")
Xu_tr_trans = loaded_u.named_steps["prep"].fit_transform(Xu_tr)
clf_u = loaded_u.named_steps["clf"]

explainer_u = shap.TreeExplainer(clf_u)
shap_values_u = get_shap_values(explainer_u, Xu_tr_trans)
shap.summary_plot(
    shap_values_u, Xu_tr_trans,
    feature_names=loaded_u.named_steps["prep"].get_feature_names_out(),
    show=False
)
plt.title("UCI SHAP Summary")
plt.savefig(f"{out_dir}/uci_shap_summary.png", bbox_inches="tight")
plt.close()




In [None]:
from sklearn.calibration import calibration_curve

def plot_calibration(y_true, y_proba, label, filename):
    prob_true, prob_pred = calibration_curve(y_true, y_proba, n_bins=10)
    plt.figure()
    plt.plot(prob_pred, prob_true, "s-", label=label)
    plt.plot([0,1],[0,1],"k--", label="Perfectly Calibrated")
    plt.xlabel("Predicted probability")
    plt.ylabel("True probability")
    plt.title(f"Calibration Curve — {label}")
    plt.legend()
    plt.savefig(f"{out_dir}/{filename}.png", bbox_inches="tight")
    plt.close()
    print(f"✅ Saved {filename}.png")

# --- Heart2020 ---
proba_h = loaded_h.predict_proba(Xh_te)[:,1]
plot_calibration(yh_te, proba_h, "Heart2020 LGBM", "heart2020_calibration")

# --- Cardio ---
proba_c = loaded_c.predict_proba(Xc_te)[:,1]
plot_calibration(yc_te, proba_c, "Cardio LGBM", "cardio_calibration")

# --- UCI ---
proba_u = loaded_u.predict_proba(Xu_te)[:,1]
plot_calibration(yu_te, proba_u, "UCI LGBM", "uci_calibration")




✅ Saved heart2020_calibration.png




✅ Saved cardio_calibration.png




✅ Saved uci_calibration.png
