In [None]:
# Mount + basics
from google.colab import drive
drive.mount('/content/drive')

import os, json, warnings, joblib
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, precision_recall_curve,
    confusion_matrix, classification_report, roc_curve
)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

warnings.filterwarnings("ignore")  # keep output clean

# Paths
DATA_DIR   = "/content/drive/MyDrive/heartriskx/data"
OUT_DIR    = "/content/drive/MyDrive/heartriskx/outputs/day7"
FINAL_DIR  = "/content/drive/MyDrive/heartriskx/models/final"

os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(FINAL_DIR, exist_ok=True)

RANDOM_STATE = 42


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Heart2020
heart2020 = pd.read_csv(f"{DATA_DIR}/heart_2020.csv")
heart2020["target"] = heart2020["HeartDisease"].map({"Yes":1,"No":0}).astype(int)
heart2020 = heart2020.drop(columns=["HeartDisease"])

# Cardio
cardio = pd.read_csv(f"{DATA_DIR}/cardio_train.csv", sep=";")
cardio = cardio.rename(columns={"cardio":"target"})

# UCI (Cleveland) â€” headerless version you confirmed earlier
uci = pd.read_csv(f"{DATA_DIR}/uci_heart.csv", header=None)
uci.columns = ["age","sex","cp","trestbps","chol","fbs","restecg",
               "thalach","exang","oldpeak","slope","ca","thal","target"]
uci["target"] = (uci["target"] > 0).astype(int)

print("Heart2020:", heart2020.shape, " target=", heart2020["target"].value_counts().to_dict())
print("Cardio   :", cardio.shape,    " target=", cardio["target"].value_counts().to_dict())
print("UCI      :", uci.shape,       " target=", uci["target"].value_counts().to_dict())


Heart2020: (319795, 18)  target= {0: 292422, 1: 27373}
Cardio   : (70000, 13)  target= {0: 35021, 1: 34979}
UCI      : (303, 14)  target= {0: 164, 1: 139}


In [None]:
def make_preprocessor(X, scale_numeric=False):
    num_cols = X.select_dtypes(include=np.number).columns.tolist()
    cat_cols = X.select_dtypes(exclude=np.number).columns.tolist()

    tx = []
    if num_cols:
        tx.append(("num", StandardScaler() if scale_numeric else "passthrough", num_cols))
    if cat_cols:
        tx.append(("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols))
    return ColumnTransformer(tx)

def best_f1_threshold(y_true, proba):
    p, r, th = precision_recall_curve(y_true, proba)
    f1 = 2*p*r/(p+r+1e-9)
    i = np.nanargmax(f1)
    return float(th[i]), float(f1[i]), float(p[i]), float(r[i])

def report_thresholded(name, y_true, proba, thr, save_prefix=None):
    y_pred = (proba >= thr).astype(int)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc = roc_auc_score(y_true, proba)
    pr  = average_precision_score(y_true, proba)
    print(f"{name}:   Acc={acc:.3f}, Prec={prec:.3f}, Rec={rec:.3f}, F1={f1:.3f}, ROC-AUC={roc:.3f}, PR-AUC={pr:.3f}")

    # Confusion matrix & report
    cm = confusion_matrix(y_true, y_pred)
    print("\nConfusion Matrix (@thr):\n", cm)
    print("\nClassification Report (@thr):\n", classification_report(y_true, y_pred))

    # Optional curves
    if save_prefix:
        fpr, tpr, _ = roc_curve(y_true, proba)
        plt.figure()
        plt.plot(fpr, tpr, label=name)
        plt.plot([0,1],[0,1],"k--")
        plt.title(f"ROC â€” {name}"); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend()
        plt.savefig(f"{OUT_DIR}/{save_prefix}_roc.png", bbox_inches="tight"); plt.close()

        p, r, _ = precision_recall_curve(y_true, proba)
        plt.figure()
        plt.plot(r, p, label=name)
        plt.title(f"PR â€” {name}"); plt.xlabel("Recall"); plt.ylabel("Precision"); plt.legend()
        plt.savefig(f"{OUT_DIR}/{save_prefix}_pr.png", bbox_inches="tight"); plt.close()

    return dict(accuracy=acc, precision=prec, recall=rec, f1=f1, roc_auc=roc, pr_auc=pr)


In [None]:
# ===== Heart2020
Xh = heart2020.drop(columns=["target"])
yh = heart2020["target"]
Xh_tr, Xh_te, yh_tr, yh_te = train_test_split(Xh, yh, test_size=0.2, stratify=yh, random_state=RANDOM_STATE)

pre_h = make_preprocessor(Xh_tr, scale_numeric=False)
pipe_h = Pipeline(steps=[
    ("prep", pre_h),
    ("clf", LGBMClassifier(
        random_state=RANDOM_STATE, class_weight="balanced",
        n_estimators=200, learning_rate=0.05, num_leaves=31, max_depth=-1,
        min_child_samples=20, subsample=0.85, colsample_bytree=0.85, reg_lambda=1.0
    ))
])
pipe_h.fit(Xh_tr, yh_tr)
proba_h_tr = pipe_h.predict_proba(Xh_tr)[:,1]
thr_h, f1_h, ph_h, rh_h = best_f1_threshold(yh_tr, proba_h_tr)
print("\nHeart2020 â€” threshold search on TRAIN")
print(f"Best F1 threshold={thr_h:.3f} (F1={f1_h:.3f}, Prec={ph_h:.3f}, Rec={rh_h:.3f})")

proba_h_te = pipe_h.predict_proba(Xh_te)[:,1]
print("Heart2020 @0.5:")
_ = report_thresholded("Heart2020 LGBM (final @0.5)", yh_te, proba_h_te, 0.5, save_prefix="heart2020_at05")
print("Heart2020 @bestF1:")
_ = report_thresholded("Heart2020 LGBM (final @bestF1)", yh_te, proba_h_te, thr_h, save_prefix="heart2020_atbest")

# ===== Cardio
Xc = cardio.drop(columns=["target","id"])
yc = cardio["target"]
Xc_tr, Xc_te, yc_tr, yc_te = train_test_split(Xc, yc, test_size=0.2, stratify=yc, random_state=RANDOM_STATE)

pre_c = make_preprocessor(Xc_tr, scale_numeric=False)
pipe_c = Pipeline(steps=[
    ("prep", pre_c),
    ("clf", LGBMClassifier(
        random_state=RANDOM_STATE, class_weight="balanced",
        n_estimators=200, learning_rate=0.05, num_leaves=31, max_depth=-1,
        min_child_samples=20, subsample=0.85, colsample_bytree=0.85, reg_lambda=1.0
    ))
])
pipe_c.fit(Xc_tr, yc_tr)
proba_c_tr = pipe_c.predict_proba(Xc_tr)[:,1]
thr_c, f1_c, ph_c, rh_c = best_f1_threshold(yc_tr, proba_c_tr)
print("\nCardio â€” threshold search on TRAIN")
print(f"Best F1 threshold={thr_c:.3f} (F1={f1_c:.3f}, Prec={ph_c:.3f}, Rec={rh_c:.3f})")

proba_c_te = pipe_c.predict_proba(Xc_te)[:,1]
print("Cardio @0.5:")
_ = report_thresholded("Cardio LGBM (final @0.5)", yc_te, proba_c_te, 0.5, save_prefix="cardio_at05")
print("Cardio @bestF1:")
_ = report_thresholded("Cardio LGBM (final @bestF1)", yc_te, proba_c_te, thr_c, save_prefix="cardio_atbest")

# ===== UCI
Xu = uci.drop(columns=["target"])
yu = uci["target"]
Xu_tr, Xu_te, yu_tr, yu_te = train_test_split(Xu, yu, test_size=0.2, stratify=yu, random_state=RANDOM_STATE)

pre_u = make_preprocessor(Xu_tr, scale_numeric=True)  # LR-friendly, also fine for LGBM
pipe_u = Pipeline(steps=[
    ("prep", pre_u),
    ("clf", LGBMClassifier(
        random_state=RANDOM_STATE, class_weight="balanced",
        n_estimators=200, learning_rate=0.03, num_leaves=31, max_depth=-1
    ))
])
pipe_u.fit(Xu_tr, yu_tr)
proba_u_tr = pipe_u.predict_proba(Xu_tr)[:,1]
thr_u, f1_u, ph_u, rh_u = best_f1_threshold(yu_tr, proba_u_tr)
print("\nUCI â€” threshold search on TRAIN")
print(f"Best F1 threshold={thr_u:.3f} (F1={f1_u:.3f}, Prec={ph_u:.3f}, Rec={rh_u:.3f})")

proba_u_te = pipe_u.predict_proba(Xu_te)[:,1]
print("UCI @0.5:")
_ = report_thresholded("UCI LGBM (final @0.5)", yu_te, proba_u_te, 0.5, save_prefix="uci_at05")
print("UCI @bestF1:")
_ = report_thresholded("UCI LGBM (final @bestF1)", yu_te, proba_u_te, thr_u, save_prefix="uci_atbest")

print("\nâœ… Step 3 done â€” metrics & plots saved to", OUT_DIR)


[LightGBM] [Info] Number of positive: 21898, number of negative: 233938
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.093907 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 432
[LightGBM] [Info] Number of data points in the train set: 255836, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

Heart2020 â€” threshold search on TRAIN
Best F1 threshold=0.723 (F1=0.412, Prec=0.333, Rec=0.539)
Heart2020 @0.5:
Heart2020 LGBM (final @0.5):   Acc=0.732, Prec=0.215, Rec=0.806, F1=0.339, ROC-AUC=0.840, PR-AUC=0.351

Confusion Matrix (@thr):
 [[42380 16104]
 [ 1064  4411]]

Classification Report (@thr):
               precision    recall  f1-score   support

           0       0.98      0.72      0.83     58484
           1       0.22      0.81      0.34      5475

    accuracy                     

In [None]:
def export_bundle(prefix, pipe, thr):
    pipe_path = f"{FINAL_DIR}/{prefix}_pipeline.joblib"
    joblib.dump(pipe, pipe_path)
    bundle = {
        "pipeline_path": pipe_path,
        "threshold": float(thr),
        "prefix": prefix
    }
    with open(f"{FINAL_DIR}/{prefix}_bundle.json","w") as f:
        json.dump(bundle, f, indent=2)
    print(f"âœ… Exported {prefix}:")
    print(bundle)

export_bundle("heart2020", pipe_h, thr_h)
export_bundle("cardio",    pipe_c, thr_c)
export_bundle("uci",       pipe_u, thr_u)

print("\nâœ… Step 4 done â€” bundles saved to", FINAL_DIR)


âœ… Exported heart2020:
{'pipeline_path': '/content/drive/MyDrive/heartriskx/models/final/heart2020_pipeline.joblib', 'threshold': 0.723115550317454, 'prefix': 'heart2020'}
âœ… Exported cardio:
{'pipeline_path': '/content/drive/MyDrive/heartriskx/models/final/cardio_pipeline.joblib', 'threshold': 0.38848665919874714, 'prefix': 'cardio'}
âœ… Exported uci:
{'pipeline_path': '/content/drive/MyDrive/heartriskx/models/final/uci_pipeline.joblib', 'threshold': 0.4891037479162372, 'prefix': 'uci'}

âœ… Step 4 done â€” bundles saved to /content/drive/MyDrive/heartriskx/models/final


In [None]:
def load_bundle(prefix):
    with open(f"{FINAL_DIR}/{prefix}_bundle.json","r") as f:
        info = json.load(f)
    pipe = joblib.load(info["pipeline_path"])
    return pipe, float(info["threshold"])

def predict_csv(prefix, csv_path, out_csv_path):
    pipe, thr = load_bundle(prefix)
    df = pd.read_csv(csv_path)
    proba = pipe.predict_proba(df)[:,1]
    pred  = (proba >= thr).astype(int)
    out = df.copy()
    out["risk_proba"] = proba
    out["risk_label"] = pred
    out.to_csv(out_csv_path, index=False)
    return out

def predict_single(prefix, record_dict):
    pipe, thr = load_bundle(prefix)
    x = pd.DataFrame([record_dict])
    proba = pipe.predict_proba(x)[:,1][0]
    pred  = int(proba >= thr)
    return {"probability": float(proba), "label": pred, "threshold": thr}

print("âœ… Inference functions ready.")


âœ… Inference functions ready.


In [None]:
# We'll demo using Cardio test-set columns (first 5 rows) to a temp CSV, then run batch inference.
demo_csv = f"{OUT_DIR}/cardio_demo_input.csv"
cardio_demo = Xc_te.head(5).copy()
cardio_demo.to_csv(demo_csv, index=False)

batch_out_path = f"{OUT_DIR}/cardio_predictions.csv"
batch_pred = predict_csv("cardio", demo_csv, batch_out_path)

print("ðŸ”Ž Batch prediction preview:")
display(batch_pred.head())
print("\nSaved:", batch_out_path)


ðŸ”Ž Batch prediction preview:


Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,risk_proba,risk_label
0,19386,1,155,59.5,120,85,1,1,0,0,1,0.266107,0
1,21081,1,160,59.0,130,90,1,1,0,0,1,0.602466,1
2,15129,2,175,88.0,120,80,2,1,0,0,1,0.392869,1
3,18785,2,177,62.0,120,90,1,1,0,0,1,0.268369,0
4,18171,1,167,81.0,120,80,1,1,0,0,1,0.234537,0



Saved: /content/drive/MyDrive/heartriskx/outputs/day7/cardio_predictions.csv


In [None]:
# Take one real test row as a dict for realism
one = Xc_te.iloc[0].to_dict()
single_pred = predict_single("cardio", one)
print("ðŸ”Ž Single prediction (cardio) on one test row:")
print(single_pred)


ðŸ”Ž Single prediction (cardio) on one test row:
{'probability': 0.2661067645838688, 'label': 0, 'threshold': 0.38848665919874714}
