In [None]:
# Mount Google Drive + install packages
from google.colab import drive
drive.mount('/content/drive')

!pip -q install imbalanced-learn xgboost lightgbm

# Globals
base_path = "/content/drive/MyDrive/heartriskx/data/"
RANDOM_STATE = 42

import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, average_precision_score, precision_recall_curve)
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression


Mounted at /content/drive


In [None]:
# === Step 1: Reload datasets ===

# Heart2020
heart2020 = pd.read_csv(base_path + "heart_2020.csv")
# Ensure target column is numeric 0/1
if "HeartDisease" in heart2020.columns:
    heart2020["target"] = heart2020["HeartDisease"].map({"Yes": 1, "No": 0}).astype(int)
    heart2020 = heart2020.drop(columns=["HeartDisease"])
else:
    raise KeyError("Heart2020 file missing 'HeartDisease' column!")

# Cardio
cardio = pd.read_csv(base_path + "cardio_train.csv", sep=";")
if "cardio" in cardio.columns:
    cardio = cardio.rename(columns={"cardio": "target"})
else:
    raise KeyError("Cardio file missing 'cardio' column!")

# UCI (Cleveland)
uci = pd.read_csv(base_path + "uci_heart.csv")
# Sometimes it's 'num' instead of 'target'
if "target" not in uci.columns:
    if "num" in uci.columns:
        uci = uci.rename(columns={"num": "target"})
    else:
        raise KeyError("UCI file missing 'target' or 'num' column!")
# make it binary: >0 means heart disease
uci["target"] = (uci["target"] > 0).astype(int)

# Quick sanity check
print("Heart2020:", heart2020.shape, " target=", heart2020["target"].value_counts().to_dict())
print("Cardio   :", cardio.shape,    " target=", cardio["target"].value_counts().to_dict())
print("UCI      :", uci.shape,       " target=", uci["target"].value_counts().to_dict())


KeyError: "UCI file missing 'target' or 'num' column!"

In [None]:
uci = pd.read_csv(base_path + "uci_heart.csv")
print(uci.head())
print(uci.columns)


   63.0  1.0  1.0.1  145.0  233.0  1.0.2  2.0  150.0  0.0  2.3  3.0 0.0.1  \
0  67.0  1.0    4.0  160.0  286.0    0.0  2.0  108.0  1.0  1.5  2.0   3.0   
1  67.0  1.0    4.0  120.0  229.0    0.0  2.0  129.0  1.0  2.6  2.0   2.0   
2  37.0  1.0    3.0  130.0  250.0    0.0  0.0  187.0  0.0  3.5  3.0   0.0   
3  41.0  0.0    2.0  130.0  204.0    0.0  2.0  172.0  0.0  1.4  1.0   0.0   
4  56.0  1.0    2.0  120.0  236.0    0.0  0.0  178.0  0.0  0.8  1.0   0.0   

   6.0  0  
0  3.0  2  
1  7.0  1  
2  3.0  0  
3  3.0  0  
4  3.0  0  
Index(['63.0', '1.0', '1.0.1', '145.0', '233.0', '1.0.2', '2.0', '150.0',
       '0.0', '2.3', '3.0', '0.0.1', '6.0', '0'],
      dtype='object')


In [None]:
# === Step 1: Reload datasets ===

# Heart2020
heart2020 = pd.read_csv(base_path + "heart_2020.csv")
heart2020["target"] = heart2020["HeartDisease"].map({"Yes": 1, "No": 0}).astype(int)
heart2020 = heart2020.drop(columns=["HeartDisease"])

# Cardio
cardio = pd.read_csv(base_path + "cardio_train.csv", sep=";")
cardio = cardio.rename(columns={"cardio": "target"})

# UCI (Cleveland)
uci = pd.read_csv(base_path + "uci_heart.csv", header=None)

uci.columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]

# make target binary: >0 means heart disease
uci["target"] = (uci["target"] > 0).astype(int)

# Sanity check
print("Heart2020:", heart2020.shape, " target=", heart2020["target"].value_counts().to_dict())
print("Cardio   :", cardio.shape,    " target=", cardio["target"].value_counts().to_dict())
print("UCI      :", uci.shape,       " target=", uci["target"].value_counts().to_dict())


Heart2020: (319795, 18)  target= {0: 292422, 1: 27373}
Cardio   : (70000, 13)  target= {0: 35021, 1: 34979}
UCI      : (303, 14)  target= {0: 164, 1: 139}


In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, average_precision_score,
                             precision_recall_curve)

def make_preprocessor(df_X, scale_numeric=False):
    cat_cols = df_X.select_dtypes(include=['object','category']).columns.tolist()
    num_cols = [c for c in df_X.columns if c not in cat_cols]

    transformers = []
    if cat_cols:
        transformers.append(("cat", OneHotEncoder(handle_unknown='ignore'), cat_cols))
    if num_cols:
        if scale_numeric:
            transformers.append(("num", StandardScaler(), num_cols))
        else:
            transformers.append(("num", "passthrough", num_cols))
    return ColumnTransformer(transformers, remainder='drop')

def report_test_metrics(name, y_true, proba, thr=0.5):
    y_pred = (proba >= thr).astype(int)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc = roc_auc_score(y_true, proba)
    pr  = average_precision_score(y_true, proba)
    print(f"{name}: Acc={acc:.3f}, Prec={prec:.3f}, Rec={rec:.3f}, "
          f"F1={f1:.3f}, ROC-AUC={roc:.3f}, PR-AUC={pr:.3f}")
    return {"accuracy":acc,"precision":prec,"recall":rec,"f1":f1,
            "roc_auc":roc,"pr_auc":pr}

def best_f1_threshold(y_true, proba):
    prec, rec, th = precision_recall_curve(y_true, proba)
    f1 = 2 * prec * rec / (prec + rec + 1e-9)
    i = np.argmax(f1)
    return float(th[i]), float(f1[i]), float(prec[i]), float(rec[i])


In [None]:
Xh = heart2020.drop(columns=['target'])
yh = heart2020['target']

# subset for speed
Xh_tune, _, yh_tune, _ = train_test_split(Xh, yh, train_size=0.35, stratify=yh, random_state=RANDOM_STATE)
Xh_tr, Xh_te, yh_tr, yh_te = train_test_split(Xh_tune, yh_tune, test_size=0.2, stratify=yh_tune, random_state=RANDOM_STATE)

pre_h = make_preprocessor(Xh_tr, scale_numeric=False)
lgbm = LGBMClassifier(random_state=RANDOM_STATE)

pipe = ImbPipeline(steps=[
    ("prep", pre_h),
    ("smote", SMOTE(random_state=RANDOM_STATE)),
    ("clf", lgbm)
])

param_dist = {
    "clf__n_estimators":   [200, 400, 800],
    "clf__learning_rate":  [0.03, 0.05, 0.1],
    "clf__num_leaves":     [31, 63, 95, 127],
    "clf__max_depth":      [-1, 6, 8, 10],
    "clf__min_child_samples": [20, 50, 100],
    "clf__subsample":      [0.7, 0.85, 1.0],
    "clf__colsample_bytree":[0.7, 0.85, 1.0],
    "clf__reg_lambda":     [0.0, 0.5, 1.0]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
search = RandomizedSearchCV(pipe, param_distributions=param_dist, n_iter=25, cv=cv,
                            scoring="average_precision", n_jobs=-1, verbose=1, random_state=RANDOM_STATE)
search.fit(Xh_tr, yh_tr)

print("Heart2020 Best PR-AUC (CV):", search.best_score_)
print("Heart2020 Best params:", search.best_params_)

best_h = search.best_estimator_
proba_h = best_h.predict_proba(Xh_te)[:,1]
report_test_metrics("Heart2020 LGBM (tuned @0.5)", yh_te, proba_h, 0.5)

thr, f1b, pb, rb = best_f1_threshold(yh_te, proba_h)
print(f"Best F1 threshold={thr:.3f} (F1={f1b:.3f}, Prec={pb:.3f}, Rec={rb:.3f})")
report_test_metrics("Heart2020 LGBM (tuned @bestF1)", yh_te, proba_h, thr)


Fitting 3 folds for each of 25 candidates, totalling 75 fits
[LightGBM] [Info] Number of positive: 81877, number of negative: 81877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.120671 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12750
[LightGBM] [Info] Number of data points in the train set: 163754, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Heart2020 Best PR-AUC (CV): 0.3250154069541795
Heart2020 Best params: {'clf__subsample': 0.85, 'clf__reg_lambda': 1.0, 'clf__num_leaves': 31, 'clf__n_estimators': 200, 'clf__min_child_samples': 20, 'clf__max_depth': -1, 'clf__learning_rate': 0.05, 'clf__colsample_bytree': 0.85}




Heart2020 LGBM (tuned @0.5): Acc=0.912, Prec=0.465, Rec=0.160, F1=0.238, ROC-AUC=0.835, PR-AUC=0.325
Best F1 threshold=0.251 (F1=0.386, Prec=0.299, Rec=0.545)
Heart2020 LGBM (tuned @bestF1): Acc=0.851, Prec=0.299, Rec=0.545, F1=0.386, ROC-AUC=0.835, PR-AUC=0.325


{'accuracy': 0.8514249977664612,
 'precision': 0.2985714285714286,
 'recall': 0.5454070981210856,
 'f1': 0.3858936484490399,
 'roc_auc': np.float64(0.8353280374661018),
 'pr_auc': np.float64(0.32513570809010106)}

In [None]:
Xc = cardio.drop(columns=['target','id'])
yc = cardio['target']

Xc_tr, Xc_te, yc_tr, yc_te = train_test_split(Xc, yc, test_size=0.2, stratify=yc, random_state=RANDOM_STATE)
pre_c = make_preprocessor(Xc_tr, scale_numeric=False)

lgbm_c = LGBMClassifier(random_state=RANDOM_STATE, class_weight='balanced')

pipe_c = Pipeline(steps=[("prep", pre_c), ("clf", lgbm_c)])

param_dist_c = {
    "clf__n_estimators":   [200, 400, 800],
    "clf__learning_rate":  [0.03, 0.05, 0.1],
    "clf__num_leaves":     [31, 63, 95, 127],
    "clf__max_depth":      [-1, 6, 8, 10],
    "clf__min_child_samples": [20, 50, 100],
    "clf__subsample":      [0.7, 0.85, 1.0],
    "clf__colsample_bytree":[0.7, 0.85, 1.0],
    "clf__reg_lambda":     [0.0, 0.5, 1.0]
}

search_c = RandomizedSearchCV(pipe_c, param_distributions=param_dist_c, n_iter=20, cv=cv,
                              scoring="average_precision", n_jobs=-1, verbose=1, random_state=RANDOM_STATE)
search_c.fit(Xc_tr, yc_tr)

print("Cardio Best PR-AUC (CV):", search_c.best_score_)
print("Cardio Best params:", search_c.best_params_)

best_c = search_c.best_estimator_
proba_c = best_c.predict_proba(Xc_te)[:,1]
report_test_metrics("Cardio LGBM (tuned @0.5)", yc_te, proba_c, 0.5)

thr_c, f1b_c, pb_c, rb_c = best_f1_threshold(yc_te, proba_c)
print(f"Cardio best F1 threshold={thr_c:.3f} (F1={f1b_c:.3f}, Prec={pb_c:.3f}, Rec={rb_c:.3f})")
report_test_metrics("Cardio LGBM (tuned @bestF1)", yc_te, proba_c, thr_c)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 27983, number of negative: 28017
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005345 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 714
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Cardio Best PR-AUC (CV): 0.7880304279484749
Cardio Best params: {'clf__subsample': 0.85, 'clf__reg_lambda': 1.0, 'clf__num_leaves': 31, 'clf__n_estimators': 200, 'clf__min_child_samples': 20, 'clf__max_depth': -1, 'clf__learning_rate': 0.05, 'clf__colsample_bytree': 0.85}




Cardio LGBM (tuned @0.5): Acc=0.735, Prec=0.754, Rec=0.697, F1=0.724, ROC-AUC=0.800, PR-AUC=0.784
Cardio best F1 threshold=0.350 (F1=0.742, Prec=0.663, Rec=0.842)
Cardio LGBM (tuned @bestF1): Acc=0.707, Prec=0.663, Rec=0.842, F1=0.742, ROC-AUC=0.800, PR-AUC=0.784


{'accuracy': 0.7069285714285715,
 'precision': 0.662692610505005,
 'recall': 0.8421955403087479,
 'f1': 0.7417385283565179,
 'roc_auc': np.float64(0.8003943327818229),
 'pr_auc': np.float64(0.7841865032710564)}

In [None]:
Xu = uci.drop(columns=['target'])
yu = uci['target']

Xu_tr, Xu_te, yu_tr, yu_te = train_test_split(Xu, yu, test_size=0.2, stratify=yu, random_state=RANDOM_STATE)
pre_u = make_preprocessor(Xu_tr, scale_numeric=True)  # scaling helps LogReg

# (a) Logistic Regression
logreg = LogisticRegression(max_iter=5000, class_weight='balanced', solver='lbfgs')
pipe_u_lr = Pipeline(steps=[("prep", pre_u), ("clf", logreg)])
grid_lr = {"clf__C": [0.1, 0.5, 1.0, 2.0, 5.0]}

search_lr = RandomizedSearchCV(pipe_u_lr, param_distributions=grid_lr, n_iter=5, cv=5,
                               scoring="average_precision", n_jobs=-1, verbose=1, random_state=RANDOM_STATE)
search_lr.fit(Xu_tr, yu_tr)

print("UCI LogReg Best PR-AUC (CV):", search_lr.best_score_)
print("UCI LogReg Best params:", search_lr.best_params_)

best_lr = search_lr.best_estimator_
proba_lr = best_lr.predict_proba(Xu_te)[:,1]
report_test_metrics("UCI LogReg (tuned @0.5)", yu_te, proba_lr, 0.5)

thr_lr, f1b_lr, pb_lr, rb_lr = best_f1_threshold(yu_te, proba_lr)
print(f"UCI LogReg best F1 threshold={thr_lr:.3f} (F1={f1b_lr:.3f}, Prec={pb_lr:.3f}, Rec={rb_lr:.3f})")
report_test_metrics("UCI LogReg (tuned @bestF1)", yu_te, proba_lr, thr_lr)

# (b) LightGBM
lgbm_u = LGBMClassifier(random_state=RANDOM_STATE, class_weight='balanced')
pipe_u_lgb = Pipeline(steps=[("prep", pre_u), ("clf", lgbm_u)])
grid_lgb_u = {
    "clf__n_estimators":  [200, 400, 800],
    "clf__learning_rate": [0.03, 0.05, 0.1],
    "clf__num_leaves":    [31, 63, 95],
    "clf__max_depth":     [-1, 6, 8],
}

search_lgb_u = RandomizedSearchCV(pipe_u_lgb, param_distributions=grid_lgb_u, n_iter=12, cv=5,
                                  scoring="average_precision", n_jobs=-1, verbose=1, random_state=RANDOM_STATE)
search_lgb_u.fit(Xu_tr, yu_tr)

print("UCI LGBM Best PR-AUC (CV):", search_lgb_u.best_score_)
print("UCI LGBM Best params:", search_lgb_u.best_params_)

best_lgb_u = search_lgb_u.best_estimator_
proba_lgb_u = best_lgb_u.predict_proba(Xu_te)[:,1]
report_test_metrics("UCI LGBM (tuned @0.5)", yu_te, proba_lgb_u, 0.5)

thr_lgb_u, f1b_lgb_u, pb_lgb_u, rb_lgb_u = best_f1_threshold(yu_te, proba_lgb_u)
print(f"UCI LGBM best F1 threshold={thr_lgb_u:.3f} (F1={f1b_lgb_u:.3f}, Prec={pb_lgb_u:.3f}, Rec={rb_lgb_u:.3f})")
report_test_metrics("UCI LGBM (tuned @bestF1)", yu_te, proba_lgb_u, thr_lgb_u)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
UCI LogReg Best PR-AUC (CV): 0.8884600452620347
UCI LogReg Best params: {'clf__C': 0.1}
UCI LogReg (tuned @0.5): Acc=0.852, Prec=0.806, Rec=0.893, F1=0.847, ROC-AUC=0.969, PR-AUC=0.964
UCI LogReg best F1 threshold=0.717 (F1=0.909, Prec=0.926, Rec=0.893)
UCI LogReg (tuned @bestF1): Acc=0.918, Prec=0.926, Rec=0.893, F1=0.909, ROC-AUC=0.969, PR-AUC=0.964
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[LightGBM] [Info] Number of positive: 111, number of negative: 131
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 247
[LightGBM] [Info] Number of data points in the train set: 242, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start trai



{'accuracy': 0.9016393442622951,
 'precision': 0.8666666666666667,
 'recall': 0.9285714285714286,
 'f1': 0.896551724137931,
 'roc_auc': np.float64(0.948051948051948),
 'pr_auc': np.float64(0.9445635104468247)}