In [None]:
from google.colab import drive
drive.mount('/content/drive')

base_path = "/content/drive/MyDrive/heartriskx/data/"

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier


Mounted at /content/drive


In [None]:
heart2020 = pd.read_csv(base_path + "heart_2020_clean.csv")
cardio = pd.read_csv(base_path + "cardio_train_clean.csv")
uci = pd.read_csv(base_path + "uci_cleveland_clean.csv")


In [None]:
if "HeartDisease" in heart2020.columns:
    heart2020 = heart2020.drop(columns=["HeartDisease"])
print("Heart2020 shape after dropping leakage col:", heart2020.shape)


Heart2020 shape after dropping leakage col: (319795, 18)


In [None]:
def run_models(X, y, dataset_name):
    # One-hot encode categoricals
    X = pd.get_dummies(X, drop_first=True)

    # Fix column names for LightGBM (remove spaces/special chars)
    X.columns = X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale for linear models
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    results = {}

    # Random Forest
    rf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    results["RandomForest"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }

    # XGBoost
    xgb = XGBClassifier(eval_metric='logloss', random_state=42, use_label_encoder=False)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    results["XGBoost"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }

    # LightGBM
    lgbm = LGBMClassifier(random_state=42)
    lgbm.fit(X_train, y_train)
    y_pred = lgbm.predict(X_test)
    results["LightGBM"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }

    # Stacking Ensemble
    estimators = [
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('xgb', XGBClassifier(eval_metric='logloss', random_state=42, use_label_encoder=False)),
        ('lgbm', LGBMClassifier(random_state=42))
    ]
    stack = StackingClassifier(
        estimators=estimators,
        final_estimator=LogisticRegression(max_iter=1000),
        n_jobs=-1
    )
    stack.fit(X_train, y_train)
    y_pred = stack.predict(X_test)
    results["Stacking"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }

    print(f"\nðŸ“Š Advanced Results for {dataset_name}:")
    for model, metrics in results.items():
        print(f"{model}: Acc={metrics['accuracy']:.3f}, Prec={metrics['precision']:.3f}, "
              f"Rec={metrics['recall']:.3f}, F1={metrics['f1']:.3f}")

    return results


In [None]:
# Heart2020
X = heart2020.drop(columns=['target'])
y = heart2020['target']
res_heart2020_adv = run_models(X, y, "Heart2020")

# Cardio
X = cardio.drop(columns=['target', 'id'])
y = cardio['target']
res_cardio_adv = run_models(X, y, "Cardio")

# UCI Cleveland
X = uci.drop(columns=['target'])
y = uci['target']
res_uci_adv = run_models(X, y, "UCI Cleveland")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 21898, number of negative: 233938
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.060003 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 406
[LightGBM] [Info] Number of data points in the train set: 255836, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.085594 -> initscore=-2.368661
[LightGBM] [Info] Start training from score -2.368661

ðŸ“Š Advanced Results for Heart2020:
RandomForest: Acc=0.904, Prec=0.335, Rec=0.119, F1=0.176
XGBoost: Acc=0.915, Prec=0.526, Rec=0.104, F1=0.174
LightGBM: Acc=0.917, Prec=0.598, Rec=0.082, F1=0.145
Stacking: Acc=0.915, Prec=0.523, Rec=0.146, F1=0.229


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 27983, number of negative: 28017
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 714
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499696 -> initscore=-0.001214
[LightGBM] [Info] Start training from score -0.001214

ðŸ“Š Advanced Results for Cardio:
RandomForest: Acc=0.713, Prec=0.719, Rec=0.701, F1=0.710
XGBoost: Acc=0.732, Prec=0.751, Rec=0.694, F1=0.721
LightGBM: Acc=0.734, Prec=0.753, Rec=0.698, F1=0.724
Stacking: Acc=0.735, Prec=0.751, Rec=0.703, F1=0.726


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 109, number of negative: 128
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000076 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 238
[LightGBM] [Info] Number of data points in the train set: 237, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.459916 -> initscore=-0.160682
[LightGBM] [Info] Start training from score -0.160682

ðŸ“Š Advanced Results for UCI Cleveland:
RandomForest: Acc=0.867, Prec=0.885, Rec=0.821, F1=0.852
XGBoost: Acc=0.867, Prec=0.885, Rec=0.821, F1=0.852
LightGBM: Acc=0.817, Prec=0.870, Rec=0.714, F1=0.784
Stacking: Acc=0.833, Prec=0.875, Rec=0.750, F1=0.808


In [None]:
# Heart2020 (leakage fixed)
X = heart2020.drop(columns=['target'])
y = heart2020['target']
res_heart2020_adv = run_models(X, y, "Heart2020")

# Cardio
X = cardio.drop(columns=['target', 'id'])
y = cardio['target']
res_cardio_adv = run_models(X, y, "Cardio")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 21898, number of negative: 233938
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.059442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 406
[LightGBM] [Info] Number of data points in the train set: 255836, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.085594 -> initscore=-2.368661
[LightGBM] [Info] Start training from score -2.368661

ðŸ“Š Advanced Results for Heart2020:
RandomForest: Acc=0.904, Prec=0.335, Rec=0.119, F1=0.176
XGBoost: Acc=0.915, Prec=0.526, Rec=0.104, F1=0.174
LightGBM: Acc=0.917, Prec=0.598, Rec=0.082, F1=0.145
Stacking: Acc=0.915, Prec=0.523, Rec=0.146, F1=0.229


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 27983, number of negative: 28017
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 714
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499696 -> initscore=-0.001214
[LightGBM] [Info] Start training from score -0.001214

ðŸ“Š Advanced Results for Cardio:
RandomForest: Acc=0.713, Prec=0.719, Rec=0.701, F1=0.710
XGBoost: Acc=0.732, Prec=0.751, Rec=0.694, F1=0.721
LightGBM: Acc=0.734, Prec=0.753, Rec=0.698, F1=0.724
Stacking: Acc=0.735, Prec=0.751, Rec=0.703, F1=0.726
