In [None]:
from google.colab import drive
drive.mount('/content/drive')

base_path = "/content/drive/MyDrive/heartriskx/data/"

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# For imbalance handling
from imblearn.over_sampling import SMOTE


Mounted at /content/drive


In [None]:
heart2020 = pd.read_csv(base_path + "heart_2020_clean.csv")

if "HeartDisease" in heart2020.columns:
    heart2020 = heart2020.drop(columns=["HeartDisease"])  # just in case
print(heart2020['target'].value_counts())


target
0    292422
1     27373
Name: count, dtype: int64


In [None]:
def run_balanced_baselines(X, y, dataset_name):
    # One-hot encode categoricals
    X = pd.get_dummies(X, drop_first=True)
    X.columns = X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Balance only training set (not test set!)
    smote = SMOTE(random_state=42)
    X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

    print(f"\nðŸ”„ {dataset_name}: Before balancing: {y_train.value_counts().to_dict()}, After balancing: {y_train_bal.value_counts().to_dict()}")

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_bal)
    X_test_scaled = scaler.transform(X_test)

    results = {}

    # Logistic Regression
    logreg = LogisticRegression(max_iter=1000)
    logreg.fit(X_train_scaled, y_train_bal)
    y_pred = logreg.predict(X_test_scaled)
    results["LogReg"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }

    # Random Forest
    rf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf.fit(X_train_bal, y_train_bal)
    y_pred = rf.predict(X_test)
    results["RandomForest"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }

    print(f"\nðŸ“Š Balanced Baseline Results for {dataset_name}:")
    for model, metrics in results.items():
        print(f"{model}: Acc={metrics['accuracy']:.3f}, Prec={metrics['precision']:.3f}, "
              f"Rec={metrics['recall']:.3f}, F1={metrics['f1']:.3f}")

    return results


In [None]:
X = heart2020.drop(columns=['target'])
y = heart2020['target']
res_heart2020_balanced = run_balanced_baselines(X, y, "Heart2020 (Balanced)")



ðŸ”„ Heart2020 (Balanced): Before balancing: {0: 233938, 1: 21898}, After balancing: {0: 233938, 1: 233938}

ðŸ“Š Balanced Baseline Results for Heart2020 (Balanced):
LogReg: Acc=0.841, Prec=0.272, Rec=0.513, F1=0.355
RandomForest: Acc=0.878, Prec=0.295, Rec=0.309, F1=0.302


In [None]:
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import StackingClassifier

def run_balanced_advanced(X, y, dataset_name):
    # One-hot encode categoricals
    X = pd.get_dummies(X, drop_first=True)
    X.columns = X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Balance only training set (not test set!)
    smote = SMOTE(random_state=42)
    X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

    print(f"\nðŸ”„ {dataset_name}: Before balancing: {y_train.value_counts().to_dict()}, After balancing: {y_train_bal.value_counts().to_dict()}")

    results = {}

    # Random Forest
    rf = RandomForestClassifier(n_estimators=200, random_state=42)
    rf.fit(X_train_bal, y_train_bal)
    y_pred = rf.predict(X_test)
    results["RandomForest"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }

    # XGBoost
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    xgb.fit(X_train_bal, y_train_bal)
    y_pred = xgb.predict(X_test)
    results["XGBoost"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }

    # LightGBM
    lgbm = lgb.LGBMClassifier(random_state=42)
    lgbm.fit(X_train_bal, y_train_bal)
    y_pred = lgbm.predict(X_test)
    results["LightGBM"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }

    # Stacking (LogReg + RF + XGB + LGBM)
    stack = StackingClassifier(
        estimators=[
            ('rf', rf),
            ('xgb', xgb),
            ('lgbm', lgbm)
        ],
        final_estimator=LogisticRegression(max_iter=1000),
        passthrough=True
    )
    stack.fit(X_train_bal, y_train_bal)
    y_pred = stack.predict(X_test)
    results["Stacking"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred)
    }

    print(f"\nðŸ“Š Balanced Advanced Results for {dataset_name}:")
    for model, metrics in results.items():
        print(f"{model}: Acc={metrics['accuracy']:.3f}, Prec={metrics['precision']:.3f}, "
              f"Rec={metrics['recall']:.3f}, F1={metrics['f1']:.3f}")

    return results


In [None]:
X = heart2020.drop(columns=['target'])
y = heart2020['target']
res_heart2020_adv_balanced = run_balanced_advanced(X, y, "Heart2020 (Balanced)")



ðŸ”„ Heart2020 (Balanced): Before balancing: {0: 233938, 1: 21898}, After balancing: {0: 233938, 1: 233938}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 233938, number of negative: 233938
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.336436 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1086
[LightGBM] [Info] Number of data points in the train set: 467876, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 233938, number of negative: 233938
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.260971 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1086
[LightGBM] [Info] Number of data points in the train set: 467876, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 187150, number of negative: 187150
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.197232 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1086
[LightGBM] [Info] Number of data points in the train set: 374300, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 187151, number of negative: 187150
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.202488 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1086
[LightGBM] [Info] Number of data points in the train set: 374301, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500001 -> initsco

In [None]:
# Ensure UCI target is binary (0 = no disease, 1 = disease)
uci['target'] = (uci['target'] > 0).astype(int)

# Now run models
X = uci.drop(columns=['target'])
y = uci['target']
res_uci_cs = run_cost_sensitive_models(X, y, "UCI Cleveland (Cost-sensitive)")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 111, number of negative: 130
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239
[LightGBM] [Info] Number of data points in the train set: 241, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

âš¡ Best F1 for UCI Cleveland (Cost-sensitive) (RF) = 0.933 at threshold=0.440 (Prec=0.875, Rec=1.000)

ðŸ“Š Cost-sensitive Results for UCI Cleveland (Cost-sensitive):
LogReg: Acc=0.951, Prec=0.963, Rec=0.929, F1=0.945, ROC-AUC=0.994, PR-AUC=0.992
RandomForest: Acc=0.902, Prec=0.893, Rec=0.893, F1=0.893, ROC-AUC=0.976, PR-AUC=0.971
XGBoost: Acc=0.820, Prec=0.815, Rec=0.786, F1=0.800, ROC-AUC=0.937, PR-AUC=0.920
LightGBM: Acc=0.902, Prec=0.867, 

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve

def run_cost_sensitive_models(X, y, dataset_name):
    # One-hot encode categoricals if present
    X = pd.get_dummies(X, drop_first=True)
    X.columns = X.columns.str.replace('[^A-Za-z0-9_]+', '_', regex=True)

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    results = {}

    # Logistic Regression
    logreg = LogisticRegression(max_iter=1000, class_weight='balanced')
    logreg.fit(X_train, y_train)
    proba = logreg.predict_proba(X_test)[:,1]
    y_pred = logreg.predict(X_test)
    results["LogReg"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, proba),
        "pr_auc": average_precision_score(y_test, proba)
    }

    # Random Forest
    rf = RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced')
    rf.fit(X_train, y_train)
    proba = rf.predict_proba(X_test)[:,1]
    y_pred = rf.predict(X_test)
    results["RandomForest"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, proba),
        "pr_auc": average_precision_score(y_test, proba)
    }

    # XGBoost
    xgb = XGBClassifier(
        eval_metric='logloss',
        random_state=42,
        scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1])
    )
    xgb.fit(X_train, y_train)
    proba = xgb.predict_proba(X_test)[:,1]
    y_pred = xgb.predict(X_test)
    results["XGBoost"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, proba),
        "pr_auc": average_precision_score(y_test, proba)
    }

    # LightGBM
    lgbm = LGBMClassifier(random_state=42, class_weight='balanced')
    lgbm.fit(X_train, y_train)
    proba = lgbm.predict_proba(X_test)[:,1]
    y_pred = lgbm.predict(X_test)
    results["LightGBM"] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, proba),
        "pr_auc": average_precision_score(y_test, proba)
    }

    # Threshold tuning for RF
    proba = rf.predict_proba(X_test)[:,1]
    prec, rec, th = precision_recall_curve(y_test, proba)
    f1 = 2 * prec*rec / (prec+rec + 1e-9)
    best = f1.argmax()
    print(f"\nâš¡ Best F1 for {dataset_name} (RF) = {f1[best]:.3f} at threshold={th[best]:.3f} "
          f"(Prec={prec[best]:.3f}, Rec={rec[best]:.3f})")

    # Print summary
    print(f"\nðŸ“Š Cost-sensitive Results for {dataset_name}:")
    for model, metrics in results.items():
        print(f"{model}: Acc={metrics['accuracy']:.3f}, Prec={metrics['precision']:.3f}, "
              f"Rec={metrics['recall']:.3f}, F1={metrics['f1']:.3f}, "
              f"ROC-AUC={metrics['roc_auc']:.3f}, PR-AUC={metrics['pr_auc']:.3f}")

    return results


In [None]:
# Reload datasets from Drive
cardio = pd.read_csv(base_path + "cardio_train.csv", sep=';')
cardio.rename(columns={'cardio': 'target'}, inplace=True)

heart2020 = pd.read_csv(base_path + "heart_2020.csv")

uci = pd.read_csv(base_path + "uci_heart.csv")   # <-- use this filename
uci.rename(columns={uci.columns[-1]: 'target'}, inplace=True)


In [None]:
# Fix UCI target to binary
uci['target'] = uci['target'].apply(lambda x: 1 if x > 0 else 0)

# Now run
X = uci.drop(columns=['target'])
y = uci['target']
res_uci_cs = run_cost_sensitive_models(X, y, "UCI Cleveland (Cost-sensitive)")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 111, number of negative: 130
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239
[LightGBM] [Info] Number of data points in the train set: 241, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

âš¡ Best F1 for UCI Cleveland (Cost-sensitive) (RF) = 0.933 at threshold=0.440 (Prec=0.875, Rec=1.000)

ðŸ“Š Cost-sensitive Results for UCI Cleveland (Cost-sensitive):
LogReg: Acc=0.951, Prec=0.963, Rec=0.929, F1=0.945, ROC-AUC=0.994, PR-AUC=0.992
RandomForest: Acc=0.902, Prec=0.893, Rec=0.893, F1=0.893, ROC-AUC=0.976, PR-AUC=0.971
XGBoost: Acc=0.820, Prec=0.815, Rec=0.786, F1=0.800, ROC-AUC=0.937, PR-AUC=0.920
LightGBM: Acc=0.902, Prec=0.867, 

In [None]:
# Run cost-sensitive models on Cardio
X = cardio.drop(columns=['target', 'id'])
y = cardio['target']
res_cardio_cs = run_cost_sensitive_models(X, y, "Cardio (Cost-sensitive)")

# Run cost-sensitive models on UCI
X = uci.drop(columns=['target'])
y = uci['target']
res_uci_cs = run_cost_sensitive_models(X, y, "UCI Cleveland (Cost-sensitive)")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 27983, number of negative: 28017
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006324 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 714
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

âš¡ Best F1 for Cardio (Cost-sensitive) (RF) = 0.724 at threshold=0.330 (Prec=0.638, Rec=0.837)

ðŸ“Š Cost-sensitive Results for Cardio (Cost-sensitive):
LogReg: Acc=0.704, Prec=0.720, Rec=0.667, F1=0.693, ROC-AUC=0.763, PR-AUC=0.743
RandomForest: Acc=0.714, Prec=0.718, Rec=0.704, F1=0.711, ROC-AUC=0.772, PR-AUC=0.754
XGBoost: Acc=0.730, Prec=0.749, Rec=0.691, F1=0.718, ROC-AUC=0.794, PR-AUC=0.775
LightGBM: Acc=0.736, Prec=0.753, Rec=0.70

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[LightGBM] [Info] Number of positive: 111, number of negative: 130
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000111 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 239
[LightGBM] [Info] Number of data points in the train set: 241, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000

âš¡ Best F1 for UCI Cleveland (Cost-sensitive) (RF) = 0.933 at threshold=0.440 (Prec=0.875, Rec=1.000)

ðŸ“Š Cost-sensitive Results for UCI Cleveland (Cost-sensitive):
LogReg: Acc=0.951, Prec=0.963, Rec=0.929, F1=0.945, ROC-AUC=0.994, PR-AUC=0.992
RandomForest: Acc=0.902, Prec=0.893, Rec=0.893, F1=0.893, ROC-AUC=0.976, PR-AUC=0.971
XGBoost: Acc=0.820, Prec=0.815, Rec=0.786, F1=0.800, ROC-AUC=0.937, PR-AUC=0.920
LightGBM: Acc=0.902, Prec=0.867, 