<!-- ### **Import Libraries** -->

In [8]:
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle 
import joblib

warnings.filterwarnings("ignore")

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,roc_auc_score, classification_report, 
    confusion_matrix, ConfusionMatrixDisplay, roc_curve
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import (
    StackingClassifier, GradientBoostingClassifier, 
    HistGradientBoostingClassifier
)
from sklearn.svm import SVC
from sklearn.base import BaseEstimator

from interpret.glassbox import ExplainableBoostingClassifier 
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli 
from snapml import BoostingMachineClassifier  

from tabulate import tabulate

from sklearn.pipeline import Pipeline

<!-- ### **Load & Split Data** -->

In [9]:
df = pd.read_csv('data/preprocessed_data_full_encoded_new_v3.csv')

cols_to_keep = ['age', 
                'height',
                'weight',
                'systolic', 
                'diastolic',
                'bmi',
                'map',
                'pulse_pressure',
                'gender',
                'cholesterol', 
                'gluc',
                'smoke', 
                'alco', 
                'active',
                'cardio'
                ]

df = df[cols_to_keep]

print('Sample Data', len(df)) 
display(df.head())

Sample Data 49818


Unnamed: 0,age,height,weight,systolic,diastolic,bmi,map,pulse_pressure,gender,cholesterol,gluc,smoke,alco,active,cardio
0,50,168,62.0,110,80,21.97,90.0,30,1,0,0,0,0,1,0
1,55,156,85.0,140,90,34.93,106.67,50,0,2,0,0,0,1,1
2,51,165,64.0,130,70,23.51,90.0,60,0,2,0,0,0,0,1
3,48,169,82.0,150,100,28.71,116.67,50,1,0,0,0,0,1,1
4,60,151,67.0,120,80,29.38,93.33,40,0,1,1,0,0,0,0


In [10]:
X = df.drop('cardio', axis=1)  
y = df['cardio']   

In [11]:
label_mapping = {0: 'Healthy', 1: 'Cardio Risk'}
target_names = [label_mapping[label] for label in y.unique()]

<!-- ### **Scaling Data** -->

In [12]:
numerical_features = ['age', 'height', 'weight', 'systolic', 'diastolic', 'bmi', 'map', 'pulse_pressure']
ordinal_features = ['cholesterol', 'gluc']
binary_features = ['gender', 'smoke', 'alco', 'active']

In [None]:

scaler_standard = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num_scaler', scaler_standard, numerical_features),
         ('ord_scaler', scaler_standard, ordinal_features)
    ],
    remainder='passthrough'  
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

X_preprocessed = pipeline.fit_transform(X)

X_preprocessed = pd.DataFrame(X_preprocessed, columns=numerical_features + ordinal_features + binary_features)
X_preprocessed

Unnamed: 0,age,height,weight,systolic,diastolic,bmi,map,pulse_pressure,cholesterol,gluc,gender,smoke,alco,active
0,-0.396496,0.497124,-0.936186,-1.077190,-0.16881,-1.155295,-0.655924,-1.389140,-0.575899,-0.421179,1.0,0.0,0.0,1.0
1,0.338776,-1.094188,0.908758,0.930288,1.09822,1.624188,1.092736,0.492485,2.230670,-0.421179,0.0,0.0,0.0,1.0
2,-0.249442,0.099296,-0.775756,0.261129,-1.43584,-0.825017,-0.655924,1.433297,2.230670,-0.421179,0.0,0.0,0.0,0.0
3,-0.690605,0.629733,0.668113,1.599448,2.36525,0.290208,2.141723,0.492485,-0.575899,-0.421179,1.0,0.0,0.0,1.0
4,1.074048,-1.757234,-0.535111,-0.408030,-0.16881,0.433900,-0.306612,-0.448328,0.827385,1.234620,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49813,0.485830,-0.696360,-0.134036,0.261129,1.09822,0.240880,0.742375,-0.448328,0.827385,1.234620,0.0,0.0,0.0,1.0
49814,-0.249442,-0.431141,-1.417476,2.937766,1.09822,-1.234648,2.141723,3.314921,-0.575899,-0.421179,0.0,0.0,0.0,1.0
49815,0.632885,0.099296,0.507683,1.599448,-0.16881,0.433900,0.742375,2.374109,-0.575899,-0.421179,0.0,0.0,0.0,1.0
49816,1.221102,-0.165923,-0.134036,0.595709,-0.16881,-0.055083,0.217882,0.962891,-0.575899,1.234620,0.0,0.0,0.0,0.0


<!-- ### **Train, Val, Test** -->

In [14]:
x_train, x_temp, y_train, y_temp = train_test_split(
    X_preprocessed, y, test_size=0.3, random_state=42, stratify=y
)  # Train 80%

x_val, x_test, y_val, y_test = train_test_split(
    x_temp, y_temp, test_size=(1/3), random_state=42
)  # Val 10%, Test 10%

x_train_np = np.array(x_train)
y_train_np = np.array(y_train)
x_test_np = np.array(x_test)
y_test_np = np.array(y_test)

<!-- ### **Base Model** -->

In [44]:
def evaluate_model(model, x_train, y_train, x_test, y_test, model_name):
    print("Using standard branch (fit/predict/predict_proba)...")
    if hasattr(x_train, "values"):
        x_train = x_train.values
    if hasattr(x_test, "values"):
        x_test = x_test.values
        
    model.fit(x_train, y_train)
    predict_func = lambda x: model.predict(x)
    predict_proba_func = lambda x: model.predict_proba(x)

    y_pred_test = predict_func(x_test)
    y_proba_test = predict_proba_func(x_test)
    
    if y_proba_test.shape[1] > 1:
        y_probs_test = y_proba_test[:, 1]
    else:
        y_probs_test = y_proba_test[:, 0]
    
    y_pred_train = predict_func(x_train)
    y_proba_train = predict_proba_func(x_train)
    if y_proba_train.shape[1] > 1:
        y_probs_train = y_proba_train[:, 1]
    else:
        y_probs_train = y_proba_train[:, 0]
    
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_acc_str = f"{(test_accuracy * 100):.2f}%"
    test_auc = roc_auc_score(y_test, y_probs_test)
    test_auc_str = f"{test_auc:.4f}"
    test_report_dict = classification_report(y_test, y_pred_test, output_dict=True)
    test_precision = test_report_dict['weighted avg']['precision'] 
    test_recall    = test_report_dict['weighted avg']['recall']   
    test_f1        = test_report_dict['weighted avg']['f1-score']
    
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_acc_str = f"{(train_accuracy * 100):.2f}%"
    train_auc = roc_auc_score(y_train, y_probs_train)
    train_auc_str = f"{train_auc:.4f}"
    train_report_dict = classification_report(y_train, y_pred_train, output_dict=True)
    train_precision = train_report_dict['weighted avg']['precision']
    train_recall    = train_report_dict['weighted avg']['recall']    
    train_f1        = train_report_dict['weighted avg']['f1-score']  

    data = [
        ["Test", test_acc_str, test_auc_str],
        ["Train", train_acc_str, train_auc_str]
    ]

    headers = ["", "Accuracy", "AUC Score"]

    print(f"\n=== {model_name} ===\n")
    print(tabulate(data, headers=headers, tablefmt="grid"))
    
    print("\nOverfitting Check :")
    if train_accuracy > test_accuracy + 5 or train_auc > test_auc + 0.05:
        print("The model might be overfitting.")
    else:
        print("No significant signs of overfitting.\n")
    
    # Plot Confusion Matrix and ROC Curve
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    cm = confusion_matrix(y_test, y_pred_test)
    # If a global variable 'label_mapping' exists, use it for display labels
    display_labels = list(label_mapping.values()) if 'label_mapping' in globals() else None
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels)
    disp.plot(ax=axes[0], cmap='viridis', colorbar=False)
    axes[0].set_title(f"{model_name} - Confusion Matrix")
    
    fpr, tpr, _ = roc_curve(y_test, y_probs_test)
    axes[1].plot(fpr, tpr, label=f"ROC Curve (AUC = {test_auc:.4f})", linewidth=2)
    axes[1].plot([0, 1], [0, 1], 'k--', label="Random Guess", linewidth=1)
    axes[1].set_title(f"{model_name} - ROC Curve")
    axes[1].legend(loc="lower right")
    axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    return {
        'model_name': model_name,
        'train_accuracy': train_accuracy,
        'train_precision': train_precision,
        'train_recall': train_recall,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_recall': test_recall,
        'test_f1': test_f1
    }


def create_summary_table(results):
    test_summary = pd.DataFrame([{
        'Algorithm': r['model_name'],
        'Accuracy':  round(r['test_accuracy'], 4),
        'Precision': round(r['test_precision'], 4),
        'Recall':    round(r['test_recall'], 4),
        'F1-Score':  round(r['test_f1'], 4)
    } for r in results])
    train_summary = pd.DataFrame([{
        'Algorithm': r['model_name'],
        'Accuracy':  round(r['train_accuracy'], 4),
        'Precision': round(r['train_precision'], 4),
        'Recall':    round(r['train_recall'], 4),
        'F1-Score':  round(r['train_f1'], 4)
    } for r in results])
    
    
    print("\nSummary Table - Test Metrics")
    print(tabulate(test_summary, headers='keys', tablefmt='grid', showindex=False))

    print("Summary Table - Training Metrics")
    print(tabulate(train_summary, headers='keys', tablefmt='grid', showindex=False))
    

<!-- ##### `SVM` -->

In [16]:
svm_model = SVC(probability=True)
svm_results = evaluate_model(svm_model, x_train, y_train, x_test, y_test, "SVM")
svm_results;

Using standard branch (fit/predict/predict_proba)...

=== SVM ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 85.95%     |      0.9338 |
+-------+------------+-------------+
| Train | 86.28%     |      0.9395 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



<!-- ##### `XGBoost` -->

In [17]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_results = evaluate_model(xgb_model, x_train, y_train, x_test, y_test, "XGBoost")
xgb_results;

Using standard branch (fit/predict/predict_proba)...

=== XGBoost ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 85.77%     |      0.9468 |
+-------+------------+-------------+
| Train | 90.34%     |      0.9741 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



<!-- ##### `Light GBM` -->

In [18]:
best_params = {
    'n_estimators': 233,
    'learning_rate': 0.019519792757748358,
    'num_leaves': 41,
    'max_depth': 15,
    'subsample': 0.758484089588373,
    'colsample_bytree': 0.9592852139230149,
    'random_state': 494
}

lgbm_model = LGBMClassifier(**best_params, verbose=-1)
lgbm_results = evaluate_model(lgbm_model, x_train, y_train, x_test, y_test, "LightGBM")
lgbm_results;

Using standard branch (fit/predict/predict_proba)...

=== LightGBM ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 86.91%     |      0.9499 |
+-------+------------+-------------+
| Train | 87.45%     |      0.9575 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



<!-- ##### `Cat Boost` -->

In [19]:
catb_model = CatBoostClassifier(verbose=False)
catb_results = evaluate_model(catb_model, x_train, y_train, x_test, y_test, "CatBoost")
catb_results;

Using standard branch (fit/predict/predict_proba)...

=== CatBoost ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 86.23%     |      0.9465 |
+-------+------------+-------------+
| Train | 89.15%     |      0.9676 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



<!-- #### `SnapBoost` -->

In [20]:
snb_model = BoostingMachineClassifier()
snb_results = evaluate_model(snb_model, x_train, y_train, x_test, y_test, "SnapBoost")
snb_results;

Using standard branch (fit/predict/predict_proba)...

=== SnapBoost ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 85.63%     |      0.9429 |
+-------+------------+-------------+
| Train | 86.39%     |      0.9489 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



<!-- ##### `Explainable Boosting Machine (EBM)` -->

In [21]:
ebm_model = ExplainableBoostingClassifier(n_jobs=1)
ebm_results = evaluate_model(ebm_model, x_train, y_train, x_test, y_test, "EBM")
ebm_results;

Using standard branch (fit/predict/predict_proba)...

=== EBM ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 85.09%     |      0.9324 |
+-------+------------+-------------+
| Train | 85.65%     |      0.9385 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



<!-- ##### `NGBoost` -->

In [22]:
ngb_model = NGBClassifier(Dist=Bernoulli, verbose=False)
ngb_results = evaluate_model(ngb_model, x_train, y_train, x_test, y_test, "NGBoost")
ngb_results;

Using standard branch (fit/predict/predict_proba)...

=== NGBoost ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 85.03%     |      0.9324 |
+-------+------------+-------------+
| Train | 84.85%     |      0.9357 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



<!-- ##### `GradientBoosting` -->

In [23]:
grb_model =  GradientBoostingClassifier(random_state=42)
grb_results = evaluate_model(grb_model, x_train, y_train, x_test, y_test, "Gradient Boosting")
grb_results;

Using standard branch (fit/predict/predict_proba)...

=== Gradient Boosting ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 84.99%     |      0.9389 |
+-------+------------+-------------+
| Train | 85.98%     |      0.9433 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



<!-- ##### `Hist GradientBoosting` -->

In [24]:
hgrb_model =  HistGradientBoostingClassifier(random_state=42)
hgrb_results = evaluate_model(hgrb_model, x_train, y_train, x_test, y_test, "Hist Gradient Boosting")
hgrb_results;

Using standard branch (fit/predict/predict_proba)...

=== Hist Gradient Boosting ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 86.61%     |      0.9495 |
+-------+------------+-------------+
| Train | 87.30%     |      0.957  |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



<!-- #### **Summary Table** -->

In [25]:
results = [ svm_results, xgb_results, lgbm_results, catb_results, 
        snb_results, ebm_results, ngb_results, 
        grb_results, hgrb_results, 
        ]

create_summary_table(results)


Summary Table - Test Metrics
+------------------------+------------+-------------+----------+------------+
| Algorithm              |   Accuracy |   Precision |   Recall |   F1-Score |
| SVM                    |     0.8595 |      0.8623 |   0.8595 |     0.8591 |
+------------------------+------------+-------------+----------+------------+
| XGBoost                |     0.8577 |      0.8585 |   0.8577 |     0.8575 |
+------------------------+------------+-------------+----------+------------+
| LightGBM               |     0.8691 |      0.8707 |   0.8691 |     0.8689 |
+------------------------+------------+-------------+----------+------------+
| CatBoost               |     0.8623 |      0.8635 |   0.8623 |     0.8621 |
+------------------------+------------+-------------+----------+------------+
| SnapBoost              |     0.8563 |      0.8583 |   0.8563 |     0.856  |
+------------------------+------------+-------------+----------+------------+
| EBM                    |     0.8

In [26]:
base_models = [
    ("lgbm", lgbm_model),
    ("catb", catb_model),
    ("hgrb", hgrb_model),
    ("snb", snb_model),
    ("xgb", xgb_model)
]

In [27]:
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=svm_model
)

In [28]:
stacking_model.fit(x_train_np, y_train_np)

In [29]:
y_pred  = stacking_model.predict(x_test_np)
y_proba = stacking_model.predict_proba(x_test_np)[:,1]

acc  = accuracy_score(y_test, y_pred)
auc  = roc_auc_score(y_test, y_proba)

print(f"Accuracy : {acc:.4f}")
print(f"AUC      : {auc:.4f}")

Accuracy : 0.8667
AUC      : 0.9295


In [None]:
# full_pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', stacking_model)
# ])

# full_pipeline.fit(x_train, y_train)
# joblib.dump(full_pipeline, 'data/my_model.pkl')

['data/my_model.pkl']

In [None]:
# import pickle 

# final_model = stacking_model

# model_filename = "data/ml_stacking_model.pkl"


# with open(model_filename, 'wb') as file:
#     pickle.dump(final_model, file)

# print(f"Model saved as {model_filename}")

Model saved as data/ml_stacking_model.pkl
