In [33]:
import pandas as pd
import numpy as np
import pickle
from tabulate import tabulate
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,roc_auc_score, classification_report, 
    confusion_matrix, ConfusionMatrixDisplay, roc_curve
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import (
    StackingClassifier, GradientBoostingClassifier, 
    GradientBoostingClassifier, HistGradientBoostingClassifier,
    AdaBoostClassifier, RandomForestClassifier
)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from snapml import BoostingMachineClassifier  
from xgboost import XGBClassifier
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli 

In [34]:
df = pd.read_csv('data/preprocessed_data_full_encoded_new_v4.csv')
cols_to_keep = ['age', 
                'height',
                'weight',
                'systolic', 
                'diastolic',
                'bmi',
                'map',
                'pulse_pressure',
                'gender',
                'cholesterol', 
                'gluc',
                'smoke', 
                'alco', 
                'active',
                'cardio'
                ]

df = df[cols_to_keep]

In [35]:
X = df.drop('cardio', axis=1)
y = df['cardio']

label_mapping = {0: 'Healthy', 1: 'Cardio Risk'}
target_names = [label_mapping[label] for label in y.unique()]

numerical_features = ['age', 'height', 'weight', 'systolic', 'diastolic', 'bmi', 'map', 'pulse_pressure']
ordinal_features = ['cholesterol', 'gluc']
binary_features = ['gender', 'smoke', 'alco', 'active']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Catatan: ColumnTransformer dengan remainder='passthrough' akan mengubah urutan kolom.
# Fitur yang discaling akan berada di awal, kemudian sisanya.
# Untuk mendapatkan nama kolom kembali (opsional), kita bisa menyusun ulang nama kolom sebagai berikut.
passthrough_features = [col for col in X.columns if col not in numerical_features]
transformed_feature_names = numerical_features + passthrough_features

X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=transformed_feature_names)
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=transformed_feature_names)
print("Contoh data train setelah scaling:")
print(X_train_transformed_df.head())

Contoh data train setelah scaling:
        age    height    weight  systolic  diastolic       bmi       map  \
0  0.047973  1.554091  1.702976 -0.416790  -0.183072  0.706152 -0.319255   
1  0.342487 -0.430436 -0.774215 -0.416790  -0.183072 -0.570828 -0.319255   
2  0.931516  0.363375  0.584244  0.923307  -0.183072  0.358079  0.382521   
3  0.195230 -0.695040  0.184697 -1.086838  -1.456053  0.575892 -1.371393   
4 -0.541056 -0.562738 -1.653219 -1.756886  -1.456053 -1.422859 -1.721755   

   pulse_pressure  gender  cholesterol  gluc  smoke  alco  active  
0       -0.450279     1.0          2.0   0.0    1.0   1.0     0.0  
1       -0.450279     0.0          1.0   0.0    0.0   0.0     1.0  
2        1.432884     0.0          1.0   2.0    0.0   0.0     0.0  
3       -0.450279     0.0          0.0   0.0    0.0   0.0     1.0  
4       -1.391861     0.0          0.0   0.0    0.0   1.0     1.0  


In [36]:
x_train_np = np.array(X_train)
y_train_np = np.array(y_train)
x_test_np = np.array(X_test)
y_test_np = np.array(y_test)

In [37]:
def evaluate_model(model, x_train, y_train, x_test, y_test, model_name):        
    model.fit(x_train, y_train)
    predict_func = lambda x: model.predict(x)
    predict_proba_func = lambda x: model.predict_proba(x)

    y_pred_test = predict_func(x_test)
    y_proba_test = predict_proba_func(x_test)
    
    if y_proba_test.shape[1] > 1:
        y_probs_test = y_proba_test[:, 1]
    else:
        y_probs_test = y_proba_test[:, 0]
    
    y_pred_train = predict_func(x_train)
    y_proba_train = predict_proba_func(x_train)
    if y_proba_train.shape[1] > 1:
        y_probs_train = y_proba_train[:, 1]
    else:
        y_probs_train = y_proba_train[:, 0]
    
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_acc_str = f"{int(test_accuracy * 100 * 100) / 100:.2f}%"
    test_auc = roc_auc_score(y_test, y_probs_test)
    test_auc_str = f"{test_auc:.4f}"
    test_report_dict = classification_report(y_test, y_pred_test, output_dict=True)
    test_precision = test_report_dict['weighted avg']['precision'] 
    test_recall    = test_report_dict['weighted avg']['recall']   
    test_f1        = test_report_dict['weighted avg']['f1-score']
    
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_acc_str = f"{(train_accuracy  * 100 * 100) / 100:.2f}%"
    train_auc = roc_auc_score(y_train, y_probs_train)
    train_auc_str = f"{train_auc:.4f}"
    train_report_dict = classification_report(y_train, y_pred_train, output_dict=True)
    train_precision = train_report_dict['weighted avg']['precision']
    train_recall    = train_report_dict['weighted avg']['recall']    
    train_f1        = train_report_dict['weighted avg']['f1-score']  

    data = [
        ["Test", test_acc_str, test_auc_str],
        ["Train", train_acc_str, train_auc_str]
    ]

    headers = ["", "Accuracy", "AUC Score"]

    print(f"\n=== {model_name} ===\n")
    print(tabulate(data, headers=headers, tablefmt="grid"))
    
    print("\nOverfitting Check :")
    if train_accuracy > test_accuracy + 5 or train_auc > test_auc + 0.05:
        print("The model might be overfitting.")
    else:
        print("No significant signs of overfitting.\n")
    
    # fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    # cm = confusion_matrix(y_test, y_pred_test)

    # display_labels = list(label_mapping.values()) if 'label_mapping' in globals() else None
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels)
    # disp.plot(ax=axes[0], cmap='viridis', colorbar=False)
    # axes[0].set_title(f"{model_name} - Confusion Matrix")
    
    # fpr, tpr, _ = roc_curve(y_test, y_probs_test)
    # axes[1].plot(fpr, tpr, label=f"ROC Curve (AUC = {test_auc:.4f})", linewidth=2)
    # axes[1].plot([0, 1], [0, 1], 'k--', label="Random Guess", linewidth=1)
    # axes[1].set_title(f"{model_name} - ROC Curve")
    # axes[1].legend(loc="lower right")
    # axes[1].grid(alpha=0.3)
    
    # plt.tight_layout()
    # plt.show()
    
    return {
        'model_name': model_name,
        'train_accuracy': train_accuracy,
        'train_precision': train_precision,
        'train_recall': train_recall,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_recall': test_recall,
        'test_f1': test_f1
    }


def create_summary_table(results):
    test_summary = pd.DataFrame([{
        'Algorithm': r['model_name'],
        'Accuracy':  round(r['test_accuracy'], 4),
        'Precision': round(r['test_precision'], 4),
        'Recall':    round(r['test_recall'], 4),
        'F1-Score':  round(r['test_f1'], 4)
    } for r in results])
    train_summary = pd.DataFrame([{
        'Algorithm': r['model_name'],
        'Accuracy':  round(r['train_accuracy'], 4),
        'Precision': round(r['train_precision'], 4),
        'Recall':    round(r['train_recall'], 4),
        'F1-Score':  round(r['train_f1'], 4)
    } for r in results])
    
    
    print("\nSummary Table - Test Metrics")
    print(tabulate(test_summary, headers='keys', tablefmt='grid', showindex=False))

    print("Summary Table - Training Metrics")
    print(tabulate(train_summary, headers='keys', tablefmt='grid', showindex=False))

In [88]:
best_params = {
    'n_estimators': 233,
    'learning_rate': 0.019519792757748358,
    'num_leaves': 41,
    'max_depth': 15,
    'subsample': 0.758484089588373,
    'colsample_bytree': 0.9592852139230149,
    'random_state': 6580
}

lgbm_model = LGBMClassifier(**best_params, verbose=-1)

lgbm_results = evaluate_model(lgbm_model, X_train, y_train, X_test, y_test, "LightGBM")
lgbm_results;


=== LightGBM ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 88.32%     |      0.9618 |
+-------+------------+-------------+
| Train | 88.99%     |      0.9677 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



In [89]:
accuracy = lgbm_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.8832923832923832


In [96]:
best_params = {
    'iterations': 680, 
    'learning_rate': 0.010453904590859812, 
    'depth': 9, 
    'l2_leaf_reg': 0.10216165140187977, 
    'border_count': 220, 
    'random_strength': 0.017515693364790794, 
    'bagging_temperature': 0.20045501246071826, 
    'random_seed': 7801}

catb_model = CatBoostClassifier(**best_params, verbose=False)
catb_results = evaluate_model(catb_model, X_train, y_train, X_test, y_test, "CatBoost")
catb_results;


=== CatBoost ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 88.05%     |      0.961  |
+-------+------------+-------------+
| Train | 90.66%     |      0.9762 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



In [None]:
accuracy = catb_model.score(X_test, y_test)
print("Accuracy:", accuracy) 

Accuracy: 0.8804258804258804


In [40]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_results = evaluate_model(xgb_model, X_train, y_train, X_test, y_test, "XGBoost")
xgb_results;




=== XGBoost ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 87.78%     |      0.9585 |
+-------+------------+-------------+
| Train | 91.71%     |      0.9807 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



In [41]:
snb_model = BoostingMachineClassifier()
snb_results = evaluate_model(snb_model, x_train_np, y_train, x_test_np, y_test, "SnapBoost")
snb_results;


=== SnapBoost ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 87.53%     |      0.9572 |
+-------+------------+-------------+
| Train | 88.05%     |      0.9612 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



In [42]:
grb_model =  GradientBoostingClassifier(random_state=42)
grb_results = evaluate_model(grb_model, X_train, y_train, X_test, y_test, "Gradient Boosting")
grb_results;


=== Gradient Boosting ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 87.16%     |      0.9545 |
+-------+------------+-------------+
| Train | 87.53%     |      0.9575 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



In [None]:
best_params = {
    'max_iter': 409, 
    'learning_rate': 0.01947652219796748, 
    'max_leaf_nodes': 36, 
    'min_samples_leaf': 28, 
    'l2_regularization': 4.313378648780895e-06, 
    'random_state': 8404
            }

hgrb_model =  HistGradientBoostingClassifier(**best_params)
hgrb_results = evaluate_model(hgrb_model, x_train_np, y_train, x_test_np, y_test, "Hist Gradient Boosting")
hgrb_results;


=== Hist Gradient Boosting ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 88.28%     |      0.9615 |
+-------+------------+-------------+
| Train | 88.94%     |      0.9679 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



In [68]:
accuracy = hgrb_model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.8828828828828829




In [44]:
base_models = [
    ("lgbm", lgbm_model),
    ("catb", catb_model),
    ("grb", grb_model),
    ("snb", snb_model),
    ("xgb", xgb_model)
]

In [45]:
meta_model = hgrb_model

In [46]:
stacking_model = StackingClassifier(estimators=base_models,
                                    final_estimator=meta_model,
                                    cv=5,
                                    passthrough=True)

In [47]:
stacking_model.fit(X_train_transformed, y_train)



In [48]:
y_pred = stacking_model.predict(X_test_transformed)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

Accuracy: 0.8794021294021294

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.90      0.88      4892
           1       0.90      0.86      0.88      4876

    accuracy                           0.88      9768
   macro avg       0.88      0.88      0.88      9768
weighted avg       0.88      0.88      0.88      9768



In [49]:
# numerical_features = ['age', 'height', 'weight', 'systolic', 'diastolic', 'bmi', 'map', 'pulse_pressure']

# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), numerical_features)
#     ],
#     remainder='passthrough' 
# )

# pipeline = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', stacking_model)
# ])

# pipeline.fit(X_train, y_train)

# with open("pkl/ml_ensemble_model.pkl", "wb") as f:
#     pickle.dump(pipeline, f)

# print("Pipeline telah disimpan dalam file 'ml_ensemble_model.pkl'.")

In [50]:
# data_new = {
#     'age':       [30, 35, 40, 45, 50, 60, 65, 70, 55, 68],   
#     'height':    [175, 180, 170, 165, 160, 158, 155, 150, 165, 160], 
#     'weight':    [68, 75, 72, 80, 85, 90, 95, 100, 78, 82], 
#     'systolic':  [110, 115, 120, 125, 130, 140, 150, 160, 115, 135],  
#     'diastolic': [70, 75, 80, 85, 90, 95, 100, 105, 75, 88],         
#     'gender':       [0, 0, 1, 0, 1, 1, 1, 1, 0, 1], # gender: 0 untuk nilai asli 1, 1 untuk nilai asli 2
#     'cholesterol':  [0, 0, 1, 1, 2, 2, 2, 2, 0, 2], # cholesterol: 0 untuk nilai asli 1, 1 untuk nilai asli 2, 2 untuk nilai asli 3
#     'gluc':         [0, 0, 1, 1, 1, 2, 2, 2, 0, 2], # gluc: 0 untuk nilai asli 1, 1 untuk nilai asli 2, 2 untuk nilai asli 3
#     'smoke':        [0, 0, 0, 0, 1, 1, 1, 1, 0, 1],
#     'alco':         [0, 0, 0, 0, 0, 1, 1, 1, 0, 1],
#     'active':       [1, 1, 1, 1, 0, 0, 0, 0, 1, 0]
# }

# new_df = pd.DataFrame(data_new)

# valid_gender      = new_df['gender'].isin([0, 1])
# valid_cholesterol = new_df['cholesterol'].isin([0, 1, 2])
# valid_gluc        = new_df['gluc'].isin([0, 1, 2])
# valid_smoke       = new_df['smoke'].isin([0, 1])
# valid_alco        = new_df['alco'].isin([0, 1])
# valid_active      = new_df['active'].isin([0, 1])

# if not (valid_gender.all() and valid_cholesterol.all() and valid_gluc.all() and 
#         valid_smoke.all() and valid_alco.all() and valid_active.all()):
#     raise ValueError("Terdapat nilai kategori yang tidak valid pada input data.")

# new_df["bmi"] = round(new_df["weight"] / ((new_df["height"] / 100) ** 2), 2)
# new_df["map"] = round((new_df["systolic"] + 2 * new_df["diastolic"]) / 3, 2)
# new_df["pulse_pressure"] = new_df["systolic"] - new_df["diastolic"]

# expected_order = [
#     'age', 'height', 'weight', 'systolic', 'diastolic', 
#     'bmi', 'map', 'pulse_pressure',
#     'gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'
# ]
# new_df = new_df[expected_order]

# with open("pkl/ml_ensemble_model.pkl", "rb") as f:
#     pipeline = pickle.load(f)

# predictions = pipeline.predict(new_df)

# print("Input Data Baru (setelah pembuatan fitur turunan dan validasi):")
# print(new_df)
# print("\nHasil Prediksi untuk 10 contoh data:")
# print("Prediksi status cardio untuk data baru:", predictions)


In [51]:
stacking_results = evaluate_model(stacking_model, x_train_np, y_train, x_test_np, y_test, "Stacking Ensemble Classifier")
stacking_results;




=== Stacking Ensemble Classifier ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 88.05%     |      0.9619 |
+-------+------------+-------------+
| Train | 89.14%     |      0.9695 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



In [52]:
results = [lgbm_results, xgb_results, catb_results, 
        snb_results, grb_results, hgrb_results, stacking_results
        ]

create_summary_table(results)


Summary Table - Test Metrics
+------------------------------+------------+-------------+----------+------------+
| Algorithm                    |   Accuracy |   Precision |   Recall |   F1-Score |
| LightGBM                     |     0.8829 |      0.8835 |   0.8829 |     0.8828 |
+------------------------------+------------+-------------+----------+------------+
| XGBoost                      |     0.8779 |      0.8781 |   0.8779 |     0.8778 |
+------------------------------+------------+-------------+----------+------------+
| CatBoost                     |     0.8794 |      0.8798 |   0.8794 |     0.8794 |
+------------------------------+------------+-------------+----------+------------+
| SnapBoost                    |     0.8753 |      0.8762 |   0.8753 |     0.8752 |
+------------------------------+------------+-------------+----------+------------+
| Gradient Boosting            |     0.8716 |      0.8729 |   0.8716 |     0.8715 |
+------------------------------+------------+-