In [34]:
import pandas as pd
import numpy as np
import pickle
from tabulate import tabulate
import matplotlib.pyplot as plt
from decimal import Decimal, ROUND_DOWN

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,roc_auc_score, classification_report, 
    confusion_matrix, ConfusionMatrixDisplay, roc_curve
)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import (
    StackingClassifier, GradientBoostingClassifier, 
    GradientBoostingClassifier, HistGradientBoostingClassifier,
    AdaBoostClassifier, RandomForestClassifier
)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from snapml import BoostingMachineClassifier  
from xgboost import XGBClassifier
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli 

In [3]:
df = pd.read_csv('data/preprocessed_data_full_encoded_new_v4.csv')
cols_to_keep = ['age', 
                'height',
                'weight',
                'systolic', 
                'diastolic',
                'bmi',
                'map',
                'pulse_pressure',
                'gender',
                'cholesterol', 
                'gluc',
                'smoke', 
                'alco', 
                'active',
                'cardio'
                ]

df = df[cols_to_keep]

In [None]:
X = df.drop('cardio', axis=1)
y = df['cardio']

label_mapping = {0: 'Healthy', 1: 'Cardio Risk'}
target_names = [label_mapping[label] for label in y.unique()]

numerical_features = ['age', 'height', 'weight', 'systolic', 'diastolic', 'bmi', 'map', 'pulse_pressure']
ordinal_features = ['cholesterol', 'gluc']
binary_features = ['gender', 'smoke', 'alco', 'active']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

passthrough_features = [col for col in X.columns if col not in numerical_features]
transformed_feature_names = numerical_features + passthrough_features

X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=transformed_feature_names)
X_test_transformed_df = pd.DataFrame(X_test_transformed, columns=transformed_feature_names)
print("Contoh data train setelah scaling:")
print(X_train_transformed_df.head())

Contoh data train setelah scaling:
        age    height    weight  systolic  diastolic       bmi       map  \
0  0.047973  1.554091  1.702976 -0.416790  -0.183072  0.706152 -0.319255   
1  0.342487 -0.430436 -0.774215 -0.416790  -0.183072 -0.570828 -0.319255   
2  0.931516  0.363375  0.584244  0.923307  -0.183072  0.358079  0.382521   
3  0.195230 -0.695040  0.184697 -1.086838  -1.456053  0.575892 -1.371393   
4 -0.541056 -0.562738 -1.653219 -1.756886  -1.456053 -1.422859 -1.721755   

   pulse_pressure  gender  cholesterol  gluc  smoke  alco  active  
0       -0.450279     1.0          2.0   0.0    1.0   1.0     0.0  
1       -0.450279     0.0          1.0   0.0    0.0   0.0     1.0  
2        1.432884     0.0          1.0   2.0    0.0   0.0     0.0  
3       -0.450279     0.0          0.0   0.0    0.0   0.0     1.0  
4       -1.391861     0.0          0.0   0.0    0.0   1.0     1.0  


In [5]:
x_train_np = np.array(X_train)
y_train_np = np.array(y_train)
x_test_np = np.array(X_test)
y_test_np = np.array(y_test)

In [35]:
def evaluate_model(model, x_train, y_train, x_test, y_test, model_name):        
    model.fit(x_train, y_train)
    predict_func = lambda x: model.predict(x)
    predict_proba_func = lambda x: model.predict_proba(x)

    y_pred_test = predict_func(x_test)
    y_proba_test = predict_proba_func(x_test)
    
    if y_proba_test.shape[1] > 1:
        y_probs_test = y_proba_test[:, 1]
    else:
        y_probs_test = y_proba_test[:, 0]
    
    y_pred_train = predict_func(x_train)
    y_proba_train = predict_proba_func(x_train)
    if y_proba_train.shape[1] > 1:
        y_probs_train = y_proba_train[:, 1]
    else:
        y_probs_train = y_proba_train[:, 0]
    
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_acc_str = f"{int(test_accuracy * 100 * 100) / 100:.2f}%"
    test_auc = roc_auc_score(y_test, y_probs_test)
    test_auc_str = f"{test_auc:.4f}"
    test_report_dict = classification_report(y_test, y_pred_test, output_dict=True)
    test_precision = test_report_dict['weighted avg']['precision'] 
    test_recall    = test_report_dict['weighted avg']['recall']   
    test_f1        = test_report_dict['weighted avg']['f1-score']
    
    train_accuracy = accuracy_score(y_train, y_pred_train)
    train_acc_str = f"{(train_accuracy  * 100 * 100) / 100:.2f}%"
    train_auc = roc_auc_score(y_train, y_probs_train)
    train_auc_str = f"{train_auc:.4f}"
    train_report_dict = classification_report(y_train, y_pred_train, output_dict=True)
    train_precision = train_report_dict['weighted avg']['precision']
    train_recall    = train_report_dict['weighted avg']['recall']    
    train_f1        = train_report_dict['weighted avg']['f1-score']  

    data = [
        ["Test", test_acc_str, test_auc_str],
        ["Train", train_acc_str, train_auc_str]
    ]

    headers = ["", "Accuracy", "AUC Score"]

    print(f"\n=== {model_name} ===\n")
    print(tabulate(data, headers=headers, tablefmt="grid"))
    
    print("\nOverfitting Check :")
    if train_accuracy > test_accuracy + 5 or train_auc > test_auc + 0.05:
        print("The model might be overfitting.")
    else:
        print("No significant signs of overfitting.\n")
    
    # fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    # cm = confusion_matrix(y_test, y_pred_test)

    # display_labels = list(label_mapping.values()) if 'label_mapping' in globals() else None
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels)
    # disp.plot(ax=axes[0], cmap='viridis', colorbar=False)
    # axes[0].set_title(f"{model_name} - Confusion Matrix")
    
    # fpr, tpr, _ = roc_curve(y_test, y_probs_test)
    # axes[1].plot(fpr, tpr, label=f"ROC Curve (AUC = {test_auc:.4f})", linewidth=2)
    # axes[1].plot([0, 1], [0, 1], 'k--', label="Random Guess", linewidth=1)
    # axes[1].set_title(f"{model_name} - ROC Curve")
    # axes[1].legend(loc="lower right")
    # axes[1].grid(alpha=0.3)
    
    # plt.tight_layout()
    # plt.show()
    
    return {
        'model_name': model_name,
        'train_accuracy': train_accuracy,
        'train_precision': train_precision,
        'train_recall': train_recall,
        'train_f1': train_f1,
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_recall': test_recall,
        'test_f1': test_f1
    }


def create_summary_table(results):
    test_summary = pd.DataFrame([{
        'Algorithm': r['model_name'],
        'Accuracy':  round(r['test_accuracy'], 4),
        'Precision': round(r['test_precision'], 4),
        'Recall':    round(r['test_recall'], 4),
        'F1-Score':  round(r['test_f1'], 4)
    } for r in results])
    train_summary = pd.DataFrame([{
        'Algorithm': r['model_name'],
        'Accuracy':  round(r['train_accuracy'], 4),
        'Precision': round(r['train_precision'], 4),
        'Recall':    round(r['train_recall'], 4),
        'F1-Score':  round(r['train_f1'], 4)
    } for r in results])
    
    
    print("\nSummary Table - Test Metrics")
    print(tabulate(test_summary, headers='keys', tablefmt='grid', showindex=False))

    print("Summary Table - Training Metrics")
    print(tabulate(train_summary, headers='keys', tablefmt='grid', showindex=False))

In [32]:
best_params = {
    'n_estimators': 233,
    'learning_rate': 0.019519792757748358,
    'num_leaves': 41,
    'max_depth': 15,
    'subsample': 0.758484089588373,
    'colsample_bytree': 0.9592852139230149,
    'random_state': 333
}

lgbm_model = LGBMClassifier(**best_params, verbose=-1)

lgbm_model.fit(X_train, y_train)

# Menghitung akurasi dengan metode score
accuracy = lgbm_model.score(X_test, y_test)
print("Akurasi model:", accuracy)

Akurasi model: 0.8828828828828829


In [8]:
lgbm_model.fit(X_train, y_train)

# Menghitung akurasi dengan metode score
accuracy = lgbm_model.score(X_test, y_test)
print("Akurasi model:", accuracy)

Akurasi model: 0.8811425061425061


In [36]:
lgbm_results = evaluate_model(lgbm_model, X_train, y_train, X_test, y_test, "LightGBM")
lgbm_results;


=== LightGBM ===

+-------+------------+-------------+
|       | Accuracy   |   AUC Score |
| Test  | 88.28%     |      0.9618 |
+-------+------------+-------------+
| Train | 88.96%     |      0.9678 |
+-------+------------+-------------+

Overfitting Check :
No significant signs of overfitting.



In [10]:
# best_acc = 0
# best_seed = None
# best_model = None

# # Coba berbagai random_state
# for seed in range(1, 101):
#     params = {
#         'n_estimators': 233,
#         'learning_rate': 0.019519792757748358,
#         'num_leaves': 41,
#         'max_depth': 15,
#         'subsample': 0.758484089588373,
#         'colsample_bytree': 0.9592852139230149,
#         'random_state': seed
#     }

#     model = LGBMClassifier(**params, verbose=-1)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     acc = accuracy_score(y_test, y_pred)

#     if acc > best_acc:
#         best_acc = acc
#         best_seed = seed
#         best_model = model
    
# print(f"\n🔍 Best Accuracy: {best_acc:.5f} with random_state = {best_seed}")

In [11]:
# import optuna
# from optuna.samplers import TPESampler
# from lightgbm import LGBMClassifier
# from sklearn.model_selection import cross_val_score, KFold
# from sklearn.metrics import accuracy_score

# SEED = 104

# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 220, 250),
#         'learning_rate': trial.suggest_float('learning_rate', 0.015, 0.025),
#         'num_leaves': trial.suggest_int('num_leaves', 35, 50),
#         'max_depth': trial.suggest_int('max_depth', 13, 16),
#         'subsample': trial.suggest_float('subsample', 0.74, 0.78),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.94, 0.97),
#         'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 0.1),
#         'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 0.1),
#         'min_child_samples': trial.suggest_int('min_child_samples', 10, 40),
#         'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 0.1),
#         'random_state': SEED
#     }
    
#     model = LGBMClassifier(**params, n_jobs=-1, verbose=-1)
#     kf = KFold(n_splits=3, shuffle=True, random_state=SEED)
    
#     score = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy', n_jobs=-1)
#     return score.mean()

# study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=SEED))
# study.optimize(objective, n_trials=30, n_jobs=-1)

# best_params = study.best_trial.params
# print("Best Parameters:", best_params)

# best_lgbm = LGBMClassifier(**best_params, n_jobs=-1, verbose=-1)
# best_lgbm.fit(X_train, y_train)

# y_pred = best_lgbm.predict(X_test)
# test_accuracy = accuracy_score(y_test, y_pred)
# print("Test Accuracy after refined+regularized tuning:", test_accuracy)


In [12]:
# catb_model = CatBoostClassifier(verbose=False)

In [13]:
# xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [14]:
# snb_model = BoostingMachineClassifier()

In [15]:
# grb_model =  GradientBoostingClassifier(random_state=42)

In [16]:
# hgrb_model =  HistGradientBoostingClassifier(random_state=42)

In [17]:
# base_models = [
#     ("lgbm", lgbm_model),
#     ("catb", catb_model),
#     ("grb", grb_model),
#     ("snb", snb_model),
#     ("xgb", xgb_model)
# ]

In [18]:
# meta_model = hgrb_model

In [19]:
# stacking_model = StackingClassifier(estimators=base_models,
#                                     final_estimator=meta_model,
#                                     cv=5,
#                                     passthrough=True)

In [20]:
# stacking_model.fit(X_train_transformed, y_train)

In [21]:
# y_pred = stacking_model.predict(X_test_transformed)

# accuracy = accuracy_score(y_test, y_pred)
# report = classification_report(y_test, y_pred)

# print("Accuracy:", accuracy)
# print("\nClassification Report:\n", report)

In [22]:
# lgbm_results = evaluate_model(lgbm_model, X_train, y_train, X_test, y_test, "LightGBM")
# lgbm_results;

In [23]:
# xgb_results = evaluate_model(xgb_model, X_train, y_train, X_test, y_test, "XGBoost")
# xgb_results;

In [24]:
# catb_results = evaluate_model(catb_model, X_train, y_train, X_test, y_test, "CatBoost")
# catb_results;

In [25]:
# snb_results = evaluate_model(snb_model, x_train_np, y_train, x_test_np, y_test, "SnapBoost")
# snb_results;

In [26]:
# grb_results = evaluate_model(grb_model, X_train, y_train, X_test, y_test, "Gradient Boosting")
# grb_results;

In [27]:
# hgrb_results = evaluate_model(hgrb_model, X_train, y_train, X_test, y_test, "Hist Gradient Boosting")
# hgrb_results;

In [28]:
# stacking_results = evaluate_model(stacking_model, x_train_np, y_train, x_test_np, y_test, "Stacking Ensemble Classifier")
# stacking_results;

In [29]:
# results = [lgbm_results, xgb_results, catb_results, 
#         snb_results, grb_results, hgrb_results, stacking_results
#         ]

# create_summary_table(results)