In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.combine import SMOTEENN
from xgboost import XGBClassifier

In [9]:
df = pd.read_csv("processed_diabetes.csv")


df["Diabetes_binary"] = df["Diabetes_012"].apply(lambda x: 0 if x == 0 else 1)

X = df.drop(columns=["Diabetes_012", "Diabetes_binary"])
y = df["Diabetes_binary"]

X.columns = [str(c).replace("<", "lt").replace(">", "gt").replace("[","").replace("]","") for c in X.columns]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

reports = []
conf_matrices = []


In [10]:
# Define model
clf = XGBClassifier(
    objective="binary:logistic",
    random_state=42,
    scale_pos_weight=len(y[y == 0]) / len(y[y == 1]),  # balance
    n_estimators=600,
    learning_rate=0.03,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    reg_alpha=0.2,
    gamma=2.0,
    tree_method="hist",
    eval_metric="logloss",
    use_label_encoder=False
)

In [11]:
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n--- Fold {fold} ---")
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Hybrid sampling
    smote_enn = SMOTEENN(random_state=42)
    X_res, y_res = smote_enn.fit_resample(X_train, y_train)

    print("Before Hybrid:", y_train.value_counts().to_dict())
    print("After Hybrid:", y_res.value_counts().to_dict())

    clf = XGBClassifier(
        objective="binary:logistic",
        random_state=42,
        scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
        n_estimators=600,
        learning_rate=0.03,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        reg_alpha=0.2,
        gamma=2.0,
        tree_method="hist",
        eval_metric="logloss",
    )

    clf.fit(X_res, y_res)
    y_pred = clf.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0, output_dict=True)

    conf_matrices.append(cm)
    reports.append(report)



--- Fold 1 ---
Before Hybrid: {0: 170962, 1: 31982}
After Hybrid: {1: 159894, 0: 102828}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Fold 2 ---
Before Hybrid: {0: 170962, 1: 31982}
After Hybrid: {1: 159884, 0: 103169}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Fold 3 ---
Before Hybrid: {0: 170962, 1: 31982}
After Hybrid: {1: 159914, 0: 102924}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Fold 4 ---
Before Hybrid: {0: 170963, 1: 31981}
After Hybrid: {1: 160108, 0: 102982}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Fold 5 ---
Before Hybrid: {0: 170963, 1: 31981}
After Hybrid: {1: 160046, 0: 103095}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [12]:
avg_precision_0 = np.mean([r['0']['precision'] for r in reports])
avg_recall_0 = np.mean([r['0']['recall'] for r in reports])
avg_f1_0 = np.mean([r['0']['f1-score'] for r in reports])

avg_precision_1 = np.mean([r['1']['precision'] for r in reports])
avg_recall_1 = np.mean([r['1']['recall'] for r in reports])
avg_f1_1 = np.mean([r['1']['f1-score'] for r in reports])

print("\n=== Cross-Validation Summary (5 folds) ===")
print(f"Class 0 - Precision: {avg_precision_0:.3f}, Recall: {avg_recall_0:.3f}, F1: {avg_f1_0:.3f}")
print(f"Class 1 - Precision: {avg_precision_1:.3f}, Recall: {avg_recall_1:.3f}, F1: {avg_f1_1:.3f}")


=== Cross-Validation Summary (5 folds) ===
Class 0 - Precision: 0.959, Recall: 0.619, F1: 0.752
Class 1 - Precision: 0.297, Recall: 0.859, F1: 0.441
