In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_recall_curve
)
from imblearn.combine import SMOTEENN
from catboost import CatBoostClassifier

df = pd.read_csv("processed_diabetes.csv")
df["Diabetes_binary"] = df["Diabetes_012"].apply(lambda x: 0 if x == 0 else 1)

X = df.drop(columns=["Diabetes_012", "Diabetes_binary"])
y = df["Diabetes_binary"]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

all_probs = []
all_true = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n--- Fold {fold} ---")
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Hybrid sampling
    smote_enn = SMOTEENN(random_state=42)
    X_res, y_res = smote_enn.fit_resample(X_train, y_train)

    print("Before Hybrid:", y_train.value_counts().to_dict())
    print("After Hybrid:", y_res.value_counts().to_dict())


    clf = CatBoostClassifier(
        iterations=600,
        learning_rate=0.03,
        depth=8,
        eval_metric="Logloss",
        random_seed=42,
        class_weights=[1, 3],  # weight positives a bit more
        verbose=0
    )
    clf.fit(X_res, y_res)


    y_prob = clf.predict_proba(X_test)[:, 1]
    all_probs.extend(y_prob)
    all_true.extend(y_test)

all_probs = np.array(all_probs)
all_true = np.array(all_true)

precisions, recalls, thresholds = precision_recall_curve(all_true, all_probs)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
best_idx = f1_scores.argmax()
best_threshold = thresholds[best_idx]

print(f"\n=== Optimal Threshold Found ===")
print(f"Best threshold: {best_threshold:.3f}")
print(f"Precision: {precisions[best_idx]:.3f}, Recall: {recalls[best_idx]:.3f}, F1: {f1_scores[best_idx]:.3f}")

y_pred = (all_probs >= best_threshold).astype(int)

print("\nConfusion Matrix:\n", confusion_matrix(all_true, y_pred))
print("\nClassification Report:\n", classification_report(all_true, y_pred, zero_division=0))



--- Fold 1 ---
Before Hybrid: {0: 170962, 1: 31982}
After Hybrid: {1: 159894, 0: 102828}

--- Fold 2 ---
Before Hybrid: {0: 170962, 1: 31982}
After Hybrid: {1: 159884, 0: 103169}

--- Fold 3 ---
Before Hybrid: {0: 170962, 1: 31982}
After Hybrid: {1: 159914, 0: 102924}

--- Fold 4 ---
Before Hybrid: {0: 170963, 1: 31981}
After Hybrid: {1: 160108, 0: 102982}

--- Fold 5 ---
Before Hybrid: {0: 170963, 1: 31981}
After Hybrid: {1: 160046, 0: 103095}

=== Optimal Threshold Found ===
Best threshold: 0.818
Precision: 0.412, Recall: 0.609, F1: 0.491

Confusion Matrix:
 [[178957  34746]
 [ 15636  24341]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.84      0.88    213703
           1       0.41      0.61      0.49     39977

    accuracy                           0.80    253680
   macro avg       0.67      0.72      0.68    253680
weighted avg       0.84      0.80      0.82    253680

