In [2]:
import cv
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_fscore_support
from imblearn.combine import SMOTEENN
from lightgbm import LGBMClassifier
import numpy as np
from sklearn.metrics import precision_recall_curve

In [3]:
df = pd.read_csv("processed_diabetes.csv")

df["Diabetes_binary"] = df["Diabetes_012"].apply(lambda x: 0 if x == 0 else 1)

X = df.drop(columns=["Diabetes_012", "Diabetes_binary"])
y = df["Diabetes_binary"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Handle imbalance with hybrid sampling
smote_enn = SMOTEENN(random_state=42)
X_res, y_res = smote_enn.fit_resample(X_train, y_train)

print("Before Hybrid:\n", y_train.value_counts())
print("After Hybrid:\n", y_res.value_counts())

Before Hybrid:
 Diabetes_binary
0    170962
1     31982
Name: count, dtype: int64
After Hybrid:
 Diabetes_binary
1    159652
0    103224
Name: count, dtype: int64


In [7]:


base_clf = LGBMClassifier(
    objective="binary",
    random_state=42,
    class_weight="balanced",
    boosting_type="gbdt",
    n_estimators=500,
    learning_rate=0.03,
    subsample=0.9,
    colsample_bytree=0.8,
    reg_lambda=2.0,
    min_child_samples=30,
)

param_grid = {
    'num_leaves': [15, 31, 63, 127],
    'max_depth': [5, 10, 15, -1],
    'min_child_samples': [10, 30, 50, 100],
    'learning_rate': [0.1, 0.05, 0.02],
    'subsample': [0.7, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'reg_lambda': [0.0, 1.0, 2.0, 5.0],
}


cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

grid = GridSearchCV(
    estimator=base_clf,
    param_grid=param_grid,
    cv=cv,              # <â€” Now correctly defined
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

grid.fit(X_res, y_res)
best_clf = grid.best_estimator_
print("Best Parameters:", grid.best_params_)


from lightgbm import early_stopping, log_evaluation

best_clf.fit(
    X_res, y_res,
    eval_set=[(X_test, y_test)],
    eval_metric='f1',
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(0)
    ]
)


y_pred = best_clf.predict(X_test)
y_proba = best_clf.predict_proba(X_test)[:, 1]
prec, rec, thresh = precision_recall_curve(y_test, y_proba)

# Pick threshold maximizing F1
f1_scores = 2 * (prec * rec) / (prec + rec)
best_thresh = thresh[f1_scores.argmax()]

y_pred_opt = (y_proba >= best_thresh).astype(int)
print("Optimized Threshold:", best_thresh)
print(classification_report(y_test, y_pred_opt))

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[LightGBM] [Info] Number of positive: 159652, number of negative: 103224
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027198 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5676
[LightGBM] [Info] Number of data points in the train set: 262876, number of used features: 53
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Best Parameters: {'max_depth': -1, 'min_child_samples': 30, 'num_leaves': 63}
[LightGBM] [Info] Number of positive: 159652, number of negative: 103224
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024894 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[

In [8]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

roc_auc = roc_auc_score(y_test, y_proba)
prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print(f"ROC-AUC: {roc_auc:.4f}")
print(f"Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")

Confusion Matrix:
 [[35701  7040]
 [ 3133  4862]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.84      0.88     42741
           1       0.41      0.61      0.49      7995

    accuracy                           0.80     50736
   macro avg       0.66      0.72      0.68     50736
weighted avg       0.84      0.80      0.81     50736

ROC-AUC: 0.8226
Precision: 0.4085, Recall: 0.6081, F1: 0.4887
