In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.combine import SMOTEENN
from lightgbm import LGBMClassifier

In [7]:
df = pd.read_csv("processed_diabetes.csv")

df["Diabetes_binary"] = df["Diabetes_012"].apply(lambda x: 0 if x == 0 else 1)

X = df.drop(columns=["Diabetes_012", "Diabetes_binary"])
y = df["Diabetes_binary"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Handle imbalance with hybrid sampling
smote_enn = SMOTEENN(random_state=42)
X_res, y_res = smote_enn.fit_resample(X_train, y_train)

print("Before Hybrid:\n", y_train.value_counts())
print("After Hybrid:\n", y_res.value_counts())

Before Hybrid:
 Diabetes_binary
0    170962
1     31982
Name: count, dtype: int64
After Hybrid:
 Diabetes_binary
1    159652
0    103224
Name: count, dtype: int64


In [9]:
clf = LGBMClassifier(
    objective="binary",
    random_state=42,
    class_weight="balanced",
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1
)

clf.fit(X_res, y_res)

# Predictions
y_pred = clf.predict(X_test)


[LightGBM] [Info] Number of positive: 159652, number of negative: 103224
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026971 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5676
[LightGBM] [Info] Number of data points in the train set: 262876, number of used features: 53
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [10]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))

Confusion Matrix:
 [[35658  7083]
 [ 3124  4871]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.83      0.87     42741
           1       0.41      0.61      0.49      7995

    accuracy                           0.80     50736
   macro avg       0.66      0.72      0.68     50736
weighted avg       0.84      0.80      0.81     50736

