In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler

In [3]:
df = pd.read_csv("processed_diabetes.csv")

X = df.drop(columns=["Diabetes_012"])
y = df["Diabetes_012"]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train, y_train)

print("Before undersampling:", y_train.value_counts())
print("After undersampling:", y_res.value_counts())


clf = GradientBoostingClassifier(random_state=42)
clf.fit(X_res, y_res)


y_pred = clf.predict(X_test)

Before undersampling: Diabetes_012
0.0    170962
2.0     28277
1.0      3705
Name: count, dtype: int64
After undersampling: Diabetes_012
0.0    3705
1.0    3705
2.0    3705
Name: count, dtype: int64


In [5]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[26666  8717  7358]
 [  225   313   388]
 [ 1000  1789  4280]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      0.62      0.76     42741
         1.0       0.03      0.34      0.05       926
         2.0       0.36      0.61      0.45      7069

    accuracy                           0.62     50736
   macro avg       0.45      0.52      0.42     50736
weighted avg       0.86      0.62      0.70     50736

