In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv("processed_diabetes.csv")

X = df.drop(columns=["Diabetes_012"])
y = df["Diabetes_012"]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", y_res.value_counts())


clf = RandomForestClassifier(random_state=42)
clf.fit(X_res, y_res)

y_pred = clf.predict(X_test)


Before SMOTE: Diabetes_012
0.0    170962
2.0     28277
1.0      3705
Name: count, dtype: int64
After SMOTE: Diabetes_012
0.0    170962
2.0    170962
1.0    170962
Name: count, dtype: int64


In [4]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[40803    69  1869]
 [  805     1   120]
 [ 5421    12  1636]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.95      0.91     42741
         1.0       0.01      0.00      0.00       926
         2.0       0.45      0.23      0.31      7069

    accuracy                           0.84     50736
   macro avg       0.44      0.40      0.41     50736
weighted avg       0.79      0.84      0.81     50736

