In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load the dataset
file_path = "SMOTE - SMOTE.csv"  # Replace this with your actual file path
df = pd.read_csv(file_path)

# Split features and target variable
X = df.drop(columns=["Diabetes"])
y = df["Diabetes"]

# Check class distribution before SMOTE
print("Class distribution before SMOTE:\n", y.value_counts())

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.75, k_neighbors=4, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
print("\nClass distribution after SMOTE:\n", y_train_smote.value_counts())

# Train a RandomForest model on original data
clf_original = RandomForestClassifier(random_state=42)
clf_original.fit(X_train, y_train)
y_pred_original = clf_original.predict(X_test)

# Train a RandomForest model on SMOTE data
clf_smote = RandomForestClassifier(random_state=42)
clf_smote.fit(X_train_smote, y_train_smote)
y_pred_smote = clf_smote.predict(X_test)

# Evaluate model performance
print("\nClassification Report (Original Data):\n")
print(classification_report(y_test, y_pred_original))

print("\nClassification Report (SMOTE Applied):\n")
print(classification_report(y_test, y_pred_smote))


Class distribution before SMOTE:
 Diabetes
0    900
1    100
Name: count, dtype: int64

Class distribution after SMOTE:
 Diabetes
0    720
1    540
Name: count, dtype: int64

Classification Report (Original Data):

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       180
           1       1.00      0.80      0.89        20

    accuracy                           0.98       200
   macro avg       0.99      0.90      0.94       200
weighted avg       0.98      0.98      0.98       200


Classification Report (SMOTE Applied):

              precision    recall  f1-score   support

           0       0.98      0.97      0.97       180
           1       0.73      0.80      0.76        20

    accuracy                           0.95       200
   macro avg       0.85      0.88      0.87       200
weighted avg       0.95      0.95      0.95       200

