In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE

# Load the dataset
df = pd.read_csv('diabetes.csv')

# Separate features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train model before SMOTE
clf_before = RandomForestClassifier(random_state=42)
clf_before.fit(X_train, y_train)
y_pred_before = clf_before.predict(X_test)
f1_before = f1_score(y_test, y_pred_before)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train model after SMOTE
clf_after = RandomForestClassifier(random_state=42)
clf_after.fit(X_train_smote, y_train_smote)
y_pred_after = clf_after.predict(X_test)
f1_after = f1_score(y_test, y_pred_after)

# Print the results
print("F1-Score before SMOTE:", f1_before)
print("F1-Score after SMOTE: ", f1_after)


F1-Score before SMOTE: 0.6336633663366337
F1-Score after SMOTE:  0.6548672566371682
