In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("/content/diabetes.csv")

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
print(df.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [5]:
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [6]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
def evaluate_model(y_true, y_pred, y_prob):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-Score': f1_score(y_true, y_pred),
        'ROC-AUC': roc_auc_score(y_true, y_prob)
    }

In [10]:
# Function to run GridSearchCV and evaluate
def run_grid_search(model, param_grid, X_train, y_train, X_test, y_test):
    grid = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    y_prob = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, 'predict_proba') else y_pred
    return grid.best_params_, evaluate_model(y_test, y_pred, y_prob)

In [11]:
# Logistic Regression on original data
print("Running Logistic Regression on original data...")
lr_model = LogisticRegression()
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}
lr_best_params_exp1, lr_metrics_exp1 = run_grid_search(lr_model, lr_param_grid, X_train_scaled, y_train, X_test_scaled, y_test)
print("\nLogistic Regression (Experiment 1):")
print(f"Best Parameters: {lr_best_params_exp1}")
print("Metrics:")
for metric, value in lr_metrics_exp1.items():
    print(f"{metric}: {value:.4f}")

Running Logistic Regression on original data...

Logistic Regression (Experiment 1):
Best Parameters: {'C': 0.01, 'solver': 'liblinear'}
Metrics:
Accuracy: 0.7143
Precision: 0.5962
Recall: 0.5741
F1-Score: 0.5849
ROC-AUC: 0.8083


In [12]:
# Decision Tree on original data
print("\nRunning Decision Tree on original data...")
dt_model = DecisionTreeClassifier()
dt_param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10]
}
dt_best_params_exp1, dt_metrics_exp1 = run_grid_search(dt_model, dt_param_grid, X_train_scaled, y_train, X_test_scaled, y_test)
print("\nDecision Tree (Experiment 1):")
print(f"Best Parameters: {dt_best_params_exp1}")
print("Metrics:")
for metric, value in dt_metrics_exp1.items():
    print(f"{metric}: {value:.4f}")


Running Decision Tree on original data...

Decision Tree (Experiment 1):
Best Parameters: {'max_depth': 5, 'min_samples_split': 2}
Metrics:
Accuracy: 0.7922
Precision: 0.7037
Recall: 0.7037
F1-Score: 0.7037
ROC-AUC: 0.7806


In [13]:
# Random Forest on original data
print("\nRunning Random Forest on original data...")
rf_model = RandomForestClassifier()
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10]
}
rf_best_params_exp1, rf_metrics_exp1 = run_grid_search(rf_model, rf_param_grid, X_train_scaled, y_train, X_test_scaled, y_test)
print("\nRandom Forest (Experiment 1):")
print(f"Best Parameters: {rf_best_params_exp1}")
print("Metrics:")
for metric, value in rf_metrics_exp1.items():
    print(f"{metric}: {value:.4f}")


Running Random Forest on original data...

Random Forest (Experiment 1):
Best Parameters: {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 50}
Metrics:
Accuracy: 0.7792
Precision: 0.7273
Recall: 0.5926
F1-Score: 0.6531
ROC-AUC: 0.8131


In [14]:
# Oversampling
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train_scaled, y_train)

# SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Combine oversampled and SMOTE data
X_train_sampled = np.vstack([X_train_ros, X_train_smote])
y_train_sampled = np.hstack([y_train_ros, y_train_smote])

In [15]:
# Logistic Regression on sampled data
print("\nRunning Logistic Regression on sampled data...")
lr_best_params_exp2, lr_metrics_exp2 = run_grid_search(lr_model, lr_param_grid, X_train_sampled, y_train_sampled, X_test_scaled, y_test)
print("\nLogistic Regression (Experiment 2):")
print(f"Best Parameters: {lr_best_params_exp2}")
print("Metrics:")
for metric, value in lr_metrics_exp2.items():
    print(f"{metric}: {value:.4f}")


Running Logistic Regression on sampled data...

Logistic Regression (Experiment 2):
Best Parameters: {'C': 0.01, 'solver': 'liblinear'}
Metrics:
Accuracy: 0.7468
Precision: 0.6119
Recall: 0.7593
F1-Score: 0.6777
ROC-AUC: 0.8185


In [16]:
# Decision Tree on sampled data
print("\nRunning Decision Tree on sampled data...")
dt_best_params_exp2, dt_metrics_exp2 = run_grid_search(dt_model, dt_param_grid, X_train_sampled, y_train_sampled, X_test_scaled, y_test)
print("\nDecision Tree (Experiment 2):")
print(f"Best Parameters: {dt_best_params_exp2}")
print("Metrics:")
for metric, value in dt_metrics_exp2.items():
    print(f"{metric}: {value:.4f}")


Running Decision Tree on sampled data...

Decision Tree (Experiment 2):
Best Parameters: {'max_depth': None, 'min_samples_split': 2}
Metrics:
Accuracy: 0.6948
Precision: 0.5614
Recall: 0.5926
F1-Score: 0.5766
ROC-AUC: 0.6713


In [17]:
# Random Forest on sampled data
print("\nRunning Random Forest on sampled data...")
rf_best_params_exp2, rf_metrics_exp2 = run_grid_search(rf_model, rf_param_grid, X_train_sampled, y_train_sampled, X_test_scaled, y_test)
print("\nRandom Forest (Experiment 2):")
print(f"Best Parameters: {rf_best_params_exp2}")
print("Metrics:")
for metric, value in rf_metrics_exp2.items():
    print(f"{metric}: {value:.4f}")


Running Random Forest on sampled data...

Random Forest (Experiment 2):
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Metrics:
Accuracy: 0.7532
Precision: 0.6379
Recall: 0.6852
F1-Score: 0.6607
ROC-AUC: 0.8098


In [18]:
# Collect results
results_exp1 = {
    'Logistic Regression': {'Best Parameters': lr_best_params_exp1, 'Metrics': lr_metrics_exp1},
    'Decision Tree': {'Best Parameters': dt_best_params_exp1, 'Metrics': dt_metrics_exp1},
    'Random Forest': {'Best Parameters': rf_best_params_exp1, 'Metrics': rf_metrics_exp1}
}
results_exp2 = {
    'Logistic Regression': {'Best Parameters': lr_best_params_exp2, 'Metrics': lr_metrics_exp2},
    'Decision Tree': {'Best Parameters': dt_best_params_exp2, 'Metrics': dt_metrics_exp2},
    'Random Forest': {'Best Parameters': rf_best_params_exp2, 'Metrics': rf_metrics_exp2}
}

# Create dataframes for comparison
metrics_df_exp1 = pd.DataFrame({name: result['Metrics'] for name, result in results_exp1.items()})
metrics_df_exp2 = pd.DataFrame({name: result['Metrics'] for name, result in results_exp2.items()})

# Display comparison
print("\n=== Comprehensive Comparison ===")
print("\nExperiment 1 Metrics (Original Data):")
print(metrics_df_exp1)
print("\nExperiment 2 Metrics (Sampled Data):")
print(metrics_df_exp2)

# Determine best model
best_model = None
best_score = 0
best_exp = None

for exp, results in [('Experiment 1', results_exp1), ('Experiment 2', results_exp2)]:
    for name, result in results.items():
        f1 = result['Metrics']['F1-Score']
        if f1 > best_score:
            best_score = f1
            best_model = name
            best_exp = exp

print(f"\nBest Model: {best_model} from {best_exp}")
print(f"F1-Score: {best_score:.4f}")


=== Comprehensive Comparison ===

Experiment 1 Metrics (Original Data):
           Logistic Regression  Decision Tree  Random Forest
Accuracy              0.714286       0.792208       0.779221
Precision             0.596154       0.703704       0.727273
Recall                0.574074       0.703704       0.592593
F1-Score              0.584906       0.703704       0.653061
ROC-AUC               0.808333       0.780648       0.813148

Experiment 2 Metrics (Sampled Data):
           Logistic Regression  Decision Tree  Random Forest
Accuracy              0.746753       0.694805       0.753247
Precision             0.611940       0.561404       0.637931
Recall                0.759259       0.592593       0.685185
F1-Score              0.677686       0.576577       0.660714
ROC-AUC               0.818519       0.671296       0.809815

Best Model: Decision Tree from Experiment 1
F1-Score: 0.7037
