<a href="https://colab.research.google.com/github/teddierakewa/diabetes-prediction-project/blob/main/Phase_3_Model_training_and_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

data = pd.read_csv('diabetes.csv')

X = data.drop('Outcome', axis=1)
y = data['Outcome']

model = LogisticRegression(solver='liblinear') # Using 'liblinear' solver for better convergence with small datasets
model.fit(X, y)


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [5]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

rf_model.fit(X_train, y_train)


In [6]:
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

y_test_prob = rf_model.predict_proba(X_test)[:, 1]


In [7]:
from sklearn.metrics import classification_report, roc_auc_score

print("Training Classification Report")
print(classification_report(y_train, y_train_pred))

print("Testing Classification Report")
print(classification_report(y_test, y_test_pred))

roc_auc = roc_auc_score(y_test, y_test_prob)
print(f"ROC-AUC Score: {roc_auc:.2f}")


Training Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       400
           1       1.00      1.00      1.00       214

    accuracy                           1.00       614
   macro avg       1.00      1.00      1.00       614
weighted avg       1.00      1.00      1.00       614

Testing Classification Report
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       100
           1       0.68      0.59      0.63        54

    accuracy                           0.76       154
   macro avg       0.74      0.72      0.73       154
weighted avg       0.75      0.76      0.76       154

ROC-AUC Score: 0.81


In [8]:
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Training Accuracy: {train_accuracy:.2f}")
print(f"Testing Accuracy: {test_accuracy:.2f}")


Training Accuracy: 1.00
Testing Accuracy: 0.76


In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [10]:
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    scoring='recall',
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)


In [11]:
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [12]:
y_test_pred_best = best_model.predict(X_test)
y_test_prob_best = best_model.predict_proba(X_test)[:, 1]

print("Tuned Model Classification Report")
print(classification_report(y_test, y_test_pred_best))

roc_auc_best = roc_auc_score(y_test, y_test_prob_best)
print(f"Tuned Model ROC-AUC: {roc_auc_best:.2f}")


Tuned Model Classification Report
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       100
           1       0.65      0.59      0.62        54

    accuracy                           0.75       154
   macro avg       0.72      0.71      0.72       154
weighted avg       0.74      0.75      0.74       154

Tuned Model ROC-AUC: 0.81
