<a href="https://colab.research.google.com/github/vasu31d/diabetes-ml-pipeline/blob/main/02_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Baseline Diabetes Prediction Model
This notebook trains a simple baseline model without handling class imbalance.


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report , confusion_matrix

In [15]:
df=pd.read_csv(r'/content/diabetes_binary_health_indicators_BRFSS2015.csv')

In [16]:
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


**spliting the data into training and validation parts**

In [17]:
X = df.drop('Diabetes_binary', axis=1)
y = df['Diabetes_binary']

In [18]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,        # 30% goes to temp both val and test
    random_state=42,
    stratify=y
)



In [19]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.3333,      # gives 10% test
    random_state=42,
    stratify=y_temp
)


In [20]:
print("Train:", X_train.shape)
print("Validation:", X_val.shape)
print("Test:", X_test.shape)


Train: (177576, 21)
Validation: (50738, 21)
Test: (25366, 21)


In [21]:
scaler = StandardScaler()

X_train["BMI"] = scaler.fit_transform(X_train[["BMI"]])
X_val["BMI"] = scaler.transform(X_val[["BMI"]])
X_test["BMI"] = scaler.transform(X_test[["BMI"]])


In [22]:
baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(X_train, y_train)

In [23]:
y_val_pred = baseline_model.predict(X_val)

print("Confusion Matrix:(validation)")
print(confusion_matrix(y_val, y_val_pred))

print("\nclassification matrix:(validation)")
print(classification_report(y_val, y_val_pred))

Confusion Matrix:(validation)
[[42692   976]
 [ 5940  1130]]

classification matrix:(validation)
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93     43668
         1.0       0.54      0.16      0.25      7070

    accuracy                           0.86     50738
   macro avg       0.71      0.57      0.59     50738
weighted avg       0.83      0.86      0.83     50738



In [24]:
balanced_model = LogisticRegression (
    max_iter=1000,
    class_weight='balanced'
)

balanced_model.fit(X_train, y_train)

y_val_pred = balanced_model.predict(X_val)

print("Confusion Matrix:(validation)")
print(confusion_matrix(y_val, y_val_pred))

print("\nclassification matrix:(validation)")
print(classification_report(y_val, y_val_pred))

Confusion Matrix:(validation)
[[31717 11951]
 [ 1670  5400]]

classification matrix:(validation)
              precision    recall  f1-score   support

         0.0       0.95      0.73      0.82     43668
         1.0       0.31      0.76      0.44      7070

    accuracy                           0.73     50738
   macro avg       0.63      0.75      0.63     50738
weighted avg       0.86      0.73      0.77     50738



In [25]:
y_val_probs = balanced_model.predict_proba(X_val)[:, 1]

for threshold in [0.3,0.4,0.5]:
    y_val_pred = (y_val_probs >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_val, y_val_pred))

Threshold: 0.3
              precision    recall  f1-score   support

         0.0       0.98      0.52      0.68     43668
         1.0       0.24      0.92      0.38      7070

    accuracy                           0.58     50738
   macro avg       0.61      0.72      0.53     50738
weighted avg       0.87      0.58      0.64     50738

Threshold: 0.4
              precision    recall  f1-score   support

         0.0       0.96      0.63      0.76     43668
         1.0       0.27      0.86      0.41      7070

    accuracy                           0.66     50738
   macro avg       0.62      0.74      0.59     50738
weighted avg       0.87      0.66      0.72     50738

Threshold: 0.5
              precision    recall  f1-score   support

         0.0       0.95      0.73      0.82     43668
         1.0       0.31      0.76      0.44      7070

    accuracy                           0.73     50738
   macro avg       0.63      0.75      0.63     50738
weighted avg       0.86      

In [26]:
import os
os.makedirs("../models", exist_ok=True)


In [27]:
import pickle

with open("../models/diabetes_model.pkl", "wb") as f:
    pickle.dump(balanced_model, f)

with open("../models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("../models/threshold.pkl", "wb") as f:
    pickle.dump(0.4, f)



In [28]:
os.listdir("../models")


['threshold.pkl', 'scaler.pkl', 'diabetes_model.pkl']

In [29]:
from google.colab import files
files.download("../models/diabetes_model.pkl")
files.download("../models/scaler.pkl")
files.download("../models/threshold.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>