In [12]:
# =====================================================
# Diabetes Risk Prediction - Model Training Notebook
# =====================================================

# Step 1. Import libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Define paths
BASE_DIR = r"c:\Users\Sumit\Desktop\healthcare-pipeline"
DATA_PATH = os.path.join(BASE_DIR, 'data', 'pima.csv')
MODEL_PATH = os.path.join(BASE_DIR, 'models', 'model.joblib')
SCALER_PATH = os.path.join(BASE_DIR, 'models', 'scaler.joblib')

# Step 2. Load Pima Indians Diabetes Dataset
columns = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin",
           "BMI", "DiabetesPedigreeFunction", "Age", "Outcome"]

# Skip the first row as it contains headers
data = pd.read_csv(DATA_PATH, names=columns, skiprows=[0])

# Convert all columns to numeric
for col in data.columns:
    data[col] = pd.to_numeric(data[col])

print("Dataset shape:", data.shape)
print("\nFirst few rows of the dataset:")
print(data.head())

# Step 3. Data preprocessing
X = data.drop("Outcome", axis=1)
y = data["Outcome"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4. Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 5. Train model
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

# Step 6. Evaluate
y_pred = model.predict(X_test)
print("\nModel Evaluation:")
print("----------------")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 7. Save the model and scaler
joblib.dump(model, MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)

print("\n✅ Model saved to", MODEL_PATH)
print("✅ Scaler saved to", SCALER_PATH)

Dataset shape: (768, 9)

First few rows of the dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Model Evaluation:
----------------
Accuracy: 0.7532467532467533

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66     