model_training

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle
import joblib

1. Load data

In [None]:
df = pd.read_csv('C:\Users\User\Desktop\Sreamlit 2\data\diabetes.csv')
print('Loaded dataset with shape:', df.shape)

Loaded dataset with shape: (768, 9)


2. Basic EDA (prints)

In [4]:
print(df.head())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [5]:
print(df.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [7]:
print('Missing values per column:\n', df.isnull().sum())

Missing values per column:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


3. Replace zeros in specific columns with NaN (commonly done for Pima dataset)

In [8]:
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_with_zero:
    df[col] = df[col].replace(0, np.nan)


4. Impute missing values with median

In [9]:
imputer = SimpleImputer(strategy='median')
df[cols_with_zero] = imputer.fit_transform(df[cols_with_zero])

5. Features and target

In [10]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

6. Train-test split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


7. Scale features

In [14]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

8. Train models

In [16]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'SVC': SVC(probability=True, random_state=42)
}

results = {}
for name, model in models.items():
    print(f'Training {name}...')
    # simple cross-val
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    print(f'{name} CV accuracy: {scores.mean():.4f} +/- {scores.std():.4f}')
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, preds)
    print(f'{name} Test accuracy: {acc:.4f}')
    results[name] = {'model': model, 'cv_mean': scores.mean(), 'test_acc': acc}
    print(classification_report(y_test, preds))


Training LogisticRegression...
LogisticRegression CV accuracy: 0.7818 +/- 0.0125
LogisticRegression Test accuracy: 0.7078
              precision    recall  f1-score   support

           0       0.75      0.82      0.78       100
           1       0.60      0.50      0.55        54

    accuracy                           0.71       154
   macro avg       0.68      0.66      0.67       154
weighted avg       0.70      0.71      0.70       154

Training RandomForest...
RandomForest CV accuracy: 0.7655 +/- 0.0379
RandomForest Test accuracy: 0.7403
              precision    recall  f1-score   support

           0       0.78      0.84      0.81       100
           1       0.65      0.56      0.60        54

    accuracy                           0.74       154
   macro avg       0.71      0.70      0.70       154
weighted avg       0.73      0.74      0.73       154

Training SVC...
SVC CV accuracy: 0.7688 +/- 0.0179
SVC Test accuracy: 0.7403
              precision    recall  f1-score

9. Choose best model by test accuracy

In [17]:
best_name = max(results.keys(), key=lambda k: results[k]['test_acc'])
best_model = results[best_name]['model']
print('Best model:', best_name, 'with test accuracy', results[best_name]['test_acc'])


Best model: RandomForest with test accuracy 0.7402597402597403


10. Save model and scaler

In [18]:
with open('model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print('Saved model to model.pkl and scaler to scaler.pkl')

Saved model to model.pkl and scaler to scaler.pkl


In [19]:
# Optional: save results summary
import json
summary = {k: {'cv_mean': float(v['cv_mean']), 'test_acc': float(v['test_acc'])} for k, v in results.items()}
with open('model_results.json', 'w') as f:
    json.dump(summary, f, indent=2)
print('Saved model_results.json')

Saved model_results.json
