## Setup

In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score, recall_score
import pickle

## Import dataset

In [27]:
diabetes = pd.read_csv("diabetes.csv")

diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [28]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 770 entries, 0 to 769
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               770 non-null    int64  
 1   Glucose                   770 non-null    int64  
 2   BloodPressure             770 non-null    int64  
 3   SkinThickness             770 non-null    int64  
 4   Insulin                   770 non-null    int64  
 5   BMI                       770 non-null    float64
 6   DiabetesPedigreeFunction  770 non-null    float64
 7   Age                       770 non-null    int64  
 8   Outcome                   770 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.3 KB


**no categorical data.**

In [29]:
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,770.0,770.0,770.0,770.0,770.0,770.0,770.0,770.0,770.0
mean,3.838961,120.885714,69.076623,20.55974,79.618182,31.987662,0.472209,33.219481,0.350649
std,3.367409,31.974468,19.343197,15.93802,115.150062,7.876514,0.331001,11.777296,0.477483
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,10.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.244,24.0,0.0
50%,3.0,117.0,72.0,23.0,27.0,32.0,0.3745,29.0,0.0
75%,6.0,140.75,80.0,32.0,126.75,36.575,0.62675,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


**no NANs.**

## Split train/test set

In [30]:
X = diabetes.drop('Outcome', axis=1)
y = diabetes['Outcome']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    shuffle=True, 
                                                    random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(616, 8) (154, 8) (616,) (154,)


## Data Preprocessing

No imputing, encoding needed. 

Scale the data using StandardScaler. 

In [32]:
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)

In [33]:
scaled_X_train

array([[-0.57209822, -1.13306417, -3.85659956, ..., -4.08721955,
        -0.48339734, -1.04345604],
       [-1.16488628, -0.82670964,  0.00905452, ...,  1.49353211,
        -0.35468058, -1.04345604],
       [-0.27570419, -0.4590842 ,  0.11950178, ..., -0.7772565 ,
        -0.77375842, -0.53533832],
       ...,
       [ 1.79905403, -0.61226147,  0.8926326 , ...,  1.7629477 ,
         2.00712238,  0.39621085],
       [-1.16488628,  0.61315666, -3.85659956, ...,  1.35240965,
        -0.77974524, -0.36596574],
       [-1.16488628,  0.12298941,  1.4448689 , ..., -1.20062387,
        -0.6091207 , -1.04345604]])

## Train Model

In [34]:
model = RandomForestClassifier()

model.fit(scaled_X_train, y_train)

RandomForestClassifier()

## Evaluate Model

In [35]:
scaled_X_test = scaler.transform(X_test)

final_predictions = model.predict(scaled_X_test)

test_rmse = mean_squared_error(y_test, final_predictions)**(1/2)
test_score = model.score(scaled_X_test, y_test)
f1_score = f1_score(y_test, final_predictions)
recall_score = recall_score(y_test, final_predictions)

print("rmse: {}\ttest score: {}%".format(round(test_rmse,2), round(test_score*100,2)))
print("f1 score: {}\trecall score: {}%".format(round(f1_score,2), round(recall_score*100,2)))

rmse: 0.5	test score: 75.32%
f1 score: 0.63	recall score: 61.11%


## Save Model

In [36]:
# Save model

pickle.dump(model, open('diabetesregressionmodel.pkl', 'wb'))

In [37]:
# Save preprocessor

pickle.dump(scaler, open('diabetesscaler.pkl', 'wb'))