## Setup

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score, recall_score
import pickle

## Import dataset

In [2]:
diabetes = pd.read_csv("diabetes.csv")

diabetes.head()

ParserError: Error tokenizing data. C error: Expected 9 fields in line 769, saw 17


In [84]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


no categorical data. 

In [85]:
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


no NANs. 

## Split train/test set

In [86]:
X = diabetes.drop('Outcome', axis=1)
y = diabetes['Outcome']

In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(614, 8) (154, 8) (614,) (154,)


## Data Preprocessing

No imputing, encoding needed. 

Scale the data using MinMaxScaler. 

In [88]:
scaler = MinMaxScaler()
scaled_X_train = scaler.fit_transform(X_train)

In [89]:
scaled_X_train

array([[0.11764706, 0.42211055, 0.        , ..., 0.        , 0.09649872,
        0.        ],
       [0.52941176, 0.56281407, 0.67213115, ..., 0.42026826, 0.51409052,
        0.48333333],
       [0.05882353, 0.69849246, 0.37704918, ..., 0.42771982, 0.24594364,
        0.01666667],
       ...,
       [0.58823529, 0.50753769, 0.70491803, ..., 0.67958271, 0.45175064,
        0.28333333],
       [0.        , 0.70854271, 0.        , ..., 0.6318927 , 0.05422716,
        0.13333333],
       [0.        , 0.6281407 , 0.78688525, ..., 0.33532042, 0.07856533,
        0.        ]])

## Train Model

In [90]:
model = RandomForestClassifier()

model.fit(scaled_X_train, y_train)

RandomForestClassifier()

## Evaluate Model

In [91]:
scaled_X_test = scaler.transform(X_test)

final_predictions = model.predict(scaled_X_test)

test_rmse = mean_squared_error(y_test, final_predictions)**(1/2)
test_score = model.score(scaled_X_test, y_test)
f1_score = f1_score(y_test, final_predictions)
recall_score = recall_score(y_test, final_predictions)

print("rmse: {}\ttest score: {}%".format(round(test_rmse,2), round(test_score*100,2)))
print("f1 score: {}\trecall score: {}%".format(round(f1_score,2), round(recall_score*100,2)))

rmse: 0.5	test score: 74.68%
f1 score: 0.65	recall score: 65.45%


## Save Model

In [92]:
# Save model

pickle.dump(model, open('diabetesregressionmodel.pkl', 'wb'))

In [93]:
# Save preprocessor

pickle.dump(scaler, open('diabetesscaler.pkl', 'wb'))