In [33]:
# load the "diabetes.csv" file and clean data to prepare for deep learning model
# 1. load data
# 2. clean data
# 3. impute missing values
# 4. save data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
import tensorflow as tf
warnings.filterwarnings('ignore')

# load data
df = pd.read_csv('diabetes.csv')
print(df.shape)
df.head()


(768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [34]:
df.sum()

Pregnancies                  2953.000
Glucose                     92847.000
BloodPressure               53073.000
SkinThickness               15772.000
Insulin                     61286.000
BMI                         24570.300
DiabetesPedigreeFunction      362.401
Age                         25529.000
Outcome                       268.000
dtype: float64

In [35]:
# fill the 0 in glucose, blood pressure, skin thickness, insulin, bmi with NaN
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [36]:
df.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [37]:
df.groupby('Outcome')["BloodPressure"].transform('median')

0      74.5
1      70.0
2      74.5
3      70.0
4      74.5
       ... 
763    70.0
764    70.0
765    70.0
766    74.5
767    70.0
Name: BloodPressure, Length: 768, dtype: float64

In [38]:
# copy the dataframe
df_filled = df.copy(deep=True)

In [39]:
df["BloodPressure"].sum()

53073.0

In [40]:
# calculate the median value for each column split by the outcome, and fill the NaN with the median value of the corresponding outcome for df_filled dataframe
df_filled['Glucose'].fillna(df_filled.groupby('Outcome')['Glucose'].transform('median'), inplace=True)
# df_filled['BloodPressure'].fillna(df_filled.groupby('Outcome')['BloodPressure'].transform('median'), inplace=True)
df_filled['SkinThickness'].fillna(df_filled.groupby('Outcome')['SkinThickness'].transform('median'), inplace=True)
df_filled['Insulin'].fillna(df_filled.groupby('Outcome')['Insulin'].transform('median'), inplace=True)
df_filled['BMI'].fillna(df_filled.groupby('Outcome')['BMI'].transform('median'), inplace=True)


In [41]:
# calculate the median value for each column split by the outcome, and fill the NaN with the median value of the corresponding outcome
df['Glucose'].fillna(df.groupby('Outcome')['Glucose'].transform('median'), inplace=True)
df['BloodPressure'].fillna(df.groupby('Outcome')['BloodPressure'].transform('median'), inplace=True)
df['SkinThickness'].fillna(df.groupby('Outcome')['SkinThickness'].transform('median'), inplace=True)
df['Insulin'].fillna(df.groupby('Outcome')['Insulin'].transform('median'), inplace=True)
df['BMI'].fillna(df.groupby('Outcome')['BMI'].transform('median'), inplace=True)

In [21]:
df["BloodPressure"].sum()

55595.0

In [23]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [24]:
df.sum()

Pregnancies                   2953.000
Glucose                      93448.000
BloodPressure                55595.000
SkinThickness                22341.000
Insulin                     108867.000
BMI                          24909.800
DiabetesPedigreeFunction       362.401
Age                          25529.000
Outcome                        268.000
dtype: float64

In [13]:
# scale the data using min-max scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_filled.iloc[:,:-1])
df_scaled = pd.DataFrame(df_scaled, columns = df.columns[:-1])
df_scaled['Outcome'] = df['Outcome']
df_scaled.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.670968,0.489796,0.304348,0.186899,0.314928,0.234415,0.483333,1
1,0.058824,0.264516,0.428571,0.23913,0.10637,0.171779,0.116567,0.166667,0
2,0.470588,0.896774,0.408163,0.271739,0.186899,0.104294,0.253629,0.183333,1
3,0.058824,0.290323,0.428571,0.173913,0.096154,0.202454,0.038002,0.0,0
4,0.0,0.6,0.163265,0.304348,0.185096,0.509202,0.943638,0.2,1


In [14]:
df_scaled.sum()

Pregnancies                 173.705882
Glucose                     384.877419
BloodPressure               362.051020
SkinThickness               184.402174
Insulin                     117.926683
BMI                         223.562372
DiabetesPedigreeFunction    129.161827
Age                         156.683333
Outcome                     268.000000
dtype: float64

In [15]:
# create X and Y
X = df_scaled.drop('Outcome', axis = 1)
Y = df_scaled['Outcome']
X = np.array(X)
Y = np.array(Y)

In [16]:
# train test split 1 1 8
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.1, stratify = Y, random_state = 2)
X_train, X_val, Y_train, Y_val = train_test_split(X_train,Y_train, test_size = 0.1, stratify = Y_train, random_state = 2)

In [31]:
# function to create model
from keras.models import Sequential
from keras.layers import Dense
def create_model():
    model = Sequential([
        Dense(20, activation = 'relu', input_shape = (8,)),
        Dense(5, activation = 'relu'),
        Dense(1, activation = 'sigmoid')
    ])
    optimizer = tf.keras.optimizers.legacy.RMSprop(lr=0.001)
    model.compile(loss = 'binary_crossentropy', optimizer = optimizer, metrics = ['accuracy'])
    return model

In [32]:
# train the model
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.1, stratify = Y, random_state = 2)
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    verbose=1,
    patience=100,
    mode='min',
    restore_best_weights=True)
model = create_model()
model.fit(X_train,Y_train, validation_split=.2, epochs = 1000, verbose = 2)

Epoch 1/1000
18/18 - 1s - loss: 0.6766 - accuracy: 0.6504 - val_loss: 0.6715 - val_accuracy: 0.6259 - 1s/epoch - 66ms/step
Epoch 2/1000
18/18 - 0s - loss: 0.6592 - accuracy: 0.6576 - val_loss: 0.6644 - val_accuracy: 0.6259 - 155ms/epoch - 9ms/step
Epoch 3/1000
18/18 - 0s - loss: 0.6512 - accuracy: 0.6576 - val_loss: 0.6622 - val_accuracy: 0.6259 - 156ms/epoch - 9ms/step
Epoch 4/1000
18/18 - 0s - loss: 0.6472 - accuracy: 0.6576 - val_loss: 0.6611 - val_accuracy: 0.6259 - 134ms/epoch - 7ms/step
Epoch 5/1000
18/18 - 0s - loss: 0.6448 - accuracy: 0.6576 - val_loss: 0.6612 - val_accuracy: 0.6259 - 135ms/epoch - 7ms/step
Epoch 6/1000
18/18 - 0s - loss: 0.6441 - accuracy: 0.6576 - val_loss: 0.6615 - val_accuracy: 0.6259 - 126ms/epoch - 7ms/step
Epoch 7/1000
18/18 - 0s - loss: 0.6435 - accuracy: 0.6576 - val_loss: 0.6617 - val_accuracy: 0.6259 - 145ms/epoch - 8ms/step
Epoch 8/1000
18/18 - 0s - loss: 0.6433 - accuracy: 0.6576 - val_loss: 0.6623 - val_accuracy: 0.6259 - 158ms/epoch - 9ms/step
Ep

KeyboardInterrupt: 

In [19]:
model.evaluate(X_test, Y_test)



[0.6480262279510498, 0.649350643157959]