In [1]:
# load the dataset from the csv file using pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import regularizers
from keras import optimizers
from keras import backend as K
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

# load the dataset from the csv file using pandas
df = pd.read_csv('diabetes.csv')
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# Replace the 0 values with NA except for the [Pregnancies, Outcome, DiabetesPedigreeFunction, Age] columns
df_copy = df.copy(deep = True)
df_copy[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = df_copy[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.NaN)
df_copy.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [3]:
# fill in the missing data with the median value of the column
df_copy.fillna(df_copy.median(), inplace = True)


In [4]:
# scale the data using min_max except for outcome column
df_copy_scaled = df_copy.copy(deep = True)
df_copy_scaled[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']] = df_copy_scaled[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))


In [5]:
# Split the data into training and test sets
X = df_copy_scaled.drop(columns = ['Outcome'])
y = df_copy_scaled['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=5)


In [14]:
# create a deep learning model with 3 hidden layers
model = Sequential([
    Dense(8, activation = 'relu', input_shape = (8,)),
    Dense(4, activation = 'relu'),
    Dense(2, activation = 'relu'),
    Dense(1, activation = 'sigmoid'),
])
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [15]:
# train the model
earlystoping = EarlyStopping(monitor = 'loss', mode = 'min', verbose = 1, patience = 100)
model.fit(X_train, y_train, epochs = 1000, batch_size = 32, validation_split = 0.1, verbose = 2, callbacks=[earlystoping])

Epoch 1/1000
20/20 - 1s - loss: 0.6917 - accuracy: 0.6425 - val_loss: 0.6881 - val_accuracy: 0.6857 - 1s/epoch - 60ms/step
Epoch 2/1000
20/20 - 0s - loss: 0.6891 - accuracy: 0.6425 - val_loss: 0.6850 - val_accuracy: 0.6857 - 144ms/epoch - 7ms/step
Epoch 3/1000
20/20 - 0s - loss: 0.6868 - accuracy: 0.6425 - val_loss: 0.6817 - val_accuracy: 0.6857 - 289ms/epoch - 14ms/step
Epoch 4/1000
20/20 - 0s - loss: 0.6843 - accuracy: 0.6425 - val_loss: 0.6786 - val_accuracy: 0.6857 - 242ms/epoch - 12ms/step
Epoch 5/1000
20/20 - 0s - loss: 0.6808 - accuracy: 0.6425 - val_loss: 0.6731 - val_accuracy: 0.6857 - 176ms/epoch - 9ms/step
Epoch 6/1000
20/20 - 0s - loss: 0.6737 - accuracy: 0.6425 - val_loss: 0.6608 - val_accuracy: 0.6857 - 156ms/epoch - 8ms/step
Epoch 7/1000
20/20 - 0s - loss: 0.6642 - accuracy: 0.6425 - val_loss: 0.6494 - val_accuracy: 0.6857 - 170ms/epoch - 8ms/step
Epoch 8/1000
20/20 - 0s - loss: 0.6563 - accuracy: 0.6425 - val_loss: 0.6369 - val_accuracy: 0.6857 - 209ms/epoch - 10ms/step

<keras.callbacks.History at 0x284b6df10>

In [16]:
model.evaluate(X_test, y_test, verbose = 2)

3/3 - 0s - loss: 0.3642 - accuracy: 0.8182 - 387ms/epoch - 129ms/step


[0.36420130729675293, 0.8181818127632141]

In [None]:
# now re-train by using k-fold cross validation
k = 10
num_val_samples = len(X_train) // k
num_epochs = 1000
all_scores = []
all_histories = []
all_models = []
all_val_acc_histories = []
all_val_loss_histories = []
all_acc_histories = []
all_loss_histories = []


for i in range(k):
    