# Data Preparation and Model

In [3]:
import pandas as pd
import numpy as np

In [4]:
# set seed for reproducibility
SEED = 20
np.random.seed(SEED)

In [32]:
# Loading Data
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,72,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,32,1
3,89,66,23,94,28.1,0.167,21,0
4,137,40,35,168,43.1,2.288,33,1


In [33]:
# Replacing all 0 values with Null values
def replace_zero(df):
    df_nan=df.copy(deep=True)
    cols = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
    df_nan[cols] = df_nan[cols].replace({0:np.nan})
    return df_nan
df_nan=replace_zero(df)

In [34]:
# Copy pasting functions from previous notebook
def find_median(frame,var):
    temp = frame[frame[var].notnull()]
    temp = frame[[var,'Outcome']].groupby('Outcome')[[var]].median().reset_index()
    return temp

In [35]:
# Copy pasting functions from previous notebook
def replace_null(frame,var):
    median_df=find_median(frame,var)
    var_0=median_df[var].iloc[0]
    var_1=median_df[var].iloc[1]
    frame.loc[(frame['Outcome'] == 0) & (frame[var].isnull()), var] = var_0
    frame.loc[(frame['Outcome'] == 1) & (frame[var].isnull()), var] = var_1
    return frame[var].isnull().sum()

In [36]:
print(str(replace_null(df_nan,'Glucose'))+ ' Nulls for Glucose')
print(str(replace_null(df_nan,'SkinThickness'))+ ' Nulls for SkinThickness')
print(str(replace_null(df_nan,'Insulin'))+ ' Nulls for Insulin')
print(str(replace_null(df_nan,'BMI'))+ ' Nulls for BMI')
print(str(replace_null(df_nan,'BloodPressure'))+ ' Nulls for BloodPressure')
# We have successfully handled Nulls

0 Nulls for Glucose
0 Nulls for SkinThickness
0 Nulls for Insulin
0 Nulls for BMI
0 Nulls for BloodPressure


In [37]:
df_nan.isnull().sum()
# Just a confirmation
# Everything looks good

Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [38]:
# We need to scale our data for uniformity.
from sklearn.preprocessing import StandardScaler
def std_scalar(df):
    std_X = StandardScaler()
    x =  pd.DataFrame(std_X.fit_transform(df.drop(["Outcome"],axis = 1),),
            columns=['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
           'BMI', 'DiabetesPedigreeFunction', 'Age'])
    y=df["Outcome"]
    return x,y


In [39]:
X,Y=std_scalar(df_nan)
X.describe()
# Scaled data looks fine

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,1.480297e-16,-3.978299e-16,8.095376e-18,-3.469447e-18,1.31839e-16,2.451743e-16,1.931325e-16
std,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652
min,-2.551447,-3.999727,-2.486187,-1.434747,-2.070186,-1.189553,-1.041549
25%,-0.7202356,-0.6934382,-0.4603073,-0.440843,-0.717659,-0.6889685,-0.7862862
50%,-0.1536274,-0.03218035,-0.1226607,-0.440843,-0.0559387,-0.3001282,-0.3608474
75%,0.6100618,0.6290775,0.3275348,0.3116039,0.6057816,0.4662269,0.6602056
max,2.539814,4.100681,7.868309,7.909072,5.041489,5.883565,4.063716


In [40]:
Y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [41]:
#Keeping train  size as 0.8
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=20, stratify=Y)


In [42]:
# We are good to go with baseline model
# Let's first implement KNN
from sklearn.neighbors import KNeighborsClassifier
test_scores = []
train_scores = []
for i in range(5,15):
    neigh = KNeighborsClassifier(n_neighbors=i)
    neigh.fit(X_train, Y_train)
    train_scores.append(neigh.score(X_train,Y_train))
    test_scores.append(neigh.score(X_test,Y_test))

In [43]:
print('Max train_scores is ' + str(max(train_scores)*100) + ' for k = '+ 
      str(train_scores.index(max(train_scores))+5))

Max train_scores is 87.62214983713355 for k = 5


In [44]:
print('Max test_scores is ' + str(max(test_scores)*100) + ' for k = '+ 
      str(test_scores.index(max(test_scores))+5))
# K=13 has generalized well for our data.

Max test_scores is 84.4155844155844 for k = 9


In [45]:
# Lets try Logistic regression now
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(random_state=20, penalty='l2').fit(X_train, Y_train)
log_pred=log_model.predict(X_test)
log_model.score(X_test, Y_test)

0.8116883116883117

In [46]:
# Support Vector Machines
from sklearn import svm
svm_model = svm.SVC().fit(X_train, Y_train)
svm_pred=svm_model.predict(X_test)
svm_model.score(X_test, Y_test)
# Almost 89% Accuracy

0.8701298701298701

In [47]:
# Function to evaluate model performance
def model_perf(pred,Y_test):
    cmp_list=[]
    for i,j in zip(pred,Y_test):
        if i==j:
            cmp_list.append(1)
        else:
            cmp_list.append(0)
    return cmp_list


In [48]:
cmp_list=model_perf(svm_pred,Y_test)

In [49]:
print('Model Accuracy Confirmation :'+ str(cmp_list.count(1)/len(Y_test)))

Model Accuracy Confirmation :0.8701298701298701


In [50]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(max_depth=2, random_state=20).fit(X_train, Y_train)
rf_pred=rf_model.predict(X_test)
rf_model.score(X_test, Y_test)
# Almost 86% Accuracy


0.8766233766233766

In [51]:
import tensorflow as tf
def build_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation='relu', input_shape=[len(X_train.keys())]),
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(2, activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
  ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

neural_model = build_model()

In [52]:
neural_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 8)                 64        
                                                                 
 dense_1 (Dense)             (None, 4)                 36        
                                                                 
 dense_2 (Dense)             (None, 2)                 10        
                                                                 
 dense_3 (Dense)             (None, 1)                 3         
                                                                 
Total params: 113
Trainable params: 113
Non-trainable params: 0
_________________________________________________________________


In [53]:
# Keeping EPOCHs high as dataset is small.
EPOCHS = 1000
neural_pred = neural_model.fit(X_train, Y_train,epochs=EPOCHS, validation_split=0.1, verbose=2)

Epoch 1/1000
18/18 - 2s - loss: 0.6484 - accuracy: 0.6141 - val_loss: 0.5589 - val_accuracy: 0.7419 - 2s/epoch - 121ms/step
Epoch 2/1000
18/18 - 0s - loss: 0.5633 - accuracy: 0.6413 - val_loss: 0.4599 - val_accuracy: 0.7419 - 113ms/epoch - 6ms/step
Epoch 3/1000
18/18 - 0s - loss: 0.5144 - accuracy: 0.6594 - val_loss: 0.4088 - val_accuracy: 0.8226 - 99ms/epoch - 6ms/step
Epoch 4/1000
18/18 - 0s - loss: 0.4860 - accuracy: 0.7699 - val_loss: 0.3854 - val_accuracy: 0.8226 - 105ms/epoch - 6ms/step
Epoch 5/1000
18/18 - 0s - loss: 0.4538 - accuracy: 0.7899 - val_loss: 0.3577 - val_accuracy: 0.8226 - 314ms/epoch - 17ms/step
Epoch 6/1000
18/18 - 0s - loss: 0.4188 - accuracy: 0.8043 - val_loss: 0.3285 - val_accuracy: 0.8387 - 106ms/epoch - 6ms/step
Epoch 7/1000
18/18 - 0s - loss: 0.4056 - accuracy: 0.8134 - val_loss: 0.3286 - val_accuracy: 0.8548 - 162ms/epoch - 9ms/step
Epoch 8/1000
18/18 - 1s - loss: 0.3802 - accuracy: 0.8261 - val_loss: 0.3185 - val_accuracy: 0.8548 - 532ms/epoch - 30ms/step


In [54]:
# Let's measure final performance
hist = pd.DataFrame(neural_pred.history)
hist['epoch'] = neural_pred.epoch
hist.tail()
# 91% accuracy on train

Unnamed: 0,loss,accuracy,val_loss,val_accuracy,epoch
995,0.163623,0.92029,1.333646,0.83871,995
996,0.165886,0.925725,1.281605,0.83871,996
997,0.165449,0.922101,1.391277,0.822581,997
998,0.171109,0.914855,1.278865,0.83871,998
999,0.176323,0.92029,1.335926,0.83871,999


In [55]:
neural_test=neural_model.predict(X_test)

In [56]:
neural_test_converted=[]
for i in neural_test:
    if i>0.5:
        neural_test_converted.append(1)
    else:
        neural_test_converted.append(0)

In [57]:
cmp_list=model_perf(neural_test_converted,Y_test)

In [58]:
print('Test Accuracy :' + str(cmp_list.count(1)/len(Y_test)*100)+' %')
#~86% Accuracy.

Test Accuracy :83.11688311688312 %


In [59]:
import pickle
# Lets dump our SVM model
pickle.dump(svm_model, open('svm_model.pkl','wb'))