In [1]:
# MLP for Pima Indians Dataset Serialize to JSON and HDF5

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.models import model_from_json
import numpy
import os
# fix random seed for reproducibility
numpy.random.seed(7)

# load pima indians dataset

import matplotlib.pyplot as plt
import pandas as pd
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
df = pd.read_csv('pima-indians-diabetes.data.csv', names=col_names)
# # load pima indians dataset
# dataset = numpy.loadtxt("pima-indians-diabetes.data.csv", delimiter=",")

# selection of relevant features
label = df['label']
df.drop('label', axis=1, inplace=True)

X, Y = df, label
df.describe()

# normalized data
df_norm = (df - df.mean()) / (df.max() - df.min())
df_norm.describe()
# X = df[['glucose','bmi']]

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,-1.561251e-17,-2.312965e-18,3.035766e-18,1.098658e-17,-2.8912060000000002e-18,3.472338e-16,2.688821e-16,3.729655e-17
std,0.1982105,0.1606664,0.1586542,0.1611335,0.1362222,0.1174987,0.1414725,0.1960039
min,-0.2261795,-0.6075102,-0.5664383,-0.207439,-0.09432563,-0.4767895,-0.1681795,-0.2040148
25%,-0.167356,-0.1100228,-0.05824155,-0.207439,-0.09432563,-0.0699341,-0.09740662,-0.1540148
50%,-0.04970895,-0.01957051,0.02372567,0.02488426,-0.05827362,0.0001106092,-0.04243224,-0.07068142
75%,0.1267616,0.09726366,0.08929944,0.1157934,0.05608809,0.06866501,0.06591533,0.1293186
max,0.7738205,0.3924898,0.4335617,0.792561,0.9056744,0.5232105,0.8318205,0.7959852


In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=66)


In [4]:
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn import model_selection
# random forest model creation
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
# predictions
rfc_predict = rfc.predict(X_test)

In [5]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

In [14]:
rfc_cv_score = cross_val_score(rfc, X, Y, cv=10, scoring='roc_auc')

In [15]:
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[139  18]
 [ 33  41]]


=== Classification Report ===
             precision    recall  f1-score   support

          0       0.81      0.89      0.84       157
          1       0.69      0.55      0.62        74

avg / total       0.77      0.78      0.77       231



=== All AUC Scores ===
[0.78333333 0.83       0.82407407 0.73185185 0.81222222 0.86814815
 0.87148148 0.90444444 0.81076923 0.84846154]


=== Mean AUC Score ===
('Mean AUC Score - Random Forest: ', 0.8284786324786324)


In [17]:
'''
We’ll use RandomizedSearchCV from sklearn to optimize our hyperparamaters. 
Koehrsen uses a full grid of hyperparameters in his article, but I found that this could take a very substantial time to run in practice.
I decided to focus on 3 hyperparameters: n_estimators, max_features, and max_depth.
'''
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# number of features at every split
max_features = ['auto', 'sqrt']

# max depth
max_depth = [int(x) for x in np.linspace(100, 500, num = 11)]
max_depth.append(None)
# create random grid
random_grid = {
 'n_estimators': n_estimators,
 'max_features': max_features,
 'max_depth': max_depth
 }
# Random search of parameters
rfc_random = RandomizedSearchCV(estimator = rfc, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the model
rfc_random.fit(X_train, y_train)
# print results
print(rfc_random.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] n_estimators=1000, max_features=auto, max_depth=140 .............
[CV] n_estimators=1400, max_features=auto, max_depth=100 .............
[CV] n_estimators=1000, max_features=auto, max_depth=140 .............
[CV] n_estimators=1000, max_features=auto, max_depth=140 .............
[CV] n_estimators=1400, max_features=auto, max_depth=100 .............
[CV] n_estimators=800, max_features=sqrt, max_depth=260 ..............
[CV] n_estimators=1400, max_features=auto, max_depth=100 .............
[CV] n_estimators=800, max_features=sqrt, max_depth=260 ..............
[CV]  n_estimators=800, max_features=sqrt, max_depth=260, total=   2.5s
[CV] n_estimators=800, max_features=sqrt, max_depth=260 ..............
[CV]  n_estimators=800, max_features=sqrt, max_depth=260, total=   2.5s
[CV] n_estimators=2000, max_features=auto, max_depth=300 .............
[CV]  n_estimators=1000, max_features=auto, max_depth=140, total=   3.1s
[CV] n_est

[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   13.7s


[CV]  n_estimators=1400, max_features=auto, max_depth=260, total=   4.8s
[CV] n_estimators=1400, max_features=auto, max_depth=500 .............
[CV]  n_estimators=800, max_features=auto, max_depth=380, total=   2.7s
[CV] n_estimators=1400, max_features=auto, max_depth=500 .............
[CV]  n_estimators=2000, max_features=auto, max_depth=100, total=   6.9s
[CV] n_estimators=1400, max_features=auto, max_depth=500 .............
[CV]  n_estimators=2000, max_features=auto, max_depth=100, total=   6.8s
[CV] n_estimators=1000, max_features=sqrt, max_depth=300 .............
[CV]  n_estimators=2000, max_features=auto, max_depth=100, total=   6.7s
[CV] n_estimators=1000, max_features=sqrt, max_depth=300 .............
[CV]  n_estimators=1200, max_features=auto, max_depth=180, total=   4.2s
[CV] n_estimators=1000, max_features=sqrt, max_depth=300 .............
[CV]  n_estimators=1200, max_features=auto, max_depth=180, total=   4.1s
[CV] n_estimators=1800, max_features=auto, max_depth=420 .......

[CV] n_estimators=1400, max_features=sqrt, max_depth=260 .............
[CV]  n_estimators=1400, max_features=sqrt, max_depth=420, total=   5.3s
[CV] n_estimators=1400, max_features=sqrt, max_depth=260 .............
[CV]  n_estimators=800, max_features=sqrt, max_depth=380, total=   3.0s
[CV] n_estimators=1400, max_features=sqrt, max_depth=260 .............
[CV]  n_estimators=800, max_features=sqrt, max_depth=380, total=   2.8s
[CV] n_estimators=1800, max_features=auto, max_depth=None ............
[CV]  n_estimators=800, max_features=sqrt, max_depth=380, total=   3.1s
[CV] n_estimators=1800, max_features=auto, max_depth=None ............
[CV]  n_estimators=1600, max_features=sqrt, max_depth=None, total=   5.6s
[CV] n_estimators=1800, max_features=auto, max_depth=None ............
[CV]  n_estimators=1600, max_features=sqrt, max_depth=None, total=   5.6s
[CV] n_estimators=1200, max_features=auto, max_depth=140 .............
[CV]  n_estimators=1600, max_features=sqrt, max_depth=None, total=

[CV] n_estimators=200, max_features=auto, max_depth=220 ..............
[CV]  n_estimators=1400, max_features=auto, max_depth=220, total=   4.9s
[CV] n_estimators=200, max_features=auto, max_depth=220 ..............
[CV]  n_estimators=200, max_features=auto, max_depth=220, total=   0.7s
[CV] n_estimators=200, max_features=auto, max_depth=220 ..............
[CV]  n_estimators=200, max_features=auto, max_depth=220, total=   0.7s
[CV] n_estimators=1000, max_features=auto, max_depth=500 .............
[CV]  n_estimators=200, max_features=auto, max_depth=220, total=   0.7s
[CV] n_estimators=1000, max_features=auto, max_depth=500 .............
[CV]  n_estimators=1800, max_features=sqrt, max_depth=140, total=   6.0s
[CV] n_estimators=1000, max_features=auto, max_depth=500 .............
[CV]  n_estimators=1800, max_features=sqrt, max_depth=140, total=   6.1s
[CV] n_estimators=1400, max_features=auto, max_depth=460 .............


[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.3min


[CV]  n_estimators=1800, max_features=sqrt, max_depth=140, total=   6.1s
[CV] n_estimators=1400, max_features=auto, max_depth=460 .............
[CV]  n_estimators=1000, max_features=auto, max_depth=500, total=   3.5s
[CV] n_estimators=1400, max_features=auto, max_depth=460 .............
[CV]  n_estimators=2000, max_features=sqrt, max_depth=340, total=   6.9s
[CV] n_estimators=1600, max_features=auto, max_depth=500 .............
[CV]  n_estimators=1000, max_features=auto, max_depth=500, total=   3.6s
[CV] n_estimators=1600, max_features=auto, max_depth=500 .............
[CV]  n_estimators=1000, max_features=auto, max_depth=500, total=   3.5s
[CV] n_estimators=1600, max_features=auto, max_depth=500 .............
[CV]  n_estimators=2000, max_features=sqrt, max_depth=340, total=   7.1s
[CV] n_estimators=800, max_features=sqrt, max_depth=460 ..............
[CV]  n_estimators=2000, max_features=sqrt, max_depth=340, total=   7.3s
[CV] n_estimators=800, max_features=sqrt, max_depth=460 .......

[CV]  n_estimators=1800, max_features=auto, max_depth=300, total=   6.2s
[CV] n_estimators=1600, max_features=sqrt, max_depth=420 .............
[CV]  n_estimators=1000, max_features=sqrt, max_depth=460, total=   3.4s
[CV] n_estimators=1600, max_features=sqrt, max_depth=420 .............
[CV]  n_estimators=1000, max_features=sqrt, max_depth=460, total=   3.6s
[CV] n_estimators=400, max_features=sqrt, max_depth=300 ..............
[CV]  n_estimators=1000, max_features=sqrt, max_depth=460, total=   3.4s
[CV] n_estimators=400, max_features=sqrt, max_depth=300 ..............
[CV]  n_estimators=400, max_features=sqrt, max_depth=300, total=   1.4s
[CV]  n_estimators=1600, max_features=sqrt, max_depth=300, total=   5.8s
[CV] n_estimators=2000, max_features=auto, max_depth=140 .............
[CV] n_estimators=400, max_features=sqrt, max_depth=300 ..............
[CV]  n_estimators=1600, max_features=sqrt, max_depth=300, total=   5.4s
[CV] n_estimators=2000, max_features=auto, max_depth=140 .......

[CV] n_estimators=600, max_features=auto, max_depth=180 ..............
[CV]  n_estimators=1200, max_features=sqrt, max_depth=140, total=   4.3s
[CV] n_estimators=600, max_features=auto, max_depth=180 ..............
[CV]  n_estimators=600, max_features=auto, max_depth=180, total=   2.1s
[CV] n_estimators=800, max_features=auto, max_depth=None .............
[CV]  n_estimators=1200, max_features=sqrt, max_depth=140, total=   4.3s
[CV] n_estimators=800, max_features=auto, max_depth=None .............
[CV]  n_estimators=600, max_features=auto, max_depth=180, total=   2.1s
[CV] n_estimators=800, max_features=auto, max_depth=None .............
[CV]  n_estimators=1200, max_features=sqrt, max_depth=140, total=   4.4s
[CV] n_estimators=1800, max_features=auto, max_depth=380 .............
[CV]  n_estimators=600, max_features=auto, max_depth=180, total=   2.1s
[CV] n_estimators=1800, max_features=auto, max_depth=380 .............
[CV]  n_estimators=800, max_features=auto, max_depth=None, total=   

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.7min finished


{'n_estimators': 1400, 'max_features': 'sqrt', 'max_depth': 180}


In [20]:
rfc = RandomForestClassifier(n_estimators=1400, max_depth=180, max_features='sqrt')
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
rfc_cv_score = cross_val_score(rfc, X, Y, cv=10, scoring='roc_auc')
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

=== Confusion Matrix ===
[[140  17]
 [ 34  40]]


=== Classification Report ===
             precision    recall  f1-score   support

          0       0.80      0.89      0.85       157
          1       0.70      0.54      0.61        74

avg / total       0.77      0.78      0.77       231



=== All AUC Scores ===
[0.77777778 0.83518519 0.83592593 0.73037037 0.81814815 0.85777778
 0.87185185 0.90777778 0.805      0.86384615]


=== Mean AUC Score ===
('Mean AUC Score - Random Forest: ', 0.8303660968660969)
