In [70]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

# Data from the Pima Indian Diabetes dataset.
https://www.kaggle.com/uciml/pima-indians-diabetes-database

In [71]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [72]:
# replace zeros or NaN with averages for the following columns:
# Glucose, BloodPressure, SkinThickness, BMI, Insulin

for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']:
    df[col] = df[col].replace(0, np.NaN)
    mean = df[col].mean()
    df[col] = df[col].replace(np.NaN, mean)
    
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [73]:
# Split the data into train and test using the first 8 columns as features

# First 8 columns
X = df.iloc[:, 0:8]

# Last column
y = df.iloc[:, 8]

# Generate the train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0, test_size = 0.2)

# Normalize the features
X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.fit_transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


# Decision Tree Classifiers with different tree depths tested.

In [74]:
test_means = []
test_stdev = []
training_means = []
tree_depths = list(range(3, 16))

for tree_depth in tree_depths:
    tree_clf = DecisionTreeClassifier(criterion='entropy', max_depth = tree_depth)
    scores = cross_validate(estimator = tree_clf, 
                            X = X, 
                            y = y, 
                            cv = 10, 
                            return_train_score = True)
    print('Tree depth = {}, test accuracy: {:.3f}, 95% CI: ({:.3f}, {:.3f})'.format(
        tree_depth, 
        np.mean(scores['test_score']), 
        np.mean(scores['test_score']) - 2 * np.std(scores['test_score']), 
        np.mean(scores['test_score']) + 2 * np.std(scores['test_score'])))
    print('\t    Training accuracy: {:.3f}'.format(np.mean(scores['train_score'])))
    test_means.append(np.mean(scores['test_score']))
    test_stdev.append(np.std(scores['test_score']))
    training_means.append(np.mean(scores['train_score']))
    

Tree depth = 3, test accuracy: 0.737, 95% CI: (0.661, 0.813)
	    Training accuracy: 0.773
Tree depth = 4, test accuracy: 0.731, 95% CI: (0.617, 0.844)
	    Training accuracy: 0.786
Tree depth = 5, test accuracy: 0.734, 95% CI: (0.645, 0.824)
	    Training accuracy: 0.826
Tree depth = 6, test accuracy: 0.746, 95% CI: (0.629, 0.863)
	    Training accuracy: 0.849
Tree depth = 7, test accuracy: 0.738, 95% CI: (0.656, 0.820)
	    Training accuracy: 0.869
Tree depth = 8, test accuracy: 0.728, 95% CI: (0.595, 0.860)
	    Training accuracy: 0.903
Tree depth = 9, test accuracy: 0.710, 95% CI: (0.604, 0.815)
	    Training accuracy: 0.928
Tree depth = 10, test accuracy: 0.716, 95% CI: (0.602, 0.830)
	    Training accuracy: 0.947
Tree depth = 11, test accuracy: 0.727, 95% CI: (0.634, 0.819)
	    Training accuracy: 0.964
Tree depth = 12, test accuracy: 0.706, 95% CI: (0.619, 0.793)
	    Training accuracy: 0.973
Tree depth = 13, test accuracy: 0.707, 95% CI: (0.611, 0.803)
	    Training accuracy: 0

# Random forest models with variations of the number of estimators.

In [75]:
test_means = []
test_stdev = []
training_means = []
estimators = list(range(85, 151, 5))

for estimator in estimators:
    forest_model = RandomForestClassifier(n_estimators = estimator, random_state = 1)
    scores = cross_validate(estimator = forest_model, 
                            X = X, 
                            y = y, 
                            cv = 10, 
                            return_train_score = True)
    print('Estimators = {}, test accuracy: {:.3f}, 95% CI: ({:.3f}, {:.3f})'.format(
        estimator, 
        np.mean(scores['test_score']), 
        np.mean(scores['test_score']) - 2 * np.std(scores['test_score']), 
        np.mean(scores['test_score']) + 2 * np.std(scores['test_score'])))
    print('\t    Training accuracy: {:.3f}'.format(np.mean(scores['train_score'])))
    test_means.append(np.mean(scores['test_score']))
    test_stdev.append(np.std(scores['test_score']))
    training_means.append(np.mean(scores['train_score']))
    

Estimators = 85, test accuracy: 0.764, 95% CI: (0.670, 0.858)
	    Training accuracy: 1.000
Estimators = 90, test accuracy: 0.762, 95% CI: (0.673, 0.850)
	    Training accuracy: 1.000
Estimators = 95, test accuracy: 0.763, 95% CI: (0.661, 0.865)
	    Training accuracy: 1.000
Estimators = 100, test accuracy: 0.763, 95% CI: (0.652, 0.874)
	    Training accuracy: 1.000
Estimators = 105, test accuracy: 0.760, 95% CI: (0.649, 0.872)
	    Training accuracy: 1.000
Estimators = 110, test accuracy: 0.760, 95% CI: (0.658, 0.862)
	    Training accuracy: 1.000
Estimators = 115, test accuracy: 0.758, 95% CI: (0.653, 0.863)
	    Training accuracy: 1.000
Estimators = 120, test accuracy: 0.758, 95% CI: (0.652, 0.864)
	    Training accuracy: 1.000
Estimators = 125, test accuracy: 0.756, 95% CI: (0.643, 0.870)
	    Training accuracy: 1.000
Estimators = 130, test accuracy: 0.754, 95% CI: (0.648, 0.859)
	    Training accuracy: 1.000
Estimators = 135, test accuracy: 0.760, 95% CI: (0.653, 0.868)
	    Train

# K-Nearest Neighbors with different K's.

In [77]:
test_means = []
test_stdev = []
training_means = []
k_neighbors = list(range(10, 15))

# k=32 for.746, k=12 for .746
for k in k_neighbors:
    knn = KNeighborsClassifier(n_neighbors=k, p=2, metric='euclidean')
    scores = cross_validate(estimator = knn, 
                            X = X, 
                            y = y, 
                            cv = 10, 
                            return_train_score = True)
    print('K_Neighbors = {}, test accuracy: {:.3f}, 95% CI: ({:.3f}, {:.3f})'.format(
        k, 
        np.mean(scores['test_score']), 
        np.mean(scores['test_score']) - 2 * np.std(scores['test_score']), 
        np.mean(scores['test_score']) + 2 * np.std(scores['test_score'])))
    print('\t    Training accuracy: {:.3f}\n'.format(np.mean(scores['train_score'])))
    test_means.append(np.mean(scores['test_score']))
    test_stdev.append(np.std(scores['test_score']))
    training_means.append(np.mean(scores['train_score']))

K_Neighbors = 10, test accuracy: 0.738, 95% CI: (0.645, 0.832)
	    Training accuracy: 0.778

K_Neighbors = 11, test accuracy: 0.737, 95% CI: (0.645, 0.830)
	    Training accuracy: 0.776

K_Neighbors = 12, test accuracy: 0.746, 95% CI: (0.668, 0.824)
	    Training accuracy: 0.779

K_Neighbors = 13, test accuracy: 0.741, 95% CI: (0.647, 0.835)
	    Training accuracy: 0.776

K_Neighbors = 14, test accuracy: 0.742, 95% CI: (0.653, 0.832)
	    Training accuracy: 0.774



# Multi-layer perceptron, variable number of neurons in the hidden layer.

In [78]:
test_means = []
test_stdev = []
training_means = []
hidden_layer_sizes = list(range(4, 11))

# k=32 for.746, k=12 for .746
for nodes in hidden_layer_sizes:
    mlp = MLPClassifier(solver='lbfgs', alpha = 1e-5,
                       hidden_layer_sizes=(nodes,), random_state=1)
    scores = cross_validate(estimator = mlp, 
                            X = X, 
                            y = y, 
                            cv = 10, 
                            return_train_score = True)
    print('Nodes in Hidden Layer = {}, test accuracy: {:.3f}, 95% CI: ({:.3f}, {:.3f})'.format(
        nodes, 
        np.mean(scores['test_score']), 
        np.mean(scores['test_score']) - 2 * np.std(scores['test_score']), 
        np.mean(scores['test_score']) + 2 * np.std(scores['test_score'])))
    print('\t    Training accuracy: {:.3f}\n'.format(np.mean(scores['train_score'])))
    test_means.append(np.mean(scores['test_score']))
    test_stdev.append(np.std(scores['test_score']))
    training_means.append(np.mean(scores['train_score']))

Nodes in Hidden Layer = 4, test accuracy: 0.651, 95% CI: (0.644, 0.658)
	    Training accuracy: 0.651

Nodes in Hidden Layer = 5, test accuracy: 0.744, 95% CI: (0.688, 0.799)
	    Training accuracy: 0.755

Nodes in Hidden Layer = 6, test accuracy: 0.736, 95% CI: (0.655, 0.816)
	    Training accuracy: 0.745

Nodes in Hidden Layer = 7, test accuracy: 0.754, 95% CI: (0.677, 0.831)
	    Training accuracy: 0.751

Nodes in Hidden Layer = 8, test accuracy: 0.714, 95% CI: (0.632, 0.795)
	    Training accuracy: 0.723

Nodes in Hidden Layer = 9, test accuracy: 0.755, 95% CI: (0.678, 0.832)
	    Training accuracy: 0.756

Nodes in Hidden Layer = 10, test accuracy: 0.651, 95% CI: (0.644, 0.658)
	    Training accuracy: 0.651



# Naive Guassian Bayes model

In [80]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train)
scores = cross_validate(estimator = gnb, 
                            X = X, 
                            y = y, 
                            cv = 10, 
                            return_train_score = True)
print('Gaussian Naive Bayes {}, test accuracy: {:.3f}, 95% CI: ({:.3f}, {:.3f})'.format(
    '', 
    np.mean(scores['test_score']), 
    np.mean(scores['test_score']) - 2 * np.std(scores['test_score']), 
    np.mean(scores['test_score']) + 2 * np.std(scores['test_score'])))
print('\t    Training accuracy: {:.3f}\n'.format(np.mean(scores['train_score'])))
test_means.append(np.mean(scores['test_score']))
test_stdev.append(np.std(scores['test_score']))
training_means.append(np.mean(scores['train_score']))

Gaussian Naive Bayes , test accuracy: 0.750, 95% CI: (0.682, 0.818)
	    Training accuracy: 0.752

