# Example Model - Simple Neural Network - MLPClassifier
To get the ball rolling for others, I'll show a simple neural network model built on the balanced 50-50 dataset.
* MLPClassifier Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html

## 1. Get the data

In [3]:
#load in packages
import os
import time
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import confusion_matrix, make_scorer

#example models
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import EasyEnsembleClassifier


random_state=7

In [7]:
#read in the 5050 balanced dataset. 
#This is comparing 0 as no diabetes to 1 with prediabetes and diabetes + with equal number of each class in the target variable diabetes_binary
df = pd.read_csv(r'/home/exam1/Desktop/1/diabetes_binary_5050split_health_indicators_BRFSS2015.csv')

In [3]:
#show dataframe df
pd.set_option('display.max_columns', 500)
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [8]:
df.shape

(70692, 22)

## 2. Model building with no feature selection

In [9]:
#select Diabetes_binary as target variable:
y = df['Diabetes_binary']

#select all the other columns minus Diabetes_binary as the feature variables:
X = df.drop(['Diabetes_binary'],axis=1)

* I'll show the cross_validate method on X and y, but you can use the usual train_test_split() below as well to fit your models.

In [10]:
#now make the train-test splits
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=random_state)
print('Dimensions: \n x_train:{} \n x_test{} \n y_train{} \n y_test{}'.format(x_train.shape, x_test.shape, y_train.shape, y_test.shape))

Dimensions: 
 x_train:(56553, 21) 
 x_test(14139, 21) 
 y_train(56553,) 
 y_test(14139,)


In [11]:
#create true negative, false positive, false negative, and true positive 
def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0]
def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1]
def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0]
def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1]

In [12]:
#Setup classifier scorers
scorers = {'Accuracy': 'accuracy', 
           'roc_auc': 'roc_auc', 
           'Sensitivity':'recall', 
           'precision':'precision',
            'tp': make_scorer(tp), 
           'tn': make_scorer(tn),
           'fp': make_scorer(fp), 
           'fn': make_scorer(fn)}   

### 2.1 Cross validation Version doesn't need the train-test split you commonly use

In [13]:
#change this name here to change the print name
classifier_name = 'Simple Neural Network: MLPClassifier'

start_ts = time.time()
#try swapping out the classifier for a different one or changing the parameters
clf = MLPClassifier(activation='logistic', solver='adam', alpha=0.0001, max_iter=1000, hidden_layer_sizes=(10,), random_state=random_state)
scores = cross_validate(clf, X, y, scoring=scorers, cv=5)          

Sensitivity = round(scores['test_tp'].mean() / (scores['test_tp'].mean() + scores['test_fn'].mean()),3)*100   #TP/(TP+FN) also recall
Specificity = round(scores['test_tn'].mean() / (scores['test_tn'].mean() + scores['test_fp'].mean()),3)*100    #TN/(TN+FP)
PPV = round(scores['test_tp'].mean() / (scores['test_tp'].mean() + scores['test_fp'].mean()),3)*100           #PPV = tp/(tp+fp) also precision
NPV = round(scores['test_tn'].mean() / (scores['test_fn'].mean() + scores['test_tn'].mean()),3)*100           #TN(FN+TN)

scores_Acc = scores['test_Accuracy']                                                                                                                                    
print(f"{classifier_name} Acc: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std() * 2))                                                                                                    
scores_AUC = scores['test_roc_auc']                                                                     #Only works with binary classes, not multiclass                  
print(f"{classifier_name} AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std() * 2))      
scores_sensitivity = scores['test_Sensitivity']                                                                     #Only works with binary classes, not multiclass                  
print(f"{classifier_name} Recall: %0.2f (+/- %0.2f)" % (scores_sensitivity.mean(), scores_sensitivity.std() * 2)) 
scores_precision = scores['test_precision']                                                                     #Only works with binary classes, not multiclass                  
print(f"{classifier_name} Precision: %0.2f (+/- %0.2f)" % (scores_precision.mean(), scores_precision.std() * 2))                          
print(f"{classifier_name} Sensitivity = ", Sensitivity, "%")
print(f"{classifier_name} Specificity = ", Specificity, "%")
print(f"{classifier_name} PPV = ", PPV, "%")  
print(f"{classifier_name} NPV = ", NPV, "%")

print("Runtime:", time.time()-start_ts)

Simple Neural Network: MLPClassifier Acc: 0.75 (+/- 0.01)
Simple Neural Network: MLPClassifier AUC: 0.83 (+/- 0.01)
Simple Neural Network: MLPClassifier Recall: 0.80 (+/- 0.04)
Simple Neural Network: MLPClassifier Precision: 0.73 (+/- 0.01)
Simple Neural Network: MLPClassifier Sensitivity =  79.60000000000001 %
Simple Neural Network: MLPClassifier Specificity =  70.8 %
Simple Neural Network: MLPClassifier PPV =  73.2 %
Simple Neural Network: MLPClassifier NPV =  77.60000000000001 %
Runtime: 69.06988883018494


So 75% accuracy is not terrible. It's a good start. What do the other metrics tell us?
* **AUC is Area Under the Curve**
 * The Area Under the Curve (AUC) is the measure of the ability of a classifier to distinguish between classes and is used as a summary of the ROC curve. The higher the AUC, the better the performance of the model at distinguishing between the positive and negative classes.
 * 0.83 is a reasonably good score here. 83% of the time the model is correctly distinguishing between the positive and negative classes.
* **Sensitivity/Recall**
 * Sensitivity refers to a test's ability to designate an individual with disease as positive. Sensitivity is the ability of a test to correctly classify an individual as ′diseased′.
 * Recall is another word often used to refer to this. Used often in context of precision/recall.
 * 0.80 (+/- 0.04) is decent. So about 4/5 of the time the model correctly classifies an individual as diseased. Not good for a clinical setting but good considering the features are all non-invasive measurements.
* **Specificity**
 * Specificity is a test or model's ability to designate an individual who does not have a disease as negative. 
 * So our model at 70.8% is actually a little worse at designating individuals who don't have disease at negative compared to correctly saying they do have the disease. Not a bad trade-off for public health awareness with an online tool that could drive individuals to see doctors.
* **Precision/PPV**
 * The positive predictive value (PPV) tells you how likely it is for someone who tests positive (screen positive) to actually have the disease (true positive). In other words PPV tells you how sure you can be, when you have a positive result, that the person actually has the disease. Thus you know how good the test is at discriminating persons with disease from those without disease.
 * 73% isn't great, but it's better than randomly guessing. 3/4 of the time we can be confident the model is correctly giving someone a diabetes prediction that actually has diabetes. Not clinically good but may raise awareness. **Could also be that there are prediabetics that have not received a diagnosis yet in the dataset!**
* **NPV**
 * Negative Predictive Value (NPV) represents the probability that a person does not have a disease or condition, given a negative test result. So, NPV is the proportion of individuals with negative test results who are correctly identified or diagnosed.
 * 77.6% is decent considering there may be prediabetics in the group and **part of the benefit of such a model is to alert people to their diabetes risk when they have not yet been diagnoses as diabetic or prediabetic!**


### 3. Model Building with feature selection

Let's rebuild the same model with 8 features research indicates are highly correlated with diabetes risk. This is manual feature selection. It was not done automatically using any sort of programmatic method:
* Selected: 'HighBP', 'BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income'

In [14]:
#select Diabetes_binary as target variable:
y_feat = df['Diabetes_binary']

#select all the other columns minus Diabetes_binary as the feature variables:
X_feat = df.drop(['Diabetes_binary'],axis=1)
X_feat = X_feat[['HighBP', 'BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']]

In [15]:
#Show the selected features
X_feat.head()

Unnamed: 0,HighBP,BMI,GenHlth,MentHlth,PhysHlth,Age,Education,Income
0,1.0,26.0,3.0,5.0,30.0,4.0,6.0,8.0
1,1.0,26.0,3.0,0.0,0.0,12.0,6.0,8.0
2,0.0,26.0,1.0,0.0,10.0,13.0,6.0,8.0
3,1.0,28.0,3.0,0.0,3.0,11.0,6.0,8.0
4,0.0,29.0,2.0,0.0,0.0,8.0,5.0,8.0


In [16]:
#change this name here to change the print name
classifier_name = 'Simple Neural Network: MLPClassifier w/ Feature Selection:'

start_ts = time.time()
#Changed the X to X_feat and y to y_feat
clf = MLPClassifier(activation='logistic', solver='adam', alpha=0.0001, max_iter=1000, hidden_layer_sizes=(10,), random_state=random_state)
scores = cross_validate(clf, X_feat, y_feat, scoring=scorers, cv=5)          

Sensitivity = round(scores['test_tp'].mean() / (scores['test_tp'].mean() + scores['test_fn'].mean()),3)*100   #TP/(TP+FN) also recall
Specificity = round(scores['test_tn'].mean() / (scores['test_tn'].mean() + scores['test_fp'].mean()),3)*100    #TN/(TN+FP)
PPV = round(scores['test_tp'].mean() / (scores['test_tp'].mean() + scores['test_fp'].mean()),3)*100           #PPV = tp/(tp+fp) also precision
NPV = round(scores['test_tn'].mean() / (scores['test_fn'].mean() + scores['test_tn'].mean()),3)*100           #TN(FN+TN)

scores_Acc = scores['test_Accuracy']                                                                                                                                    
print(f"{classifier_name} Acc: %0.2f (+/- %0.2f)" % (scores_Acc.mean(), scores_Acc.std() * 2))                                                                                                    
scores_AUC = scores['test_roc_auc']                                                                     #Only works with binary classes, not multiclass                  
print(f"{classifier_name} AUC: %0.2f (+/- %0.2f)" % (scores_AUC.mean(), scores_AUC.std() * 2))      
scores_sensitivity = scores['test_Sensitivity']                                                                     #Only works with binary classes, not multiclass                  
print(f"{classifier_name} Recall: %0.2f (+/- %0.2f)" % (scores_sensitivity.mean(), scores_sensitivity.std() * 2)) 
scores_precision = scores['test_precision']                                                                     #Only works with binary classes, not multiclass                  
print(f"{classifier_name} Precision: %0.2f (+/- %0.2f)" % (scores_precision.mean(), scores_precision.std() * 2))                          
print(f"{classifier_name} Sensitivity = ", Sensitivity, "%")
print(f"{classifier_name} Specificity = ", Specificity, "%")
print(f"{classifier_name} PPV = ", PPV, "%")  
print(f"{classifier_name} NPV = ", NPV, "%")

print("Runtime:", time.time()-start_ts)

Simple Neural Network: MLPClassifier w/ Feature Selection: Acc: 0.74 (+/- 0.01)
Simple Neural Network: MLPClassifier w/ Feature Selection: AUC: 0.82 (+/- 0.01)
Simple Neural Network: MLPClassifier w/ Feature Selection: Recall: 0.78 (+/- 0.06)
Simple Neural Network: MLPClassifier w/ Feature Selection: Precision: 0.73 (+/- 0.02)
Simple Neural Network: MLPClassifier w/ Feature Selection: Sensitivity =  78.0 %
Simple Neural Network: MLPClassifier w/ Feature Selection: Specificity =  70.5 %
Simple Neural Network: MLPClassifier w/ Feature Selection: PPV =  72.5 %
Simple Neural Network: MLPClassifier w/ Feature Selection: NPV =  76.2 %
Runtime: 42.025094747543335


Cool! We got about the same metrics but used less features. This is more efficient! Try other feature selection methods and different models to improve the predictive power of this data. 