In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

## Model Selection

In [2]:
# importing classification models
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# importing method to perform cross validation
from sklearn.model_selection import GridSearchCV #, RandomizedSearchCV
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [3]:
data = pd.read_csv("diabetes_clean.csv")
# take a 1% sample of the dataset
df_frac = data.sample(frac =.01) 
print(df_frac.shape)
df_frac.head(5)

(2537, 22)


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
78489,1.0,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,4.0,0.0,0.0,1.0,0.0,12.0,2.0,2.0
51552,0.0,1.0,0.0,1.0,33.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,4.0,20.0,15.0,0.0,1.0,8.0,4.0,3.0
233076,0.0,0.0,0.0,1.0,27.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,9.0,5.0,6.0
228190,1.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,0.0,1.0,1.0,1.0,13.0,5.0,8.0
128639,0.0,1.0,1.0,1.0,25.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,5.0,0.0,0.0,12.0,5.0,5.0


In [11]:
# IVs
x = df_frac.iloc[:,1:21] 
# DVs
y = df_frac.iloc[:,0]
# train/ test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, stratify = y, random_state = 42)

In [12]:
# store the models
model_list = []
#  Logistic Regression model
model_list.append(('LR', LogisticRegression(max_iter = 1000)))
#  KNN model
model_list.append(('KNN', KNeighborsClassifier()))
#  SVM model
model_list.append(('SVM', SVC()))
#  Naive Bayes model
model_list.append(('NaiveBayes', GaussianNB()))
#  Random Forest
model_list.append(('RF', RandomForestClassifier()))

In [14]:
# store results
eval_score = []
model_name = []
# cross validation on each model
for name, model_detail in model_list:
    kfold = KFold(n_splits = 10)
    # training data
    cv_results = cross_val_score(model_detail, x_train, y_train, cv = kfold)
    # append results
    eval_score.append(cv_results)
    # append names
    model_name.append(name)

# results as a dataframe
rs = pd.DataFrame(eval_score, index = ['LR','KNN', 'SVM', 'NB', 'RF'])
print("The 10 cross validation results are: \n")
# transpose
rs = pd.DataFrame(rs.T)
rs

The 10 cross validation results are: 



Unnamed: 0,LR,KNN,SVM,NB,RF
0,0.865169,0.859551,0.882022,0.359551,0.865169
1,0.820225,0.814607,0.820225,0.438202,0.837079
2,0.870787,0.820225,0.853933,0.38764,0.848315
3,0.859551,0.837079,0.865169,0.438202,0.870787
4,0.825843,0.808989,0.837079,0.359551,0.831461
5,0.836158,0.824859,0.830508,0.412429,0.824859
6,0.864407,0.847458,0.853107,0.40678,0.853107
7,0.824859,0.80226,0.824859,0.468927,0.80791
8,0.836158,0.779661,0.830508,0.429379,0.80791
9,0.819209,0.785311,0.80226,0.40678,0.79096


In [15]:
# get the mean result
rs.describe()

Unnamed: 0,LR,KNN,SVM,NB,RF
count,10.0,10.0,10.0,10.0,10.0
mean,0.842236,0.818,0.839967,0.410744,0.833755
std,0.020526,0.02558,0.023567,0.03504,0.026262
min,0.819209,0.779661,0.80226,0.359551,0.79096
25%,0.825105,0.803942,0.826271,0.392425,0.812147
50%,0.836158,0.817416,0.833794,0.409605,0.83427
75%,0.863193,0.834024,0.853726,0.435996,0.851909
max,0.870787,0.859551,0.882022,0.468927,0.870787


## Model

1. Random Forest
2. SVM
3. Logistic Regression

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

### Datasets

In [5]:
df = pd.read_csv('diabetes_important.csv')
df.head(5)

Unnamed: 0,Diabetes_binary,BMI,Age,MentHlth,GenHlth,Education,HighBP
0,0.0,0.002725,9.0,18.0,5.0,4.0,1.0
1,0.0,0.001703,7.0,0.0,3.0,6.0,0.0
2,0.0,0.001908,9.0,30.0,5.0,4.0,1.0
3,0.0,0.00184,11.0,0.0,2.0,3.0,1.0
4,0.0,0.001635,11.0,3.0,2.0,5.0,1.0


In [6]:
def data_split(df):
    iv = df.drop('Diabetes_binary', axis = 1)
    dv = df['Diabetes_binary']
    iv_train, iv_test, dv_train, dv_test = train_test_split(iv, dv, test_size = 0.3, stratify=dv, random_state = 42)
    return iv_train, iv_test, dv_train, dv_test

In [7]:
# train/test datasets

iv_train_1, iv_test_1, dv_train_1, dv_test_1 = data_split(df)

In [8]:
# check 
print(iv_train_1.shape)
iv_train_1.head(5)

(177576, 6)


Unnamed: 0,BMI,Age,MentHlth,GenHlth,Education,HighBP
52059,0.00218,11.0,0.0,2.0,6.0,0.0
68134,0.001635,6.0,0.0,2.0,5.0,0.0
95597,0.00184,10.0,0.0,2.0,6.0,1.0
186284,0.00184,5.0,0.0,2.0,6.0,0.0
110275,0.00218,11.0,0.0,1.0,6.0,1.0


### Random Forest

In [9]:
def random_forest(iv_train, iv_test, dv_train, dv_test, N):
    rf = RandomForestClassifier(n_estimators = N, random_state = 42)
    random_forest_fit=rf.fit(iv_train, dv_train)
    pred = rf.predict(iv_test)
    return random_forest_fit, pred

In [10]:
# fitting random forest model with varying number of trees: 300 to 1000.
rand_for_fit1, rf_pred_1 = random_forest(iv_train_1, iv_test_1, dv_train_1, dv_test_1, 500)

rand_for_fit2, rf_pred_2 = random_forest(iv_train_1, iv_test_1, dv_train_1, dv_test_1, 700)

rand_for_fit3, rf_pred_3 = random_forest(iv_train_1, iv_test_1, dv_train_1, dv_test_1, 1000)

rand_for_fit4, rf_pred_4 = random_forest(iv_train_1, iv_test_1, dv_train_1, dv_test_1, 300)

In [11]:
# defining evaluation function

def eval(dv_test, pred):
    # Precision without averaging
    precision = precision_score(dv_test, pred, average=None)
    # Round the precision values
    precision = [round(p, 4) for p in precision]
    
    # Recall without averaging
    recall = recall_score(dv_test, pred, average=None)
    # Round the recall values
    recall = [round(r, 4) for r in recall]
    
    # Accuracy without averaging
    accuracy = round(accuracy_score(dv_test, pred), 4)
    
    print('Precision Scores for each class:', precision)
    print('Recall Scores for each class:', recall)
    print('Accuracy Score:', accuracy)
    
    # Confusion matrix
    cmtx = pd.DataFrame(
        confusion_matrix(dv_test, pred, labels=[1, 0]), 
        index=['dv_test: positive', 'dv_test: negative'], 
        columns=['pred: positive', 'pred: negative']
    )
    print(cmtx)


In [24]:
eval(dv_test_1,rf_pred_1)
print()
eval(dv_test_1,rf_pred_2)
print()
eval(dv_test_1,rf_pred_3)
print()
eval(dv_test_1,rf_pred_4)

Precision Scores for each class: [0.8649, 0.4372]
Recall Scores for each class: [0.9504, 0.2062]
Accuracy Score: 0.8331
                   pred: positive  pred: negative
dv_test: positive            2473            9520
dv_test: negative            3183           60928

Precision Scores for each class: [0.8648, 0.4371]
Recall Scores for each class: [0.9504, 0.2059]
Accuracy Score: 0.8331
                   pred: positive  pred: negative
dv_test: positive            2469            9524
dv_test: negative            3180           60931

Precision Scores for each class: [0.8646, 0.4361]
Recall Scores for each class: [0.9506, 0.204]
Accuracy Score: 0.833
                   pred: positive  pred: negative
dv_test: positive            2447            9546
dv_test: negative            3164           60947

Precision Scores for each class: [0.8649, 0.4375]
Recall Scores for each class: [0.9503, 0.2065]
Accuracy Score: 0.8331
                   pred: positive  pred: negative
dv_test: positive  

The model performing the best included 300 estimators. The important class of positive diabetes are correctly classified only 20.7% of the time, while negative diabetes prognosis is more commonly predicted correctly. The false positives are less important than false negatives as the treatment should be given to the patients with diabetes as fast as possible. Further the cutoff value of the chosen model will be evaluated.

In [12]:
def random_forest_co(iv_train, iv_test, dv_train, dv_test, N, cutoff):
    rf = RandomForestClassifier(n_estimators = N, random_state = 42)
    random_forest_fit=rf.fit(iv_train, dv_train)
    pred_probab=rf.predict_proba(iv_test)
    pred_probabs = (pred_probab[:, 1] >= cutoff).astype(int)
    return random_forest_fit, pred_probabs

In [13]:
# fitting random forest model with varying cut-off value to improve positive predictions:

rand_for_fit1_cf, rf_pred_cf_1 = random_forest_co(iv_train_1, iv_test_1, dv_train_1, dv_test_1, 300, 0.3)

rand_for_fit2_cf, rf_pred_cf_2 = random_forest_co(iv_train_1, iv_test_1, dv_train_1, dv_test_1, 300, 0.2)

rand_for_fit3_cf, rf_pred_cf_3 = random_forest_co(iv_train_1, iv_test_1, dv_train_1, dv_test_1, 300, 0.1)

rand_for_fit4_cf, rf_pred_cf_4 = random_forest_co(iv_train_1, iv_test_1, dv_train_1, dv_test_1, 300, 0.05)

In [27]:
eval(dv_test_1,rf_pred_cf_1)
print()
eval(dv_test_1,rf_pred_cf_2)
print()
eval(dv_test_1,rf_pred_cf_3)
print()
eval(dv_test_1,rf_pred_cf_4)

Precision Scores for each class: [0.8928, 0.367]
Recall Scores for each class: [0.8543, 0.4516]
Accuracy Score: 0.7908
                   pred: positive  pred: negative
dv_test: positive            5416            6577
dv_test: negative            9343           54768

Precision Scores for each class: [0.9137, 0.3279]
Recall Scores for each class: [0.7645, 0.614]
Accuracy Score: 0.7408
                   pred: positive  pred: negative
dv_test: positive            7364            4629
dv_test: negative           15097           49014

Precision Scores for each class: [0.9358, 0.2717]
Recall Scores for each class: [0.611, 0.7759]
Accuracy Score: 0.637
                   pred: positive  pred: negative
dv_test: positive            9305            2688
dv_test: negative           24941           39170

Precision Scores for each class: [0.9474, 0.2385]
Recall Scores for each class: [0.4897, 0.8546]
Accuracy Score: 0.5472
                   pred: positive  pred: negative
dv_test: positive    

An estimated 44.7% of people globally are unaware of their diabetes. This number could potentially be minimised with models 2 to 4. However, the models do provide a high number of false positives,hurting the accuracy of the overall model. For the important class classification, the model nr. 4 was chosen. The percentage of false positives is very high, however the prediction of diabetes has improved and model can be further improved through addition of more data about diabetes.

### SVM

In [28]:
def support_vector(iv_train, iv_test, dv_train, dv_test,C):
    svm = SVC(kernel = 'rbf', C = C) # test with varying C values.
    svm_fit=svm.fit(iv_train, dv_train)
    pred = svm.predict(iv_test)
    return svm_fit, pred

In [None]:
# fitting support vector machines model with different C parameters, kernel "rfb"
fit_1_svm, svm_pred_1 = support_vector(iv_train_1, iv_test_1, dv_train_1, dv_test_1,10)

In [14]:
fit_2_svm, svm_pred_2 = support_vector(iv_train_1, iv_test_1, dv_train_1, dv_test_1,30)

In [13]:
fit_3_svm, svm_pred_3 = support_vector(iv_train_1, iv_test_1, dv_train_1, dv_test_1,5)

In [None]:
eval(dv_test_1,svm_pred_1)
print()
eval(dv_test_1,svm_pred_2)
print()
eval(dv_test_1,svm_pred_3)

Models performed in a similar fassion, the decision boundry assigns all values to the majority class.

### LR

In [14]:
def lr(iv_train, iv_test, dv_train, dv_test):
    lr = LogisticRegression(solver = 'lbfgs', random_state = 42)
    the_fit= lr.fit(iv_train, dv_train)
    pred = lr.predict(iv_test)
    return the_fit, pred

In [15]:
# fitting logistic regression model
lr_fit_1, lr_pred_1 = lr(iv_train_1, iv_test_1, dv_train_1, dv_test_1)

### Model Evaluation

Metrics used:
1. Precision
2. Recall
3. Confusion matrix
4. Accuracy

In [45]:
print('Random Forest: ')
eval(dv_test_1, rf_pred_cf_4)
print() # line break
print('SVM, C=5: ')
eval(dv_test_1, svm_pred_3)
print() # line break
print('LR with original dataset: ')
eval(dv_test_1, lr_pred_1)
print()

Random Forest: 
Precision Scores for each class: [0.9474, 0.2386]
Recall Scores for each class: [0.4899, 0.8547]
Accuracy Score: 0.5474
                   pred: positive  pred: negative
dv_test: positive           10250            1743
dv_test: negative           32703           31408

SVM, C=5: 
Precision Scores for each class: [0.8424, 0.0]
Recall Scores for each class: [1.0, 0.0]
Accuracy Score: 0.8424
                   pred: positive  pred: negative
dv_test: positive               0           11993
dv_test: negative               0           64111

LR with original dataset: 
Precision Scores for each class: [0.854, 0.467]
Recall Scores for each class: [0.9772, 0.1066]
Accuracy Score: 0.84
                   pred: positive  pred: negative
dv_test: positive            1279           10714
dv_test: negative            1460           62651




- Overall, the accuracy scores of the models have no significant difference before tuning.
- SVM tends to predict all classes to negatives. This indicates that the creation of decision boundry is a complex problem. More datapoints could improve the performance of predicting diabetes.
- It is of utmost importance that diabetes is classified correctly. As recall indicates out of all truly positive outcomes, how many were correctly predicted, this is the decision factor. The tuned random forest model has the best recall with 85% and therefore is chosen.

Testing chosen Random Forest model on new data from 2021 CDC Survey. (Preprocessed in CDC_dataset jupyter notebook)

In [16]:
cdc_2021=pd.read_csv('2021_CDC_test.csv')

In [17]:
# splitting data
iv = cdc_2021.drop('Diabetes_binary', axis = 1)
dv = cdc_2021['Diabetes_binary']

In [18]:
logistic_Reg_test=lr_fit_1.predict(iv)

In [19]:
probs=rand_for_fit4_cf.predict_proba(iv)
random_forest_test=(probs[:, 1] >= 0.05).astype(int)

In [20]:
print()
print("Random Forest Result")
eval(dv, random_forest_test)
print()
print("Logistic Regression Result")
eval(dv, logistic_Reg_test)


Random Forest Result
Precision Scores for each class: [0.918, 0.2733]
Recall Scores for each class: [0.6304, 0.7116]
Accuracy Score: 0.6437
                   pred: positive  pred: negative
dv_test: positive           43941           17809
dv_test: negative          116827          199280

Logistic Regression Result
Precision Scores for each class: [0.8462, 0.4619]
Recall Scores for each class: [0.9799, 0.0882]
Accuracy Score: 0.8342
                   pred: positive  pred: negative
dv_test: positive            5446           56304
dv_test: negative            6344          309763


The Random Forest Model performs best, with a lower accuracy of 64%, but higher recall of 71%.