In [26]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('diabetes.csv')
df.head(5)

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,0,1,26,0,0,0,1,0,...,1,0,3,5,30,0,1,4,6,8
1,0,1,1,1,26,1,1,0,0,1,...,1,0,3,0,0,0,1,12,6,8
2,0,0,0,1,26,0,0,0,1,1,...,1,0,1,0,10,0,1,13,6,8
3,0,1,1,1,28,1,0,0,1,1,...,1,0,3,0,3,0,1,11,6,8
4,0,0,0,1,29,1,0,0,1,1,...,1,0,2,0,0,0,0,8,5,8


In [5]:
# rename
df = df.rename(columns={"Diabetes_binary": "D"})

# transform on another dataframe to compare results
df_fe = df
# Normalize numeric columns
df_fe[['BMI', 'MentHlth', 'PhysHlth']] = StandardScaler().fit_transform(df_fe[['BMI', 'MentHlth', 'PhysHlth']])
df_fe

Unnamed: 0,D,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,0,1,-0.542176,0,0,0,1,0,...,1,0,3,0.153020,2.404008,0,1,4,6,8
1,0,1,1,1,-0.542176,1,1,0,0,1,...,1,0,3,-0.460058,-0.577451,0,1,12,6,8
2,0,0,0,1,-0.542176,0,0,0,1,1,...,1,0,1,-0.460058,0.416369,0,1,13,6,8
3,0,1,1,1,-0.261036,1,0,0,1,1,...,1,0,3,-0.460058,-0.279305,0,1,11,6,8
4,0,0,0,1,-0.120466,1,0,0,1,1,...,1,0,2,-0.460058,-0.577451,0,0,8,5,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70687,1,0,1,1,1.004092,0,0,0,0,0,...,1,0,4,-0.460058,-0.577451,0,0,6,4,1
70688,1,0,1,1,-0.120466,1,0,1,0,1,...,1,0,2,-0.460058,-0.577451,1,1,10,3,6
70689,1,1,1,1,-0.682745,0,0,1,0,1,...,1,0,5,1.379176,-0.577451,1,0,13,6,4
70690,1,1,1,1,-1.666734,0,0,0,0,0,...,1,0,4,-0.460058,-0.577451,1,0,11,2,4


## Model

1. Random Forest
2. SVM
3. Logistic Regression

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

### Datasets

In [5]:
def data_split(df):
    # rename target column
    df = df.rename(columns={"Diabetes_binary": "D"})
    iv = df.drop('D', axis = 1)
    dv = df['D']
    iv_train, iv_test, dv_train, dv_test = train_test_split(iv, dv, test_size = 0.3, random_state = 42)
    return iv_train, iv_test, dv_train, dv_test

In [14]:
# minimized dataset
df_min = pd.read_csv('diabetes_important.csv')
# currated_dataset
df_curr = pd.read_csv('currated_dataset.csv')

In [15]:
# train test datasets

# original dataset (1)
iv_train_1, iv_test_1, dv_train_1, dv_test_1 = data_split(df)

# minimized dataset (2)
iv_train_2, iv_test_2, dv_train_2, dv_test_2 = data_split(df_min)

# currated dataset (3)
iv_train_3, iv_test_3, dv_train_3, dv_test_3 = data_split(df_curr)

In [18]:
# check 
print(iv_train_2.shape)
iv_train_2.head(5)

(49484, 5)


Unnamed: 0,HighBP,GenHlth,BMI,Age,HighChol
14533,1,3,18,13,1
57060,1,4,34,10,1
26147,1,3,34,10,1
24376,0,2,32,4,0
49271,1,2,37,7,0


### Random Forest

In [24]:
def random_forest(iv_train, iv_test, dv_train, dv_test):
    rf = RandomForestClassifier(n_estimators = 500, random_state = 42)
    rf.fit(iv_train, dv_train)
    pred = rf.predict(iv_test)
    return pred

In [25]:
# original dataset (1)
rf_pred_1 = random_forest(iv_train_1, iv_test_1, dv_train_1, dv_test_1)

# dataset 2
rf_pred_2 = random_forest(iv_train_2, iv_test_2, dv_train_2, dv_test_2)

# dataset 3
rf_pred_3 = random_forest(iv_train_3, iv_test_3, dv_train_3, dv_test_3)

### SVM

In [30]:
def support_vector(iv_train, iv_test, dv_train, dv_test):
    svm = SVC(kernel = 'rbf', C = 100.0) # increase C because there are outliers
    svm.fit(iv_train, dv_train)
    pred = svm.predict(iv_test)
    return pred

In [31]:
# dataset 1
svm_pred_1 = support_vector(iv_train_1, iv_test_1, dv_train_1, dv_test_1)

# dataset 2
svm_pred_2 = support_vector(iv_train_2, iv_test_2, dv_train_2, dv_test_2)

# dataset 3
svm_pred_3 = support_vector(iv_train_3, iv_test_3, dv_train_3, dv_test_3)

### LR

In [32]:
def lr(iv_train, iv_test, dv_train, dv_test):
    lr = LogisticRegression(solver = 'lbfgs', max_iter = 1000, random_state = 42)
    lr.fit(iv_train, dv_train)
    pred = lr.predict(iv_test)
    return pred

In [33]:
# dataset 1
lr_pred_1 = lr(iv_train_1, iv_test_1, dv_train_1, dv_test_1)

# dataset 2
lr_pred_2 = lr(iv_train_2, iv_test_2, dv_train_2, dv_test_2)

# dataset 3
lr_pred_3 = lr(iv_train_3, iv_test_3, dv_train_3, dv_test_3)

### Model Evaluation

Metrics used:
1. Precision
2. Recall
3. Confusion matrix
4. Accuracy

In [65]:
def eval(dv_test, pred):
    precision = round(precision_score(dv_test, pred, average = 'weighted'), 4)
    recall = round(recall_score(dv_test, pred, average = 'weighted'), 4)
    accuracy = round(accuracy_score(dv_test, pred), 4)
    print('Precision Score: ', precision, '\nRecall Score: ', recall, '\nAccuracy Score:', accuracy)
    cmtx = pd.DataFrame(
        confusion_matrix(dv_test, pred, labels = [1,0]), 
        index = ['dv_test: positive', 'dv_test: negative'], 
        columns = ['pred: positive', 'pred: negative']
    )
    print(cmtx)

In [66]:
# dataset 1
print('Random Forest with original dataset: ')
eval(dv_test_1, rf_pred_1)
print() # line break
print('SVM with original dataset: ')
eval(dv_test_1, svm_pred_1)
print() # line break
print('LR with original dataset: ')
eval(dv_test_1, lr_pred_1)
print()

Random Forest with original dataset: 
Precision Score:  0.7419 
Recall Score:  0.7402 
Accuracy Score: 0.7402
                   pred: positive  pred: negative
dv_test: positive            8296            2311
dv_test: negative            3198            7403

SVM with original dataset: 
Precision Score:  0.7608 
Recall Score:  0.7555 
Accuracy Score: 0.7555
                   pred: positive  pred: negative
dv_test: positive            8772            1835
dv_test: negative            3350            7251

LR with original dataset: 
Precision Score:  0.7492 
Recall Score:  0.7487 
Accuracy Score: 0.7487
                   pred: positive  pred: negative
dv_test: positive            8184            2423
dv_test: negative            2906            7695



In [67]:
# dataset 2
print('Random Forest with minimized dataset: ')
eval(dv_test_2, rf_pred_2)
print()
print('SVM with minimized dataset: ')
eval(dv_test_2, svm_pred_2)
print()
print('LR with minimized dataset: ')
eval(dv_test_2, lr_pred_2)
print()

Random Forest with minimized dataset: 
Precision Score:  0.7276 
Recall Score:  0.726 
Accuracy Score: 0.726
                   pred: positive  pred: negative
dv_test: positive            8142            2465
dv_test: negative            3345            7256

SVM with minimized dataset: 
Precision Score:  0.7547 
Recall Score:  0.7468 
Accuracy Score: 0.7468
                   pred: positive  pred: negative
dv_test: positive            8852            1755
dv_test: negative            3614            6987

LR with minimized dataset: 
Precision Score:  0.7424 
Recall Score:  0.7417 
Accuracy Score: 0.7417
                   pred: positive  pred: negative
dv_test: positive            8146            2461
dv_test: negative            3017            7584



In [68]:
# dataset 3
print('Random Forest with currated dataset: ')
eval(dv_test_3, rf_pred_3)
print()
print('SVM with currated dataset: ')
eval(dv_test_3, svm_pred_3)
print()
print('LR with currated dataset: ')
eval(dv_test_3, lr_pred_3)

Random Forest with currated dataset: 
Precision Score:  0.7143 
Recall Score:  0.7137 
Accuracy Score: 0.7137
                   pred: positive  pred: negative
dv_test: positive            7849            2758
dv_test: negative            3314            7287

SVM with currated dataset: 
Precision Score:  0.7545 
Recall Score:  0.7474 
Accuracy Score: 0.7474
                   pred: positive  pred: negative
dv_test: positive            8817            1790
dv_test: negative            3567            7034

LR with currated dataset: 
Precision Score:  0.742 
Recall Score:  0.7415 
Accuracy Score: 0.7415
                   pred: positive  pred: negative
dv_test: positive            8123            2484
dv_test: negative            2999            7602


- Overall, the accuracy scores of the models have no significant difference.
- As the model will be used to dianogse diabetes, it is better to have a high precision score. Of all the models tested above, SVM consistenly returns the highest precision score. Specifically, SVM with the original dataset has the highest precision score of 0.7608.