In [117]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import numpy as np

## Preprocess Data I

In [118]:
df = pd.read_csv("./data/diabetes_binary_health_indicators_BRFSS2015.csv")
df = pd.DataFrame(df)
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [119]:
df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [120]:
# converting data to integer
for col in df.columns:
    df[col] = df[col].astype("int")
    
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,1,40,1,0,0,0,0,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,0,25,1,0,0,1,0,...,0,1,3,0,0,0,0,7,6,1
2,0,1,1,1,28,0,0,0,0,1,...,1,1,5,30,30,1,0,9,4,8
3,0,1,0,1,27,0,0,0,1,1,...,1,0,2,0,0,0,0,11,3,6
4,0,1,1,1,24,0,0,0,1,1,...,1,0,2,3,0,0,0,11,5,4


In [121]:
#checking the data type of the columns
display(df.dtypes)

Diabetes_binary         int32
HighBP                  int32
HighChol                int32
CholCheck               int32
BMI                     int32
Smoker                  int32
Stroke                  int32
HeartDiseaseorAttack    int32
PhysActivity            int32
Fruits                  int32
Veggies                 int32
HvyAlcoholConsump       int32
AnyHealthcare           int32
NoDocbcCost             int32
GenHlth                 int32
MentHlth                int32
PhysHlth                int32
DiffWalk                int32
Sex                     int32
Age                     int32
Education               int32
Income                  int32
dtype: object

In [122]:
#checking balance of diabetic vs not diabetic 
display(df["Diabetes_binary"].value_counts())

#significant imbalance

Diabetes_binary
0    218334
1     35346
Name: count, dtype: int64

In [123]:
#splitting the data for later use

X = df.copy()
X = X.drop(columns="Diabetes_binary")
y = df["Diabetes_binary"]


## VIF Evaluation
### - Evaluation of Initial Features

In [124]:
# Using Variance Inflation Factor to assess usefulness of each column of the original df

from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF dataframe for original df
vif_df = pd.DataFrame()
vif_df["Features"] = X.columns 
  
# calculating VIF for each feature 
vif_df["Calculated VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))] 
  
print(vif_df)

# Following columns appear to distort the data: CholCheck, BMI, Veggies, AnyHealthcare, GenHlth, Age, Education, Income


                Features  Calculated VIF
0                 HighBP        2.299761
1               HighChol        2.029660
2              CholCheck       23.187436
3                    BMI       18.149913
4                 Smoker        1.933042
5                 Stroke        1.126777
6   HeartDiseaseorAttack        1.289820
7           PhysActivity        4.645314
8                 Fruits        3.032775
9                Veggies        5.826886
10     HvyAlcoholConsump        1.083523
11         AnyHealthcare       20.839710
12           NoDocbcCost        1.215834
13               GenHlth       10.740162
14              MentHlth        1.463103
15              PhysHlth        1.999556
16              DiffWalk        1.838551
17                   Sex        1.910708
18                   Age        9.886830
19             Education       29.507416
20                Income       14.156118


In [125]:
#created df where less useful columns are dropped

df_drop_cols = df.drop(columns=[ "CholCheck", "BMI", "Veggies", "AnyHealthcare", "GenHlth", "Age", "Education", "Income"])
df_drop_cols.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   Diabetes_binary       253680 non-null  int32
 1   HighBP                253680 non-null  int32
 2   HighChol              253680 non-null  int32
 3   Smoker                253680 non-null  int32
 4   Stroke                253680 non-null  int32
 5   HeartDiseaseorAttack  253680 non-null  int32
 6   PhysActivity          253680 non-null  int32
 7   Fruits                253680 non-null  int32
 8   HvyAlcoholConsump     253680 non-null  int32
 9   NoDocbcCost           253680 non-null  int32
 10  MentHlth              253680 non-null  int32
 11  PhysHlth              253680 non-null  int32
 12  DiffWalk              253680 non-null  int32
 13  Sex                   253680 non-null  int32
dtypes: int32(14)
memory usage: 13.5 MB


## Creating Test Data
### - Test Train for Initial DF and Updated DF

In [126]:
# Creating test and train data for initial df
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 10)
X_train.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
147485,1,0,1,30,0,0,0,0,0,1,...,1,0,3,5,0,0,0,7,5,5
155828,1,1,1,29,0,0,0,1,0,0,...,1,0,2,0,0,0,0,7,5,8
167688,0,0,1,23,0,0,0,1,0,1,...,1,0,2,0,25,0,0,9,6,8
43443,0,0,1,23,0,0,0,1,1,1,...,1,0,1,0,0,0,0,7,6,8
107094,1,1,1,34,0,0,0,1,0,0,...,1,0,3,0,0,0,0,8,4,6


In [127]:
# Updated X to contain only useful features
X = df_drop_cols.copy()

In [128]:
# Creating test and train data for updated df
X_train2, X_test2, y_train2, y_test2 = train_test_split(X,y, random_state = 10)
X_train2.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,HvyAlcoholConsump,NoDocbcCost,MentHlth,PhysHlth,DiffWalk,Sex
147485,0,1,0,0,0,0,0,0,0,0,5,0,0,0
155828,0,1,1,0,0,0,1,0,0,0,0,0,0,0
167688,0,0,0,0,0,0,1,0,0,0,0,25,0,0
43443,0,0,0,0,0,0,1,1,0,0,0,0,0,0
107094,0,1,1,0,0,0,1,0,0,0,0,0,0,0


### - Scaling Training Data

In [129]:
#scaling original features
scaler = StandardScaler().fit(X_train)
X_train_scaled_orig_array = scaler.transform(X_train)
X_test_scaled_orig_array = scaler.transform(X_test)

#scaling updated features (less useful columns removed)
scaler = StandardScaler().fit(X_train2)
X_train2_scaled_upd_array = scaler.transform(X_train2)

#updating X train, X test and X train2(less features) from array to df for both

X_train_scaled_orig = pd.DataFrame(X_train_scaled_orig_array, columns = X_train.columns)
display(X_train_scaled_orig.head())

X_test_scaled_upd = pd.DataFrame(X_test_scaled_orig_array, columns = X_train.columns)
display(X_test_scaled_upd.head())

X_train2_scaled_upd = pd.DataFrame(X_train2_scaled_upd_array, columns = X_train2.columns)
display(X_train2_scaled_upd.head())


Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.154676,-0.858429,0.196424,0.245045,-0.890018,-0.205234,-0.321197,-1.764702,-1.315001,0.481339,...,0.226199,-0.301905,0.45799,0.244789,-0.48613,-0.449885,-0.887839,-0.336148,-0.052925,-0.510599
1,1.154676,1.164919,0.196424,0.093554,-0.890018,-0.205234,-0.321197,0.566668,-1.315001,-2.077537,...,0.226199,-0.301905,-0.478879,-0.429758,-0.48613,-0.449885,-0.887839,-0.336148,-0.052925,0.938109
2,-0.866044,-0.858429,0.196424,-0.815396,-0.890018,-0.205234,-0.321197,0.566668,-1.315001,0.481339,...,0.226199,-0.301905,-0.478879,-0.429758,2.379837,-0.449885,-0.887839,0.318691,0.962658,0.938109
3,-0.866044,-0.858429,0.196424,-0.815396,-0.890018,-0.205234,-0.321197,0.566668,0.760455,0.481339,...,0.226199,-0.301905,-1.415747,-0.429758,-0.48613,-0.449885,-0.887839,-0.336148,0.962658,0.938109
4,1.154676,1.164919,0.196424,0.851011,-0.890018,-0.205234,-0.321197,0.566668,-1.315001,-2.077537,...,0.226199,-0.301905,0.45799,-0.429758,-0.48613,-0.449885,-0.887839,-0.008728,-1.068507,-0.027696


Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.154676,1.164919,0.196424,-0.360921,-0.890018,4.872497,3.113353,-1.764702,-1.315001,0.481339,...,0.226199,-0.301905,2.331728,-0.429758,2.953031,-0.449885,1.12633,1.30095,-1.068507,-0.027696
1,1.154676,1.164919,0.196424,-0.512413,1.123572,4.872497,-0.321197,0.566668,-1.315001,0.481339,...,0.226199,-0.301905,0.45799,2.94298,-0.48613,-0.449885,-0.887839,0.646111,0.962658,0.938109
2,-0.866044,-0.858429,0.196424,0.548028,-0.890018,-0.205234,-0.321197,0.566668,0.760455,0.481339,...,0.226199,-0.301905,-1.415747,-0.429758,-0.48613,-0.449885,-0.887839,0.646111,0.962658,0.938109
3,-0.866044,-0.858429,0.196424,-1.118379,-0.890018,-0.205234,-0.321197,0.566668,0.760455,0.481339,...,0.226199,-0.301905,-1.415747,0.379699,-0.48613,-0.449885,1.12633,0.973531,0.962658,-0.510599
4,-0.866044,-0.858429,0.196424,0.548028,1.123572,-0.205234,-0.321197,0.566668,0.760455,-2.077537,...,0.226199,-0.301905,-1.415747,-0.429758,0.087064,-0.449885,1.12633,0.646111,-0.052925,-0.027696


Unnamed: 0,Diabetes_binary,HighBP,HighChol,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,HvyAlcoholConsump,NoDocbcCost,MentHlth,PhysHlth,DiffWalk,Sex
0,-0.400753,1.154676,-0.858429,-0.890018,-0.205234,-0.321197,-1.764702,-1.315001,-0.244389,-0.301905,0.244789,-0.48613,-0.449885,-0.887839
1,-0.400753,1.154676,1.164919,-0.890018,-0.205234,-0.321197,0.566668,-1.315001,-0.244389,-0.301905,-0.429758,-0.48613,-0.449885,-0.887839
2,-0.400753,-0.866044,-0.858429,-0.890018,-0.205234,-0.321197,0.566668,-1.315001,-0.244389,-0.301905,-0.429758,2.379837,-0.449885,-0.887839
3,-0.400753,-0.866044,-0.858429,-0.890018,-0.205234,-0.321197,0.566668,0.760455,-0.244389,-0.301905,-0.429758,-0.48613,-0.449885,-0.887839
4,-0.400753,1.154676,1.164919,-0.890018,-0.205234,-0.321197,0.566668,-1.315001,-0.244389,-0.301905,-0.429758,-0.48613,-0.449885,-0.887839


### - VIF for Scaled Features (for Training Data)

In [130]:
#VIF for Scaled data
# calculating VIF for scaled trained orig features 
vif_df["Calculated Scaled VIF"] = [variance_inflation_factor(X_train_scaled_orig.values, i) for i in range(len(X_train_scaled_orig.columns))]

# calculating VIF for scaled trained updated features
vif_scaled_df = pd.DataFrame()
vif_scaled_df["Scaled Trained Features"] = X_train2_scaled_upd.columns
vif_scaled_df["Scaled Trained VIF"] = [variance_inflation_factor(X_train2_scaled_upd.values, i) for i in range(len(X_train2_scaled_upd.columns))] 
  
print(vif_df, end="\n\n\n")
print(vif_scaled_df)

                Features  Calculated VIF  Calculated Scaled VIF
0                 HighBP        2.299761               1.330071
1               HighChol        2.029660               1.172964
2              CholCheck       23.187436               1.032577
3                    BMI       18.149913               1.141016
4                 Smoker        1.933042               1.093238
5                 Stroke        1.126777               1.082002
6   HeartDiseaseorAttack        1.289820               1.170161
7           PhysActivity        4.645314               1.155657
8                 Fruits        3.032775               1.111643
9                Veggies        5.826886               1.112206
10     HvyAlcoholConsump        1.083523               1.024527
11         AnyHealthcare       20.839710               1.113788
12           NoDocbcCost        1.215834               1.143751
13               GenHlth       10.740162               1.792179
14              MentHlth        1.463103

## Initial Modeling - Logistic Regression

###  - Logistic Regression on Raw Data & Score (Baseline Model)

In [131]:
#using logistic regression model first as a baseline
# original raw data

classifier = LogisticRegression(max_iter=500)
classifier.fit(X_train, y_train)

# score for training data (evaluates how well the model performs on the training data)
print(f"Training Data Accuracy Score: {classifier.score(X_train, y_train)}")

#score for test data (evaluates model's ability to make predictions on unseen data)
print(f"Test Data Accuracy Score: {classifier.score(X_test, y_test)}")

Training Data Accuracy Score: 0.8644591611479029
Test Data Accuracy Score: 0.8611794386628824


In [132]:
#Generating predictions using the test data
y_pred = classifier.predict(X_test)

#Checking balanced accuracy due to the significant imbalance of the data
print(f"Balanced Accuracy Score (score is more useful given data imbalance): {balanced_accuracy_score(y_test, y_pred)}")

#not great, barely better than random

Balanced Accuracy Score (score is more useful given data imbalance): 0.565982136923413


###  - Logistic Regression on Scaled Data & Score (Baseline Model)

In [133]:
#using logistic regression model on scaled X_train_scaled_orig and X_train2_scaled_upd

classifier.fit(X_train_scaled_orig, y_train)
y_pred_scaled = classifier.predict(X_test_scaled_upd)
                                  
# checking balanced accuracy for scaled data
print(f"Balanced Accuracy Score (score is more useful given data imbalance): {balanced_accuracy_score(y_test, y_pred_scaled)}")

#not great, barely better than random

Balanced Accuracy Score (score is more useful given data imbalance): 0.5661114079330035


---

## Preprocess Data II - Undersampling Data

Due to significant imbalance and score, I don't think it is worth attempting RandomForest model unless the data's balance is improved. Next I undersampled the data. I selected undersampling rather than oversampling due to the larger balance (163932) being over 6x larger than the smaller balance (26328). The smaller balance is plenty and may predict better than attempting to predict using a significant amount of synthetic data compared to the actual data available.

### - RandomUnderSampler Technique

In [134]:
# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=63)

# Fitting the training data
X_rus_resampled, y_rus_resampled = rus.fit_resample(X_train, y_train)

# Fitting the scaled training data
X_rus_resampled_scaled, y_rus_resampled_scaled = rus.fit_resample(X_train_scaled_orig, y_train)

display(y_rus_resampled.value_counts())
display(y_train.value_counts())

Diabetes_binary
0    26328
1    26328
Name: count, dtype: int64

Diabetes_binary
0    163932
1     26328
Name: count, dtype: int64

### - EditedNearestNeighbors Technique

In [135]:
from imblearn.under_sampling import EditedNearestNeighbours

# Instantiate instance
enn = EditedNearestNeighbours(n_neighbors = 6, sampling_strategy = "auto")

# Fitting the training data
X_enn_resampled, y_enn_resampled = enn.fit_resample(X_train, y_train)

# # Fitting the scaled training data
# X_enn_resampled_scaled, y_enn_resampled_scaled = enn.fit_resample(X_train_scaled_orig, y_train)

display(y_enn_resampled.value_counts())
display(y_train.value_counts())

Diabetes_binary
0    98690
1    26328
Name: count, dtype: int64

Diabetes_binary
0    163932
1     26328
Name: count, dtype: int64

### - ClusterCentroids Technique (LONG PROCESSING TIME)

In [136]:
from imblearn.under_sampling import ClusterCentroids

# Instantiate instance
cc = ClusterCentroids(random_state=63)

# Fitting the training data
X_cc_resampled, y_cc_resampled = cc.fit_resample(X_train, y_train)

# # Fitting the scaled training data
# X_cc_resampled_scaled, y_cc_resampled_scaled = cc.fit_resample(X_train_scaled_orig, y_train)

display(y_cc_resampled.value_counts())
display(y_train.value_counts())



Diabetes_binary
0    26328
1    26328
Name: count, dtype: int64

Diabetes_binary
0    163932
1     26328
Name: count, dtype: int64

## Modeling - RandomForest

### - RF for original data (imbalanced) + metrics

In [137]:
#randomforest for original data

rf = RandomForestClassifier(n_estimators= 10, min_samples_leaf= 96, max_leaf_nodes= 92, max_depth= 38)
rf.fit(X_train, y_train)
y_predict = rf.predict(X_test)

#get the importance of each feature
imp_features = rf.feature_importances_
for i, f in enumerate(imp_features):
    col_name = X_train.columns
    feature_name = col_name[i]
    print(f"Feature: [{i}] {feature_name}, Score: {f:.3f}")

Feature: [0] HighBP, Score: 0.324
Feature: [1] HighChol, Score: 0.106
Feature: [2] CholCheck, Score: 0.001
Feature: [3] BMI, Score: 0.133
Feature: [4] Smoker, Score: 0.001
Feature: [5] Stroke, Score: 0.006
Feature: [6] HeartDiseaseorAttack, Score: 0.057
Feature: [7] PhysActivity, Score: 0.004
Feature: [8] Fruits, Score: 0.001
Feature: [9] Veggies, Score: 0.001
Feature: [10] HvyAlcoholConsump, Score: 0.005
Feature: [11] AnyHealthcare, Score: 0.000
Feature: [12] NoDocbcCost, Score: 0.000
Feature: [13] GenHlth, Score: 0.228
Feature: [14] MentHlth, Score: 0.003
Feature: [15] PhysHlth, Score: 0.015
Feature: [16] DiffWalk, Score: 0.037
Feature: [17] Sex, Score: 0.003
Feature: [18] Age, Score: 0.043
Feature: [19] Education, Score: 0.011
Feature: [20] Income, Score: 0.023


In [138]:
print(f"Classification Report for Original Data")
print(classification_report(y_test, y_predict))

print(f"Confusion Matrix")
print(f"{confusion_matrix(y_test, y_predict)}\n")
#using confusion_matrix
# true neg  | false posi
# false neg | true pos

print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_predict)}\n")

print(f"Predict Values with Probability")
pred_probas = rf.predict_proba(X_test)
print(pred_probas)

print(f"\n Predicted Class 1 Values with Probability")
pred_probas_class1 = [prob[1] for prob in pred_probas]
print(pred_probas_class1[0:5])

print(f"\n roc_auc score: {roc_auc_score(y_test, pred_probas_class1)}")

Classification Report for Original Data
              precision    recall  f1-score   support

           0       0.87      0.99      0.93     54402
           1       0.64      0.07      0.12      9018

    accuracy                           0.86     63420
   macro avg       0.75      0.53      0.52     63420
weighted avg       0.83      0.86      0.81     63420

Confusion Matrix
[[54069   333]
 [ 8431   587]]

Balanced Accuracy Score: 0.5294854698284521

Predict Values with Probability
[[0.68886961 0.31113039]
 [0.80749989 0.19250011]
 [0.93525929 0.06474071]
 ...
 [0.64962479 0.35037521]
 [0.90173929 0.09826071]
 [0.88957256 0.11042744]]

 Predicted Class 1 Values with Probability
[0.31113039473573356, 0.19250010917402088, 0.06474070915847982, 0.037447459792887675, 0.09160199930926288]

 roc_auc score: 0.8164242286925563


### - RF + feature importance for randomundersampler dataset

In [139]:
#randomforest for data that has been undersampled using RandomUnderSampler

rf = RandomForestClassifier(n_estimators= 10, min_samples_leaf= 96, max_leaf_nodes= 92, max_depth= 38)
rf.fit(X_rus_resampled, y_rus_resampled)
y_predict = rf.predict(X_test)

#get the importance of each feature
imp_features = rf.feature_importances_
for i, f in enumerate(imp_features):
    col_name = X_train.columns
    feature_name = col_name[i]
    print(f"Feature: [{i}] {feature_name}, Score: {f:.3f}")

Feature: [0] HighBP, Score: 0.194
Feature: [1] HighChol, Score: 0.140
Feature: [2] CholCheck, Score: 0.003
Feature: [3] BMI, Score: 0.130
Feature: [4] Smoker, Score: 0.001
Feature: [5] Stroke, Score: 0.002
Feature: [6] HeartDiseaseorAttack, Score: 0.019
Feature: [7] PhysActivity, Score: 0.004
Feature: [8] Fruits, Score: 0.001
Feature: [9] Veggies, Score: 0.001
Feature: [10] HvyAlcoholConsump, Score: 0.005
Feature: [11] AnyHealthcare, Score: 0.000
Feature: [12] NoDocbcCost, Score: 0.000
Feature: [13] GenHlth, Score: 0.242
Feature: [14] MentHlth, Score: 0.003
Feature: [15] PhysHlth, Score: 0.050
Feature: [16] DiffWalk, Score: 0.075
Feature: [17] Sex, Score: 0.006
Feature: [18] Age, Score: 0.078
Feature: [19] Education, Score: 0.014
Feature: [20] Income, Score: 0.033


In [140]:
print(f"Classification Report for Original Data")
print(classification_report(y_test, y_predict))

print(f"Confusion Matrix")
print(f"{confusion_matrix(y_test, y_predict)}\n")
#using confusion_matrix
# true neg  | false posi
# false neg | true pos

print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_predict)}")
print(f"Score: {balanced_accuracy_score(y_test, y_predict)}")

print(f"Predict Values with Probability")
pred_probas = rf.predict_proba(X_test)
print(pred_probas)

print(f"\n Predicted Class 1 Values with Probability")
pred_probas_class1 = [prob[1] for prob in pred_probas]
print(pred_probas_class1[0:5])

print(f"\n roc_auc score: {roc_auc_score(y_test, pred_probas_class1)}")

Classification Report for Original Data
              precision    recall  f1-score   support

           0       0.95      0.71      0.81     54402
           1       0.31      0.78      0.44      9018

    accuracy                           0.72     63420
   macro avg       0.63      0.74      0.63     63420
weighted avg       0.86      0.72      0.76     63420

Confusion Matrix
[[38525 15877]
 [ 2025  6993]]

Balanced Accuracy Score: 0.7418016068887922
Score: 0.7418016068887922
Predict Values with Probability
[[0.225215   0.774785  ]
 [0.51760147 0.48239853]
 [0.76591177 0.23408823]
 ...
 [0.248656   0.751344  ]
 [0.61685739 0.38314261]
 [0.55253832 0.44746168]]

 Predicted Class 1 Values with Probability
[0.7747849996544309, 0.4823985307990982, 0.23408823477653043, 0.16791635451646303, 0.3760236768318121]

 roc_auc score: 0.8186573405399291


### - RF + feature importance for enn dataset (improved data + somewhat undersampled)

In [141]:
#randomforest for data that has been undersampled using editednearestneighbor

#original: rf = RandomForestClassifier(random_state = 32, n_estimators = 100)
#updated based on best_params that i used in a cell down below
rf = RandomForestClassifier(n_estimators= 10, min_samples_leaf= 96, max_leaf_nodes= 92, max_depth= 38)
rf.fit(X_enn_resampled, y_enn_resampled)
y_predict = rf.predict(X_test)

#get the importance of each feature
imp_features = rf.feature_importances_
for i, f in enumerate(imp_features):
    col_name = X_train.columns
    feature_name = col_name[i]
    print(f"Feature: [{i}] {feature_name}, Score: {f:.3f}")

Feature: [0] HighBP, Score: 0.181
Feature: [1] HighChol, Score: 0.117
Feature: [2] CholCheck, Score: 0.001
Feature: [3] BMI, Score: 0.206
Feature: [4] Smoker, Score: 0.000
Feature: [5] Stroke, Score: 0.001
Feature: [6] HeartDiseaseorAttack, Score: 0.024
Feature: [7] PhysActivity, Score: 0.003
Feature: [8] Fruits, Score: 0.000
Feature: [9] Veggies, Score: 0.001
Feature: [10] HvyAlcoholConsump, Score: 0.003
Feature: [11] AnyHealthcare, Score: 0.000
Feature: [12] NoDocbcCost, Score: 0.000
Feature: [13] GenHlth, Score: 0.202
Feature: [14] MentHlth, Score: 0.002
Feature: [15] PhysHlth, Score: 0.053
Feature: [16] DiffWalk, Score: 0.083
Feature: [17] Sex, Score: 0.002
Feature: [18] Age, Score: 0.096
Feature: [19] Education, Score: 0.006
Feature: [20] Income, Score: 0.020


In [142]:
print(f"Classification Report for ENN data")
print(classification_report(y_test, y_predict))

print(f"Confusion Matrix")
print(f"{confusion_matrix(y_test, y_predict)}\n")
#using confusion_matrix
# true neg  | false posi
# false neg | true pos

print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_predict)}")

print(f"Predict Values with Probability")
pred_probas = rf.predict_proba(X_test)
print(pred_probas)

print(f"\n Predicted Class 1 Values with Probability")
pred_probas_class1 = [prob[1] for prob in pred_probas]
print(pred_probas_class1[0:5])

print(f"\n roc_auc score: {roc_auc_score(y_test, pred_probas_class1)}")

Classification Report for ENN data
              precision    recall  f1-score   support

           0       0.93      0.83      0.88     54402
           1       0.37      0.60      0.46      9018

    accuracy                           0.80     63420
   macro avg       0.65      0.72      0.67     63420
weighted avg       0.85      0.80      0.82     63420

Confusion Matrix
[[45158  9244]
 [ 3563  5455]]

Balanced Accuracy Score: 0.7174905424864644
Predict Values with Probability
[[0.26589874 0.73410126]
 [0.6561142  0.3438858 ]
 [0.77830204 0.22169796]
 ...
 [0.15774228 0.84225772]
 [0.8797303  0.1202697 ]
 [0.77976576 0.22023424]]

 Predicted Class 1 Values with Probability
[0.7341012556257401, 0.34388579748495807, 0.2216979552900285, 0.06549276941427004, 0.45954847235976254]

 roc_auc score: 0.8168541220643974


### RF + ClusterCentroids undersampled dataset 

In [143]:
#randomforest for data that has been undersampled using ClusterCentroid data

rf = RandomForestClassifier(n_estimators= 10, min_samples_leaf= 96, max_leaf_nodes= 92, max_depth= 38)
rf.fit(X_cc_resampled, y_cc_resampled)
y_predict = rf.predict(X_test)


In [144]:
print(f"Classification Report for ClusterCentroid Undersampled Data")
print(classification_report(y_test, y_predict))

print(f"Confusion Matrix")
print(f"{confusion_matrix(y_test, y_predict)}\n")
#using confusion_matrix
# true neg  | false posi
# false neg | true pos

print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_predict)}")

print(f"Predict Values with Probability")
pred_probas = rf.predict_proba(X_test)
print(pred_probas)

print(f"\n Predicted Class 1 Values with Probability")
pred_probas_class1 = [prob[1] for prob in pred_probas]
print(pred_probas_class1[0:5])

print(f"\n roc_auc score: {roc_auc_score(y_test, pred_probas_class1)}")

Classification Report for ClusterCentroid Undersampled Data
              precision    recall  f1-score   support

           0       0.95      0.40      0.56     54402
           1       0.19      0.88      0.32      9018

    accuracy                           0.47     63420
   macro avg       0.57      0.64      0.44     63420
weighted avg       0.84      0.47      0.53     63420

Confusion Matrix
[[21650 32752]
 [ 1089  7929]]

Balanced Accuracy Score: 0.6386024135692441
Predict Values with Probability
[[0.15689742 0.84310258]
 [0.34288509 0.65711491]
 [0.54304415 0.45695585]
 ...
 [0.06057665 0.93942335]
 [0.62737334 0.37262666]
 [0.16376005 0.83623995]]

 Predicted Class 1 Values with Probability
[0.8431025797218232, 0.6571149054210936, 0.45695584975902265, 0.3492140472235331, 0.5565380661081328]

 roc_auc score: 0.6983311143236853


---
# RF + Randomized Search CV + New Hypertuning and Re-scoring.

   Removing the following as they consistently show they don't add much value for predicting y according to feature impotance_: 
    [2] CholCheck, [5] Stroke,  [10] HvyAlcoholConsump, [11] AnyHealthcare, Score: 0.005, [12] NoDocbcCost, Score: 0.008
   
   Also, I have a significant quantity of false positives and false negatives. I'll try randomizedsearch against randomized forest to see if I can improve the rf predictions. I will use the parameters retroactively in all the rf modeling I used in relevent cells above.

In [145]:
#creating new test train data for data excluding CholCheck, Stroke, HvyAlcoholConsump, AnyHealthcare, NoDocbcCost 
df2 = df.drop(columns=[ "CholCheck", "Stroke", "HvyAlcoholConsump", "AnyHealthcare", "NoDocbcCost"])
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 17 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   Diabetes_binary       253680 non-null  int32
 1   HighBP                253680 non-null  int32
 2   HighChol              253680 non-null  int32
 3   BMI                   253680 non-null  int32
 4   Smoker                253680 non-null  int32
 5   HeartDiseaseorAttack  253680 non-null  int32
 6   PhysActivity          253680 non-null  int32
 7   Fruits                253680 non-null  int32
 8   Veggies               253680 non-null  int32
 9   GenHlth               253680 non-null  int32
 10  MentHlth              253680 non-null  int32
 11  PhysHlth              253680 non-null  int32
 12  DiffWalk              253680 non-null  int32
 13  Sex                   253680 non-null  int32
 14  Age                   253680 non-null  int32
 15  Education             253680 non-n

In [146]:
# Creating test and train data for df2
X = df2.copy()
X = X.drop(columns="Diabetes_binary")
y = df2["Diabetes_binary"]

X_train_df2, X_test_df2, y_train_df2, y_test_df2 = train_test_split(X,y, random_state = 10)
X_train_df2.head()

Unnamed: 0,HighBP,HighChol,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
147485,1,0,30,0,0,0,0,1,3,5,0,0,0,7,5,5
155828,1,1,29,0,0,1,0,0,2,0,0,0,0,7,5,8
167688,0,0,23,0,0,1,0,1,2,0,25,0,0,9,6,8
43443,0,0,23,0,0,1,1,1,1,0,0,0,0,7,6,8
107094,1,1,34,0,0,1,0,0,3,0,0,0,0,8,4,6


In [147]:
#creating parameter grid for randomized search estimator use
param_grid = {
    'max_depth': np.arange(2,100,2),
    'n_estimators': [10, 50, 60],
    'min_samples_leaf': np.arange(1, 101),
    'max_leaf_nodes': np.arange(2,100,2),
}
param_grid


{'max_depth': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34,
        36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68,
        70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98]),
 'n_estimators': [10, 50, 60],
 'min_samples_leaf': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100]),
 'max_leaf_nodes': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34,
        36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58

In [148]:
from sklearn.model_selection import RandomizedSearchCV
random_rf = RandomizedSearchCV(rf, param_grid, cv=3, random_state = 10)
random_rf.fit(X_train_df2, y_train_df2)

In [149]:
# List the best parameters for this dataset
best_params = random_rf.best_params_
print(best_params)

{'n_estimators': 10, 'min_samples_leaf': 96, 'max_leaf_nodes': 92, 'max_depth': 38}


In [150]:
y_predict = random_rf.predict(X_test_df2)

In [151]:
print(f"Classification Report for RandomizedSearchCV for Random Forest model Data")
print(classification_report(y_test_df2, y_predict))

print(f"Confusion Matrix")
print(f"{confusion_matrix(y_test_df2, y_predict)}\n")
#using confusion_matrix
# true neg  | false posi
# false neg | true pos

print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_predict)}")

print(f"Predict Values with Probability")
pred_probas = random_rf.predict_proba(X_test_df2)
print(pred_probas)

print(f"\n Predicted Class 1 Values with Probability")
pred_probas_class1 = [prob[1] for prob in pred_probas]
print(pred_probas_class1[0:5])

print(f"\n roc_auc score: {roc_auc_score(y_test_df2, pred_probas_class1)}")

Classification Report for RandomizedSearchCV for Random Forest model Data
              precision    recall  f1-score   support

           0       0.87      0.99      0.93     54402
           1       0.62      0.08      0.14      9018

    accuracy                           0.86     63420
   macro avg       0.74      0.54      0.53     63420
weighted avg       0.83      0.86      0.81     63420

Confusion Matrix
[[53965   437]
 [ 8302   716]]

Balanced Accuracy Score: 0.5356819845597336
Predict Values with Probability
[[0.65208408 0.34791592]
 [0.8146048  0.1853952 ]
 [0.9388287  0.0611713 ]
 ...
 [0.63158504 0.36841496]
 [0.87477269 0.12522731]
 [0.85701604 0.14298396]]

 Predicted Class 1 Values with Probability
[0.34791592363907486, 0.1853951959688368, 0.061171299755639785, 0.033459297300954094, 0.094347692485297]

 roc_auc score: 0.8180181298453137


### RandomForest w EditedNearestNeighbors (Undersampled) data 

In [152]:
# Instantiate ENN instance

enn = EditedNearestNeighbours(sampling_strategy = "all")

# Fitting the training data
X_enn_resampled, y_enn_resampled = enn.fit_resample(X_train, y_train)

display(y_enn_resampled.value_counts())
display(y_train.value_counts())

Diabetes_binary
0    120949
1      1194
Name: count, dtype: int64

Diabetes_binary
0    163932
1     26328
Name: count, dtype: int64

In [153]:
# rf modeling against understampled data using bes_param parameters
rf = RandomForestClassifier(n_estimators= 10, min_samples_leaf= 96, max_leaf_nodes= 92, max_depth= 38)
rf.fit(X_enn_resampled, y_enn_resampled)
y_predict = rf.predict(X_test)

In [155]:
print(f"Classification Report for RandomForest on Undersampled Data")
print(classification_report(y_test, y_predict))

print(f"Confusion Matrix")
print(f"{confusion_matrix(y_test, y_predict)}\n")
#using confusion_matrix
# true neg  | false posi
# false neg | true pos

print(f"Balanced Accuracy Score: {balanced_accuracy_score(y_test, y_predict)}")

print(f"Predict Values with Probability")
pred_probas = random_rf.predict_proba(X_test_df2)
print(pred_probas)

print(f"\n Predicted Class 1 Values with Probability")
pred_probas_class1 = [prob[1] for prob in pred_probas]
print(pred_probas_class1[0:5])

print(f"\n roc_auc score: {roc_auc_score(y_test, pred_probas_class1)}")

Classification Report for RandomForest on Undersampled Data
              precision    recall  f1-score   support

           0       0.86      1.00      0.92     54402
           1       0.83      0.00      0.00      9018

    accuracy                           0.86     63420
   macro avg       0.85      0.50      0.46     63420
weighted avg       0.85      0.86      0.79     63420

Confusion Matrix
[[54400     2]
 [ 9008    10]]

Balanced Accuracy Score: 0.5005360649850868
Predict Values with Probability
[[0.65208408 0.34791592]
 [0.8146048  0.1853952 ]
 [0.9388287  0.0611713 ]
 ...
 [0.63158504 0.36841496]
 [0.87477269 0.12522731]
 [0.85701604 0.14298396]]

 Predicted Class 1 Values with Probability
[0.34791592363907486, 0.1853951959688368, 0.061171299755639785, 0.033459297300954094, 0.094347692485297]

 roc_auc score: 0.8180181298453137
