In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

## Preprocess Data I

In [10]:
df = pd.read_csv("./data/diabetes_binary_health_indicators_BRFSS2015.csv")
df = pd.DataFrame(df)
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [11]:
df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [12]:
# converting data to integer
for col in df.columns:
    df[col] = df[col].astype("int")
    
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,1,40,1,0,0,0,0,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,0,25,1,0,0,1,0,...,0,1,3,0,0,0,0,7,6,1
2,0,1,1,1,28,0,0,0,0,1,...,1,1,5,30,30,1,0,9,4,8
3,0,1,0,1,27,0,0,0,1,1,...,1,0,2,0,0,0,0,11,3,6
4,0,1,1,1,24,0,0,0,1,1,...,1,0,2,3,0,0,0,11,5,4


In [13]:
#checking the data type of the columns
display(df.dtypes)

Diabetes_binary         int32
HighBP                  int32
HighChol                int32
CholCheck               int32
BMI                     int32
Smoker                  int32
Stroke                  int32
HeartDiseaseorAttack    int32
PhysActivity            int32
Fruits                  int32
Veggies                 int32
HvyAlcoholConsump       int32
AnyHealthcare           int32
NoDocbcCost             int32
GenHlth                 int32
MentHlth                int32
PhysHlth                int32
DiffWalk                int32
Sex                     int32
Age                     int32
Education               int32
Income                  int32
dtype: object

In [14]:
#checking balance of diabetic vs not diabetic 
display(df["Diabetes_binary"].value_counts())

#significant imbalance

Diabetes_binary
0    218334
1     35346
Name: count, dtype: int64

In [15]:
#splitting the data for later use

X = df.copy()
X = X.drop(columns="Diabetes_binary")
y = df["Diabetes_binary"]


## VIF Evaluation
### - Evaluation of Initial Features

In [16]:
# Using Variance Inflation Factor to assess usefulness of each column of the original df

from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF dataframe for original df
vif_df = pd.DataFrame()
vif_df["Features"] = X.columns 
  
# calculating VIF for each feature 
vif_df["Calculated VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))] 
  
print(vif_df)

# Following columns appear to distort the data: CholCheck, BMI, Veggies, AnyHealthcare, GenHlth, Age, Education, Income


                Features  Calculated VIF
0                 HighBP        2.299761
1               HighChol        2.029660
2              CholCheck       23.187436
3                    BMI       18.149913
4                 Smoker        1.933042
5                 Stroke        1.126777
6   HeartDiseaseorAttack        1.289820
7           PhysActivity        4.645314
8                 Fruits        3.032775
9                Veggies        5.826886
10     HvyAlcoholConsump        1.083523
11         AnyHealthcare       20.839710
12           NoDocbcCost        1.215834
13               GenHlth       10.740162
14              MentHlth        1.463103
15              PhysHlth        1.999556
16              DiffWalk        1.838551
17                   Sex        1.910708
18                   Age        9.886830
19             Education       29.507416
20                Income       14.156118


In [17]:
#created df where less useful columns are dropped

df_drop_cols = df.drop(columns=[ "CholCheck", "BMI", "Veggies", "AnyHealthcare", "GenHlth", "Age", "Education", "Income"])
df_drop_cols.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype
---  ------                --------------   -----
 0   Diabetes_binary       253680 non-null  int32
 1   HighBP                253680 non-null  int32
 2   HighChol              253680 non-null  int32
 3   Smoker                253680 non-null  int32
 4   Stroke                253680 non-null  int32
 5   HeartDiseaseorAttack  253680 non-null  int32
 6   PhysActivity          253680 non-null  int32
 7   Fruits                253680 non-null  int32
 8   HvyAlcoholConsump     253680 non-null  int32
 9   NoDocbcCost           253680 non-null  int32
 10  MentHlth              253680 non-null  int32
 11  PhysHlth              253680 non-null  int32
 12  DiffWalk              253680 non-null  int32
 13  Sex                   253680 non-null  int32
dtypes: int32(14)
memory usage: 13.5 MB


## Creating Test Data
### - Test Train for Initial DF and Updated DF

In [18]:
# Creating test and train data for initial df
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 10)
X_train.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
147485,1,0,1,30,0,0,0,0,0,1,...,1,0,3,5,0,0,0,7,5,5
155828,1,1,1,29,0,0,0,1,0,0,...,1,0,2,0,0,0,0,7,5,8
167688,0,0,1,23,0,0,0,1,0,1,...,1,0,2,0,25,0,0,9,6,8
43443,0,0,1,23,0,0,0,1,1,1,...,1,0,1,0,0,0,0,7,6,8
107094,1,1,1,34,0,0,0,1,0,0,...,1,0,3,0,0,0,0,8,4,6


In [19]:
# Updated X to contain only useful features
X = df_drop_cols.copy()

In [20]:
# Creating test and train data for updated df
X_train2, X_test2, y_train2, y_test2 = train_test_split(X,y, random_state = 10)
X_train2.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,HvyAlcoholConsump,NoDocbcCost,MentHlth,PhysHlth,DiffWalk,Sex
147485,0,1,0,0,0,0,0,0,0,0,5,0,0,0
155828,0,1,1,0,0,0,1,0,0,0,0,0,0,0
167688,0,0,0,0,0,0,1,0,0,0,0,25,0,0
43443,0,0,0,0,0,0,1,1,0,0,0,0,0,0
107094,0,1,1,0,0,0,1,0,0,0,0,0,0,0


### - Scaling Training Data

In [33]:
#scaling original features
scaler = StandardScaler().fit(X_train)
X_train_scaled_orig_array = scaler.transform(X_train)
X_test_scaled_orig_array = scaler.transform(X_test)

#scaling updated features (less useful columns removed)
scaler = StandardScaler().fit(X_train2)
X_train2_scaled_upd_array = scaler.transform(X_train2)

#updating X train, X test and X train2(less features) from array to df for both

X_train_scaled_orig = pd.DataFrame(X_train_scaled_orig_array, columns = X_train.columns)
display(X_train_scaled_orig.head())

X_test_scaled_upd = pd.DataFrame(X_test_scaled_orig_array, columns = X_train.columns)
display(X_test_scaled_upd.head())

X_train2_scaled_upd = pd.DataFrame(X_train2_scaled_upd_array, columns = X_train2.columns)
display(X_train2_scaled_upd.head())


Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.154676,-0.858429,0.196424,0.245045,-0.890018,-0.205234,-0.321197,-1.764702,-1.315001,0.481339,...,0.226199,-0.301905,0.45799,0.244789,-0.48613,-0.449885,-0.887839,-0.336148,-0.052925,-0.510599
1,1.154676,1.164919,0.196424,0.093554,-0.890018,-0.205234,-0.321197,0.566668,-1.315001,-2.077537,...,0.226199,-0.301905,-0.478879,-0.429758,-0.48613,-0.449885,-0.887839,-0.336148,-0.052925,0.938109
2,-0.866044,-0.858429,0.196424,-0.815396,-0.890018,-0.205234,-0.321197,0.566668,-1.315001,0.481339,...,0.226199,-0.301905,-0.478879,-0.429758,2.379837,-0.449885,-0.887839,0.318691,0.962658,0.938109
3,-0.866044,-0.858429,0.196424,-0.815396,-0.890018,-0.205234,-0.321197,0.566668,0.760455,0.481339,...,0.226199,-0.301905,-1.415747,-0.429758,-0.48613,-0.449885,-0.887839,-0.336148,0.962658,0.938109
4,1.154676,1.164919,0.196424,0.851011,-0.890018,-0.205234,-0.321197,0.566668,-1.315001,-2.077537,...,0.226199,-0.301905,0.45799,-0.429758,-0.48613,-0.449885,-0.887839,-0.008728,-1.068507,-0.027696


Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.154676,1.164919,0.196424,-0.360921,-0.890018,4.872497,3.113353,-1.764702,-1.315001,0.481339,...,0.226199,-0.301905,2.331728,-0.429758,2.953031,-0.449885,1.12633,1.30095,-1.068507,-0.027696
1,1.154676,1.164919,0.196424,-0.512413,1.123572,4.872497,-0.321197,0.566668,-1.315001,0.481339,...,0.226199,-0.301905,0.45799,2.94298,-0.48613,-0.449885,-0.887839,0.646111,0.962658,0.938109
2,-0.866044,-0.858429,0.196424,0.548028,-0.890018,-0.205234,-0.321197,0.566668,0.760455,0.481339,...,0.226199,-0.301905,-1.415747,-0.429758,-0.48613,-0.449885,-0.887839,0.646111,0.962658,0.938109
3,-0.866044,-0.858429,0.196424,-1.118379,-0.890018,-0.205234,-0.321197,0.566668,0.760455,0.481339,...,0.226199,-0.301905,-1.415747,0.379699,-0.48613,-0.449885,1.12633,0.973531,0.962658,-0.510599
4,-0.866044,-0.858429,0.196424,0.548028,1.123572,-0.205234,-0.321197,0.566668,0.760455,-2.077537,...,0.226199,-0.301905,-1.415747,-0.429758,0.087064,-0.449885,1.12633,0.646111,-0.052925,-0.027696


Unnamed: 0,Diabetes_binary,HighBP,HighChol,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,HvyAlcoholConsump,NoDocbcCost,MentHlth,PhysHlth,DiffWalk,Sex
0,-0.400753,1.154676,-0.858429,-0.890018,-0.205234,-0.321197,-1.764702,-1.315001,-0.244389,-0.301905,0.244789,-0.48613,-0.449885,-0.887839
1,-0.400753,1.154676,1.164919,-0.890018,-0.205234,-0.321197,0.566668,-1.315001,-0.244389,-0.301905,-0.429758,-0.48613,-0.449885,-0.887839
2,-0.400753,-0.866044,-0.858429,-0.890018,-0.205234,-0.321197,0.566668,-1.315001,-0.244389,-0.301905,-0.429758,2.379837,-0.449885,-0.887839
3,-0.400753,-0.866044,-0.858429,-0.890018,-0.205234,-0.321197,0.566668,0.760455,-0.244389,-0.301905,-0.429758,-0.48613,-0.449885,-0.887839
4,-0.400753,1.154676,1.164919,-0.890018,-0.205234,-0.321197,0.566668,-1.315001,-0.244389,-0.301905,-0.429758,-0.48613,-0.449885,-0.887839


### - VIF for Scaled Features (for Training Data)

In [27]:
#VIF for Scaled data
# calculating VIF for scaled trained orig features 
vif_df["Calculated Scaled VIF"] = [variance_inflation_factor(X_train_scaled_orig.values, i) for i in range(len(X_train_scaled_orig.columns))]

# calculating VIF for scaled trained updated features
vif_scaled_df = pd.DataFrame()
vif_scaled_df["Scaled Trained Features"] = X_train2_scaled_upd.columns
vif_scaled_df["Scaled Trained VIF"] = [variance_inflation_factor(X_train2_scaled_upd.values, i) for i in range(len(X_train2_scaled_upd.columns))] 
  
print(vif_df, end="\n\n\n")
print(vif_scaled_df)

                Features  Calculated VIF  Calculated Scaled VIF
0                 HighBP        2.299761               1.330071
1               HighChol        2.029660               1.172964
2              CholCheck       23.187436               1.032577
3                    BMI       18.149913               1.141016
4                 Smoker        1.933042               1.093238
5                 Stroke        1.126777               1.082002
6   HeartDiseaseorAttack        1.289820               1.170161
7           PhysActivity        4.645314               1.155657
8                 Fruits        3.032775               1.111643
9                Veggies        5.826886               1.112206
10     HvyAlcoholConsump        1.083523               1.024527
11         AnyHealthcare       20.839710               1.113788
12           NoDocbcCost        1.215834               1.143751
13               GenHlth       10.740162               1.792179
14              MentHlth        1.463103

## Initial Modeling - Logistic Regression

###  - Logistic Regression on Raw Data & Score (Baseline Model)

In [30]:
#using logistic regression model first as a baseline
# original raw data

classifier = LogisticRegression(max_iter=500)
classifier.fit(X_train, y_train)

# score for training data (evaluates how well the model performs on the training data)
print(f"Training Data Accuracy Score: {classifier.score(X_train, y_train)}")

#score for test data (evaluates model's ability to make predictions on unseen data)
print(f"Test Data Accuracy Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8644591611479029
Test Data Score: 0.8611794386628824


In [32]:
#Generating predictions using the test data
y_pred = classifier.predict(X_test)

#Checking balanced accuracy due to the significant imbalance of the data
print(f"Balanced Accuracy Score (score is more useful given data imbalance): {balanced_accuracy_score(y_test, y_pred)}")

#not great, barely better than random

Balanced Accuracy Score (score is more useful given data imbalance): 0.565982136923413


###  - Logistic Regression on Scaled Data & Score (Baseline Model)

In [35]:
#using logistic regression model on scaled X_train_scaled_orig and X_train2_scaled_upd

classifier.fit(X_train_scaled_orig, y_train)
y_pred_scaled = classifier.predict(X_test_scaled_upd)
                                  
# checking balanced accuracy for scaled data
print(f"Balanced Accuracy Score (score is more useful given data imbalance): {balanced_accuracy_score(y_test, y_pred_scaled)}")

#not great, barely better than random

Balanced Accuracy Score (score is more useful given data imbalance): 0.5661114079330035


---

## Preprocess Data II - Undersampling Data

     Due to significant imbalance and score, I don't think it is worth attempting RandomForest model unless the data's balance is improved. Next step undersamples the data. I selected undersampling rather than oversampling due to the larger balance (163932) being over 6x larger than the smaller balance (26328). The smaller balance is plenty and may predict better than attempting to predict using a significant amount of synthetic data compared to the actual data available.

### - RandomUnderSampler Technique

In [40]:
# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=63)

# Fitting the training data
X_rus_resampled, y_rus_resampled = rus.fit_resample(X_train, y_train)

# Fitting the scaled training data
X_rus_resampled_scaled, y_rus_resampled_scaled = rus.fit_resample(X_train_scaled_orig, y_train)

display(y_rus_resampled.value_counts())
display(y_train.value_counts())

Diabetes_binary
0    26328
1    26328
Name: count, dtype: int64

Diabetes_binary
0    163932
1     26328
Name: count, dtype: int64

### - ClusterCentroids Technique

In [None]:
from imblearn.under_sampling import ClusterCentroids

# Instantiate instance
enn = EditedNearestNeighbours(n_neighbors = 6, sampling_strategy = "auto")

# Fitting the training data
X_enn_resampled, y_enn_resampled = enn.fit_resample(X_train, y_train)

# Fitting the scaled training data
X_enn_resampled_scaled, y_enn_resampled_scaled = enn.fit_resample(X_train_scaled_orig, y_train)

display(y_enn_resampled.value_counts())
display(y_train.value_counts())

### - EditedNearestNeighbors Technique

In [None]:
from imblearn.under_sampling import EditedNearestNeighbours

# Instantiate instance
 = ClusterCentroids(random_state=63, n_init = "auto")

# Fitting the training data
X_cc_resampled, y_cc_resampled = cc.fit_resample(X_train, y_train)

# Fitting the scaled training data
X_cc_resampled_scaled, y_cc_resampled_scaled = cc.fit_resample(X_train_scaled_orig, y_train)

display(y_cc_resampled.value_counts())
display(y_train.value_counts())

## Modeling - RandomForest \*\*\*LEFT OFF\*\*\*

In [None]:
#reviewing the randomforest model's accuracy using classification report
# raw_model = RandomForestClassifier(random_state = 32, n_estimators = 100).fit(X_train, y_train)
# y_predict_rfc_raw_test = raw_model.predict(X_test)
# print(classification_report(y_test, y_predict_rfc_raw_test))

In [None]:
#using confusion_matrix
# true neg  | false posi
# false neg | true pos

#print(confusion_matrix(y_test, y_predict_rfc_raw_test))

In [None]:
# display(balanced_accuracy_score(y_test, y_predict_rfc_raw_test))

# Ignore below - Scratch Paper 

In [None]:
# Oversampling data using SMOTE
from imblearn.over_sampling import SMOTE
from collections import Counter

smt = SMOTE(random_state = 42)
X_train_sm, y_train_sm = smt.fit_resample(X_train,y_train)
X_test_sm, y_test_sm = smt.fit_resample(X_test,y_test)

#chatgpt suggested Counter to review the scale
print(Counter(y_train_sm))
print(Counter(y_test_sm))

#checked the original prediction from randomforest model)
print(Counter(y_predict_rfc_raw_test))


In [None]:
#creating train test data for the smote values

X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_train_sm, y_train_sm, random_state = 15)
X_train_smote2, X_test_smote2, y_train_smote2, y_test_smote2 = train_test_split(X_test_sm, y_test_sm, random_state = 15)

smt_model = RandomForestClassifier(random_state = 32, n_estimators = 100).fit(X_train_smote, y_train_smote)
smt_model2 = RandomForestClassifier(random_state = 32, n_estimators = 100).fit(X_train_smote2, y_train_smote2)

y_predicted_smt = smt_model.predict(X_test_smote)
y_predicted_smt2 = smt_model2.predict(X_test_smote2)


# for the train data
print(classification_report(y_train_smote, y_predicted_smt))

# for the test data
print(classification_report(y_train_smote2, y_predicted_smt2))


In [None]:
display(balanced_accuracy_score(y_train_smote, y_predicted_smt))
display(balanced_accuracy_score(y_train_smote2, y_predicted_smt2))

In [None]:
#attempting to use randomized search estimator - creating the required parameters first)
param_grid = {
    'n_neighbors': np.arange(1,20,2),
    'weights': ['uniform', 'distance'],
    'leaf_size': np.arange(1, 500)
}
param_grid

In [None]:
# Create the randomized search estimator
from sklearn.model_selection import RandomizedSearchCV
random_clf = RandomizedSearchCV(random_tuned_model, param_grid, random_state=0, verbose=3)