In [1]:
# Import the data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
df = pd.read_csv('./data/diabetes_binary_health_indicators_BRFSS2015.csv')

df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [4]:
def calc_vif(X:pd.DataFrame):
    """
    Calculates the VIF scores for a feature DataFrame.
    """
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i)\
                  for i in range(X.shape[1])]
    vif.sort_values('VIF', ascending=True, inplace=True)
    return(vif)

In [5]:
def vif_removal_priority(X:pd.DataFrame, threshold:int):
    """
    Iteratively drops the feature with highest VIF until all
    scores fall below a given threshold.

    Returns the list of features that should be dropped, paired
    with the calculated VIF.

    :param X: Pandas DataFrame containing the features.
    :param threshold: The VIF threshold over which features should be dropped.
    """
    X_tmp = X.copy()
    drop_list = []
    vif = calc_vif(X_tmp)
    while vif['VIF'].max() >= threshold:
        to_remove = vif.iloc[-1]['variables']
        score = vif.iloc[-1]['VIF']
        drop_list.append([to_remove, score])
        X_tmp.drop(to_remove, inplace=True, axis='columns')
        vif = calc_vif(X_tmp)
    return drop_list

vif = vif_removal_priority(df, 10)



In [6]:
vif

[['Education', 29.584451146273683],
 ['CholCheck', 22.245439651302462],
 ['AnyHealthcare', 18.1501738000634],
 ['BMI', 14.7838897768036]]

In [7]:
target = df['Diabetes_binary']
target_names = ['negative', 'positive']
data = df.drop(['Diabetes_binary','Education','CholCheck','AnyHealthcare','BMI'], axis=1)
data.head()

Unnamed: 0,HighBP,HighChol,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Income
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,3.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,1.0
2,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,8.0
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,6.0
4,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,4.0


In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [9]:
y_train.value_counts()

Diabetes_binary
0.0    163677
1.0     26583
Name: count, dtype: int64

In [10]:
untuned_model = KNeighborsClassifier()
grid_tuned_model = KNeighborsClassifier()
random_tuned_model = KNeighborsClassifier()


In [11]:
## Train a model without tuning

untuned_model.fit(X_train, y_train)
untuned_y_pred = untuned_model.predict(X_test)
print(classification_report(y_test, untuned_y_pred,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.88      0.95      0.91     54657
    positive       0.37      0.17      0.24      8763

    accuracy                           0.84     63420
   macro avg       0.62      0.56      0.57     63420
weighted avg       0.81      0.84      0.82     63420



In [12]:
# Create the grid search estimator along with a parameter object containing the values to adjust.
# Try adjusting n_neighbors with values of 1 through 19. Adjust leaf_size by using 10, 50, 100, and 500.
# Include both uniform and distance options for weights.
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
    'weights': ['uniform', 'distance'],
    'leaf_size': [10, 50, 100, 500]
}
grid_clf = GridSearchCV(grid_tuned_model, param_grid, verbose=3)

In [13]:
# Fit the model by using the grid search estimator.
# This will take the KNN model and try each combination of parameters.
grid_clf.fit(X_train, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV 1/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.799 total time=   4.2s
[CV 2/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.799 total time=   4.0s
[CV 3/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.798 total time=   4.1s
[CV 4/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.799 total time=   4.0s
[CV 5/5] END leaf_size=10, n_neighbors=1, weights=uniform;, score=0.798 total time=   4.2s
[CV 1/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.799 total time=   2.5s
[CV 2/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.799 total time=   2.6s
[CV 3/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.798 total time=   2.6s
[CV 4/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.799 total time=   2.6s
[CV 5/5] END leaf_size=10, n_neighbors=1, weights=distance;, score=0.798 total time=   2.6s
[CV 1/5] END leaf_size=

In [14]:
# List the best parameters for this dataset
print(grid_clf.best_params_)

{'leaf_size': 10, 'n_neighbors': 19, 'weights': 'uniform'}


In [15]:
# Print the classification report for the best model
grid_y_pred = grid_clf.predict(X_test)
print(classification_report(y_test, grid_y_pred,
                            target_names=target_names))

              precision    recall  f1-score   support

    negative       0.87      0.98      0.92     54657
    positive       0.45      0.08      0.13      8763

    accuracy                           0.86     63420
   macro avg       0.66      0.53      0.53     63420
weighted avg       0.81      0.86      0.81     63420



In [16]:
# Create the parameter object for the randomized search estimator.
# Try adjusting n_neighbors with values of 1 through 19. 
# Adjust leaf_size by using a range from 1 to 500.
# Include both uniform and distance options for weights.
param_grid = {
    'n_neighbors': np.arange(1,20,2),
    'weights': ['uniform', 'distance'],
    'leaf_size': np.arange(1, 500)
}
param_grid

{'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
 'weights': ['uniform', 'distance'],
 'leaf_size': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        1

In [17]:
from sklearn.model_selection import RandomizedSearchCV
random_clf = RandomizedSearchCV(random_tuned_model, param_grid, random_state=0, verbose=3)

In [18]:
# Fit the model by using the randomized search estimator.
random_clf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END leaf_size=137, n_neighbors=13, weights=uniform;, score=0.856 total time=   4.0s
[CV 2/5] END leaf_size=137, n_neighbors=13, weights=uniform;, score=0.855 total time=   4.1s
[CV 3/5] END leaf_size=137, n_neighbors=13, weights=uniform;, score=0.854 total time=   4.0s
[CV 4/5] END leaf_size=137, n_neighbors=13, weights=uniform;, score=0.856 total time=   4.2s
[CV 5/5] END leaf_size=137, n_neighbors=13, weights=uniform;, score=0.855 total time=   4.1s
[CV 1/5] END leaf_size=493, n_neighbors=5, weights=distance;, score=0.840 total time=   2.5s
[CV 2/5] END leaf_size=493, n_neighbors=5, weights=distance;, score=0.839 total time=   2.7s
[CV 3/5] END leaf_size=493, n_neighbors=5, weights=distance;, score=0.839 total time=   2.6s
[CV 4/5] END leaf_size=493, n_neighbors=5, weights=distance;, score=0.839 total time=   2.7s
[CV 5/5] END leaf_size=493, n_neighbors=5, weights=distance;, score=0.838 total time=   2.7s
[CV 1/5] 

In [19]:
y_pred = random_clf.predict(X_test)
print(balanced_accuracy_score(y_test, y_pred))

0.5366169658456422


In [20]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components = 15)
pca_model.fit(X_train)

X_train_pca = pd.DataFrame(pca_model.transform(X_train))
X_test_pca = pd.DataFrame(pca_model.transform(X_test))
X_train_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,-3.743177,-1.667831,-1.567135,-2.017032,0.265452,-0.612122,-0.339380,0.242338,-0.157481,0.644471,0.130309,0.031599,-0.045841,-0.035294,0.009049
1,0.181198,8.184763,0.882886,2.171702,0.528920,-0.445378,-0.301639,0.371047,-0.077347,-1.097567,0.565980,-0.253671,0.629115,-0.244015,-0.206847
2,-5.300837,-0.538191,-0.908287,-1.470785,-2.155466,0.746233,-0.172056,-0.174137,-0.152091,0.182574,-0.516064,-0.320252,-0.267313,-0.247184,-0.022699
3,-3.849846,2.217838,0.986353,-1.583121,-0.004760,-0.135085,-0.381227,-0.400749,-0.815440,0.533584,0.099711,0.116507,-0.015582,-0.032689,-0.063626
4,-2.769048,3.890519,0.792724,-1.666356,-0.123028,0.080940,-0.894538,-0.237924,-0.820946,-0.199403,-0.204874,0.128362,-0.097650,-0.070859,-0.070482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190255,2.975799,12.515899,1.157784,4.875465,1.990520,0.357062,-0.370824,-0.264392,-0.364352,-0.758633,-0.327573,0.915031,-0.067068,0.022979,0.758004
190256,-5.263480,-0.722291,-2.961815,-0.928307,-0.959877,0.890377,-0.192031,-0.101320,-0.074340,0.168763,-0.458757,-0.322908,-0.196665,-0.201217,-0.016721
190257,-2.919774,-2.101674,-0.509513,-1.916832,0.266932,-0.589136,-0.398428,0.244464,-0.178885,0.626429,0.129022,0.035944,-0.044633,-0.025230,-0.006822
190258,-5.352816,-0.094375,2.922768,0.863209,0.228053,-0.657302,0.185327,0.216744,-0.182035,-0.308506,0.761144,-0.211833,-0.087520,0.103647,0.909933


In [21]:
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
    'weights': ['uniform', 'distance'],
    'leaf_size': [10, 50, 100, 500]
}
random_knn = RandomizedSearchCV(KNeighborsClassifier(), param_grid, verbose=3)

random_knn.fit(X_train_pca, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END leaf_size=10, n_neighbors=17, weights=distance;, score=0.851 total time=   5.5s
[CV 2/5] END leaf_size=10, n_neighbors=17, weights=distance;, score=0.850 total time=   5.4s
[CV 3/5] END leaf_size=10, n_neighbors=17, weights=distance;, score=0.850 total time=   5.3s
[CV 4/5] END leaf_size=10, n_neighbors=17, weights=distance;, score=0.849 total time=   5.3s
[CV 5/5] END leaf_size=10, n_neighbors=17, weights=distance;, score=0.849 total time=   5.3s
[CV 1/5] END leaf_size=500, n_neighbors=11, weights=distance;, score=0.848 total time=   7.7s
[CV 2/5] END leaf_size=500, n_neighbors=11, weights=distance;, score=0.847 total time=   7.6s
[CV 3/5] END leaf_size=500, n_neighbors=11, weights=distance;, score=0.847 total time=   7.7s
[CV 4/5] END leaf_size=500, n_neighbors=11, weights=distance;, score=0.846 total time=   7.8s
[CV 5/5] END leaf_size=500, n_neighbors=11, weights=distance;, score=0.846 total time=   7.5s
[CV 

In [22]:
y_pred = random_knn.predict(X_test_pca)
print(balanced_accuracy_score(y_test, y_pred))

0.5374632225267764


In [23]:
# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=1)

In [24]:
# Fit the training data to the random undersampler model
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

In [25]:
# Count distinct values for the resampled target data
y_undersampled.value_counts()

Diabetes_binary
0.0    26583
1.0    26583
Name: count, dtype: int64

In [26]:
model_undersampled = KNeighborsClassifier()
model_undersampled.fit(X_undersampled, y_undersampled)

In [27]:
# Predict labels for oversampled testing features
y_pred_undersampled = model_undersampled.predict(X_test)

In [28]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Undersampled Data")
print(classification_report(y_test, y_pred_undersampled))

Classification Report - Original Data
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     54657
         1.0       0.44      0.09      0.15      8763

    accuracy                           0.86     63420
   macro avg       0.66      0.54      0.54     63420
weighted avg       0.81      0.86      0.82     63420

---------
Classification Report - Undersampled Data
              precision    recall  f1-score   support

         0.0       0.91      0.81      0.86     54657
         1.0       0.29      0.50      0.37      8763

    accuracy                           0.77     63420
   macro avg       0.60      0.65      0.61     63420
weighted avg       0.82      0.77      0.79     63420



In [29]:
# Import RandomOverSampler from imblearn
from imblearn.over_sampling import RandomOverSampler

# Instantiate a RandomOversampler instance
ros = RandomOverSampler(random_state=1)

In [30]:
# Fit the training data to the `RandomOverSampler` model
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

In [31]:
# Count distinct values
y_oversampled.value_counts()

Diabetes_binary
0.0    163677
1.0    163677
Name: count, dtype: int64

In [32]:
# Instantiate a new RandomForestClassier model
model_oversampled = KNeighborsClassifier()

# Fit the oversampled data the new model
model_oversampled.fit(X_oversampled, y_oversampled)

In [33]:
# Predict labels for oversampled testing features
y_pred_oversampled = model_oversampled.predict(X_test)

In [34]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Undersampled Data")
print(classification_report(y_test, y_pred_undersampled))
print("---------")
print(f"Classification Report - Oversampled Data")
print(classification_report(y_test, y_pred_oversampled))

Classification Report - Original Data
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     54657
         1.0       0.44      0.09      0.15      8763

    accuracy                           0.86     63420
   macro avg       0.66      0.54      0.54     63420
weighted avg       0.81      0.86      0.82     63420

---------
Classification Report - Undersampled Data
              precision    recall  f1-score   support

         0.0       0.91      0.81      0.86     54657
         1.0       0.29      0.50      0.37      8763

    accuracy                           0.77     63420
   macro avg       0.60      0.65      0.61     63420
weighted avg       0.82      0.77      0.79     63420

---------
Classification Report - Oversampled Data
              precision    recall  f1-score   support

         0.0       0.89      0.86      0.88     54657
         1.0       0.30      0.36      0.32      8763

    accuracy                           0.

In [35]:
from imblearn.under_sampling import ClusterCentroids

# Instantiate a ClusterCentroids instance
cc_sampler = ClusterCentroids(random_state=1)

In [36]:
X_resampled, y_resampled = cc_sampler.fit_resample(X_train, y_train)



In [37]:
# Count distinct values for the resampled target data
y_resampled.value_counts()

Diabetes_binary
0.0    26583
1.0    26583
Name: count, dtype: int64

In [38]:
# Instantiate a new RandomForestClassier model
cc_model = KNeighborsClassifier()

# Fit the resampled data the new model
cc_model.fit(X_resampled, y_resampled)

In [40]:
# Predict labels for resampled testing features
cc_y_pred = cc_model.predict(X_test)

In [41]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Resampled Data - CentroidClusters")
print(classification_report(y_test, cc_y_pred))

Classification Report - Original Data
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     54657
         1.0       0.44      0.09      0.15      8763

    accuracy                           0.86     63420
   macro avg       0.66      0.54      0.54     63420
weighted avg       0.81      0.86      0.82     63420

---------
Classification Report - Resampled Data - CentroidClusters
              precision    recall  f1-score   support

         0.0       0.90      0.58      0.70     54657
         1.0       0.18      0.59      0.28      8763

    accuracy                           0.58     63420
   macro avg       0.54      0.58      0.49     63420
weighted avg       0.80      0.58      0.64     63420



In [42]:
# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE instance 
# Set the sampling_strategy parameter equal to auto
smote_sampler = SMOTE(random_state=1, sampling_strategy='auto')

In [43]:
# Fit the training data to the smote_sampler model
X_resampled, y_resampled = smote_sampler.fit_resample(X_train, y_train)

In [44]:
# Count distinct values for the resampled target data
y_resampled.value_counts()

Diabetes_binary
0.0    163677
1.0    163677
Name: count, dtype: int64

In [45]:
# Instantiate a new RandomForestClassier model 
smote_model = KNeighborsClassifier()

# Fit the resampled data to the new model
smote_model.fit(X_resampled, y_resampled)

In [46]:
# Predict labels for resampled testing features
smote_y_pred = smote_model.predict(X_test)

In [47]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTE")
print(classification_report(y_test, smote_y_pred))

Classification Report - Original Data
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     54657
         1.0       0.44      0.09      0.15      8763

    accuracy                           0.86     63420
   macro avg       0.66      0.54      0.54     63420
weighted avg       0.81      0.86      0.82     63420

---------
Classification Report - Resampled Data - SMOTE
              precision    recall  f1-score   support

         0.0       0.91      0.76      0.83     54657
         1.0       0.27      0.54      0.36      8763

    accuracy                           0.73     63420
   macro avg       0.59      0.65      0.59     63420
weighted avg       0.82      0.73      0.77     63420



In [48]:
# Import SMOTEEN from imblearn
from imblearn.combine import SMOTEENN

# Instantiate the SMOTEENN instance
smote_enn = SMOTEENN(random_state=1)

In [49]:
# Fit the model to the training data
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

In [51]:
# Instantiate a new RandomForestClassier model
smoteenn_model = KNeighborsClassifier()

# Fit the resampled data the new model
smoteenn_model.fit(X_resampled, y_resampled)

In [52]:
# Predict labels for resampled testing features
smoteenn_y_pred = smoteenn_model.predict(X_test)

In [53]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred))
print("---------")
print(f"Classification Report - Resampled Data - SMOTEENN")
print(classification_report(y_test, smoteenn_y_pred))

Classification Report - Original Data
              precision    recall  f1-score   support

         0.0       0.87      0.98      0.92     54657
         1.0       0.44      0.09      0.15      8763

    accuracy                           0.86     63420
   macro avg       0.66      0.54      0.54     63420
weighted avg       0.81      0.86      0.82     63420

---------
Classification Report - Resampled Data - SMOTEENN
              precision    recall  f1-score   support

         0.0       0.93      0.67      0.78     54657
         1.0       0.25      0.69      0.37      8763

    accuracy                           0.67     63420
   macro avg       0.59      0.68      0.57     63420
weighted avg       0.84      0.67      0.72     63420

