In [60]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [61]:
# Reading in data
csv_path = "../data/diabetes_binary_health_indicators_BRFSS2015.csv"
data = pd.read_csv(csv_path)
data.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [62]:
data.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [63]:
data["Diabetes_binary"].value_counts()

Diabetes_binary
0.0    218334
1.0     35346
Name: count, dtype: int64

### Using `vif_removal_priority()` from `utils.py` to return a list of features that should be dropped

In [64]:
import utils as fu

fu.vif_removal_priority(X=data, threshold=10)

[['Education', 29.584451146273683],
 ['CholCheck', 22.245439651302405],
 ['AnyHealthcare', 18.1501738000634],
 ['BMI', 14.7838897768036]]

In [97]:
X = data.drop(columns=["Diabetes_binary", "Education", "CholCheck", "AnyHealthcare", "BMI"])
y = data["Diabetes_binary"]

In [98]:
X.columns

Index(['HighBP', 'HighChol', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
       'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'NoDocbcCost',
       'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Income'],
      dtype='object')

In [99]:
# Split data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [100]:
# Review the distinct values from y
y_train.value_counts()

Diabetes_binary
0.0    163677
1.0     26583
Name: count, dtype: int64

### Base `RandomForestClassifier()` model

In [101]:
# Instantiate a RandomForestClassifier instance
model = RandomForestClassifier()

# Fit the traning data to the model
model.fit(X_train, y_train)

# Predict labels for original scaled testing features
y_pred = model.predict(X_test)

### Using `RandomUnderSampler()` for imbalanced data

In [102]:
# Instantiate a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=42)

# Fit the training data to the random undersampler model
X_undersampled, y_undersampled = rus.fit_resample(X_train, y_train)

# Count distinct values for the resampled target data
y_undersampled.value_counts()

Diabetes_binary
0.0    26583
1.0    26583
Name: count, dtype: int64

### Making a new `RandomForestClassifier()` model with undersampled data

In [103]:
# Instantiate a RandomForestClassifier() model
model_undersampled = RandomForestClassifier()

# Fit the undersampled data the new model
model_undersampled.fit(X_undersampled, y_undersampled)

# Predict labels for oversampled testing features
y_pred_undersampled = model_undersampled.predict(X_test)

### Using `RandomOverSampler()` for imbalanced data

In [104]:
# Import RandomOverSampler from imblearn
from imblearn.over_sampling import RandomOverSampler

# Instantiate a RandomOversampler instance
ros = RandomOverSampler(random_state=42)

# Fit the training data to the `RandomOverSampler` model
X_oversampled, y_oversampled = ros.fit_resample(X_train, y_train)

# Count distinct values
y_oversampled.value_counts()

Diabetes_binary
0.0    163677
1.0    163677
Name: count, dtype: int64

### Making a new `RandomForestClassifier()` model with oversampled data

In [105]:
# Instantiate a new RandomForestClassier model
model_oversampled = RandomForestClassifier()

# Fit the oversampled data the new model
model_oversampled.fit(X_oversampled, y_oversampled)

# Predict labels for oversampled testing features
y_pred_oversampled = model_oversampled.predict(X_test)

In [106]:
# Print classification reports
print(f"Classification Report - Original Data")
print(classification_report(y_test, y_pred, labels=[1,0]))
print("------------------------------------------------------")
print(f"Classification Report - Undersampled Data")
print(classification_report(y_test, y_pred_undersampled, labels=[1,0]))
print("------------------------------------------------------")
print(f"Classification Report - Oversampled Data")
print(classification_report(y_test, y_pred_oversampled, labels=[1,0]))

Classification Report - Original Data
              precision    recall  f1-score   support

           1       0.39      0.17      0.24      8763
           0       0.88      0.96      0.92     54657

    accuracy                           0.85     63420
   macro avg       0.63      0.56      0.58     63420
weighted avg       0.81      0.85      0.82     63420

------------------------------------------------------
Classification Report - Undersampled Data
              precision    recall  f1-score   support

           1       0.27      0.75      0.39      8763
           0       0.94      0.67      0.78     54657

    accuracy                           0.68     63420
   macro avg       0.61      0.71      0.59     63420
weighted avg       0.85      0.68      0.73     63420

------------------------------------------------------
Classification Report - Oversampled Data
              precision    recall  f1-score   support

           1       0.30      0.36      0.33      8763
      

### Converting the dataset into an optimized data structure called Dmatrix 

In [107]:
# Import XGBoost
import xgboost as xgb


# Define data_dmatrix
data_dmatrix = xgb.DMatrix(data=X,label=y)

### Instantiating an `XGBoost()` classifier

In [108]:
# Import XGBClassifier
from xgboost import XGBClassifier

# Declare parameters
params = {
            'objective':'binary:logistic',
            'max_depth': 4,
            'alpha': 10,
            'learning_rate': .8,
            'n_estimators':200
        }
            
              
# Instantiate the classifier 
xgb_clf = XGBClassifier(**params)

# Fit the classifier to the training data
xgb_clf.fit(X_train, y_train)

# Make predictions on test data
clf_pred = xgb_clf.predict(X_test)

# Check accuracy score

print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

XGBoost model accuracy score: 0.8487


In [109]:
print(f"Classification Report - XGBoost")
print(classification_report(y_test, clf_pred, labels=[1,0]))

Classification Report - XGBoost
              precision    recall  f1-score   support

           1       0.52      0.12      0.20      8763
           0       0.87      0.98      0.93     54657

    accuracy                           0.86     63420
   macro avg       0.70      0.55      0.56     63420
weighted avg       0.82      0.86      0.82     63420



In [110]:
# Instantiate the classifier 
xgb_clf_over = XGBClassifier(**params)

# Fit the classifier to the training data
xgb_clf_over.fit(X_oversampled, y_oversampled)

# Make predictions on test data
clf_pred_over = xgb_clf_over.predict(X_test)

print(f"Classification Report - XGBoost Oversampled")
print(classification_report(y_test, clf_pred_over, labels=[1,0]))


Classification Report - XGBoost Oversampled
              precision    recall  f1-score   support

           1       0.29      0.78      0.42      8763
           0       0.95      0.70      0.80     54657

    accuracy                           0.71     63420
   macro avg       0.62      0.74      0.61     63420
weighted avg       0.86      0.71      0.75     63420



In [111]:
# Instantiate the classifier 
xgb_clf_under = XGBClassifier(**params)

# Fit the classifier to the training data
xgb_clf_under.fit(X_undersampled, y_undersampled)

# Make predictions on test data
clf_pred_under = xgb_clf_under.predict(X_test)

print(f"Classification Report - XGBoost Undersampled")
print(classification_report(y_test, clf_pred_under, labels=[1,0]))

Classification Report - XGBoost Undersampled
              precision    recall  f1-score   support

           1       0.29      0.79      0.42      8763
           0       0.95      0.68      0.79     54657

    accuracy                           0.70     63420
   macro avg       0.62      0.74      0.61     63420
weighted avg       0.86      0.70      0.74     63420



### Using `GridSearchCV()` to find best set of hyperparameters
### XGBoost GridSearch Oversampled

In [114]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [1, 0.1, 0.01],
    'subsample': [0.5, 0.7, 1],
    'n_estimators': [350, 450, 550]
}

# Create the XGBoost model object
xgb_over_grid = xgb.XGBClassifier()

# Create the GridSearchCV object
grid_search_over = GridSearchCV(xgb_over_grid, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search_over.fit(X_oversampled, y_oversampled)

# Make predictions on test data
grid_pred_over = grid_search_over.predict(X_test)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search_over.best_params_)
print("Best score: ", grid_search_over.best_score_)
print("------------------------------------------------------")
print(f"Classification Report - XGBoost GridSearch Oversampled")
print(classification_report(y_test, grid_pred_over, labels=[1,0]))

KeyboardInterrupt: 

In [113]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.metrics import classification_report

# Define the hyperparameter grid
param_distributions = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1],
    'n_estimators': [100, 250, 400]
}

# Create the XGBoost model object
xgb_model = xgb.XGBClassifier()

# Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(
    estimator=xgb_model, 
    param_distributions=param_distributions, 
    n_iter=100,  # Number of parameter settings that are sampled
    cv=5, 
    scoring='accuracy',
    random_state=42  # Ensuring reproducibility
)

# Fit the RandomizedSearchCV object to the training data
random_search.fit(X_oversampled, y_oversampled)

# Make predictions on test data
predictions = random_search.predict(X_test)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)
print("------------------------------------------------------")
print(f"Classification Report - XGBoost RandomSearch Oversampled")
print(classification_report(y_test, predictions, labels=[1, 0]))



Best set of hyperparameters:  {'subsample': 0.7, 'n_estimators': 400, 'max_depth': 7, 'learning_rate': 0.1}
Best score:  0.7799049357279475
------------------------------------------------------
Classification Report - XGBoost RandomSearch Oversampled
              precision    recall  f1-score   support

           1       0.30      0.73      0.42      8763
           0       0.94      0.73      0.82     54657

    accuracy                           0.73     63420
   macro avg       0.62      0.73      0.62     63420
weighted avg       0.85      0.73      0.77     63420



### XGBoost GridSearch Undersampled

In [95]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'subsample': [0.5, 0.7, 1],
    'n_estimators': [100, 250, 400]
}

# Create the XGBoost model object
xgb_under_grid = xgb.XGBClassifier()

# Create the GridSearchCV object
grid_search_under = GridSearchCV(xgb_under_grid, param_grid, cv=5, scoring='accuracy')

# Fit the GridSearchCV object to the training data
grid_search_under.fit(X_undersampled, y_undersampled)

# Make predictions on test data
grid_pred_under = grid_search_under.predict(X_test)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search_under.best_params_)
print("Best score: ", grid_search_under.best_score_)
print("------------------------------------------------------")
print(f"Classification Report - XGBoost GridSearch Undersampled")
print(classification_report(y_test, grid_pred_under, labels=[1,0]))

Best set of hyperparameters:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 400, 'subsample': 1}
Best score:  0.7358274629529846
------------------------------------------------------
Classification Report - XGBoost GridSearch Undersampled
              precision    recall  f1-score   support

           1       0.29      0.81      0.42      8763
           0       0.96      0.68      0.79     54657

    accuracy                           0.70     63420
   macro avg       0.62      0.74      0.61     63420
weighted avg       0.86      0.70      0.74     63420

