In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

## Preprocess Data I

In [None]:
df = pd.read_csv("./data/diabetes_binary_health_indicators_BRFSS2015.csv")
df = pd.DataFrame(df)
df.head()

In [None]:
df.dropna()
df.info()

In [None]:
# converting data to integer
for col in df.columns:
    df[col] = df[col].astype("int")
    
df.head()

In [None]:
#checking the data type of the columns
display(df.dtypes)

In [None]:
#checking balance of diabetic vs not diabetic 
display(df["Diabetes_binary"].value_counts())

#significant imbalance

In [None]:
#splitting the data for later use

X = df.copy()
X = X.drop(columns="Diabetes_binary")
y = df["Diabetes_binary"]


## VIF Evaluation
### - Evaluation of Initial Features

In [None]:
# Using Variance Inflation Factor to assess usefulness of each column of the original df

from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF dataframe for original df
vif_df = pd.DataFrame()
vif_df["Features"] = X.columns 
  
# calculating VIF for each feature 
vif_df["Calculated VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))] 
  
print(vif_df)

# Following columns appear to distort the data: CholCheck, BMI, Veggies, AnyHealthcare, GenHlth, Age, Education, Income


In [None]:
#created df where less useful columns are dropped

df_drop_cols = df.drop(columns=[ "CholCheck", "BMI", "Veggies", "AnyHealthcare", "GenHlth", "Age", "Education", "Income"])
df_drop_cols.info()



## Creating Test Data
### - Test Train for Initial DF and Updated DF

In [None]:
# Creating test and train data for initial df
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 10)
X_train.head()

In [None]:
# Updated X to contain only useful features
X = df_drop_cols.copy()

In [None]:
# Creating test and train data for updated df
X_train2, X_test2, y_train2, y_test2 = train_test_split(X,y, random_state = 10)
X_train2.head()

### - Scaling Training Data

In [None]:
#scaling original features
scaler = StandardScaler().fit(X_train)
X_train_scaled_orig_array = scaler.transform(X_train)
X_test_scaled_orig_array = scaler.transform(X_test)

#scaling updated features (less useful columns removed)
scaler = StandardScaler().fit(X_train2)
X_train2_scaled_upd_array = scaler.transform(X_train2)

#updating X train, X test and X train2(less features) from array to df for both

X_train_scaled_orig = pd.DataFrame(X_train_scaled_orig_array, columns = X_train.columns)
display(X_train_scaled_orig.head())

X_test_scaled_upd = pd.DataFrame(X_test_scaled_orig_array, columns = X_train.columns)
display(X_test_scaled_upd.head())

X_train2_scaled_upd = pd.DataFrame(X_train2_scaled_upd_array, columns = X_train2.columns)
display(X_train2_scaled_upd.head())


### - VIF for Scaled Features (for Training Data)

In [None]:
#VIF for Scaled data
# calculating VIF for scaled trained orig features 
vif_df["Calculated Scaled VIF"] = [variance_inflation_factor(X_train_scaled_orig.values, i) for i in range(len(X_train_scaled_orig.columns))]

# calculating VIF for scaled trained updated features
vif_scaled_df = pd.DataFrame()
vif_scaled_df["Scaled Trained Features"] = X_train2_scaled_upd.columns
vif_scaled_df["Scaled Trained VIF"] = [variance_inflation_factor(X_train2_scaled_upd.values, i) for i in range(len(X_train2_scaled_upd.columns))] 
  
print(vif_df, end="\n\n\n")
print(vif_scaled_df)

## Initial Modeling - Logistic Regression

###  - Logistic Regression on Raw Data & Score (Baseline Model)

In [None]:
#using logistic regression model first as a baseline
# original raw data

classifier = LogisticRegression(max_iter=500)
classifier.fit(X_train, y_train)

# score for training data (evaluates how well the model performs on the training data)
print(f"Training Data Accuracy Score: {classifier.score(X_train, y_train)}")

#score for test data (evaluates model's ability to make predictions on unseen data)
print(f"Test Data Accuracy Score: {classifier.score(X_test, y_test)}")

In [None]:
#Generating predictions using the test data
y_pred = classifier.predict(X_test)

#Checking balanced accuracy due to the significant imbalance of the data
print(f"Balanced Accuracy Score (score is more useful given data imbalance): {balanced_accuracy_score(y_test, y_pred)}")

#not great, barely better than random

###  - Logistic Regression on Scaled Data & Score (Baseline Model)

In [None]:
#using logistic regression model on scaled X_train_scaled_orig and X_train2_scaled_upd

classifier.fit(X_train_scaled_orig, y_train)
y_pred_scaled = classifier.predict(X_test_scaled_upd)
                                  
# checking balanced accuracy for scaled data
print(f"Balanced Accuracy Score (score is more useful given data imbalance): {balanced_accuracy_score(y_test, y_pred_scaled)}")

#not great, barely better than random

---

## Preprocess Data II - Undersampling Data

     Due to significant imbalance and score, I don't think it is worth attempting RandomForest model unless the data's balance is improved. Next step undersamples the data. I selected undersampling rather than oversampling due to the larger balance (163932) being over 6x larger than the smaller balance (26328). The smaller balance is plenty and may predict better than attempting to predict using a significant amount of synthetic data compared to the actual data available.

### - RandomUnderSampler Technique

In [None]:
# Import RandomUnderSampler from imblearn
from imblearn.under_sampling import RandomUnderSampler

# Instantiate a RandomUnderSampler instance
rus = RandomUnderSampler(random_state=63)

# Fitting the training data
X_rus_resampled, y_rus_resampled = rus.fit_resample(X_train, y_train)

# Fitting the scaled training data
X_rus_resampled_scaled, y_rus_resampled_scaled = rus.fit_resample(X_train_scaled_orig, y_train)

display(y_rus_resampled.value_counts())
display(y_train.value_counts())

### - ClusterCentroids Technique

In [None]:
from imblearn.under_sampling import ClusterCentroids

# Instantiate instance
cc = ClusterCentroids(random_state=63, n_init = "auto")

# Fitting the training data
X_cc_resampled, y_cc_resampled = cc.fit_resample(X_train, y_train)

# Fitting the scaled training data
X_cc_resampled_scaled, y_cc_resampled_scaled = cc.fit_resample(X_train_scaled_orig, y_train)

display(y_cc_resampled.value_counts())
display(y_train.value_counts())

### - EditedNearestNeighbors Technique

In [None]:
from imblearn.under_sampling import EditedNearestNeighbours

# Instantiate instance
enn = EditedNearestNeighbours(n_neighbors = 6, sampling_strategy = "auto")

# Fitting the training data
X_enn_resampled, y_enn_resampled = enn.fit_resample(X_train, y_train)

# Fitting the scaled training data
X_enn_resampled_scaled, y_enn_resampled_scaled = enn.fit_resample(X_train_scaled_orig, y_train)

display(y_enn_resampled.value_counts())
display(y_train.value_counts())

## Modeling - RandomForest \*\*\*LEFT OFF\*\*\*

In [None]:
#reviewing the randomforest model's accuracy using classification report
# raw_model = RandomForestClassifier(random_state = 32, n_estimators = 100).fit(X_train, y_train)
# y_predict_rfc_raw_test = raw_model.predict(X_test)
# print(classification_report(y_test, y_predict_rfc_raw_test))

In [None]:
#using confusion_matrix
# true neg  | false posi
# false neg | true pos

#print(confusion_matrix(y_test, y_predict_rfc_raw_test))

In [None]:
# display(balanced_accuracy_score(y_test, y_predict_rfc_raw_test))

# Ignore below - Scratch Paper 

In [None]:
# Oversampling data using SMOTE
from imblearn.over_sampling import SMOTE
from collections import Counter

smt = SMOTE(random_state = 42)
X_train_sm, y_train_sm = smt.fit_resample(X_train,y_train)
X_test_sm, y_test_sm = smt.fit_resample(X_test,y_test)

#chatgpt suggested Counter to review the scale
print(Counter(y_train_sm))
print(Counter(y_test_sm))

#checked the original prediction from randomforest model)
print(Counter(y_predict_rfc_raw_test))


In [None]:
#creating train test data for the smote values

X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_train_sm, y_train_sm, random_state = 15)
X_train_smote2, X_test_smote2, y_train_smote2, y_test_smote2 = train_test_split(X_test_sm, y_test_sm, random_state = 15)

smt_model = RandomForestClassifier(random_state = 32, n_estimators = 100).fit(X_train_smote, y_train_smote)
smt_model2 = RandomForestClassifier(random_state = 32, n_estimators = 100).fit(X_train_smote2, y_train_smote2)

y_predicted_smt = smt_model.predict(X_test_smote)
y_predicted_smt2 = smt_model2.predict(X_test_smote2)


# for the train data
print(classification_report(y_train_smote, y_predicted_smt))

# for the test data
print(classification_report(y_train_smote2, y_predicted_smt2))


In [None]:
display(balanced_accuracy_score(y_train_smote, y_predicted_smt))
display(balanced_accuracy_score(y_train_smote2, y_predicted_smt2))

In [None]:
#attempting to use randomized search estimator - creating the required parameters first)
param_grid = {
    'n_neighbors': np.arange(1,20,2),
    'weights': ['uniform', 'distance'],
    'leaf_size': np.arange(1, 500)
}
param_grid

In [None]:
# Create the randomized search estimator
from sklearn.model_selection import RandomizedSearchCV
random_clf = RandomizedSearchCV(random_tuned_model, param_grid, random_state=0, verbose=3)