In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("./data/diabetes_binary_health_indicators_BRFSS2015.csv")
df = pd.DataFrame(df)
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [3]:
df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [4]:
# converting data to integer
for col in df.columns:
    df[col] = df[col].astype("int")
    
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,1,1,1,40,1,0,0,0,0,...,1,0,5,18,15,1,0,9,4,3
1,0,0,0,0,25,1,0,0,1,0,...,0,1,3,0,0,0,0,7,6,1
2,0,1,1,1,28,0,0,0,0,1,...,1,1,5,30,30,1,0,9,4,8
3,0,1,0,1,27,0,0,0,1,1,...,1,0,2,0,0,0,0,11,3,6
4,0,1,1,1,24,0,0,0,1,1,...,1,0,2,3,0,0,0,11,5,4


In [5]:
#checking the data type of the columns
display(df.dtypes)

Diabetes_binary         int32
HighBP                  int32
HighChol                int32
CholCheck               int32
BMI                     int32
Smoker                  int32
Stroke                  int32
HeartDiseaseorAttack    int32
PhysActivity            int32
Fruits                  int32
Veggies                 int32
HvyAlcoholConsump       int32
AnyHealthcare           int32
NoDocbcCost             int32
GenHlth                 int32
MentHlth                int32
PhysHlth                int32
DiffWalk                int32
Sex                     int32
Age                     int32
Education               int32
Income                  int32
dtype: object

In [6]:
#checking balance of diabetic vs not diabetic 
display(df["Diabetes_binary"].value_counts())

#significant imbalance

Diabetes_binary
0    218334
1     35346
Name: count, dtype: int64

In [7]:
#splitting the data for train test purpose

X = df.copy()
X = X.drop(columns="Diabetes_binary")
y = df["Diabetes_binary"]

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 10)

In [8]:
X_train.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
147485,1,0,1,30,0,0,0,0,0,1,...,1,0,3,5,0,0,0,7,5,5
155828,1,1,1,29,0,0,0,1,0,0,...,1,0,2,0,0,0,0,7,5,8
167688,0,0,1,23,0,0,0,1,0,1,...,1,0,2,0,25,0,0,9,6,8
43443,0,0,1,23,0,0,0,1,1,1,...,1,0,1,0,0,0,0,7,6,8
107094,1,1,1,34,0,0,0,1,0,0,...,1,0,3,0,0,0,0,8,4,6


In [9]:
#using logistic regression model first as a baseline

classifier = LogisticRegression(max_iter=500)
classifier.fit(X_train, y_train)
display(classifier.score(X_train, y_train))
classifier.score(X_test, y_test)


0.8644591611479029

0.8611794386628824

In [22]:
#Checking balanced accuracy on the test set
y_pred = classifier.predict(X_test)
display(balanced_accuracy_score(y_test, y_pred))

#not great, barely better than random

0.565982136923413

In [21]:
#reviewing the randomforest model's accuracy using classification report
raw_model = RandomForestClassifier(random_state = 32, n_estimators = 100).fit(X_train, y_train)
y_rand_raw = raw_model.predict(X_test)
print(classification_report(y_test, y_rand_raw))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92     54402
           1       0.50      0.17      0.26      9018

    accuracy                           0.86     63420
   macro avg       0.69      0.57      0.59     63420
weighted avg       0.82      0.86      0.83     63420



In [24]:
#using confusion_matrix
# true neg  | false posi
# false neg | true pos

print(confusion_matrix(y_test, y_rand_raw))

[[52840  1562]
 [ 7441  1577]]


In [None]:
# using the SVC model

# model = SVC()
# model.fit(X_train, y_train)


In [None]:
# model.score(X_train, y_train)