In [96]:
import pandas as pd
import numpy as np
import sklearn

In [115]:
df = pd.read_csv("./Data-engineering/MLS-data-eng/archive/diabetes_binary_health_indicators_BRFSS2015.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [116]:
df.isna().sum()

Diabetes_binary         0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64

In [117]:
df.isna().any(axis=1)

0         False
1         False
2         False
3         False
4         False
          ...  
253675    False
253676    False
253677    False
253678    False
253679    False
Length: 253680, dtype: bool

In [118]:
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [119]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression

train, test = train_test_split(df,
                               test_size=0.3,
                               stratify=df["Diabetes_binary"],
                               random_state=999)
len(train), len(test)

(177576, 76104)

In [120]:
train["Diabetes_binary"].value_counts()

Diabetes_binary
0.0    152834
1.0     24742
Name: count, dtype: int64

In [121]:
test["Diabetes_binary"].value_counts()

Diabetes_binary
0.0    65500
1.0    10604
Name: count, dtype: int64

In [122]:
152834/24742, 65500/10604 ## stratified sample method

(6.177107752000647, 6.176914371935119)

In [123]:
## Logistic Model with all feature in DataFrame
feature = train.drop("Diabetes_binary", axis=1).columns
diabetes = "Diabetes_binary"

x_train = train[feature]
y_train = train[diabetes]

x_test = test[feature]
y_test = test[diabetes]

model = LogisticRegression(solver='lbfgs', max_iter=500)
model.fit(x_train, y_train)

importance = permutation_importance(model, x_test, y_test)
importance = pd.Series(importance.importances_mean, index=feature)
importance.sort_values(ascending=False)
importance

HighBP                  0.001127
HighChol                0.000880
CholCheck               0.000058
BMI                     0.006746
Smoker                  0.000039
Stroke                 -0.000016
HeartDiseaseorAttack   -0.000202
PhysActivity           -0.000171
Fruits                 -0.000055
Veggies                -0.000029
HvyAlcoholConsump       0.000381
AnyHealthcare           0.000021
NoDocbcCost            -0.000058
GenHlth                 0.004767
MentHlth                0.000184
PhysHlth                0.000473
DiffWalk               -0.000200
Sex                     0.000289
Age                     0.000365
Education              -0.000218
Income                 -0.000426
dtype: float64

In [124]:
model.score(x_test, y_test)

0.8626484810259645

In [125]:
## Logistic Model without some features (minus feature importance) 
feature2 = train.drop(["Diabetes_binary","Stroke", "HeartDiseaseorAttack", "PhysActivity",
                       "Veggies", "AnyHealthcare", "NoDocbcCost",
                       "MentHlth", "DiffWalk", "Sex", "Age", "Education", "Income",
                       "Fruits", "Smoker"], axis=1).columns

x2_train = train[feature2]
y2_train = train[diabetes]

x2_test = test[feature2]
y2_test = test[diabetes]

model2 = LogisticRegression(solver='lbfgs', max_iter=500)
model2.fit(x2_train, y2_train)

importance2 = permutation_importance(model2, x2_test, y2_test)
importance2 = pd.Series(importance2.importances_mean, index=feature2)
importance.sort_values(ascending=False)
importance2

HighBP               0.002252
HighChol             0.002003
CholCheck            0.000307
BMI                  0.004780
HvyAlcoholConsump    0.000481
GenHlth              0.007739
PhysHlth             0.000586
dtype: float64

In [126]:
model2.score(x2_test, y2_test)

0.8634500157678966