In [6]:
import pandas as pd
import numpy as np
import sklearn

In [8]:
df = pd.read_csv("../Data-engineering/MLS-data-eng/archive/diabetes_binary_health_indicators_BRFSS2015.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253680 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       253680 non-null  float64
 1   HighBP                253680 non-null  float64
 2   HighChol              253680 non-null  float64
 3   CholCheck             253680 non-null  float64
 4   BMI                   253680 non-null  float64
 5   Smoker                253680 non-null  float64
 6   Stroke                253680 non-null  float64
 7   HeartDiseaseorAttack  253680 non-null  float64
 8   PhysActivity          253680 non-null  float64
 9   Fruits                253680 non-null  float64
 10  Veggies               253680 non-null  float64
 11  HvyAlcoholConsump     253680 non-null  float64
 12  AnyHealthcare         253680 non-null  float64
 13  NoDocbcCost           253680 non-null  float64
 14  GenHlth               253680 non-null  float64
 15  

In [9]:
df.isna().sum()

Diabetes_binary         0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64

In [10]:
df.isna().any(axis=1)

0         False
1         False
2         False
3         False
4         False
          ...  
253675    False
253676    False
253677    False
253678    False
253679    False
Length: 253680, dtype: bool

In [24]:
df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 229474 entries, 0 to 253679
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_binary       229474 non-null  float64
 1   HighBP                229474 non-null  float64
 2   HighChol              229474 non-null  float64
 3   CholCheck             229474 non-null  float64
 4   BMI                   229474 non-null  float64
 5   Smoker                229474 non-null  float64
 6   Stroke                229474 non-null  float64
 7   HeartDiseaseorAttack  229474 non-null  float64
 8   PhysActivity          229474 non-null  float64
 9   Fruits                229474 non-null  float64
 10  Veggies               229474 non-null  float64
 11  HvyAlcoholConsump     229474 non-null  float64
 12  AnyHealthcare         229474 non-null  float64
 13  NoDocbcCost           229474 non-null  float64
 14  GenHlth               229474 non-null  float64
 15  MentH

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression

train, test = train_test_split(df,
                               test_size=0.3,
                               stratify=df["Diabetes_binary"],
                               random_state=999)
len(train), len(test)

(160631, 68843)

In [26]:
train["Diabetes_binary"].value_counts()

Diabetes_binary
0.0    136063
1.0     24568
Name: count, dtype: int64

In [27]:
test["Diabetes_binary"].value_counts()

Diabetes_binary
0.0    58314
1.0    10529
Name: count, dtype: int64

In [28]:
152834/24742, 65500/10604 ## stratified sample method

(6.177107752000647, 6.176914371935119)

In [29]:
## Logistic Model with all feature in DataFrame
feature = train.drop("Diabetes_binary", axis=1).columns
diabetes = "Diabetes_binary"

x_train = train[feature]
y_train = train[diabetes]

x_test = test[feature]
y_test = test[diabetes]

model = LogisticRegression(solver='lbfgs', max_iter=500)
model.fit(x_train, y_train)

importance = permutation_importance(model, x_test, y_test)
importance = pd.Series(importance.importances_mean, index=feature)
importance.sort_values(ascending=False)
importance

HighBP                  0.001845
HighChol                0.001691
CholCheck               0.000177
BMI                     0.007333
Smoker                 -0.000110
Stroke                 -0.000032
HeartDiseaseorAttack    0.000224
PhysActivity           -0.000064
Fruits                 -0.000119
Veggies                -0.000020
HvyAlcoholConsump       0.000497
AnyHealthcare          -0.000046
NoDocbcCost            -0.000017
GenHlth                 0.006063
MentHlth                0.000110
PhysHlth                0.000285
DiffWalk                0.000116
Sex                     0.000206
Age                     0.000410
Education              -0.000293
Income                 -0.000046
dtype: float64

In [30]:
model.score(x_test, y_test)

0.8505875688160016

In [31]:
from sklearn.metrics import f1_score

y_pred = model.predict(x_test)
f1_score(y_test, y_pred)

np.float64(0.23603683897801545)

In [32]:
## Logistic Model without some features (minus feature importance) 
feature2 = train.drop(["Diabetes_binary","Stroke", "HeartDiseaseorAttack", "PhysActivity",
                       "Veggies", "AnyHealthcare", "NoDocbcCost",
                       "MentHlth", "DiffWalk", "Sex", "Age", "Education", "Income",
                       "Fruits", "Smoker"], axis=1).columns

x2_train = train[feature2]
y2_train = train[diabetes]

x2_test = test[feature2]
y2_test = test[diabetes]

model2 = LogisticRegression(solver='lbfgs', max_iter=500)
model2.fit(x2_train, y2_train)

importance2 = permutation_importance(model2, x2_test, y2_test)
importance2 = pd.Series(importance2.importances_mean, index=feature2)
importance.sort_values(ascending=False)
importance2

HighBP               0.003344
HighChol             0.003167
CholCheck            0.000305
BMI                  0.005749
HvyAlcoholConsump    0.000578
GenHlth              0.009233
PhysHlth             0.000456
dtype: float64

In [33]:
model2.score(x2_test, y2_test)

0.8512702816553608