In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Diabetes in America

Diabetes is an enormous public health problem as it probably affects more than 9% of U.S. adults

The goal of this project is to build a machine learning model that estimates the risk of having diabetes based of a few simple variables (income, education, age, bmi, etc).

## The BRFSS survey

The Behavioral Risk Factor Surveillance System (BRFSS) survey asks the participants a number of questions regarding health and health-related behaviours.
Of the many questions asked of the participants, one asks whether the respondent has diabetes.

Navigate to the [Behavioral Risk Factor Surveillance System data portal](https://www.cdc.gov/brfss/annual_data/annual_Data.htm), and download the 2019 BRFSS Survey [Data](https://www.cdc.gov/brfss/annual_data/2019/files/LLCP2019ASC.zip).
Unzip the file LLCP2019.ASC, and place it in your data folder.

The following cell will display the first row of the data file:

In [None]:
path = r'Data\LLCP2019.ASC'
with open(path) as file:
    first_line = file.readline()
    print(first_line)

Each record in the BRFSS file is a string without delimiters to identify variables (i.e., the file format is *fixed-width*).
Variables are located at established positions in the string.
The [codebook](https://www.cdc.gov/brfss/annual_data/2019/pdf/codebook19_llcp-v2-508.HTML) describes the variables and field positions.

The following table contains the positions for several variables

| Variable | Start | End |
| :- | :-: | :-: |
| Diabetes | 127 | 127
| General health | 101 | 101
| Education level | 174 | 174
| Employment status | 188 | 188
| Income level | 191 | 192
| Weight (in Pounds) | 193 | 196
| Height in (ft/inches) | 197 | 200
| Smoking Status | 2007 | 2007
| Alcohol consumption (drinks per week) | 2014 | 2018
| Heavy drinkers | 2019 | 2019
| Physical activity | 2101 | 2101
| Physical activities to Strengthen your muscles | 240 | 242
| Body Mass Index (BMI) | 1998 | 2001
| Reported age (in five-year age categories) | 1981 | 1982
| Sex | 1980 | 1980
| Number of children in household | 2004 | 2004
| Race | 1979 | 1979
| Metropolitan Status | 1402 | 1402

Don't forget that Python uses zero-indexing to reference characters in a string, so you will have to adjust the values in the table accordingly.

# Step 1: Create a Dataframe

In [None]:
'diabetes dictionary'
diabetes_dict = {}
with open(path) as file:
    for counter,line in enumerate(file): #(0, line 1), (1, line 2), (2, line 3),....
        # education (ordinal)
        education = line[173]
        if education in ['9',' ']:
            education = np.nan
        else:
            education = int(education)
        # income level (ordinal)
        income = line[190:192]
        if income in ['77','99','  ']:
            income = np.nan
        else:
            income = int(income)
        # Reported age (ordinal)
        age = line[1980:1982]
        if age == '14':
            age = np.nan
        else:
            age = int(age)
        # Computed body mass index (numerical)
        bmi = line[1997:2001]
        if bmi == '    ': # 4 blanck spaces
            bmi = np.nan
        else:
            bmi = float(bmi)/100 #  2 implied decimal places
        # diabetes
        diabetes = line[126]
        if diabetes == '1':
            diabetes = 1 # yes
        elif diabetes in ['2','3','4']:
            diabetes = 0 # no
        else:
            diabetes = np.nan
        diabetes_dict[counter] = {
                             'education' : education,
                             'employment' : employment,
                             'income' : income,
                             'age': age,
                             'bmi' : bmi,
                             'diabetes' : diabetes}

In [None]:
# put data into a dataframe
diabetes = pd.DataFrame.from_dict(diabetes_dict, orient='index')
diabetes.head(20)

In [None]:
# save dataframe
diabetes = df.to_csv('Data/diabetes.csv')

In [None]:
# missing values
diabetes.isnull().sum()

In [None]:
# drop rows which contain missing diabetes values
diabetes.dropna(subset=['diabetes'], axis=0, inplace=True)

## Step 2: Machine Learning pipeline

**goal**: predict whether a person has diabetes

In [None]:
# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

# models
from sklearn.linear_model import SGDClassifier # classifier for large datasets
from sklearn.neighbors import KNeighborsClassifier
# evaluation metrics
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,precision_score, precision_score, balanced_accuracy_score, f1_score

# cross validation
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split

# grid search 
from sklearn.model_selection import GridSearchCV

#display the pipeline
from sklearn import set_config                     
set_config(display='diagram')

In [None]:
# feature matrix target vector 
y = diabetes.diabetes
features = ['education', 'income', 'age', 'bmi']
X = diabetes[features]

In [None]:
# train/test split
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [None]:
# preprocessor for numerical features
numeric_features = ['bmi']
numeric_processor = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [None]:
# preprocessor for ordinal features
ordinal_features = ['education','income','age']
ordinal_processor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', MaxAbsScaler())
])

In [None]:
# preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('num',numeric_processor,numeric_features),
    ('ord',ordinal_processor,ordinal_features)
])

In [None]:
# pipeline = imputer + scaler + ML model
pipe = Pipeline(steps= [
    ('preprocessor', preprocessor),
    ('poly_features', PolynomialFeatures(degree=2)),
    ('clf', SGDClassifier(loss='log', alpha=1, n_jobs=-1, class_weight = 'balanced')) # logistic regression
])

We have an imbalanced dataset (most people in the survey do not have diabetes)

In [None]:
diabetes.diabetes.value_counts()

class_weight = 'balanced' places more emphasis on the minority class

In [None]:
# train the pipeline
pipe.fit(X_train,y_train)

## Step 3: assess the accuracy of the model

In [None]:
# predict
y_test_pred = pipe.predict(X_test)

In [None]:
## accuracy (percentage of correct classifications)
accuracy_score(y_test,y_test_pred)

In [None]:
## recall (if a person has diabets, how often is our prediction correct?)
recall_score(y_test,y_test_pred)

In [None]:
## precision (if we predict that a person has diabetes, how often is the prediction correct?)
precision_score(y_test,y_test_pred)

In [None]:
## confusion matrix
confusion_matrix(y_test,y_test_pred)

Too many false positives!

##  Assignment: Improve the diabetes predictor

Ideas:

- add more features (employment status, general health, smoking status, alcoholic drinks per week, physical activity, sex, race, etc)
- tune in hyperparameter (alpha, degree, etc)
- Adjust the classification threshold (see the "classification threshold" notebook)