In [None]:
pip install fancyimpute

In [1]:
# Data information: https://www.drivendata.org/competitions/66/flu-shot-learning/page/211/

# Install Dependencies

In [18]:
import os
import sys
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.multioutput import MultiOutputClassifier
from pandas.api.types import CategoricalDtype

In [21]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26707 entries, 0 to 26706
Data columns (total 36 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

In [22]:
training_data = pd.read_csv('training_set_features.csv')
training_labels = pd.read_csv('training_set_labels.csv')
training_data.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [5]:
training_labels[['h1n1_vaccine','seasonal_vaccine']]
training_labels.groupby(['h1n1_vaccine','seasonal_vaccine']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,h1n1_vaccine,seasonal_vaccine,count
0,0,0,13295
1,0,1,7738
2,1,0,977
3,1,1,4697


# EDA & Missing Data Removal

In [None]:
ProfileReport(training_data)

In [51]:
# Drop where household_adults or household_children are NaN; cannot coerce to int if NaN exists
training_data = training_data.dropna(axis=0, subset=['household_adults'])
training_data = training_data.dropna(axis=0, subset=['household_children'])

# Employment industry & occupation have nearly 50% missing data, so we can't reasonably impute
# -- We are dropping these fields from the data

del training_data['employment_industry']
del training_data['employment_occupation']
del training_data['health_insurance']
training_data.dropna()

In [52]:
# Create ordered categories where it applies

cat_concern   = CategoricalDtype(categories=['0','1','2','3'], ordered=True)
cat_knowledge = CategoricalDtype(categories=['0','1','2'], ordered=True)
cat_opinion   = CategoricalDtype(categories=['1','2','3','4','5'], ordered=True)
cat_age       = CategoricalDtype(categories=['18 - 34 Years','35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years'], ordered=True)
cat_edu       = CategoricalDtype(categories=['<12 Years','12 Years','Some College','College Graduate'], ordered=True)
cat_income    = CategoricalDtype(categories=['Below Poverty','<= $75,000, Above Poverty','> $75,000'], ordered=True)

In [76]:

##### Coerce Data Types


training_data = training_data.astype({
   'respondent_id'                 : 'object' # respondent_id
    ,'h1n1_concern'                :  'category' # h1n1+concern
    ,'h1n1_knowledge'              :  'category' # h1n1_knowledge
    ,'behavioral_antiviral_meds'   : 'bool' # behavioral_antiviral_meds
    ,'behavioral_avoidance'        : 'bool' # behavioral_avoidance
    ,'behavioral_face_mask'        : 'bool' # behavioral_face_mask
    ,'behavioral_wash_hands'       : 'bool' # behavioral_wash_hands
    ,'behavioral_large_gatherings' : 'bool' # behavioral_large_gatherings
    ,'behavioral_outside_home'     : 'bool' # behavioral_outside_home
    ,'behavioral_touch_face'       : 'bool' # behavioral_touch_face
    ,'doctor_recc_h1n1'            : 'bool' # doctor_recc_h1n1
    ,'doctor_recc_seasonal'        : 'bool' # doctor_recc_seasonal
    ,'chronic_med_condition'       : 'bool' # chronic_med_condition
    ,'child_under_6_months'        : 'bool' # child_under_6_months
    ,'health_worker'               : 'bool' # health_worker
    #,'health_insurance'            : 'bool' # health_insurance
    ,'opinion_h1n1_vacc_effective' :  'category' # opinion_h1n1_vacc_effective
    ,'opinion_h1n1_risk'           :  'category' # opinion_h1n1_risk
    ,'opinion_h1n1_sick_from_vacc' :  'category' # opinion_h1n1_sick_from_vacc
    ,'opinion_seas_vacc_effective' :  'category' # opinion_seas_vacc_effective
    ,'opinion_seas_risk'           :  'category' # opinion_seas_risk
    ,'opinion_seas_sick_from_vacc' :  'category' # opinion_sas_sick_from_vacc
    ,'age_group'                   :  'category' # age_group
    ,'education'                   :  'category' # education
    ,'race'                        : 'category' # race
    ,'sex'                         : 'category' # sex
    ,'income_poverty'              :  cat_income # income-poverty
    ,'marital_status'              : 'category' # marital_status
    ,'rent_or_own'                 : 'category' # rent_or_own
    ,'employment_status'           : 'category' # employment_status
    ,'hhs_geo_region'              : 'category' # hhs_geo_region
    ,'census_msa'                  : 'category' # census_msa
    ,'household_adults'            : 'int64' # household_adults
    ,'household_children'          : 'int64' # household_children
    #,'employment_industry'         : 'category' # employment_industry
    #,'employment_occupation'       : 'category' # emlpoyment_occupation
    })

training_data['h1n1_knowledge'].astype(cat_knowledge)
training_data['income_poverty'].astype(cat_income)
training_data['opinion_h1n1_vacc_effective']
training_data['opinion_h1n1_risk'].astype(cat_opinion)
training_data['opinion_h1n1_sick_from_vacc'].astype(cat_opinion)
training_data['opinion_seas_vacc_effective'].astype(cat_opinion)
training_data['opinion_seas_risk'].astype(cat_opinion)
training_data['opinion_seas_sick_from_vacc'].astype(cat_opinion)
training_data['age_group'].astype(cat_age)
training_data['education'].astype(cat_edu)

0                     NaN
1                12 Years
3                12 Years
4            Some College
5                12 Years
               ...       
26700            12 Years
26701    College Graduate
26702        Some College
26703    College Graduate
26706        Some College
Name: education, Length: 19642, dtype: category
Categories (4, object): [<12 Years < 12 Years < Some College < College Graduate]

# Modeling

## Scikit-Learn MultiOutput Classifier
Has the advantage of outputting both the H1N1 prediction and seasonal prediction.