In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression 
from sklearn.multioutput import MultiOutputClassifier 
from sklearn.metrics import RocCurveDisplay
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [2]:
X = pd.read_csv('data/training_set_features.csv')
y = pd.read_csv('data/training_set_labels.csv')

In [3]:
data = pd.concat([X,y], axis = 1)
data = data.drop('respondent_id', axis = 1)

In [4]:
data.shape

(26707, 37)

In [5]:
data.dtypes

h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1               float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                             object
income_poverty           

## Baseline Model
To get a baseline, we will use a logistic regression model. We will OneHotEncode all of the data and will fill missing values with the mode.

In [6]:
X = X.drop('respondent_id', axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [7]:
X_train.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
7184,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,fcxhlnwr,mxkfnird
142,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",0.0,0.0,dotnnunm,mxkfnird
24319,3.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,Below Poverty,Married,Own,Employed,mlyzmhmf,"MSA, Principle City",3.0,1.0,atmlpfrs,mxkfnird
20906,3.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Not in Labor Force,oxchjgsf,Non-MSA,1.0,0.0,,
17446,2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,Non-MSA,1.0,0.0,,


In [8]:
baseline_imputer = SimpleImputer(strategy = 'most_frequent')
X_train = pd.DataFrame(baseline_imputer.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(baseline_imputer.transform(X_test), columns = X_test.columns)

In [9]:
dummies = OneHotEncoder(drop = 'first', sparse = False)
X_train = pd.DataFrame(dummies.fit_transform(X_train), columns = dummies.get_feature_names())
X_test = pd.DataFrame(dummies.transform(X_test), columns = dummies.get_feature_names())

Index(['x0_1.0', 'x0_2.0', 'x0_3.0', 'x1_1.0', 'x1_2.0', 'x2_1.0', 'x3_1.0',
       'x4_1.0', 'x5_1.0', 'x6_1.0',
       ...
       'x34_qxajmpny', 'x34_rcertsgn', 'x34_tfqavkke', 'x34_ukymxvdu',
       'x34_uqqtjvyb', 'x34_vlluhbov', 'x34_xgwztkwe', 'x34_xqwwgdyp',
       'x34_xtkaffoo', 'x34_xzmlyyjv'],
      dtype='object', length=118)