# Model for Prediction of H1N1 Vaccination Likelihood

## Import Dependencies

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import os
import numpy as np
import pandas as pd

## Data Import and Pre-Processing

In [2]:
# Import features training data set  
features_data = os.path.join("Data", "training_set_features.csv")
features_df = pd.read_csv(features_data, index_col="respondent_id")
features_df.head()

# Import features testing data set  
features_testing_data = os.path.join("Data", "test_set_features.csv")
features_testing_df = pd.read_csv(features_testing_data, index_col="respondent_id")
features_testing_df.head()

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird


In [3]:
# Import labels training data set, leave only H1N1 label
labels_data = os.path.join("Data", "training_set_labels.csv")
labels = pd.read_csv(labels_data, index_col="respondent_id")
labels_df = labels.drop(columns=['seasonal_vaccine'])
labels_df.head()

Unnamed: 0_level_0,h1n1_vaccine
respondent_id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


### Process training dataset

In [4]:
# Separate dataframes into 2: one with object data columns and one with numerical data columns 
object_features_df = features_df.select_dtypes(include=['object'])
numerical_features_df = features_df.select_dtypes(exclude=['object'])

# Drop columns with meaningless data, check mode of each column
object_features_df = object_features_df.drop(["hhs_geo_region", 'employment_industry', "employment_occupation"], axis=1)
object_df_mode = object_features_df.mode()

numerical_df_median_1 = numerical_features_df.median(skipna=True)
numerical_df_median_2 = pd.DataFrame(numerical_df_median_1)
numerical_df_median = numerical_df_median_2.transpose()

# Impute NaN values in columns with mode (most frequent value) of respective column
for i in object_features_df.columns.values:
    object_features_df[i] = object_features_df[i].fillna(value=object_df_mode[i].iloc[0])
    
for i in numerical_features_df.columns.values:
    numerical_features_df[i] = numerical_features_df[i].fillna(value=numerical_df_median[i].iloc[0])

numerical_features_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,5.0,4.0,4.0,4.0,2.0,4.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,1.0,1.0,4.0,1.0,2.0,2.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,3.0,3.0,5.0,5.0,4.0,1.0,0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,3.0,3.0,2.0,3.0,1.0,4.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,3.0,1.0,1.0,5.0,2.0,2.0,0.0,0.0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,4.0,2.0,2.0,5.0,1.0,1.0,1.0,0.0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,4.0,4.0,2.0,5.0,4.0,2.0,0.0,0.0
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,3.0,1.0,2.0,2.0,1.0,2.0,1.0,0.0


### Process testing dataset

In [5]:
# Separate dataframes into 2: one with object data columns and one with numerical data columns 
object_features_testing_df = features_testing_df.select_dtypes(include=['object'])
numerical_features_testing_df = features_testing_df.select_dtypes(exclude=['object'])

# Drop columns with meaningless data, check mode of each column
object_features_testing_df = object_features_testing_df.drop(["hhs_geo_region", 'employment_industry', "employment_occupation"], axis=1)
object_testing_df_mode = object_features_testing_df.mode()

numerical_testing_df_median_1 = numerical_features_testing_df.median(skipna=True)
numerical_testing_df_median_2 = pd.DataFrame(numerical_testing_df_median_1)
numerical_testing_df_median = numerical_testing_df_median_2.transpose()

# Impute NaN values in columns with mode (most frequent value) of respective column
for i in object_features_testing_df.columns.values:
    object_features_testing_df[i] = object_features_testing_df[i].fillna(value=object_testing_df_mode[i].iloc[0])
    
for i in numerical_features_testing_df.columns.values:
    numerical_features_testing_df[i] = numerical_features_testing_df[i].fillna(value=numerical_testing_df_median[i].iloc[0])

numerical_features_testing_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,5.0,1.0,1.0,5.0,1.0,1.0,1.0,0.0
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,1.0,1.0,4.0,1.0,1.0,3.0,0.0
26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,5.0,4.0,2.0,5.0,4.0,4.0,1.0,0.0
26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,4.0,2.0,2.0,4.0,4.0,2.0,1.0,0.0
26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,5.0,2.0,4.0,4.0,4.0,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53410,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,4.0,2.0,2.0,4.0,2.0,1.0,1.0,1.0
53411,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,4.0,1.0,1.0,5.0,2.0,2.0,1.0,3.0
53412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,4.0,3.0,1.0,4.0,3.0,1.0,1.0,0.0
53413,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,2.0,3.0,4.0,4.0,3.0,2.0,1.0,0.0


### Encode training dataset

In [6]:
# Encode each category in the objects dataframe 
from sklearn.preprocessing import LabelEncoder

# Make label encoder for each column
cat_object_features_df = object_features_df.apply(LabelEncoder().fit_transform)
cat_object_features_df.head()

Unnamed: 0_level_0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3,1,3,0,2,1,0,1,2
1,1,0,3,1,2,1,1,0,0
2,0,2,3,1,0,1,0,0,0
3,4,0,3,0,2,1,1,1,1
4,2,3,3,0,0,0,0,0,0


In [7]:
# Make one-hot encoding for non-binary columns
encoded_df = pd.get_dummies(cat_object_features_df, columns=["age_group", "education", "race", 
                                                             "income_poverty","employment_status",
                                                             "census_msa"])
encoded_df.head()

Unnamed: 0_level_0,sex,marital_status,rent_or_own,age_group_0,age_group_1,age_group_2,age_group_3,age_group_4,education_0,education_1,...,race_3,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_1,employment_status_2,census_msa_0,census_msa_1,census_msa_2
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,0,0,0,0,1,0,0,1,...,1,0,0,1,0,1,0,0,0,1
1,1,1,1,0,1,0,0,0,1,0,...,1,0,0,1,1,0,0,1,0,0
2,1,1,0,1,0,0,0,0,0,0,...,1,1,0,0,1,0,0,1,0,0
3,0,1,1,0,0,0,0,1,1,0,...,1,0,0,1,0,1,0,0,1,0
4,0,0,0,0,0,1,0,0,0,0,...,1,1,0,0,1,0,0,1,0,0


In [8]:
# Assert the numerical_features_df, encoded_df and labels_df rows match in order to join arrays
np.testing.assert_array_equal(encoded_df.index.values, labels_df.index.values)
np.testing.assert_array_equal(encoded_df.index.values, numerical_features_df.index.values)

In [9]:
# Join arrays
joined_df = numerical_features_df.join(encoded_df)
training_df = joined_df.join(labels_df)
training_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_1,employment_status_2,census_msa_0,census_msa_1,census_msa_2,h1n1_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0,0,1,0,1,0,0,0,1,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0,0,1,1,0,0,1,0,0,0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,1,0,0,1,0,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,1,0,0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1,0,0,0,1,0,0,0,1,0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1,0,0,1,0,0,0,1,0,0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1,0,0,1,0,0,1,0,0,0
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1,0,0,1,0,0,0,0,1,0


### Encode testing dataset

In [10]:
# Encode each category in the objects dataframe 
from sklearn.preprocessing import LabelEncoder

# Make label encoder for each column
cat_object_features_testing_df = object_features_testing_df.apply(LabelEncoder().fit_transform)
cat_object_features_testing_df.head()

Unnamed: 0_level_0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
26707,1,2,1,0,1,1,1,0,0
26708,0,0,3,1,2,1,1,0,2
26709,3,2,3,1,1,0,0,0,2
26710,4,0,3,0,0,0,0,1,0
26711,1,0,0,0,0,1,0,0,2


In [11]:
# Make one-hot encoding for non-binary columns
encoded_testing_df = pd.get_dummies(cat_object_features_testing_df, columns=["age_group", "education", "race", 
                                                                             "income_poverty","employment_status",
                                                                             "census_msa"])
encoded_testing_df.head()

Unnamed: 0_level_0,sex,marital_status,rent_or_own,age_group_0,age_group_1,age_group_2,age_group_3,age_group_4,education_0,education_1,...,race_3,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_1,employment_status_2,census_msa_0,census_msa_1,census_msa_2
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,0,1,1,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0
26708,1,1,1,1,0,0,0,0,1,0,...,1,0,0,1,1,0,0,0,0,1
26709,1,0,0,0,0,0,1,0,0,0,...,1,0,1,0,1,0,0,0,0,1
26710,0,0,0,0,0,0,0,1,1,0,...,1,1,0,0,0,1,0,1,0,0
26711,0,1,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,1


In [12]:
# Assert the numerical_features_testing_df, and encoded_testing_df rows match in order to join arrays
np.testing.assert_array_equal(encoded_testing_df.index.values, numerical_features_testing_df.index.values)

In [13]:
# Join arrays
testing_df = numerical_features_testing_df.join(encoded_testing_df)
testing_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,race_3,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_1,employment_status_2,census_msa_0,census_msa_1,census_msa_2
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0,0,1,0,1,0,0,1,0,0
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,1,1,0,0,0,0,1
26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1,0,1,0,1,0,0,0,0,1
26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1,1,0,0,0,1,0,1,0,0
26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53410,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1,1,0,0,1,0,0,0,1,0
53411,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1,0,0,1,1,0,0,0,0,1
53412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,1,0,1,0,1,0,0
53413,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1,1,0,0,0,1,0,1,0,0


## Data Exploration

In [14]:
# Examine distribution of target variable
H1N1_count_gb = training_df["h1n1_vaccine"].value_counts(normalize=True)
H1N1_count_gb
# H1N1 flu vaccine target has moderately imbalanced classes, can proceed with model building

0    0.787546
1    0.212454
Name: h1n1_vaccine, dtype: float64

In [15]:
# Examine pearson correlation matrix to see if some variables should be excluded from analysis
training_df.corr(method='pearson').unstack().sort_values().drop_duplicates()
# Customary to keep variables with correlation coefficients smaller than absolute value of 0.8

employment_status_0          employment_status_1       -0.893544
income_poverty_1             income_poverty_0          -0.786905
race_3                       race_0                    -0.577294
census_msa_1                 census_msa_0              -0.568035
census_msa_2                 census_msa_0              -0.534093
                                                          ...   
age_group_4                  employment_status_1        0.498880
opinion_seas_risk            opinion_h1n1_risk          0.562976
behavioral_large_gatherings  behavioral_outside_home    0.580881
doctor_recc_h1n1             doctor_recc_seasonal       0.603152
h1n1_concern                 h1n1_concern               1.000000
Length: 1177, dtype: float64

In [16]:
# Keep the employment status feature which has more members
print(f"Employment_status_0: {training_df['employment_status_0'].sum()}")
print(f"Employment_status_1: {training_df['employment_status_1'].sum()}")

# Keep empoyment status 0
training_df = training_df.drop(['employment_status_1'], axis=1)
training_df.head()

Employment_status_0: 15023
Employment_status_1: 10231


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,race_3,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_2,census_msa_0,census_msa_1,census_msa_2,h1n1_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1,0,0,1,0,0,0,0,1,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1,0,0,1,1,0,1,0,0,0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,1,0,1,0,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1,0,0,1,0,0,0,1,0,0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1,1,0,0,1,0,1,0,0,0


In [17]:
# Determine which features to keep for model

# Separate data by those that got H1N1 vaccine (H1N1_1) and those that didn't (H1N1_0)
H1N1_1_df = training_df.loc[training_df["h1n1_vaccine"] == 1]
H1N1_0_df = training_df.loc[training_df["h1n1_vaccine"] == 0]

# Find means for each feature in two datasets
H1N1_1_mean = H1N1_1_df.mean()
H1N1_0_mean = H1N1_0_df.mean()

# Calculate normalized differences of averages of the features for the two datasets
# Values that differ most between two datasets are more likely to be useful as a differentiator between the two classes  
differences = (H1N1_1_mean.subtract(H1N1_0_mean))/ H1N1_1_mean
normalized_diffs_avg = differences.abs().sort_values()
normalized_diffs_avg

employment_status_0            0.000918
census_msa_2                   0.002697
census_msa_0                   0.003615
census_msa_1                   0.007753
household_children             0.011059
opinion_seas_sick_from_vacc    0.012702
household_adults               0.014950
race_2                         0.023739
health_insurance               0.026440
race_1                         0.027330
education_3                    0.028024
race_3                         0.037262
behavioral_large_gatherings    0.056643
sex                            0.064188
behavioral_avoidance           0.066374
behavioral_outside_home        0.071560
behavioral_wash_hands          0.078527
age_group_4                    0.079559
income_poverty_0               0.081030
age_group_1                    0.094309
opinion_h1n1_sick_from_vacc    0.096889
opinion_seas_vacc_effective    0.106545
behavioral_touch_face          0.108932
age_group_2                    0.115564
h1n1_knowledge                 0.126779


In [18]:
# Keep only features with normalized differences of averages greater than 0.10
columns_to_drop = normalized_diffs_avg.loc[lambda x: x < 0.1].keys().tolist()
print(f"Count of columns dropped: {len(columns_to_drop)}")

training_df = training_df.drop(columns=columns_to_drop, axis=1)
training_df.head()

Count of columns dropped: 21


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_face_mask,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,...,age_group_2,age_group_3,education_0,education_1,education_2,race_0,income_poverty_1,income_poverty_2,employment_status_2,h1n1_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,1,0,0,0,1,0,0
1,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,1,0,0
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0,0,1,0,0,0,0,1,0,0
4,2.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


In [19]:
# Transform testing dataset
testing_df = testing_df.drop(['employment_status_1'], axis=1)
testing_df = testing_df.drop(columns=columns_to_drop, axis=1)
testing_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_face_mask,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,child_under_6_months,health_worker,...,age_group_0,age_group_2,age_group_3,education_0,education_1,education_2,race_0,income_poverty_1,income_poverty_2,employment_status_2
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,2.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,0
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,1,0,0,0,0,1,0
26709,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,1,0,1,0,0
26710,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
26711,3.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53410,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0
53411,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1,0,0,1,0,0,0,0,1,0
53412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,1,0
53413,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,0,1,0,0,0,0,0,0,0


## Model Building

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [21]:
# Split training dataset into X(features) and y(label)
X = training_df.drop('h1n1_vaccine', axis=1)
y = training_df['h1n1_vaccine'].values.reshape(-1,1)
print(X.shape, y.shape)

(26707, 26) (26707, 1)


In [22]:
# Split data into training and validation
X_train, X_eval, y_train, y_eval = train_test_split(X, y, random_state=42)

### Fit model

In [23]:
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
# Fit(train) model using training data
classifier.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
# Test model using validation dataset
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Evaluation Data Score: {classifier.score(X_eval, y_eval)}")

Training Data Score: 0.8334997503744384
Evaluation Data Score: 0.8398981578553243


In [26]:
# Make predictions on validation dataset
predictions = classifier.predict_proba(X_eval)
preds = predictions[:, 1].tolist()
preds

[0.10124236298821268,
 0.0998796322664861,
 0.08416560330104253,
 0.21026187615004538,
 0.15171627786321235,
 0.7245229923469841,
 0.18091775095349566,
 0.5032158391613019,
 0.05912512105835711,
 0.6128856810648406,
 0.42617218688939495,
 0.07244395350928473,
 0.0808748019233434,
 0.0632851347709696,
 0.07087145767732908,
 0.04035076480589866,
 0.26070912061190094,
 0.06048948722711226,
 0.08387209992406243,
 0.3063803509904693,
 0.711391074464071,
 0.04956900002089506,
 0.10798735737107507,
 0.012186161827628824,
 0.12748213567816688,
 0.04433237842810664,
 0.1755068236430977,
 0.20029193598021472,
 0.06662378036218405,
 0.17792438549829426,
 0.05896537370637206,
 0.05838162943545066,
 0.0372589580716181,
 0.14060687464194802,
 0.03315500895861523,
 0.12004258278093088,
 0.21620348945562656,
 0.889861031675426,
 0.11115457839310215,
 0.04730599419831341,
 0.12802873990896696,
 0.25833754877960585,
 0.07618430219148165,
 0.10921758514224153,
 0.5103743727588828,
 0.013144652492805874,


In [27]:
# Check AUC score on validation dataset
roc_auc_score(y_eval, preds)

0.8285680935731516

### Retrain model on full dataset

In [28]:
classifier.fit(X, y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## Model Testing

In [29]:
test_predictions = classifier.predict_proba(testing_df)
test_predictions[:,:1]

array([[0.87334144],
       [0.9551204 ],
       [0.53102949],
       ...,
       [0.87433851],
       [0.94396263],
       [0.48785841]])

## Submission to DataDriven.org

https://www.drivendata.org/competitions/66/flu-shot-learning/page/210/

In [30]:
# Open submission file
submission_file = os.path.join("Data", "submission_format.csv")
submission = pd.read_csv(submission_file, index_col = "respondent_id")

# Assert the testing_df, and submission rows match in order to join arrays
np.testing.assert_array_equal(testing_df.index.values, submission.index.values)

In [31]:
# Populate submission file with H1N1 predictions
submission['h1n1_vaccine'] = test_predictions[:,:1]
submission

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.873341,0.7
26708,0.955120,0.7
26709,0.531029,0.7
26710,0.536045,0.7
26711,0.755093,0.7
...,...,...
53410,0.596905,0.7
53411,0.905376,0.7
53412,0.874339,0.7
53413,0.943963,0.7


In [32]:
# Save submission
submission.to_csv("my_submission.csv", index=True)
!head my_submission.csv

respondent_id,h1n1_vaccine,seasonal_vaccine
26707,0.873341441453652,0.7
26708,0.9551204039233024,0.7
26709,0.5310294908979329,0.7
26710,0.5360449491064614,0.7
26711,0.7550928761396916,0.7
26712,0.596313615461277,0.7
26713,0.6612621890371435,0.7
26714,0.8931724170963743,0.7
26715,0.9619812979871755,0.7
