# Model for Prediction of H1N1 and Seasonal Flu Vaccination Likelihood

## Import Dependencies

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import os
import numpy as np
import pandas as pd

## Data Import and Pre-Processing

In [2]:
# Import features training data set  
features_data = os.path.join("Data", "training_set_features.csv")
features_df = pd.read_csv(features_data, index_col="respondent_id")
features_df.head()

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [3]:
# Import labels training data set, leave only H1N1 label
labels_data = os.path.join("Data", "training_set_labels.csv")
labels = pd.read_csv(labels_data, index_col="respondent_id")
labels_df = labels.drop(columns=['seasonal_vaccine'])
labels_df.head()

Unnamed: 0_level_0,h1n1_vaccine
respondent_id,Unnamed: 1_level_1
0,0
1,0
2,0
3,0
4,0


In [4]:
# Separate dataframes into 2: one with object data columns and one with numerical data columns 
object_features_df = features_df.select_dtypes(include=['object'])
numerical_features_df = features_df.select_dtypes(exclude=['object'])

# Drop columns with meaningless data, check mode of each column
object_features_df = object_features_df.drop(["hhs_geo_region", 'employment_industry', "employment_occupation"], axis=1)
object_df_mode = object_features_df.mode()

numerical_df_median_1 = numerical_features_df.median(skipna=True)
numerical_df_median_2 = pd.DataFrame(numerical_df_median_1)
numerical_df_median = numerical_df_median_2.transpose()

# Impute NaN values in columns with mode (most frequent value) of respective column
for i in object_features_df.columns.values:
    object_features_df[i] = object_features_df[i].fillna(value=object_df_mode[i].iloc[0])
    
for i in numerical_features_df.columns.values:
    numerical_features_df[i] = numerical_features_df[i].fillna(value=numerical_df_median[i].iloc[0])

numerical_features_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,5.0,4.0,4.0,4.0,2.0,4.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,1.0,1.0,4.0,1.0,2.0,2.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,3.0,3.0,5.0,5.0,4.0,1.0,0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,3.0,3.0,2.0,3.0,1.0,4.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,3.0,1.0,1.0,5.0,2.0,2.0,0.0,0.0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,4.0,2.0,2.0,5.0,1.0,1.0,1.0,0.0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,4.0,4.0,2.0,5.0,4.0,2.0,0.0,0.0
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,3.0,1.0,2.0,2.0,1.0,2.0,1.0,0.0


In [5]:
# Encode each category in the objects dataframe 
from sklearn.preprocessing import LabelEncoder

# Make label encoder for each column
cat_object_features_df = object_features_df.apply(LabelEncoder().fit_transform)
cat_object_features_df.head()

Unnamed: 0_level_0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3,1,3,0,2,1,0,1,2
1,1,0,3,1,2,1,1,0,0
2,0,2,3,1,0,1,0,0,0
3,4,0,3,0,2,1,1,1,1
4,2,3,3,0,0,0,0,0,0


In [6]:
# Make one-hot encoding for non-binary columns
encoded_df = pd.get_dummies(cat_object_features_df, columns=["age_group", "education", "race", 
                                                             "income_poverty","employment_status",
                                                             "census_msa"])
encoded_df

Unnamed: 0_level_0,sex,marital_status,rent_or_own,age_group_0,age_group_1,age_group_2,age_group_3,age_group_4,education_0,education_1,...,race_3,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_1,employment_status_2,census_msa_0,census_msa_1,census_msa_2
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,0,0,0,0,1,0,0,1,...,1,0,0,1,0,1,0,0,0,1
1,1,1,1,0,1,0,0,0,1,0,...,1,0,0,1,1,0,0,1,0,0
2,1,1,0,1,0,0,0,0,0,0,...,1,1,0,0,1,0,0,1,0,0
3,0,1,1,0,0,0,0,1,1,0,...,1,0,0,1,0,1,0,0,1,0
4,0,0,0,0,0,1,0,0,0,0,...,1,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,0,1,0,0,0,0,0,1,0,0,...,1,1,0,0,0,1,0,0,0,1
26703,1,1,1,1,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,1,0
26704,0,1,0,0,0,0,1,0,0,0,...,1,1,0,0,1,0,0,1,0,0
26705,0,0,1,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1


In [7]:
# Assert the numerical_features_df, encoded_df and labels_df rows match in order to join arrays
np.testing.assert_array_equal(encoded_df.index.values, labels_df.index.values)
np.testing.assert_array_equal(encoded_df.index.values, numerical_features_df.index.values)

In [8]:
# Join arrays
joined_df = numerical_features_df.join(encoded_df)
training_df = joined_df.join(labels_df)
training_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_1,employment_status_2,census_msa_0,census_msa_1,census_msa_2,h1n1_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0,0,1,0,1,0,0,0,1,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0,0,1,1,0,0,1,0,0,0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,1,0,0,1,0,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,1,0,0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1,0,0,0,1,0,0,0,1,0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1,0,0,1,0,0,0,1,0,0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1,0,0,1,0,0,1,0,0,0
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1,0,0,1,0,0,0,0,1,0


## Data Exploration

In [40]:
# Examine distribution of target variable
H1N1_count_gb = training_df["h1n1_vaccine"].value_counts(normalize=True)
H1N1_count_gb
# H1N1 flu vaccine target has moderately imbalanced classes

0    0.787546
1    0.212454
Name: h1n1_vaccine, dtype: float64

In [13]:
# Examine pearson correlation matrix to see if some variables should be excluded from analysis
training_df.corr(method='pearson').unstack().sort_values().drop_duplicates()
# Customary to keep variables with correlation coefficients smaller than absolute value of 0.8

employment_status_0          employment_status_1       -0.893544
income_poverty_1             income_poverty_0          -0.786905
race_3                       race_0                    -0.577294
census_msa_1                 census_msa_0              -0.568035
census_msa_2                 census_msa_0              -0.534093
                                                          ...   
age_group_4                  employment_status_1        0.498880
opinion_seas_risk            opinion_h1n1_risk          0.562976
behavioral_large_gatherings  behavioral_outside_home    0.580881
doctor_recc_h1n1             doctor_recc_seasonal       0.603152
h1n1_concern                 h1n1_concern               1.000000
Length: 1177, dtype: float64

In [29]:
# Keep the employment status feature which has more members
print(f"Employment_status_0: {training_df['employment_status_0'].sum()}")
print(f"Employment_status_1: {training_df['employment_status_1'].sum()}")

# Keep empoyment status 0
training_df = training_df.drop(['employment_status_1'], axis=1)
training_df.head()

Employment_status_0: 15023
Employment_status_1: 10231


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,race_3,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_2,census_msa_0,census_msa_1,census_msa_2,h1n1_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1,0,0,1,0,0,0,0,1,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1,0,0,1,1,0,1,0,0,0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,1,0,1,0,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1,0,0,1,0,0,0,1,0,0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1,1,0,0,1,0,1,0,0,0


In [62]:
# Determine which features to keep for model

# Separate data by those that got H1N1 vaccine (H1N1_1) and those that didn't (H1N1_0)
H1N1_1_df = training_df.loc[training_df["h1n1_vaccine"] == 1]
H1N1_0_df = training_df.loc[training_df["h1n1_vaccine"] == 0]

# Find means for each feature in two datasets
H1N1_1_mean = H1N1_1_df.mean()
H1N1_0_mean = H1N1_0_df.mean()

# Determine normalized averages of which features show greatest differences in two datasets
# Values that differ most among two datasets are more likely to be useful a differentiator between classes  
differences = (H1N1_1_mean.subtract(H1N1_0_mean))/ H1N1_1_mean
differences.abs().sort_values()

employment_status_0            0.000918
census_msa_2                   0.002697
census_msa_0                   0.003615
census_msa_1                   0.007753
household_children             0.011059
opinion_seas_sick_from_vacc    0.012702
household_adults               0.014950
race_2                         0.023739
health_insurance               0.026440
race_1                         0.027330
education_3                    0.028024
race_3                         0.037262
behavioral_large_gatherings    0.056643
sex                            0.064188
behavioral_avoidance           0.066374
behavioral_outside_home        0.071560
behavioral_wash_hands          0.078527
age_group_4                    0.079559
income_poverty_0               0.081030
age_group_1                    0.094309
opinion_h1n1_sick_from_vacc    0.096889
opinion_seas_vacc_effective    0.106545
behavioral_touch_face          0.108932
age_group_2                    0.115564
h1n1_knowledge                 0.126779
