# Model for Prediction of Seasonal Vaccination Likelihood

## Import Dependencies

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import os
import numpy as np
import pandas as pd

## Data Import and Pre-Processing

In [2]:
# Import features training data set  
features_data = os.path.join("Data", "training_set_features.csv")
features_df = pd.read_csv(features_data, index_col="respondent_id")
features_df.head()

# Import features testing data set  
features_testing_data = os.path.join("Data", "test_set_features.csv")
features_testing_df = pd.read_csv(features_testing_data, index_col="respondent_id")
features_testing_df.head()

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"> $75,000",Not Married,Rent,Employed,mlyzmhmf,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp
26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,"> $75,000",Married,Own,Employed,lrircsnp,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik
26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,lrircsnp,"MSA, Not Principle City",1.0,0.0,,
26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,lzgpxyit,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird


In [4]:
# Import labels training data set, leave only H1N1 label
labels_data = os.path.join("Data", "training_set_labels.csv")
labels = pd.read_csv(labels_data, index_col="respondent_id")
labels_df = labels.drop(columns=['h1n1_vaccine'])
labels_df.head()

Unnamed: 0_level_0,seasonal_vaccine
respondent_id,Unnamed: 1_level_1
0,0
1,1
2,0
3,1
4,0


### Process training dataset

In [5]:
# Separate dataframes into 2: one with object data columns and one with numerical data columns 
object_features_df = features_df.select_dtypes(include=['object'])
numerical_features_df = features_df.select_dtypes(exclude=['object'])

# Drop columns with meaningless data, check mode of each column
object_features_df = object_features_df.drop(["hhs_geo_region", 'employment_industry', "employment_occupation"], axis=1)
object_df_mode = object_features_df.mode()

numerical_df_median_1 = numerical_features_df.median(skipna=True)
numerical_df_median_2 = pd.DataFrame(numerical_df_median_1)
numerical_df_median = numerical_df_median_2.transpose()

# Impute NaN values in columns with mode (most frequent value) of respective column
for i in object_features_df.columns.values:
    object_features_df[i] = object_features_df[i].fillna(value=object_df_mode[i].iloc[0])
    
for i in numerical_features_df.columns.values:
    numerical_features_df[i] = numerical_features_df[i].fillna(value=numerical_df_median[i].iloc[0])

numerical_features_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,3.0,1.0,2.0,2.0,1.0,2.0,0.0,0.0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,1.0,5.0,4.0,4.0,4.0,2.0,4.0,0.0,0.0
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,1.0,1.0,4.0,1.0,2.0,2.0,0.0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,1.0,3.0,3.0,5.0,5.0,4.0,1.0,0.0,0.0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,3.0,3.0,2.0,3.0,1.0,4.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,3.0,1.0,1.0,5.0,2.0,2.0,0.0,0.0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,4.0,2.0,2.0,5.0,1.0,1.0,1.0,0.0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,4.0,4.0,2.0,5.0,4.0,2.0,0.0,0.0
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,3.0,1.0,2.0,2.0,1.0,2.0,1.0,0.0


### Process testing dataset

In [6]:
# Separate dataframes into 2: one with object data columns and one with numerical data columns 
object_features_testing_df = features_testing_df.select_dtypes(include=['object'])
numerical_features_testing_df = features_testing_df.select_dtypes(exclude=['object'])

# Drop columns with meaningless data, check mode of each column
object_features_testing_df = object_features_testing_df.drop(["hhs_geo_region", 'employment_industry', "employment_occupation"], axis=1)
object_testing_df_mode = object_features_testing_df.mode()

numerical_testing_df_median_1 = numerical_features_testing_df.median(skipna=True)
numerical_testing_df_median_2 = pd.DataFrame(numerical_testing_df_median_1)
numerical_testing_df_median = numerical_testing_df_median_2.transpose()

# Impute NaN values in columns with mode (most frequent value) of respective column
for i in object_features_testing_df.columns.values:
    object_features_testing_df[i] = object_features_testing_df[i].fillna(value=object_testing_df_mode[i].iloc[0])
    
for i in numerical_features_testing_df.columns.values:
    numerical_features_testing_df[i] = numerical_features_testing_df[i].fillna(value=numerical_testing_df_median[i].iloc[0])

numerical_features_testing_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,5.0,1.0,1.0,5.0,1.0,1.0,1.0,0.0
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,1.0,1.0,4.0,1.0,1.0,3.0,0.0
26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,1.0,5.0,4.0,2.0,5.0,4.0,4.0,1.0,0.0
26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,4.0,2.0,2.0,4.0,4.0,2.0,1.0,0.0
26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,1.0,5.0,2.0,4.0,4.0,4.0,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53410,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,1.0,4.0,2.0,2.0,4.0,2.0,1.0,1.0,1.0
53411,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1.0,0.0,4.0,1.0,1.0,5.0,2.0,2.0,1.0,3.0
53412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,4.0,3.0,1.0,4.0,3.0,1.0,1.0,0.0
53413,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,1.0,2.0,3.0,4.0,4.0,3.0,2.0,1.0,0.0


### Encode training dataset

In [7]:
# Encode each category in the objects dataframe 
from sklearn.preprocessing import LabelEncoder

# Make label encoder for each column
cat_object_features_df = object_features_df.apply(LabelEncoder().fit_transform)
cat_object_features_df.head()

Unnamed: 0_level_0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,3,1,3,0,2,1,0,1,2
1,1,0,3,1,2,1,1,0,0
2,0,2,3,1,0,1,0,0,0
3,4,0,3,0,2,1,1,1,1
4,2,3,3,0,0,0,0,0,0


In [8]:
# Make one-hot encoding for non-binary columns
encoded_df = pd.get_dummies(cat_object_features_df, columns=["age_group", "education", "race", 
                                                             "income_poverty","employment_status",
                                                             "census_msa"])
encoded_df.head()

Unnamed: 0_level_0,sex,marital_status,rent_or_own,age_group_0,age_group_1,age_group_2,age_group_3,age_group_4,education_0,education_1,...,race_3,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_1,employment_status_2,census_msa_0,census_msa_1,census_msa_2
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,0,0,0,0,1,0,0,1,...,1,0,0,1,0,1,0,0,0,1
1,1,1,1,0,1,0,0,0,1,0,...,1,0,0,1,1,0,0,1,0,0
2,1,1,0,1,0,0,0,0,0,0,...,1,1,0,0,1,0,0,1,0,0
3,0,1,1,0,0,0,0,1,1,0,...,1,0,0,1,0,1,0,0,1,0
4,0,0,0,0,0,1,0,0,0,0,...,1,1,0,0,1,0,0,1,0,0


In [9]:
# Assert the numerical_features_df, encoded_df and labels_df rows match in order to join arrays
np.testing.assert_array_equal(encoded_df.index.values, labels_df.index.values)
np.testing.assert_array_equal(encoded_df.index.values, numerical_features_df.index.values)

In [10]:
# Join arrays
joined_df = numerical_features_df.join(encoded_df)
training_df = joined_df.join(labels_df)
training_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_1,employment_status_2,census_msa_0,census_msa_1,census_msa_2,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0,0,1,0,1,0,0,0,1,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0,0,1,1,0,0,1,0,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,1,0,0,1,0,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0,0,1,0,1,0,0,1,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1,0,0,0,1,0,0,0,1,0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1,0,0,1,0,0,0,1,0,0
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,1,0,0,1,0,0,1,0,0,1
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1,0,0,1,0,0,0,0,1,0


### Encode testing dataset

In [11]:
# Encode each category in the objects dataframe 
from sklearn.preprocessing import LabelEncoder

# Make label encoder for each column
cat_object_features_testing_df = object_features_testing_df.apply(LabelEncoder().fit_transform)
cat_object_features_testing_df.head()

Unnamed: 0_level_0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
26707,1,2,1,0,1,1,1,0,0
26708,0,0,3,1,2,1,1,0,2
26709,3,2,3,1,1,0,0,0,2
26710,4,0,3,0,0,0,0,1,0
26711,1,0,0,0,0,1,0,0,2


In [12]:
# Make one-hot encoding for non-binary columns
encoded_testing_df = pd.get_dummies(cat_object_features_testing_df, columns=["age_group", "education", "race", 
                                                                             "income_poverty","employment_status",
                                                                             "census_msa"])
encoded_testing_df.head()

Unnamed: 0_level_0,sex,marital_status,rent_or_own,age_group_0,age_group_1,age_group_2,age_group_3,age_group_4,education_0,education_1,...,race_3,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_1,employment_status_2,census_msa_0,census_msa_1,census_msa_2
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,0,1,1,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,1,0,0
26708,1,1,1,1,0,0,0,0,1,0,...,1,0,0,1,1,0,0,0,0,1
26709,1,0,0,0,0,0,1,0,0,0,...,1,0,1,0,1,0,0,0,0,1
26710,0,0,0,0,0,0,0,1,1,0,...,1,1,0,0,0,1,0,1,0,0
26711,0,1,0,0,1,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,1


In [13]:
# Assert the numerical_features_testing_df, and encoded_testing_df rows match in order to join arrays
np.testing.assert_array_equal(encoded_testing_df.index.values, numerical_features_testing_df.index.values)

In [14]:
# Join arrays
testing_df = numerical_features_testing_df.join(encoded_testing_df)
testing_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,race_3,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_1,employment_status_2,census_msa_0,census_msa_1,census_msa_2
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0,0,1,0,1,0,0,1,0,0
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,1,1,0,0,0,0,1
26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,...,1,0,1,0,1,0,0,0,0,1
26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1,1,0,0,0,1,0,1,0,0
26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,0,1,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53410,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1,1,0,0,1,0,0,0,1,0
53411,3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,...,1,0,0,1,1,0,0,0,0,1
53412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,1,0,1,0,1,0,0
53413,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1,1,0,0,0,1,0,1,0,0


## Data Exploration

In [15]:
# Examine distribution of target variable
Seasonal_count_gb = training_df["seasonal_vaccine"].value_counts(normalize=True)
Seasonal_count_gb
# Seasonal flu vaccine target has balanced classes

0    0.534392
1    0.465608
Name: seasonal_vaccine, dtype: float64

In [16]:
# Examine pearson correlation matrix to see if some variables should be excluded from analysis
training_df.corr(method='pearson').unstack().sort_values().drop_duplicates()
# Customary to keep variables with correlation coefficients smaller than absolute value of 0.8

employment_status_0      employment_status_1           -0.893544
income_poverty_0         income_poverty_1              -0.786905
race_0                   race_3                        -0.577294
census_msa_0             census_msa_1                  -0.568035
                         census_msa_2                  -0.534093
                                                          ...   
age_group_4              employment_status_1            0.498880
opinion_h1n1_risk        opinion_seas_risk              0.562976
behavioral_outside_home  behavioral_large_gatherings    0.580881
doctor_recc_h1n1         doctor_recc_seasonal           0.603152
h1n1_concern             h1n1_concern                   1.000000
Length: 1177, dtype: float64

In [17]:
# Keep the employment status feature which has more members
print(f"Employment_status_0: {training_df['employment_status_0'].sum()}")
print(f"Employment_status_1: {training_df['employment_status_1'].sum()}")

# Keep empoyment status 0
training_df = training_df.drop(['employment_status_1'], axis=1)
training_df.head()

Employment_status_0: 15023
Employment_status_1: 10231


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,race_3,income_poverty_0,income_poverty_1,income_poverty_2,employment_status_0,employment_status_2,census_msa_0,census_msa_1,census_msa_2,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1,0,0,1,0,0,0,0,1,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,1,0,0,1,1,0,1,0,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,1,0,1,0,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1,0,0,1,0,0,0,1,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1,1,0,0,1,0,1,0,0,0


In [18]:
# Determine which features to keep for model

# Separate data by those that got Seasonal vaccine (Seasonal_1) and those that didn't (Seasonal_0)
Seasonal_1_df = training_df.loc[training_df["seasonal_vaccine"] == 1]
Seasonal_0_df = training_df.loc[training_df["seasonal_vaccine"] == 0]

# Find means for each feature in two datasets
Seasonal_1_mean = Seasonal_1_df.mean()
Seasonal_0_mean = Seasonal_0_df.mean()

# Calculate normalized differences of averages of the features for the two datasets
# Values that differ most between two datasets are more likely to be useful as a differentiator between the two classes  
differences = (Seasonal_1_mean.subtract(Seasonal_0_mean))/ Seasonal_1_mean
normalized_diffs_avg = differences.abs().sort_values()
normalized_diffs_avg

income_poverty_0               0.015189
census_msa_2                   0.030327
opinion_h1n1_sick_from_vacc    0.031706
census_msa_1                   0.049060
census_msa_0                   0.049392
behavioral_antiviral_meds      0.051794
education_3                    0.056602
health_insurance               0.063785
education_0                    0.072843
opinion_seas_sick_from_vacc    0.078883
child_under_6_months           0.086985
behavioral_avoidance           0.088914
race_3                         0.096942
behavioral_wash_hands          0.097921
opinion_h1n1_vacc_effective    0.100088
marital_status                 0.101032
h1n1_knowledge                 0.110525
education_2                    0.114665
household_adults               0.117253
income_poverty_1               0.117381
behavioral_outside_home        0.138937
behavioral_touch_face          0.151949
behavioral_large_gatherings    0.156906
h1n1_concern                   0.159032
age_group_3                    0.167009


In [19]:
# Keep only features with normalized differences of averages greater than 0.10
columns_to_drop = normalized_diffs_avg.loc[lambda x: x < 0.1].keys().tolist()
print(f"Count of columns dropped: {len(columns_to_drop)}")

training_df = training_df.drop(columns=columns_to_drop, axis=1)
training_df.head()

Count of columns dropped: 14


Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_face_mask,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,health_worker,...,education_1,education_2,race_0,race_1,race_2,income_poverty_1,income_poverty_2,employment_status_0,employment_status_2,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,1,0,0,0
1,3.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,1,0,1
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0,1,0,0,0,0,0,1,0,0
3,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0,0,0,0,0,0,1,0,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0


In [20]:
# Transform testing dataset
testing_df = testing_df.drop(['employment_status_1'], axis=1)
testing_df = testing_df.drop(columns=columns_to_drop, axis=1)
testing_df

Unnamed: 0_level_0,h1n1_concern,h1n1_knowledge,behavioral_face_mask,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,doctor_recc_seasonal,chronic_med_condition,health_worker,...,age_group_4,education_1,education_2,race_0,race_1,race_2,income_poverty_1,income_poverty_2,employment_status_0,employment_status_2
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
26707,2.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,1,0,1,0
26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,1,0
26709,2.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,1,0,1,0
26710,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,...,1,0,0,0,0,0,0,0,0,0
26711,3.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53410,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0,0,1,0,0,0,0,0,1,0
53411,3.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,1,1,0
53412,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
53413,3.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## Model Building

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [22]:
# Split training dataset into X(features) and y(label)
X = training_df.drop('seasonal_vaccine', axis=1)
y = training_df['seasonal_vaccine'].values.reshape(-1,1)
print(X.shape, y.shape)

(26707, 33) (26707, 1)


In [23]:
# Split data into training and validation
X_train, X_eval, y_train, y_eval = train_test_split(X, y, random_state=42)

### Fit model

In [24]:
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
# Fit(train) model using training data
classifier.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
# Test model using validation dataset
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Evaluation Data Score: {classifier.score(X_eval, y_eval)}")

Training Data Score: 0.7690464303544683
Evaluation Data Score: 0.781039388947132


In [27]:
# Make predictions on validation dataset
predictions = classifier.predict_proba(X_eval)
preds = predictions[:, 1].tolist()
preds

[0.1732597137103928,
 0.1342080475043342,
 0.7629151720251196,
 0.24012881555784216,
 0.29729846543765137,
 0.937993966258021,
 0.7689718907232223,
 0.9021502558194013,
 0.1538424149586581,
 0.9121845547004584,
 0.9481454799249699,
 0.25784593841263376,
 0.19117775161176612,
 0.2361684845400617,
 0.053518266611997156,
 0.5494775824246497,
 0.23475356463561187,
 0.1854391749566809,
 0.20410579363101627,
 0.06093879164768876,
 0.8148049638679907,
 0.18160716993702838,
 0.7990370216066478,
 0.11309791315582239,
 0.24753988990199505,
 0.14119869684250166,
 0.24982997670262158,
 0.7857624061223579,
 0.32816400152957925,
 0.12627701804428093,
 0.8928729123176015,
 0.6276421286037605,
 0.6109068510316554,
 0.30768584433943913,
 0.07493025598631387,
 0.3597499769461655,
 0.4535083600411578,
 0.9523076675539103,
 0.7176787116734158,
 0.14270650731549037,
 0.3255420486174209,
 0.7447779711451169,
 0.06087932195782043,
 0.33644522798692056,
 0.8307232099155006,
 0.1052952295714996,
 0.25473536323

In [28]:
# Check AUC score on validation dataset
roc_auc_score(y_eval, preds)

0.8500077136895472

### Retrain model on full dataset

In [29]:
classifier.fit(X, y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## Model Testing

In [30]:
test_predictions = classifier.predict_proba(testing_df)
test_predictions[:,:1]

array([[0.79324943],
       [0.94789454],
       [0.24817749],
       ...,
       [0.84174312],
       [0.6788471 ],
       [0.50611575]])

## Submission to DataDriven.org

https://www.drivendata.org/competitions/66/flu-shot-learning/page/210/

In [31]:
# Open submission file
submission_file = os.path.join("my_submission.csv")
submission = pd.read_csv(submission_file, index_col = "respondent_id")

# Assert the testing_df, and submission rows match in order to join arrays
np.testing.assert_array_equal(testing_df.index.values, submission.index.values)

In [32]:
# Populate submission file with H1N1 predictions
submission['seasonal_vaccine'] = test_predictions[:,:1]
submission

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.873341,0.793249
26708,0.955120,0.947895
26709,0.531029,0.248177
26710,0.536045,0.103366
26711,0.755093,0.545256
...,...,...
53410,0.596905,0.513067
53411,0.905376,0.703503
53412,0.874339,0.841743
53413,0.943963,0.678847


In [33]:
# Save submission
submission.to_csv("my_submission.csv", index=True)
!head my_submission.csv

respondent_id,h1n1_vaccine,seasonal_vaccine
26707,0.8733414414536519,0.7932494328575974
26708,0.9551204039233024,0.9478945369918101
26709,0.5310294908979329,0.24817749111853105
26710,0.5360449491064614,0.10336637362908885
26711,0.7550928761396916,0.545256241939934
26712,0.596313615461277,0.12286737264079006
26713,0.6612621890371435,0.38993792147742534
26714,0.8931724170963743,0.8401885752483879
26715,0.9619812979871756,0.8844888321973068
