# Exploratory Data Analysis

In [142]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import numpy as np

In [19]:
train_data = pd.read_csv('training_set_features.csv')
label = pd.read_csv('training_set_labels.csv')
df = pd.merge(train_data, label, how="left", on="respondent_id")

In [6]:
df.head()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0,0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0,1
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26707 entries, 0 to 26706
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   respondent_id                26707 non-null  int64  
 1   h1n1_concern                 26615 non-null  float64
 2   h1n1_knowledge               26591 non-null  float64
 3   behavioral_antiviral_meds    26636 non-null  float64
 4   behavioral_avoidance         26499 non-null  float64
 5   behavioral_face_mask         26688 non-null  float64
 6   behavioral_wash_hands        26665 non-null  float64
 7   behavioral_large_gatherings  26620 non-null  float64
 8   behavioral_outside_home      26625 non-null  float64
 9   behavioral_touch_face        26579 non-null  float64
 10  doctor_recc_h1n1             24547 non-null  float64
 11  doctor_recc_seasonal         24547 non-null  float64
 12  chronic_med_condition        25736 non-null  float64
 13  child_under_6_mo

## Checking for Null Values

In [11]:
# check number of null rows for each column
df.isna().sum() / len(df)

respondent_id                  0.000000
h1n1_concern                   0.003445
h1n1_knowledge                 0.004343
behavioral_antiviral_meds      0.002658
behavioral_avoidance           0.007788
behavioral_face_mask           0.000711
behavioral_wash_hands          0.001573
behavioral_large_gatherings    0.003258
behavioral_outside_home        0.003070
behavioral_touch_face          0.004793
doctor_recc_h1n1               0.080878
doctor_recc_seasonal           0.080878
chronic_med_condition          0.036358
child_under_6_months           0.030704
health_worker                  0.030104
health_insurance               0.459580
opinion_h1n1_vacc_effective    0.014640
opinion_h1n1_risk              0.014528
opinion_h1n1_sick_from_vacc    0.014790
opinion_seas_vacc_effective    0.017299
opinion_seas_risk              0.019246
opinion_seas_sick_from_vacc    0.020107
age_group                      0.000000
education                      0.052683
race                           0.000000


Since *employment_industry*, *employment_occupation*, and *health_insurance* have around 50% of the row empty, I dropped both columns

In [20]:
df = df.drop(columns=['employment_industry', 'employment_occupation', 'health_insurance'])
df.isna().sum() / len(df)

respondent_id                  0.000000
h1n1_concern                   0.003445
h1n1_knowledge                 0.004343
behavioral_antiviral_meds      0.002658
behavioral_avoidance           0.007788
behavioral_face_mask           0.000711
behavioral_wash_hands          0.001573
behavioral_large_gatherings    0.003258
behavioral_outside_home        0.003070
behavioral_touch_face          0.004793
doctor_recc_h1n1               0.080878
doctor_recc_seasonal           0.080878
chronic_med_condition          0.036358
child_under_6_months           0.030704
health_worker                  0.030104
opinion_h1n1_vacc_effective    0.014640
opinion_h1n1_risk              0.014528
opinion_h1n1_sick_from_vacc    0.014790
opinion_seas_vacc_effective    0.017299
opinion_seas_risk              0.019246
opinion_seas_sick_from_vacc    0.020107
age_group                      0.000000
education                      0.052683
race                           0.000000
sex                            0.000000


## Imputing Missing Values

Since the dataset consists of many missing rows, I imputed the columns one by one by predicting the values of the missing row using other columns.

In [38]:
# pipeline
categorical_transformer = Pipeline(
    steps=[("categorical", OneHotEncoder())]
)

binary_transformer = Pipeline(
    steps=[("ordinal", OrdinalEncoder())]
)

In [160]:
def print_best_score_and_parameter(search):
    print(f"Best Score: {search.best_score_}")
    print(f"Best parameters: {search.best_params_}")

### Education

To impute the values for the *education* column, I first removed all the NA values from the *employment_status* column. I used *age_group*, *employment_status*, *race*, *sex*, *census_msa*, *hhs_geo_region* to predict *education*.

In [108]:
# subset relevant columns
education_data = df[['age_group', 'employment_status', 'race', 'sex', 'census_msa', 'hhs_geo_region', 'education']]
education_data_rna = education_data[education_data['employment_status'].notna()]
education_data_rna = education_data_rna.reset_index()

In [109]:
education_data_rna.isna().sum()

index                  0
age_group              0
employment_status      0
race                   0
sex                    0
census_msa             0
hhs_geo_region         0
education            157
dtype: int64

In [110]:
# select rows where education is null and use as test dataset
education_nan_index = education_data_rna['education'][education_data_rna['education'].isna() == True].index.tolist()
education_test = education_data_rna.iloc[education_nan_index]

In [117]:
# select rows where education is not null and use as train dataset
education_train = education_data_rna[education_data_rna['education'].notna()]

In [120]:
education_categorical = ['employment_status', 'race', 'census_msa', 'hhs_geo_region']
education_binary = ['age_group', 'sex']

In [129]:
education_preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, education_categorical),
        ("binary", binary_transformer, education_binary)
    ]
)

In [136]:
# create training data
education_train_transform = education_preprocessor.fit_transform(education_train[education_binary + education_categorical])
education_train_label = OrdinalEncoder().fit_transform(education_train['education'].to_numpy().reshape(-1, 1)).astype(int)

In [131]:
# create dataset to be used for prediction
education_test_transform = education_preprocessor.transform(education_test[education_binary + education_categorical])

In [143]:
education_model = xgb.XGBClassifier(objective="multi:softmax", verbosity=0, use_label_encoder=False)

In [144]:
education_parameter = {"learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
                       "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
                       "min_child_weight": [1, 3, 5, 7],
                       "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
                       "colsample_bytree": [0.3, 0.4, 0.5, 0.7],
                       "n_estimators": [1, 2, 3, 4, 5, 6, 7]}

In [157]:
random_search_education = RandomizedSearchCV(education_model, education_parameter,
                                             n_iter=50,
                                             scoring="f1_micro",
                                             cv=5)

In [158]:
random_search_education.fit(education_train_transform, education_train_label)

RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints...
                                           subsample=None, tree_method=None,
                                           use_label_encoder=False,
                                    

In [161]:
print_best_score_and_parameter(random_search_education)

Best Score: 0.4346076349578432
Best parameters: {'n_estimators': 7, 'min_child_weight': 5, 'max_depth': 8, 'learning_rate': 0.3, 'gamma': 0.4, 'colsample_bytree': 0.7}
