Importing necessary libraries and settings

In [1]:
import pandas as pd
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
# to display output inside the notebook

# feature engineering
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 0    # Set a random seed for reproducibility!

pd.set_option('display.max_columns', 100) # to display maximum columns

Featues and target

In [2]:
features_df = pd.read_csv(
    'training_set_features.csv',
    index_col="respondent_id"
)
labels_df = pd.read_csv(
    'training_set_labels.csv',
    index_col="respondent_id"
)
test_features_df = pd.read_csv(
    'test_set_features.csv',
    index_col="respondent_id"
)

In [3]:
print('shape of features_df:' , features_df.shape)
print('shape of labels_df:', labels_df.shape)
print('shape of test_features_df:' , test_features_df.shape)

shape of features_df: (26707, 35)
shape of labels_df: (26707, 2)
shape of test_features_df: (26708, 35)


In [4]:
# separating the two classes
labels_df_h1n1 = labels_df[['h1n1_vaccine']]
labels_df_seasonal = labels_df[['seasonal_vaccine']]

Preprocessing and preparing the data

In [5]:
# percentage of null values in features
((features_df.isnull().sum()/len(features_df) * 100  )).sort_values(ascending=False)

employment_occupation          50.436215
employment_industry            49.912008
health_insurance               45.957989
income_poverty                 16.561201
doctor_recc_h1n1                8.087767
doctor_recc_seasonal            8.087767
rent_or_own                     7.645936
employment_status               5.477965
marital_status                  5.272026
education                       5.268282
chronic_med_condition           3.635751
child_under_6_months            3.070356
health_worker                   3.010447
opinion_seas_sick_from_vacc     2.010709
opinion_seas_risk               1.924589
opinion_seas_vacc_effective     1.729884
opinion_h1n1_sick_from_vacc     1.479013
opinion_h1n1_vacc_effective     1.464036
opinion_h1n1_risk               1.452803
household_children              0.932340
household_adults                0.932340
behavioral_avoidance            0.778822
behavioral_touch_face           0.479275
h1n1_knowledge                  0.434343
h1n1_concern    

In [6]:
numeric_cols = features_df.columns[features_df.dtypes != 'object'].values
non_numeric_cols = features_df.columns[features_df.dtypes == 'object'].values

print('numerical columns:', len(numeric_cols))
print('non numerical (categorical) columns:', len(non_numeric_cols))

numerical columns: 23
non numerical (categorical) columns: 12


In [7]:
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy = 'mean'))
])

non_numeric_preprocessing_steps = Pipeline([
     ('simple_imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('one_hot_encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers = [
        ('numeric', numeric_preprocessing_steps, numeric_cols),
        ('non_numeric', non_numeric_preprocessing_steps, non_numeric_cols)  
    ],
    remainder = "drop"
)

In [37]:
features_df_preprocess = pd.DataFrame(preprocessor.transform(features_df))
test_features_df_preprocess = pd.DataFrame(preprocessor.transform(test_features_df))

# Train on whole dataset

In [48]:
model_h1n1 = LogisticRegression()
model_seasonal = LogisticRegression()

model_h1n1.fit(features_df_preprocess, labels_df_h1n1)
model_seasonal.fit(features_df_preprocess, labels_df_seasonal)

None

In [49]:
preds1_h1n1 = model_h1n1.predict_proba(test_features_df_preprocess)
preds2_seasonal = model_seasonal.predict_proba(test_features_df_preprocess)

In [50]:
y_preds1_h1n1 = pd.DataFrame(
    {
        "h1n1_vaccine": preds1_h1n1[:, 1],
        
    },
    index = test_features_df.index
)

y_preds2_seasonal = pd.DataFrame(
    {
        "seasonal_vaccine": preds2_seasonal[:, 1],
        
    },
    index = test_features_df.index
)
print("y_preds1_h1n1.shape:", y_preds1_h1n1.shape)
print("y_preds2_seasonal.shape:", y_preds2_seasonal.shape)

y_preds1_h1n1.shape: (26708, 1)
y_preds2_seasonal.shape: (26708, 1)


In [45]:
y_preds1_h1n1.to_csv('h1n1_submision.csv', index=True)
y_preds2_seasonal.to_csv('seasonal_submission.csv', index=True)

In [47]:
joined_df = y_preds1_h1n1.join(y_preds2_seasonal)
joined_df.head()

joined_df.to_csv('final_submission.csv', index=True)