In [19]:
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
train_data = pd.read_csv('./Data_Test_Train_Formats/training_set_features.csv')
train_labels = pd.read_csv('./Data_Test_Train_Formats/training_set_labels.csv')
test_data = pd.read_csv('./Data_Test_Train_Formats/test_set_features.csv')
df = train_labels.merge(train_data, on='respondent_id')

In [3]:
all_cols = [l for l in train_data.columns.values]
level_features = ['h1n1_concern',
       'h1n1_knowledge','opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc']

## Silly way of dealing with NAN values -  drop them from everything.

In [4]:
df_dropna = df.dropna()
logreg = sklearn.linear_model.LogisticRegression()

In [5]:
logreg.fit(df_dropna[level_features], df_dropna['h1n1_vaccine'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [6]:
logreg.score(df_dropna[level_features], df_dropna['h1n1_vaccine']) # accuracy of 79% is pretty good to start with

0.7960229920770545

# Instead, try a pipeline to impute various  replacements for 'level feature' data

In [7]:
Xtrain, Xtest, ytrain, ytest =  sklearn.model_selection.train_test_split(train_data, train_labels)

In [8]:
[c for c in Xtrain.columns if c not in level_features] # write up the data dictionary....

['respondent_id',
 'behavioral_antiviral_meds',
 'behavioral_avoidance',
 'behavioral_face_mask',
 'behavioral_wash_hands',
 'behavioral_large_gatherings',
 'behavioral_outside_home',
 'behavioral_touch_face',
 'doctor_recc_h1n1',
 'doctor_recc_seasonal',
 'chronic_med_condition',
 'child_under_6_months',
 'health_worker',
 'health_insurance',
 'age_group',
 'education',
 'race',
 'sex',
 'income_poverty',
 'marital_status',
 'rent_or_own',
 'employment_status',
 'hhs_geo_region',
 'census_msa',
 'household_adults',
 'household_children',
 'employment_industry',
 'employment_occupation']

In [26]:
# Build a pipeline for Categorical variables (aka the rest) that includes

def impute_categorical_df(df):
    # Imputation of missing values using Iterative Imputer on a categorical dataframe
    # imputes features from all other features
    # adapted from https://stackoverflow.com/questions/64900801/implementing-knn-imputation-on-categorical-variables-in-an-sklearn-pipeline
    # credit Lucca Massaron
    II = IterativeImputer(random_state=42)

    # Need to encode the labels as numeric values using LabelEnconder
    df = df.apply(lambda series: pd.Series(
        LabelEncoder().fit_transform(series[series.notnull()]),
        index=series[series.notnull()].index
    ))
    
    # use sklearn IterativeImputer to fit a model that predicts missing labels, then fill them
    imp_cat = IterativeImputer(estimator=RandomForestClassifier(), 
                           initial_strategy='most_frequent',
                           max_iter=10, random_state=0)
    
    return imp_cat.fit_transform(df)

impute_categorical_df(Xtrain.head(15)) # add impute_categorical_df to a pipeline





array([[ 0.,  3.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  1.,  1.,  2.,  2.,  2.,  2.,  2.,  1.,  0.,  1.,
         0.,  1.,  1.,  0.,  6.,  1.,  0.,  0.,  0.,  6.],
       [ 1.,  3.,  2.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.,  0.,  2.,  3.,  3.,  2.,  1.,  1.,  4.,  2.,  3.,  0.,
         0.,  0.,  0.,  0.,  6.,  1.,  2.,  0.,  4.,  1.],
       [ 2.,  0.,  2.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  1.,  1.,
         0.,  0.,  0.,  1.,  0.,  0.,  2.,  0.,  0.,  4.,  0.,  3.,  1.,
         0.,  0.,  0.,  1.,  6.,  0.,  1.,  0.,  5.,  4.],
       [ 3.,  2.,  2.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,
         0.,  0.,  0.,  3.,  1.,  1.,  3.,  3.,  1.,  2.,  2.,  3.,  1.,
         0.,  0.,  0.,  0.,  6.,  0.,  2.,  0.,  1.,  0.],
       [ 4.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  1.,  0.,  0.,  0.,
         0.,  0.,  0.,  2.,  0.,  0.,  2.,  0.,  0.,  3.,  0.,  3.,  0.,
         0.,  1.,  1.,  0.,  1.,  

# One hot encoding the Categorical variables - but they are ALL categorical!

In [28]:
OH = sklearn.preprocessing.OneHotEncoder()
OH.fit_transform(impute_categorical_df(Xtrain.head(15)))















KeyboardInterrupt: 