<img src="https://m1.quebecormedia.com/emp/emp/bender8175858f-7dbd-4927-85c5-85557e800b98_ORIGINAL.jpg?impolicy=crop-resize&x=0&y=0&w=1000&h=745&width=925&height=925" style="float: left; margin: 30px; height: 75px">

# Approaching Categorical Variables 


In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [30]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
data = pd.read_csv("data/train_clean.csv")
data.drop(columns=["respondent_id"],inplace=True)

In [10]:
data.head()

Unnamed: 0,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_h1n1,...,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,h1n1_vaccine,seasonal_vaccine
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,no_response,no_response,0,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,0,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,no_response,...,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,0,0
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,no_response,no_response,0,1
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0,0


In [11]:
from sklearn import preprocessing

## H1N1 Vaccine

<font style="font-size: 2rem; color: blue">
Before going to any kind of model building, it’s essential to take care of crossvalidation. We have already seen the label/target distribution, and we know that it
is a binary classification problem with skewed targets. Thus, we will be using
StratifiedKFold to split the data here.
</font>

In [12]:
from sklearn import model_selection

In [13]:
#Create a new column called k-fold and fill it with -1. 
data["kfold"] = -1

In [14]:
#randomize the rows of the data 
data = data.sample(frac=1).reset_index(drop=True)

In [15]:
#get labels 
y = data.h1n1_vaccine.values
print(type(y))

<class 'numpy.ndarray'>


In [17]:
#initiate the kfold class from model_selection module
#This cross-validation object is a variation of KFold that returns stratified folds. 
#The folds are made by preserving the percentage of samples for each class.
kf = model_selection.StratifiedKFold(n_splits=5)

In [22]:
#Fill the kfold column with the number of the k-fold. 
for f, (t_,v_) in enumerate(kf.split(X=data,y=y)):
    data.loc[v_,"kfold"] = f

In [23]:
#Check that the number of folds is equal. 
data.kfold.value_counts()

0    5342
1    5342
2    5341
3    5341
4    5341
Name: kfold, dtype: int64

In [25]:
#Check that the target variable has the same distribution  the same in each fold. 
for k in range(5):
    print(f"fold: k = {k}")
    print(data[data.kfold==k].h1n1_vaccine.value_counts())


fold: k = 0
0    4207
1    1135
Name: h1n1_vaccine, dtype: int64
fold: k = 1
0    4207
1    1135
Name: h1n1_vaccine, dtype: int64
fold: k = 2
0    4207
1    1134
Name: h1n1_vaccine, dtype: int64
fold: k = 3
0    4206
1    1135
Name: h1n1_vaccine, dtype: int64
fold: k = 4
0    4206
1    1135
Name: h1n1_vaccine, dtype: int64


## Try Logistic Regression on H1N1

In [26]:
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing


In [28]:
data.columns

Index(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation', 'h1n1_vaccine', 'seasonal_vaccine', 'kfold'],
      dtype='object')

In [32]:
def run(fold):
    #create a list of just the categorical variables
    features = [f for f in data.columns if f not in ('household_adults','household_children','h1n1_vaccine', 'seasonal_vaccine', 'kfold')]
    #convert the categorical variables to strings
    for col in features:
        data.loc[:,col] = data[col].astype(str)
    #Hold one of the five folds as validation set and four folds as training sets 
    df_train = data[data.kfold!=fold].reset_index(drop=True)
    df_valid = data[data.kfold==fold].reset_index(drop=True)
    
    #Initialize one hot processing 
    ohe = preprocessing.OneHotEncoder()
    #fit ohe on training + validation categorical features
    full_data = pd.concat([df_train[features],df_valid[features]],axis=0)
    ohe.fit(full_data[features])
    # transform training data
    x_train = ohe.transform(df_train[features])
    # transform validation data
    x_valid = ohe.transform(df_valid[features])
    
    #initialize Logistic Regression
    model = linear_model.LogisticRegression()
    
    #fit model on training data
    model.fit(x_train,df_train.h1n1_vaccine.values)
    
    #predict the validation data
    valid_preds = model.predict_proba(x_valid)[:,1]
    
    #get roc auc score
    auc = metrics.roc_auc_score(df_valid.h1n1_vaccine.values,valid_preds)
    print(f"For fold = {fold}, AUC = {auc}")
    

In [33]:
for k in range(5):
    run(k)

For fold = 0, AUC = 0.8680397365833533
For fold = 1, AUC = 0.8640424130539723
For fold = 2, AUC = 0.8631044085841644
For fold = 3, AUC = 0.8604871161608862
For fold = 4, AUC = 0.859028113812657
