<font style="font-size: 2rem; color: blue">

Building a neural network for our survey data, by using StratifiedKFold and Entity Embedding. 
   
 
</font>

In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils

In [16]:
from sklearn import preprocessing

In [17]:
from sklearn import model_selection

In [18]:
df = pd.read_csv("../data/train_clean.csv")
df.drop(columns=["respondent_id"],inplace=True)

In [19]:
#initiate the kfold class from model_selection module
#This cross-validation object is a variation of KFold that returns stratified folds. 
#The folds are made by preserving the percentage of samples for each class.
kf = model_selection.StratifiedKFold(n_splits=5)

In [20]:
df["kfold"] = -1

In [21]:
df = df.sample(frac=1).reset_index(drop=True)

In [22]:
y = df.h1n1_vaccine.values

In [23]:
for f, (t_,v_) in enumerate(kf.split(X=df,y=y)):
    df.loc[v_,"kfold"] = f

In [24]:
def fit_model_h1n1(fold):
     #create a list of just the categorical variables
    features = [f for f in df.columns if f not in ('household_adults','household_children','h1n1_vaccine', 'seasonal_vaccine', 'kfold')]
    #convert the categorical variables to strings
    for col in features:
        df.loc[:,col] = df[col].astype(str)
    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        df.loc[:,feat]=lbl_enc.fit_transform(df[feat].values)
    #Hold one of the five folds as validation set and four folds as training sets 
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    model = create_model(df,features)
    # our features are lists of lists
    xtrain = [
     df_train[features].values[:, k] for k in range(len(features))
    ]
    xvalid = [
    df_valid[features].values[:, k] for k in range(len(features))
    ]
    # get target columns
    ytrain = df_train.h1n1_vaccine.values
    yvalid = df_valid.h1n1_vaccine.values
    # convert target columns to categories

    ytrain_cat = utils.to_categorical(ytrain)
    yvalid_cat = utils.to_categorical(yvalid)
    
    # fit the model
    model.fit(xtrain,
    ytrain_cat,
    validation_data=(xvalid, yvalid_cat),
    verbose=0,
    batch_size=32,
    epochs=100)
    valid_preds = model.predict(xvalid)[:, 1]
    auc = metrics.roc_auc_score(yvalid, valid_preds)
    print(f"For fold = {fold}, AUC = {auc}")
    
    K.clear_session()


In [25]:
def create_model(data, catcols):
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(45, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(45, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    y = layers.Dense(2, activation="softmax")(x)
    model = Model(inputs=inputs, outputs=y)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

<font style="font-size: 2rem; color: blue">

Results for H1N1 data. 
   
 
</font>

In [26]:
print("The Neural Network results for predicting H1N1 Flu")
score = fit_model_h1n1(0)

The Neural Network results for predicting H1N1 Flu
For fold = 0, AUC = 0.8632551788554632


In [27]:
score = fit_model_h1n1(1)

For fold = 1, AUC = 0.8731076483603475


In [28]:
score = fit_model_h1n1(2)

For fold = 2, AUC = 0.8743907127157265


In [29]:
score = fit_model_h1n1(3)

For fold = 3, AUC = 0.8671434346989093


In [30]:
score = fit_model_h1n1(4)

For fold = 4, AUC = 0.8638480584690216


<font style="font-size: 2rem; color: blue">

For Seasonal Flu  survey. 
   
 
</font>

In [31]:
df = pd.read_csv("../data/train_clean.csv")
df.drop(columns=["respondent_id"],inplace=True)

In [32]:
#initiate the kfold class from model_selection module
#This cross-validation object is a variation of KFold that returns stratified folds. 
#The folds are made by preserving the percentage of samples for each class.
kf_seas = model_selection.StratifiedKFold(n_splits=5)

In [33]:
df["kfold_seas"] = -1

In [34]:
df = df.sample(frac=1).reset_index(drop=True)

In [35]:
y = df.seasonal_vaccine.values

In [36]:
for f, (t_,v_) in enumerate(kf.split(X=df,y=y)):
    df.loc[v_,"kfold_seas"] = f

In [37]:
def fit_model_seas(fold):
     #create a list of just the categorical variables
    features = [f for f in df.columns if f not in ('household_adults','household_children','h1n1_vaccine', 'seasonal_vaccine', 'kfold_seas')]
    #convert the categorical variables to strings
    for col in features:
        df.loc[:,col] = df[col].astype(str)
    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        df.loc[:,feat]=lbl_enc.fit_transform(df[feat].values)
    #Hold one of the five folds as validation set and four folds as training sets 
    df_train = df[df.kfold_seas != fold].reset_index(drop=True)
    df_valid = df[df.kfold_seas == fold].reset_index(drop=True)
    model = create_model(df,features)
    # our features are lists of lists
    xtrain = [
     df_train[features].values[:, k] for k in range(len(features))
    ]
    xvalid = [
    df_valid[features].values[:, k] for k in range(len(features))
    ]
    # get target columns
    ytrain = df_train.seasonal_vaccine.values
    yvalid = df_valid.seasonal_vaccine.values
    # convert target columns to categories

    ytrain_cat = utils.to_categorical(ytrain)
    yvalid_cat = utils.to_categorical(yvalid)
    
    # fit the model
    model.fit(xtrain,
    ytrain_cat,
    validation_data=(xvalid, yvalid_cat),
    verbose=0,
    batch_size=32,
    epochs=100)
    valid_preds = model.predict(xvalid)[:, 1]
    auc = metrics.roc_auc_score(yvalid, valid_preds)
    print(f"For fold = {fold}, AUC = {auc}")
    K.clear_session()

In [38]:
def create_model(data, catcols):
    inputs = []
    outputs = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(45, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(45, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    y = layers.Dense(2, activation="softmax")(x)
    model = Model(inputs=inputs, outputs=y)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

In [39]:
print("The Neural Network results for predicting Seasonal Flu")
fit_model_seas(0)
    

The Neural Network results for predicting Seasonal Flu
For fold = 0, AUC = 0.8672824079257675


In [40]:
fit_model_seas(1)

For fold = 1, AUC = 0.8587448427092335


In [41]:
fit_model_seas(2)

For fold = 2, AUC = 0.8568116222577445


In [42]:
fit_model_seas(3)

For fold = 3, AUC = 0.8554497401906874


In [None]:
fit_model_seas(4)