# Categorical encodings

The data for this notebook comes from this kaggle [competition](https://www.kaggle.com/c/wids2018datathon/). You are given a dataset of survey questions and results from a developing country. Your goal is to predict the gender of the respondent based on the other answers he/she provided. You Kaggle api to get the data.

kaggle competitions download -c wids2018datathon -p /data/yinterian/WiDS18

In [104]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import random

In [105]:
from pandas_summary import DataFrameSummary

## Dataset

In [213]:
PATH = Path("/data/yinterian/WiDS18/")

In [214]:
train = pd.read_csv(PATH/"train.csv", low_memory=False)
train.head()

Unnamed: 0,train_id,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,is_female,...,GN1,GN1_OTHERS,GN2,GN2_OTHERS,GN3,GN3_OTHERS,GN4,GN4_OTHERS,GN5,GN5_OTHERS
0,0,3,32,3.0,,323011,3854,481,1975,1,...,99.0,,99,,99,,99,,99,
1,1,2,26,,8.0,268131,2441,344,1981,1,...,,,1,,2,,2,,2,
2,2,1,16,,7.0,167581,754,143,1995,1,...,1.0,,2,,2,,2,,2,
3,3,4,44,5.0,,445071,5705,604,1980,1,...,,,2,,2,,99,,99,
4,4,4,43,,6.0,436161,5645,592,1958,1,...,,,1,,1,,1,,1,


In [215]:
train = train.drop(columns=["train_id"])
train.head()

Unnamed: 0,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,is_female,DG3,...,GN1,GN1_OTHERS,GN2,GN2_OTHERS,GN3,GN3_OTHERS,GN4,GN4_OTHERS,GN5,GN5_OTHERS
0,3,32,3.0,,323011,3854,481,1975,1,3,...,99.0,,99,,99,,99,,99,
1,2,26,,8.0,268131,2441,344,1981,1,8,...,,,1,,2,,2,,2,
2,1,16,,7.0,167581,754,143,1995,1,3,...,1.0,,2,,2,,2,,2,
3,4,44,5.0,,445071,5705,604,1980,1,3,...,,,2,,2,,99,,99,
4,4,43,,6.0,436161,5645,592,1958,1,3,...,,,1,,1,,1,,1,


## Cleaning columns with too many NAs

In [216]:
train["AA5"].isnull().sum()

12602

In [217]:
train.isnull().sum()

AA3                     0
AA4                     0
AA5                 12602
AA6                  5653
AA7                     0
AA14                    0
AA15                    0
DG1                     0
is_female               0
DG3                     0
DG3A                    0
DG3A_OTHERS         18205
DG4                     0
DG4_OTHERS          18255
DG5_1                   0
DG5_2                   0
DG5_3                   0
DG5_4                   0
DG5_5                   0
DG5_6                   0
DG5_7                   0
DG5_8                   0
DG5_9                   0
DG5_10                  0
DG5_11                  0
DG5_96                  0
DG6                     0
DG8a                    0
DG8b                    0
DG8c                    0
                    ...  
FB28_2_OTHERS       18253
FB28_3_OTHERS       18255
FB28_4_OTHERS       18253
FB28_96_OTHERS      18254
FB29_1                  0
FB29_2                  0
FB29_3                  0
FB29_4      

In [218]:
## dropping columns with too many nulls
for col in train.columns:
    if train[col].isnull().sum() > 12000:
        #print(col, train[col].isnull().sum())
        train.drop(col, axis=1, inplace=True)

In [219]:
DataFrameSummary(train).summary()

Unnamed: 0,AA3,AA4,AA6,AA7,AA14,AA15,DG1,is_female,DG3,DG3A,...,LN2_2,LN2_3,LN2_4,LN2_RIndLngBEOth,LN2_WIndLngBEOth,GN1,GN2,GN3,GN4,GN5
count,18255,18255,12602,18255,18255,18255,18255,18255,18255,18255,...,18255,18255,18255,,,14230,18255,18255,18255,18255
mean,2.37146,28.5583,6.79352,291361,8030.51,352.038,1978.07,0.537113,3.94248,4.48973,...,2.15157,2.83073,2.83588,,,5.59065,6.76308,7.42191,8.92588,8.81737
std,1.13052,9.82263,0.769568,98126.5,22061.6,179.744,14.7407,0.498634,8.66617,8.54588,...,1.40702,1.59432,1.59286,,,18.2108,20.5373,21.7525,24.5233,24.4132
min,1,11,6,111011,96,24,1917,0,1,1,...,1,1,1,,,1,1,1,1,1
25%,1,21,6,216071,949,178,1969,0,3,4,...,1,1,1,,,1,1,1,1,1
50%,2,31,7,313011,2902,354,1981,1,3,4,...,1,3,3,,,2,2,2,2,2
75%,3,34,7,348181,4609,523,1990,1,3,4,...,3,4,4,,,3,3,3,3,3
max,4,44,8,448051,99999,633,2001,1,99,99,...,5,5,5,,,99,99,99,99,99
counts,18255,18255,12602,18255,18255,18255,18255,18255,18255,18255,...,18255,18255,18255,11341,11344,14230,18255,18255,18255,18255
uniques,4,22,3,1050,907,450,79,2,9,8,...,5,5,5,57,58,6,6,6,6,6


In [220]:
train.shape

(18255, 421)

In [221]:
train.to_csv(PATH/"train_421_cols.csv", index=False)

## Picking columns for embeddings

In [294]:
train = pd.read_csv(PATH/"train_421_cols.csv")

In [295]:
Y = train["is_female"].values.astype(np.float32)
X = train.drop(columns=["is_female"])

In [296]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
for col in X.columns:
    if X.dtypes[col] == "object":
        X[col] = X[col].fillna("NA")
    else:
        X[col] = X[col].fillna(0)
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

In [297]:
for col in X.columns:
    X[col] = X[col].astype('category')

In [298]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.20, random_state=3)
X_train.head()

Unnamed: 0,AA3,AA4,AA6,AA7,AA14,AA15,DG1,DG3,DG3A,DG4,...,LN2_2,LN2_3,LN2_4,LN2_RIndLngBEOth,LN2_WIndLngBEOth,GN1,GN2,GN3,GN4,GN5
11369,1,10,2,424,482,252,62,2,3,4,...,0,0,0,38,38,0,1,1,1,1
1250,2,17,1,831,831,410,63,2,3,5,...,0,3,3,33,32,1,2,2,0,0
7527,1,8,0,301,310,166,53,2,3,0,...,0,0,0,38,38,2,1,1,1,1
13476,2,14,2,664,564,300,69,2,3,4,...,2,2,2,14,14,3,2,2,2,2
13406,0,12,3,587,111,76,58,2,3,2,...,1,3,3,14,14,1,0,2,2,2


In [299]:
# number of categories for categories with more than 2 categories
emb_c = {n: len(col.cat.categories) for n,col in X.items() if len(col.cat.categories) > 2}
#emb_c

In [459]:
# size of the category, size of the embedding
# 30 and (c+1)//2) are arbitrary (we should play with these numbers)
emb_szs = [(c, min(50, (c+1)//2)) for _,c in emb_c.items()]
#emb_szs

In [461]:
emb_cols = emb_c.keys()

## Dataset

In [460]:
# all variables are categorical, but some of them has just two values 
# emb_c are the variables we plan to embed
class WiDSDataset(Dataset):
    def __init__(self, X, Y, emb_cols):
        X = X.copy()
        self.X1 = X.loc[:,emb_cols].copy().values.astype(np.int64)
        self.X2 = X.drop(columns=emb_cols).copy().values.astype(np.float32)
        self.emb_szs = emb_szs
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return [self.X1[idx], self.X2[idx], self.y[idx]]

In [462]:
train_ds = WiDSDataset(X_train, y_train, emb_cols)
valid_ds = WiDSDataset(X_val, y_val, emb_cols)

In [463]:
batch_size = 5
train_dl = DataLoader(train_ds, batch_size=batch_size)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [305]:
valid_ds[0]

[array([  0,   4,   3, 200, 229, 129,  72,   0,   3,   5,   2,   5,   3,
          1,   3,   0,   0,   3,   0,   3,   5,   0,   0,   0,   1, 136,
          0,   8,   3,   0,   1,   7,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   2,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   1,   2,   0,   0,   1,   3,
          1,   3,   2,   1,   2,   3,   2,   2,   2,   0,   0,   1,   0,
          0,   4,   1,   2,   2,   2,   2,   2,   2,   1,   1,   1,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   4,   4,   2,   2,   2,
          2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
          2,   2,   2,   2,   0,   2,   2,   6,   6,   6,   6,   6,   6,
          1,   6,   6,   6,   6,   6,   6,   6,   0

## Model

In [451]:
from torch.nn.init import kaiming_uniform, kaiming_normal

In [498]:
# modified from fast.ai
class MixedInputModel(nn.Module):
    def __init__(self, emb_szs, n_cont):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(c, s) for c,s in emb_szs])
        n_emb = sum(e.embedding_dim for e in self.embs) 
        self.n_emb, self.n_cont = n_emb, n_cont
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 100)
        self.lin2 = nn.Linear(100, 1)
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(100)
        self.emb_drop = nn.Dropout(0.5)
        self.drops = nn.Dropout(0.2)
        

    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:,i]) for i,e in enumerate(self.embs)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x2 = self.bn1(x_cont)
        x = torch.cat([x, x2], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = self.lin2(x)
        return x

In [499]:
model = MixedInputModel(emb_szs, 172)

In [481]:
x1,x2,y = next(iter(train_dl))
x1 = Variable(x1)
x2 = Variable(x2)
y = Variable(y).unsqueeze(1)
out = model(x1, x2)
#out

In [482]:
pred = (out > 0.0).float()

In [483]:
(pred == y).float().sum()

Variable containing:
 260
[torch.FloatTensor of size 1]

In [484]:
F.binary_cross_entropy_with_logits(out, y)

Variable containing:
 0.7801
[torch.FloatTensor of size 1]

## Training

In [500]:
def get_optimizer(model, lr = 0.01, wd = 0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim

In [501]:
def train_model(model, optim, train_dl=train_dl, verbose=False):
    model.train()
    total = 0
    sum_loss = 0
    for i, (x1, x2, y) in enumerate(train_dl):
        batch = y.shape[0]
        x1 = Variable(x1).cuda()
        x2 = Variable(x2).cuda()
        y = Variable(y).cuda().unsqueeze(1)
        
        out = model(x1, x2)
        loss = F.binary_cross_entropy_with_logits(out, y)
        
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.data[0])
        if verbose: print(sum_loss/total)
    return sum_loss/total

In [502]:
def val_loss(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for i, (x1, x2, y) in enumerate(valid_dl):
        batch = y.shape[0]
        x1 = Variable(x1).cuda()
        x2 = Variable(x2).cuda()
        y = Variable(y).cuda().unsqueeze(1)
        
        out = model(x1, x2)
        loss = F.binary_cross_entropy_with_logits(out, y)
        sum_loss += batch*(loss.data[0])
        total += batch
        pred = (out > 0).float()
        correct += (pred == y).float().sum().data[0]
    print("val loss", sum_loss/total, correct/total)
    return sum_loss/total, correct/total

In [503]:
from datetime import datetime

def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for i in range(epochs): 
        loss = train_model(model, optim, train_dl)
        print("loss ", loss)
        val_loss(model, valid_dl)

In [504]:
batch_size = 500
train_dl = DataLoader(train_ds, batch_size=batch_size)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [505]:
model = MixedInputModel(emb_szs, 172).cuda()

In [506]:
# Try the higest learning rate that doesn't cycle 
#optim = get_optimizer(model, lr = 0.1, wd = 0.0)
#train_model(model, optim, train_dl, verbose=True)

In [507]:
train_loop(model, epochs=10, lr=0.05, wd=0.00001)

loss  0.40526977476927395
val loss 0.29699555219803336 0.866885784716516
loss  0.2750424739964137
val loss 0.2437903297556423 0.9000273897562312
loss  0.24441620868726874
val loss 0.2540404100927122 0.8918104628868803
loss  0.22775121667848094
val loss 0.23680879757261317 0.9060531361270885
loss  0.2194080888139356
val loss 0.25075806512453236 0.9005751848808545
loss  0.2133990087490544
val loss 0.24533881665386262 0.9030402629416598
loss  0.20408456937658137
val loss 0.2584028082558698 0.9041358531909066
loss  0.19421941843531745
val loss 0.259794986164756 0.9052314434401534
loss  0.19766427119834165
val loss 0.26451290132933464 0.9000273897562312
loss  0.18953184935694622
val loss 0.2613539876014819 0.901122980005478
