In [1]:
import torch
import torch.utils.data as data_utils
import sys
import pandas as pd
import numpy as np
sys.path.append('../')

from utils import load_wids_xy_data, get_mappers
from EmbeddingModelV2 import EmbeddingModel, train_model

DATA_DIR = '/Users/timlee/data/wids/'


In [2]:
X, y, X_test = load_wids_xy_data(DATA_DIR, target='is_female')

loading training data ...
loading test data ...
complete ...
formatting ...
imputing missing values ...
(18255, 1122) (18255,) (27285, 1122)


In [3]:
cat_cols = X.columns
emb_cols = ['DG3', 'DG4']
X_mapped, mappers, emb_szs, idx2col, col2idx = get_mappers(X, cat_cols, emb_cols)

cat_onehot_cols = X_mapped.columns

converting to category ...
0
100
200
300
400
500
600
700
800
900
1000
1100
calculating cardinality
remapping columns to int
complete


In [4]:
em = EmbeddingModel(emb_szs=emb_szs,
                    cat_cols=cat_onehot_cols,
                    idx2col=idx2col, 
                    col2idx=col2idx,
                    layer_sizes=[500,100], 
                    output_dim=1)

number of emb feats: 2
total embedding parameters 12


In [5]:
bz = 50
X_tensor = torch.from_numpy(X_mapped.head(18200).as_matrix())
y_tensor = torch.from_numpy(y[:18200]).view(-1,1)

train = data_utils.TensorDataset(X_tensor, y_tensor)
train_loader = data_utils.DataLoader(train, batch_size=bz, shuffle=True)
loss_fn = torch.nn.BCELoss(size_average=False)
params = {
    'weight_decay': 0.01,
    'n_epoches': 2,
    'learning_rate': 0.01,
    'ml_type': 'binary'
}

In [6]:
train_model(em, train_loader, loss_fn, **params)

learning rate 0.010000
[1/2] - 1200/18200 loss: 25.528437, acc: 0.757600
[1/2] - 2450/18200 loss: 19.337873, acc: 0.833600
[1/2] - 3700/18200 loss: 18.758203, acc: 0.835200
[1/2] - 4950/18200 loss: 17.701558, acc: 0.847200
[1/2] - 6200/18200 loss: 15.136698, acc: 0.873600
[1/2] - 7450/18200 loss: 15.061308, acc: 0.874400
[1/2] - 8700/18200 loss: 15.817006, acc: 0.874400
[1/2] - 9950/18200 loss: 15.926250, acc: 0.864000
[1/2] - 11200/18200 loss: 14.881642, acc: 0.875200
[1/2] - 12450/18200 loss: 15.016040, acc: 0.878400
[1/2] - 13700/18200 loss: 14.973339, acc: 0.874400
[1/2] - 14950/18200 loss: 14.955606, acc: 0.876800
[1/2] - 16200/18200 loss: 15.689525, acc: 0.868800
[1/2] - 17450/18200 loss: 14.206887, acc: 0.873600
[2/2] - 1200/18200 loss: 14.373886, acc: 0.884000
[2/2] - 2450/18200 loss: 15.510150, acc: 0.868800
[2/2] - 3700/18200 loss: 13.434170, acc: 0.898400
[2/2] - 4950/18200 loss: 14.609803, acc: 0.869600
[2/2] - 6200/18200 loss: 15.039789, acc: 0.877600
[2/2] - 7450/18200 lo

### Trying to Predict Level of Education

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.init import kaiming_normal

y_mapper = {
    1:0,
    2:1,
    3:2,
    4:3,
    5:4,
    6:5,
    7:6,
    8:7,
    9:8,
    10:9,
    11:10,
    12:11,
    96:12,
    99:13
}

In [3]:
X, y, X_test, y_test = load_wids_xy_data(DATA_DIR, target='DG4')
y = np.array([y_mapper[v] for v in y])
y_test = np.array([y_mapper[v] for v in y_test])

loading training data ...
loading test data ...
complete ...
formatting ...
imputing missing values ...
(18255, 1121) (18255,) (27285, 1121) (27285,)


In [4]:
X_join = pd.concat([X, X_test])
y_join = np.concatenate([y, y_test])
X_join.shape, y_join.shape

((45540, 1121), (45540,))

In [5]:
cat_cols = X.columns
emb_cols = cat_cols 
X_mapped, mappers, emb_szs, idx2col, col2idx = get_mappers(X, cat_cols, emb_cols)

converting to category ...
0
100
200
300
400
500
600
700
800
900
1000
1100
calculating cardinality
remapping columns to int
complete


In [6]:
bz = 50
n_classes = max(set(y_join))+1
X_tensor = torch.from_numpy(X_mapped.head(45000).as_matrix())
y_tensor = torch.from_numpy(y[:45000]).view(-1,1)
y_1hot_tensor = torch.zeros([y_tensor.shape[0], int(n_classes)])
y_1hot_tensor.scatter_(1, y_tensor, 1) 

train = data_utils.TensorDataset(X_tensor, y_1hot_tensor)
train_loader = data_utils.DataLoader(train, batch_size=bz, shuffle=True)
loss_fn = torch.nn.MultiLabelSoftMarginLoss()
params = {
    'weight_decay': 0.01,
    'n_epoches': 2,
    'learning_rate': 0.01,
    'ml_type':'multi',
    'n_classes': n_classes
}
cat_onehot_cols = X_mapped.columns

In [7]:
em = EmbeddingModel(emb_szs=emb_szs,
                    cat_cols=cat_cols,
                    idx2col=idx2col, 
                    col2idx=col2idx,
                    layer_sizes=[1000,300, 100], 
                    output_dim=14)

number of emb feats: 1121
total embedding parameters 3494


In [8]:
train_model(em, train_loader, loss_fn, **params)

learning rate 0.010000
[1/2] - 1200/18200 loss: 0.296072, acc: 0.156000
[1/2] - 2450/18200 loss: 0.287184, acc: 0.289600
[1/2] - 3700/18200 loss: 0.283537, acc: 0.336000
[1/2] - 4950/18200 loss: 0.283725, acc: 0.352800
[1/2] - 6200/18200 loss: 0.285810, acc: 0.354400
[1/2] - 7450/18200 loss: 0.287987, acc: 0.348800
[1/2] - 8700/18200 loss: 0.287494, acc: 0.388800
[1/2] - 9950/18200 loss: 0.288795, acc: 0.316800
[1/2] - 11200/18200 loss: 0.288188, acc: 0.311200
[1/2] - 12450/18200 loss: 0.288168, acc: 0.356000
[1/2] - 13700/18200 loss: 0.287874, acc: 0.363200
[1/2] - 14950/18200 loss: 0.288887, acc: 0.317600
[1/2] - 16200/18200 loss: 0.288068, acc: 0.348800
[1/2] - 17450/18200 loss: 0.288274, acc: 0.340800
[2/2] - 1200/18200 loss: 0.289014, acc: 0.330400
[2/2] - 2450/18200 loss: 0.288992, acc: 0.353600
[2/2] - 3700/18200 loss: 0.288761, acc: 0.343200
[2/2] - 4950/18200 loss: 0.290257, acc: 0.334400
[2/2] - 6200/18200 loss: 0.289255, acc: 0.342400
[2/2] - 7450/18200 loss: 0.289408, acc: 