In [1]:
import torch
import torch.utils.data as data_utils
import sys
import pandas as pd
import numpy as np
sys.path.append('../')

from utils import load_wids_xy_data, get_mappers
from EmbeddingModelV2 import EmbeddingModel, train_model

DATA_DIR = '/Users/timlee/data/wids/'


In [2]:
X, y, X_test = load_wids_xy_data(DATA_DIR, target='is_female')

loading training data ...
loading test data ...
complete ...
formatting ...
imputing missing values ...
(18255, 1122) (18255,) (27285, 1122)


In [8]:
cat_cols = X.columns
emb_cols = ['DG3', 'DG4']
X_mapped, mappers, emb_szs, idx2col, col2idx = get_mappers(X, cat_cols, emb_cols)

cat_onehot_cols = X_mapped.columns

converting to category ...
0
100
200
300
400
500
600
700
800
900
1000
1100
calculating cardinality
remapping columns to int
complete


In [12]:
em = EmbeddingModel(emb_szs=emb_szs,
                    cat_cols=cat_onehot_cols,
                    idx2col=idx2col, 
                    col2idx=col2idx,
                    layer_sizes=[500,100], 
                    output_dim=1)

number of emb feats: 2
total embedding parameters 12


In [13]:
bz = 50
X_tensor = torch.from_numpy(X_mapped.head(18200).as_matrix())
y_tensor = torch.from_numpy(y[:18200]).view(-1,1)

train = data_utils.TensorDataset(X_tensor, y_tensor)
train_loader = data_utils.DataLoader(train, batch_size=bz, shuffle=True)
loss_fn = torch.nn.BCELoss(size_average=False)
params = {
    'weight_decay': 0.01,
    'n_epoches': 2,
    'learning_rate': 0.01,
    'ml_type': 'binary'
}

In [14]:
train_model(em, train_loader, loss_fn, **params)

learning rate 0.010000
[1/5] - 1200/18200 loss: 26.951455, acc: 0.744000
[1/5] - 2450/18200 loss: 21.749182, acc: 0.798400
[1/5] - 3700/18200 loss: 17.843097, acc: 0.839200
[1/5] - 4950/18200 loss: 16.634552, acc: 0.856800
[1/5] - 6200/18200 loss: 14.265708, acc: 0.872000
[1/5] - 7450/18200 loss: 17.090805, acc: 0.854400
[1/5] - 8700/18200 loss: 15.977530, acc: 0.864000
[1/5] - 9950/18200 loss: 15.197267, acc: 0.868800
[1/5] - 11200/18200 loss: 16.880825, acc: 0.858400
[1/5] - 12450/18200 loss: 15.599335, acc: 0.867200
[1/5] - 13700/18200 loss: 15.000434, acc: 0.884800
[1/5] - 14950/18200 loss: 15.283147, acc: 0.868800
[1/5] - 16200/18200 loss: 15.157902, acc: 0.873600
[1/5] - 17450/18200 loss: 14.455345, acc: 0.886400
[2/5] - 1200/18200 loss: 12.560787, acc: 0.893600
[2/5] - 2450/18200 loss: 14.101243, acc: 0.883200
[2/5] - 3700/18200 loss: 15.416921, acc: 0.870400
[2/5] - 4950/18200 loss: 14.056465, acc: 0.888800
[2/5] - 6200/18200 loss: 13.518464, acc: 0.885600
[2/5] - 7450/18200 lo

[2/5] - 11200/18200 loss: 12.384076, acc: 0.903200
[2/5] - 12450/18200 loss: 11.000812, acc: 0.912800
[2/5] - 13700/18200 loss: 10.069328, acc: 0.925600
[2/5] - 14950/18200 loss: 9.635882, acc: 0.924000
[2/5] - 16200/18200 loss: 11.057190, acc: 0.908800
[2/5] - 17450/18200 loss: 10.711087, acc: 0.912800
[3/5] - 1200/18200 loss: 9.319447, acc: 0.920000
[3/5] - 2450/18200 loss: 8.549238, acc: 0.934400
[3/5] - 3700/18200 loss: 12.547142, acc: 0.895200
[3/5] - 4950/18200 loss: 11.653435, acc: 0.912800
[3/5] - 6200/18200 loss: 10.314143, acc: 0.913600
[3/5] - 7450/18200 loss: 10.092259, acc: 0.916000
[3/5] - 8700/18200 loss: 10.701959, acc: 0.914400
[3/5] - 9950/18200 loss: 10.003985, acc: 0.906400
[3/5] - 11200/18200 loss: 11.213393, acc: 0.908800
[3/5] - 12450/18200 loss: 11.391655, acc: 0.909600
[3/5] - 13700/18200 loss: 12.616872, acc: 0.901600
[3/5] - 14950/18200 loss: 10.876661, acc: 0.912000
[3/5] - 16200/18200 loss: 9.989320, acc: 0.924800
[3/5] - 17450/18200 loss: 10.472764, acc: 0

KeyboardInterrupt: 

### Trying to Predict Level of Education

In [28]:
X, y, X_test, y_test = load_wids_xy_data(DATA_DIR, target='DG4')

loading training data ...
loading test data ...
complete ...
formatting ...
imputing missing values ...
(18255, 1121) (18255,) (27285, 1121) (27285,)


In [29]:
X_join = pd.concat([X, X_test])
y_join = np.concatenate([y, y_test])
X_join.shape, y_join.shape

((45540, 1121), (45540,))

In [30]:
cat_cols = X.columns
emb_cols = cat_cols 
X_mapped, mappers, emb_szs, idx2col, col2idx = get_mappers(X, cat_cols, emb_cols)

converting to category ...
0
100
200
300
400
500
600
700
800
900
1000
1100
calculating cardinality
remapping columns to int
complete


In [24]:
bz = 50
X_tensor = torch.from_numpy(X_mapped.head(45000).as_matrix())
y_tensor = torch.from_numpy(y[:45000]).view(-1,1)

train = data_utils.TensorDataset(X_tensor, y_tensor)
train_loader = data_utils.DataLoader(train, batch_size=bz, shuffle=True)
loss_fn = torch.nn.NLLLoss(size_average=False)
params = {
    'weight_decay': 0.01,
    'n_epoches': 5,
    'learning_rate': 0.01,
    'ml_type':'multi'
}
cat_onehot_cols = X_mapped.columns

In [26]:
em = EmbeddingModel(emb_szs=emb_szs,
                    cat_cols=cat_cols,
                    idx2col=idx2col, 
                    col2idx=col2idx,
                    layer_sizes=[500,100], 
                    output_dim=1)

number of emb feats: 1121
total embedding parameters 3494


In [27]:
train_model(em, train_loader, loss_fn, **params)

learning rate 0.010000


AttributeError: 'tuple' object has no attribute 'float'