In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import io
import os
import json
import time
import sys
import math
import copy
import pickle
import zipfile
from textwrap import wrap
from pathlib import Path
from itertools import zip_longest
from collections import defaultdict
from urllib.error import URLError
from urllib.request import urlopen

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim
from torch import tensor
from torch.nn import functional as F 
from torch.optim.lr_scheduler import _LRScheduler

In [3]:
sys.path.append("src/")
from constants import *

## Experiment 4 - NN Classification with baseline features

In [4]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import IterableDataset
from itertools import chain, islice


class InteractionsStream(IterableDataset):

    def __init__(self, prep_data_dir=PREPARED_DATA_DIR, file_num=None,
                 sample='train', user_col='User', item_col='Movie',
                 dv_col='Rating',
                 end_token='.h5', start_token='user_{}_data_',
                 baseline_feats=False, model_type='regression',
                 chunksize=10):

        if file_num is None:
            self.files = [os.path.join(prep_data_dir, x) for x in
                          _find_files(prep_data_dir,
                                      start_token.format(sample),
                                      end_token)]
        else:
            self.files = [
                os.path.join(prep_data_dir,
                             start_token.format(sample)+str(file_num)+
                             end_token)]
        print(self.files)
        self.user_col = user_col
        self.item_col = item_col
        self.baseline_feats = baseline_feats
        self.sample = sample
        self.chunksize = chunksize
        self.model_type = model_type
        self.dv_col = dv_col
        self.cat_cols = [self.user_col, self.item_col]
        
        if baseline_feats:
            self.numeric_cols = [
                'days_since_first_user_rating',
                'sqrt_days_since_first_user_rating',
                'rating_age_days_user', 'rating_age_weeks_user',
                'rating_age_months_user', 'mean_ratings_user',
                'num_ratings_user', 'days_since_first_item_rating',
                'sqrt_days_since_first_item_rating',
                'rating_age_days_item', 'rating_age_weeks_item',
                'rating_age_months_item', 'mean_ratings_movie',
                'weighted_mean_ratings_movie', 'num_ratings_movie']
        else:
            self.numeric_cols = []            

    def read_file(self, fn):
        
        if self.sample == 'train':
            df = pd.read_hdf(fn, key='stage', iterator=True,
                             chunksize=self.chunksize)
        else:
            df = pd.read_hdf(fn, key='stage')
        
        return df
    
    def get_dv_for_classification(self, dv_lst):
        
        if self.model_type == 'classification':
            return [int(x-1) for x in dv_lst]
        else:
            return [int(x) for x in dv_lst]

    def process_data(self, fn):

        print('read data')
        data = self.read_file(fn)

        print('create an iterable')
        if self.sample == 'train':
            if self.baseline_feats:
                for row in data:
                    x1 = row[self.cat_cols].values.tolist()
                    x2 = row[self.numeric_cols].values.tolist()
                    y = self.get_dv_for_classification(
                            row[self.dv_col].tolist())
                    yield (x1, x2, y)
            else:
                for row in data:
                    user = row[self.user_col].tolist()
                    item = row[self.item_col].tolist()
                    y = self.get_dv_for_classification(
                            row[self.dv_col].tolist())
                    yield (user, item), y
        else:
            if self.baseline_feats:
                for i, row in data.iterrows():
                    y = int(row[self.dv_col]-1) if self.model_type == 'classification' else row[self.dv_col]
                    yield (row[self.cat_cols].tolist(),
                           row[self.numeric_cols].tolist(), 
                           y)
            else:
                for i, row in data.iterrows():
                    y = int(row[self.dv_col]-1) if self.model_type == 'classification' else row[self.dv_col]
                    yield (row[self.user_col],
                           row[self.item_col]), y

    def get_stream(self, files):
        return chain.from_iterable(map(self.process_data, files))

    def __iter__(self):
        return self.get_stream(self.files)

In [5]:
import torch
from torch import nn


class TabularModel(nn.Module):
    """
    Defines the neural network for tabular data
    """

    def __init__(self, embedding_sizes, n_cont, n_classes=5):
        super().__init__()
        self.embeddings = nn.ModuleList(
            [nn.Embedding(categories, size) for
             categories, size in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.n_emb, self.n_cont, self.n_classes = n_emb, n_cont, n_classes
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 200)
        self.lin2 = nn.Linear(200, 70)
        self.lin3 = nn.Linear(70, self.n_classes)
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(200)
        self.bn3 = nn.BatchNorm1d(70)
        self.emb_drop = nn.Dropout(0.6)
        self.drops = nn.Dropout(0.3)


    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x2 = self.bn1(x_cont)
        x = torch.cat([x, x2], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.drops(x)
        x = self.bn3(x)
        x = self.lin3(x)

        return x

In [61]:
import torch
import torch.optim as torch_optim
import torch.nn.functional as F
from torch import tensor
from sklearn.metrics import roc_auc_score
from tqdm import tqdm


def choose_embedding_size(cat_cols, cat_num_values, min_emb_dim=100):
    """
    cat_cols: list of categorical columns
    cat_num_values: list of number of unique values for each categorical column
    """
    embedded_cols = dict(zip(cat_cols, cat_num_values))
    embedding_sizes = [(n_categories, min(min_emb_dim, (n_categories+1)//2))
                       for _, n_categories in embedded_cols.items()]
    return embedding_sizes


def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')


def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)


class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl:
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)


def get_optimizer(model, lr = 0.001, wd = 0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim


def construct_tensor(a):
    final = []
    for i in a:
        out = []
        for j in i:
            out.append(j.tolist())
        out1 = []
        for item in zip(*out):
            out1.append(list(item))
        final += out1
    return tensor(final)


def construct_tensor_test(a):
    out = []
    for i in a:
        out.append(i.tolist())
        out1 = []
        for item in zip(*out):
            out1.append(list(item))
    return tensor(out1)


def construct_tensor_y(a):
    out = []
    for i in a:
        out += i.tolist()
    return tensor(out)


def train_model(model, optim, train_dl, train_size, chunksize, batch_size,
                device, loss_fn=F.cross_entropy):
    model.train()
    total = 0
    sum_loss = 0
    with tqdm(total=train_size // (batch_size * chunksize)) as pbar:
        for x1, x2, y in train_dl:
            x1, x2, y = (construct_tensor(x1), construct_tensor(x2),
                         construct_tensor_y(y))
            x1 = x1.to(device)
            x2 = x2.to(device)
            y = y.to(device)
            batch = y.size()[0]
            output = model(x1, x2)
            loss = loss_fn(output, y)
            optim.zero_grad()
            loss.backward()
            optim.step()
            total += batch
            sum_loss += loss.item()
            pbar.update(1)
    return sum_loss/total


def val_loss(model, valid_dl, test_size, batch_size,
             device, loss_fn=F.cross_entropy):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    sum_auc_macro = 0
    sum_auc_micro = 0
    num_aucs = 0
    with tqdm(total=test_size // (batch_size)) as pbar:
        for x1, x2, y in valid_dl:
            x1, x2 = construct_tensor_test(x1), construct_tensor_test(x2)
            x1 = x1.to(device)
            x2 = x2.to(device)
            y = y.to(device)
            current_batch_size = y.size()[0]
            out = model(x1, x2)
            loss = loss_fn(out, y)
            sum_loss += loss.item()
            total += current_batch_size
            pred = torch.max(out, 1)[1]
            pred_prob = F.softmax(out, dim=1)
            y_onehot = F.one_hot(y)
            correct += (pred == y).float().sum().item()
            pred_prob = pred_prob.cpu().detach().numpy()
            y_onehot = y_onehot.cpu().detach().numpy()
            try:
                sum_auc_macro += roc_auc_score(y_onehot, pred_prob,
                                               average='macro')
                sum_auc_micro += roc_auc_score(y_onehot, pred_prob,
                                               average='micro')
                num_aucs += current_batch_size
            except:
                continue
                
            pbar.update(1)
    print("valid loss %.3f, accuracy %.3f, macro auc %.3f and micro auc %.3f" % (
        sum_loss/total, correct/total, sum_auc_macro/num_aucs, sum_auc_micro/num_aucs))
    return sum_loss/total, correct/total, sum_auc_macro/num_aucs, sum_auc_micro/num_aucs


def train_loop(model, train_dl, valid_dl, epochs, train_size,
               test_size, chunksize, batch_size, device, lr=0.01,
               wd=0.0, loss_fn=F.cross_entropy):
    optim = get_optimizer(model, lr = lr, wd = wd)
    start = time.time()
    losses = []
    for i in range(epochs):
        stats = {'epoch': i+1}
        train_loss = train_model(model, optim, train_dl, train_size,
                                 chunksize, batch_size, device,
                                 loss_fn)
        print("training loss: ", train_loss)
        stats['train_loss'] = train_loss
        loss, acc, auc_macro, auc_micro = val_loss(
            model, valid_dl, test_size, batch_size, device, loss_fn)
        print('time taken: %0.2f' % (time.time() - start))
        stats['test_loss'] = loss
        stats['test_acc'] = acc
        stats['test_auc_macro'] = auc_macro
        stats['test_auc_micro'] = auc_micro
        losses.append(stats)
    return losses

In [62]:
# GLOBALS
FILE_NUM = 1
N_USERS = 480189
N_ITEMS = 17770
BATCH_SIZE = 50
CHUNKSIZE = 100
TRAIN_SIZE = 22851074 # corresponds to FILE_NUM
VAL_SIZE = 962152     # corresponds to FILE_NUM
TEST_SIZE = 240538    # corresponds to FILE_NUM

In [63]:
# choose embedding size

cat_cols = ['User', 'Movie']
cat_num_values = [N_USERS, N_ITEMS]
embedding_sizes = choose_embedding_size(cat_cols, cat_num_values, 100)

In [64]:
embedding_sizes

[(480189, 100), (17770, 100)]

In [65]:
# dataset

from torch.utils.data import DataLoader

train_dataset = InteractionsStream(
    file_num=FILE_NUM, baseline_feats=True, model_type='classification',
    sample='train', chunksize=CHUNKSIZE)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                          shuffle=False)

test_dataset = InteractionsStream(file_num=FILE_NUM, baseline_feats=True,
                                  model_type='classification',
                                  sample='test')
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         shuffle=False)

['/data/kaggle/netflix-prize-data/prepared_data_for_NN_modelling/user_train_data_1.h5']
['/data/kaggle/netflix-prize-data/prepared_data_for_NN_modelling/user_test_data_1.h5']


In [66]:
device = get_default_device()
device

device(type='cuda')

In [67]:
n_cont = len(train_loader.dataset.numeric_cols)
print('number of numeric vars: ', n_cont)

net = TabularModel(embedding_sizes, n_cont, 5)

number of numeric vars:  15


In [68]:
to_device(net, device)

TabularModel(
  (embeddings): ModuleList(
    (0): Embedding(480189, 100)
    (1): Embedding(17770, 100)
  )
  (lin1): Linear(in_features=215, out_features=200, bias=True)
  (lin2): Linear(in_features=200, out_features=70, bias=True)
  (lin3): Linear(in_features=70, out_features=5, bias=True)
  (bn1): BatchNorm1d(15, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(70, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [69]:
losses = train_loop(model=net, train_dl=train_loader,
                    valid_dl=test_loader, epochs=2,
                    train_size=TRAIN_SIZE, test_size=TEST_SIZE,
                    chunksize=CHUNKSIZE, batch_size=BATCH_SIZE,
                    device=device, lr=0.05, wd=0.00001,
                    loss_fn=F.cross_entropy)

  0%|          | 0/4570 [00:00<?, ?it/s]

read data
create an iterable


4571it [30:18,  2.51it/s]                          
  0%|          | 0/4810 [00:00<?, ?it/s]

training loss:  0.0002537636473082755
read data
create an iterable


 90%|█████████ | 4352/4810 [06:53<00:43, 10.53it/s]
  0%|          | 0/4570 [00:00<?, ?it/s]

valid loss 0.025, accuracy 0.441, macro auc 0.014 and micro auc 0.016
time taken: 2231.54
read data
create an iterable


4571it [30:43,  2.48it/s]                          
  0%|          | 0/4810 [00:00<?, ?it/s]

training loss:  0.000253764094867131
read data
create an iterable


 90%|█████████ | 4352/4810 [06:53<00:43, 10.54it/s]

valid loss 0.025, accuracy 0.440, macro auc 0.014 and micro auc 0.016
time taken: 4488.11





In [70]:
losses

[{'epoch': 1,
  'train_loss': 0.0002537636473082755,
  'test_loss': 0.025352873244562046,
  'test_acc': 0.44095319658432347,
  'test_auc_macro': 0.014386494047214069,
  'test_auc_micro': 0.01558661101017521},
 {'epoch': 2,
  'train_loss': 0.000253764094867131,
  'test_loss': 0.025394514972358027,
  'test_acc': 0.4396062160656528,
  'test_auc_macro': 0.014356277362117713,
  'test_auc_micro': 0.015562970227170028}]

In [71]:
net

TabularModel(
  (embeddings): ModuleList(
    (0): Embedding(480189, 100)
    (1): Embedding(17770, 100)
  )
  (lin1): Linear(in_features=215, out_features=200, bias=True)
  (lin2): Linear(in_features=200, out_features=70, bias=True)
  (lin3): Linear(in_features=70, out_features=5, bias=True)
  (bn1): BatchNorm1d(15, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(70, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [73]:
preds = []
pred_probs = []
actuals = []
actuals_onehot = []
with torch.no_grad():
    for x1, x2, y in test_loader:
        x1, x2 = construct_tensor_test(x1), construct_tensor_test(x2)
        x1 = x1.to(device)
        x2 = x2.to(device)
        y = y.to(device)
        out = net(x1, x2)
        pred = torch.max(out, 1)[1]
        pred_prob = F.softmax(out, dim=1)
        y_onehot = F.one_hot(y)
        preds.append(pred)
        pred_probs.append(pred_prob)
        actuals.append(y)
        actuals_onehot.append(y_onehot)

read data
create an iterable


In [74]:
len(preds), len(pred_probs), len(actuals), len(actuals_onehot)

(4811, 4811, 4811, 4811)

In [83]:
final_preds = [item for sublist in preds for
               item in sublist.cpu().detach().tolist()]
final_pred_probs = [item for sublist in pred_probs for
                    item in sublist.cpu().detach().numpy()]
final_actuals = [item for sublist in actuals for
                 item in sublist.cpu().detach().tolist()]
final_actuals_onehot = [item for sublist in actuals_onehotonehot for
                        item in sublist.cpu().detach().numpy()]

In [84]:
len(final_preds), len(final_pred_probs), len(final_actuals), len(final_actuals_onehot)

(240538, 240538, 240538, 240538)

In [90]:
final_pred_probs = np.array(final_pred_probs)
final_actuals_onehot = np.array(final_actuals_onehot)
final_actuals = np.array(final_actuals)
final_preds = np.array(final_preds)

In [91]:
final_pred_probs.shape, final_actuals_onehot.shape, final_pred_probs[0]

((240538, 5),
 (240538, 5),
 array([0.03358189, 0.10595833, 0.48191768, 0.32772338, 0.0508187 ],
       dtype=float32))

In [92]:
from sklearn.metrics import mean_squared_error, roc_auc_score

acc = (final_actuals == final_preds).mean()
auc_macro = roc_auc_score(y_true=final_actuals_onehot,
                          y_score=final_pred_probs, average='macro')
auc_micro = roc_auc_score(y_true=final_actuals_onehot,
                          y_score=final_pred_probs, average='micro')
rmse = np.sqrt(mean_squared_error(y_true=final_actuals,
                                  y_pred=final_preds))

print('Test Accuracy: %0.3f' % (acc))
print('Test Macro AUC: %0.3f' % (auc_macro))
print('Test Micro AUC: %0.3f' % (auc_micro))
print('Test RMSE: %0.3f' % (rmse))

Test Accuracy: 0.440
Test Macro AUC: 0.718
Test Micro AUC: 0.781
Test RMSE: 1.038


In [93]:
# save model
model_fn = os.path.join(MODEL_DIR, 'model_NN_DeepMF_withBaseline_classification_{}_E2.pt'.format(
    FILE_NUM))
torch.save(net.state_dict, model_fn)

  "type " + obj.__name__ + ". It won't be checked "
