# Experiments
1. NN Classification
2. NN Regression

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import io
import os
import json
import time
import sys
import math
import copy
import pickle
import zipfile
from textwrap import wrap
from pathlib import Path
from itertools import zip_longest
from collections import defaultdict
from urllib.error import URLError
from urllib.request import urlopen

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim
from torch import tensor
from torch.nn import functional as F 
from torch.optim.lr_scheduler import _LRScheduler

In [3]:
sys.path.append("/data/ExternalTest/MAD/src/")
from constants import *
from metadata_utils import _find_files
from baseline_feats_utils import feat_type_feats_dct

In [4]:
# set global variables based on exploratory analysis
USER2IDX_SEGGE20_FN = os.path.join(METADATA_DIR, 'user2idx_segGE20.json')
USER2IDX_SEGLE3_FN = os.path.join(METADATA_DIR, 'user2idx_segLE3.json')
USER2IDX_SEG4TO19_FN = os.path.join(METADATA_DIR, 'user2idx_seg4-19.json')

In [5]:
user2idx_dct = {'GE20': USER2IDX_SEGGE20_FN,
                'LE3': USER2IDX_SEGLE3_FN,
                '4-19': USER2IDX_SEG4TO19_FN}

In [6]:
feat_type_feats_dct

{'user': ['num_interactions',
  'mean_price_interactions',
  'earliest_interaction_date',
  'min_num_interactions_per_pdt',
  'max_num_interactions_per_pdt',
  'mean_num_interactions_per_pdt',
  'min_num_interactions_per_ont',
  'max_num_interactions_per_ont',
  'mean_num_interactions_per_ont',
  'min_num_interactions_per_brand',
  'max_num_interactions_per_brand',
  'mean_num_interactions_per_brand'],
 'item': ['num_interactions',
  'earliest_interaction_date',
  'min_num_interactions_per_user',
  'max_num_interactions_per_user',
  'mean_num_interactions_per_user']}

## Experiment 1 - NN Classification with baseline features

In [7]:
from torch.utils.data import IterableDataset
from itertools import chain, islice


class InteractionsStream(IterableDataset):

    def __init__(self, sample, model_type, file_name=None,
                 interim_data_dir=INTERIM_DATA_DIR, user_col=USER_COL,
                 item_col=ITEM_COL, ontology_col=ONTOLOGY_COL,
                 brand_col=BRAND_COL, price_col=PRICE_COL, dv_col=DV_COL,
                 date_col=DATE_COL, end_token='.gz', chunksize=10,
                 segment='LE3', user2idx_dct=user2idx_dct):

        data_dir = interim_data_dir
        
        if file_name is None:
            files = _find_files(data_dir, end_token)
            if sample == 'train':
                self.files = [os.path.join(data_dir, x) for x in files
                              if not x.startswith('0005')]
            elif sample == 'test':
                self.files = [os.path.join(data_dir, x) for x in files
                              if x.startswith('0005')]
        else:
            self.files = [os.path.join(data_dir, file_name)]
        print(self.files)
        
        self.model_type = model_type
        self.segment = segment
        self.user_col = user_col
        self.item_col = item_col
        self.ontology_col = ontology_col
        self.brand_col = brand_col
        self.price_col = price_col
        self.date_col = date_col
        self.dv_col = dv_col
        self.feat_type_feats_dct = feat_type_feats_dct
        self.chunksize = chunksize
        user_feats = ['{}_{}'.format(self.user_col, x) for x in
                      self.feat_type_feats_dct['user']
                      if x != 'earliest_interaction_date']
        user_feats.append('{}_days_since_earliest_interaction'.format(
            self.user_col))
        item_feats = ['{}_{}'.format(self.item_col, x) for x in
                      self.feat_type_feats_dct['item']
                      if x != 'earliest_interaction_date']
        item_feats.append('{}_days_since_earliest_interaction'.format(
            self.item_col))
        self.numeric_feats = [self.price_col] + user_feats + item_feats
        if self.segment == 'GE20':
            self.cat_feats = [self.user_col, self.item_col,
                              self.ontology_col, self.brand_col]
        else:
            self.cat_feats = [self.item_col, self.ontology_col,
                              self.brand_col]
        if self.segment == 'GE20':
            self.user2idx = json.load(open(user2idx_dct.get(self.segment)))
        else:
            self.user2idx = None
        

    def read_file(self, fn):
        
        df = pd.read_csv(fn, compression='gzip', sep='|', iterator=True,
                         chunksize=self.chunksize)
        return df
    
    
    def get_dv_for_classification(self, dv_lst):
        
        if self.model_type == 'classification':
            return [int(x-1) for x in dv_lst]
        else:
            return [int(x) for x in dv_lst]
        
    
    def _segment_filter(self, num_interactions_lst, feat_type, feats_lst):
        
        if self.segment != 'GE20':
            idxs = [i for i, x in enumerate(num_interactions_lst)
                    if x < 20]
        elif self.segment == 'GE20':
            idxs = [i for i, x in enumerate(num_interactions_lst)
                    if x >= 20]
        
        if idxs:
            new_feats_lst = [feats_lst[i] for i in idxs]
            if (self.segment == 'GE20') and (feat_type == 'cat'):
                new_feats_lst = []
                for i in idxs:
                    out = feats_lst[i]
                    out[0] = self.user2idx[str(out[0])]
                    new_feats_lst.append(out)
            return new_feats_lst

    
    def process_data(self, fn):

        print('read data')
        data = self.read_file(fn)

        for row in data:
            num_interactions = row['uuid_num_interactions'].values.tolist()
            
            x1 = row[self.cat_feats].values.tolist()
            x2 = row[self.numeric_feats].values.tolist()
            y = self.get_dv_for_classification(
                    row[self.dv_col].tolist())
            x1 = self._segment_filter(num_interactions, 'cat', x1)
            if x1:
                x2 = self._segment_filter(num_interactions, 'numeric', x2)
                y = self._segment_filter(num_interactions, 'dv', y)
                yield (x1, x2, y)
            else:
                continue

    
    def get_stream(self, files):
        return chain.from_iterable(map(self.process_data, files))

    
    def __iter__(self):
        return self.get_stream(self.files)

In [8]:
class ProductRecommendationModel(nn.Module):
    """
    Defines the neural network for product recommendation
    """

    def __init__(self, embedding_sizes, n_cont, n_classes=3):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for
                                         categories, size in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.n_emb, self.n_cont, self.n_classes = n_emb, n_cont, n_classes
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 300)
        self.lin2 = nn.Linear(300, 100)
        self.lin3 = nn.Linear(100, self.n_classes)
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(300)
        self.bn3 = nn.BatchNorm1d(100)
        self.emb_drop = nn.Dropout(0.6)
        self.drops = nn.Dropout(0.3)


    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x2 = self.bn1(x_cont)
        x = torch.cat([x, x2], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.drops(x)
        x = self.bn3(x)
        x = self.lin3(x)

        return x

In [9]:
import torch.optim as torch_optim
from torch import tensor
from sklearn.metrics import roc_auc_score
from tqdm import tqdm


def choose_embedding_size(cat_cols, cat_num_values, min_emb_dim=100):
    """
    cat_cols: list of categorical columns
    cat_num_values: list of number of unique values for each categorical column
    """

    embedded_cols = dict(zip(cat_cols, cat_num_values))
    embedding_sizes = [(n_categories, min(min_emb_dim, (n_categories+1)//2))
                       for _, n_categories in embedded_cols.items()]
    return embedding_sizes


def get_default_device():
    """Pick GPU if available, else CPU"""

    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')


def to_device(data, device):
    """Move tensor(s) to chosen device"""

    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)


class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""

    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl:
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)


def get_optimizer(model, lr = 0.001, wd = 0.0):

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim


def construct_tensor(a):

    final = []
    for i in a:
        out = []
        for j in i:
            out.append(j.tolist())
        out1 = []
        for item in zip(*out):
            out1.append(list(item))
        final += out1
    return tensor(final)


def construct_tensor_y(a):

    out = []
    for i in a:
        out += i.tolist()
    return tensor(out)


def train_model(model, optim, train_dl, train_size, chunksize, batch_size,
                device, loss_fn=F.cross_entropy):

    model.train()
    total = 0
    sum_loss = 0
    with tqdm(total=train_size // (batch_size * chunksize)) as pbar:
        for x1, x2, y in train_dl:
            x1, x2, y = (construct_tensor(x1), construct_tensor(x2),
                         construct_tensor_y(y))
            x1 = x1.to(device)
            x2 = x2.to(device)
            y = y.to(device)
            batch = y.size()[0]
            output = model(x1, x2)
            loss = loss_fn(output, y)
            optim.zero_grad()
            loss.backward()
            optim.step()
            total += batch
            sum_loss += loss.item()
            pbar.update(1)
    return sum_loss/total


def val_loss(model, valid_dl, test_size, chunksize, batch_size,
             device, loss_fn=F.cross_entropy):

    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    sum_auc_macro = 0
    sum_auc_micro = 0
    num_aucs = 0
    with tqdm(total=test_size // (batch_size * chunksize)) as pbar:
        for x1, x2, y in valid_dl:
            x1, x2, y = (construct_tensor(x1), construct_tensor(x2),
                         construct_tensor_y(y))
            x1 = x1.to(device)
            x2 = x2.to(device)
            y = y.to(device)
            batch = y.size()[0]
            out = model(x1, x2)
            loss = loss_fn(out, y)
            sum_loss += loss.item()
            total += batch
            pred = torch.max(out, 1)[1]
            pred_prob = F.softmax(out, dim=1)
            y_onehot = F.one_hot(y)
            correct += (pred == y).float().sum().item()
            pred_prob = pred_prob.cpu().detach().numpy()
            y_onehot = y_onehot.cpu().detach().numpy()
            try:
                sum_auc_macro += roc_auc_score(y_onehot, pred_prob,
                                               average='macro')
                sum_auc_micro += roc_auc_score(y_onehot, pred_prob,
                                               average='micro')
                num_aucs += 1
            except:
                continue

            pbar.update(1)
    print("valid loss %.3f, accuracy %.3f, macro auc %.3f and micro auc %.3f" % (
        sum_loss/total, correct/total, sum_auc_macro/num_aucs, sum_auc_micro/num_aucs))
    return sum_loss/total, correct/total, sum_auc_macro/num_aucs, sum_auc_micro/num_aucs


def train_loop(model, train_dl, valid_dl, epochs, train_size,
               test_size, chunksize, batch_size, device, lr=0.01,
               wd=0.0, loss_fn=F.cross_entropy):

    optim = get_optimizer(model, lr = lr, wd = wd)
    start = time.time()
    losses = []
    for i in range(epochs):
        stats = {'epoch': i+1}
        train_loss = train_model(model, optim, train_dl, train_size,
                                 chunksize, batch_size, device,
                                 loss_fn)
        print("training loss: ", train_loss)
        stats['train_loss'] = train_loss
        loss, acc, auc_macro, auc_micro = val_loss(
            model, valid_dl, test_size, chunksize, batch_size, device, loss_fn)
        print('time taken: %0.2f' % (time.time() - start))
        stats['test_loss'] = loss
        stats['test_acc'] = acc
        stats['test_auc_macro'] = auc_macro
        stats['test_auc_micro'] = auc_micro
        losses.append(stats)
    return losses

### Segment - < 20

In [10]:
# GLOBALS
TRAIN_FILE_NAME = '0000_part_00.gz'
TEST_FILE_NAME = '0005_part_07.gz'
SEGMENT = 'LE3'
N_USERS = 4881444
N_ITEMS = 1175648
N_ONTOLOGIES = 801
N_BRANDS = 1686
BATCH_SIZE = 50
CHUNKSIZE = 100
TRAIN_SIZE = 4812995 # corresponds to FILE_NAME
TEST_SIZE = 1371989    # corresponds to FILE_NAME

In [12]:
# choose embedding size

if SEGMENT != 'GE20':
    cat_cols = [ITEM_COL, ONTOLOGY_COL, BRAND_COL]
    cat_num_values = [N_ITEMS, N_ONTOLOGIES, N_BRANDS]
else:
    cat_cols = [USER_COL, ITEM_COL, ONTOLOGY_COL, BRAND_COL]
    cat_num_values = [N_USERS, N_ITEMS, N_ONTOLOGIES, N_BRANDS]

embedding_sizes = choose_embedding_size(cat_cols, cat_num_values, 150)

In [13]:
embedding_sizes

[(1175648, 150), (801, 150), (1686, 150)]

In [14]:
# dataset

from torch.utils.data import DataLoader

train_dataset = InteractionsStream(
    file_name=TRAIN_FILE_NAME, model_type='classification',
    sample='train', chunksize=CHUNKSIZE, segment=SEGMENT)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                          shuffle=False)

test_dataset = InteractionsStream(
    file_name=TEST_FILE_NAME, model_type='classification',
    sample='test', chunksize=CHUNKSIZE, segment=SEGMENT)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         shuffle=False)

['/data/ExternalTest_Data/MAD/interim/0000_part_00.gz']
['/data/ExternalTest_Data/MAD/interim/0005_part_07.gz']


In [15]:
train_loader.batch_size

50

In [16]:
device = get_default_device()
device

device(type='cuda')

In [17]:
n_cont = len(train_loader.dataset.numeric_feats)
print('number of numeric vars: ', n_cont)

net = ProductRecommendationModel(embedding_sizes, n_cont, 3)

number of numeric vars:  18


In [18]:
net

ProductRecommendationModel(
  (embeddings): ModuleList(
    (0): Embedding(1175648, 150)
    (1): Embedding(801, 150)
    (2): Embedding(1686, 150)
  )
  (lin1): Linear(in_features=468, out_features=300, bias=True)
  (lin2): Linear(in_features=300, out_features=100, bias=True)
  (lin3): Linear(in_features=100, out_features=3, bias=True)
  (bn1): BatchNorm1d(18, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [19]:
to_device(net, device)

ProductRecommendationModel(
  (embeddings): ModuleList(
    (0): Embedding(1175648, 150)
    (1): Embedding(801, 150)
    (2): Embedding(1686, 150)
  )
  (lin1): Linear(in_features=468, out_features=300, bias=True)
  (lin2): Linear(in_features=300, out_features=100, bias=True)
  (lin3): Linear(in_features=100, out_features=3, bias=True)
  (bn1): BatchNorm1d(18, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [20]:
from itertools import islice

for x1, x2, y in islice(train_loader, 2):
    x1, x2, y = (construct_tensor(x1), construct_tensor(x2),
                 construct_tensor_y(y))
    print(x1, '\t', x1.shape)
    print('\n')
    print(x2)
    print('\n')
    print(y, '\t', y.shape)
    print('\n\n\n')

read data
tensor([[ 720408,     727,     346],
        [ 674750,     783,    1147],
        [ 886279,     512,    1592],
        [ 922935,     217,     757],
        [ 247773,     104,     808],
        [ 631158,     241,     965],
        [1004812,     617,    1133],
        [ 208059,     104,     547],
        [1158573,     165,     101],
        [1134965,     659,    1246],
        [ 173913,     657,    1327],
        [ 133463,     783,    1437],
        [ 134684,     512,    1144],
        [ 902684,     165,     862],
        [ 681135,     104,     235],
        [1158573,     165,     101],
        [ 489617,      87,    1229],
        [ 574000,     512,    1246],
        [1030658,     757,    1211],
        [ 611497,     696,     444],
        [ 884114,     423,    1647],
        [ 273820,     427,     832],
        [  71791,     737,     124],
        [  63868,     512,    1246],
        [ 493093,     217,    1327],
        [ 492524,     512,    1508],
        [ 494617,     512,  

tensor([[ 815583,     423,     454],
        [ 232899,     619,    1144],
        [1142639,     336,     381],
        ...,
        [ 532674,     799,    1394],
        [ 832950,     423,     454],
        [   5648,     708,     226]]) 	 torch.Size([650, 3])


tensor([[2.7990e+03, 9.0000e+00, 8.1775e+02,  ..., 9.0000e+00, 1.0533e+00,
         5.0611e-01],
        [5.9900e+02, 5.0000e+00, 1.7678e+03,  ..., 6.0000e+00, 1.0651e+00,
         3.6043e-01],
        [9.4500e+02, 2.0000e+00, 9.4500e+02,  ..., 4.7000e+01, 1.1414e+00,
         2.1497e-01],
        ...,
        [6.5000e+03, 3.0000e+00, 6.5000e+03,  ..., 7.0000e+00, 1.0535e+00,
         5.2946e-01],
        [2.7990e+03, 4.0000e+00, 2.5240e+03,  ..., 1.0000e+01, 1.0386e+00,
         3.9329e-02],
        [4.4900e+02, 1.6000e+01, 4.9469e+03,  ..., 9.0000e+00, 1.1287e+00,
         2.0370e-03]])


tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [21]:
for x1, x2, y in islice(test_loader, 2):
    x1, x2, y = (construct_tensor(x1), construct_tensor(x2),
                 construct_tensor_y(y))
    x1 = x1.to(device)
    x2 = x2.to(device)
    y = y.to(device)
    print(x1)
    print('\n')
    print(x2)
    print('\n')
    print(y)
    print('\n\n\n')
    print('shape of y: ', y.size())
    print('shape of x1: ', x1.size())
    print('shape of x2: ', x2.size())
    out = net(x1, x2)
    print('model output: ', out)
    loss = F.cross_entropy(out, y)
    print('Loss: ', loss.item())

read data
tensor([[530312,    676,    301],
        [344594,    382,    660],
        [442471,    104,    238],
        ...,
        [778171,    678,    660],
        [ 42138,    431,     27],
        [338068,    364,   1182]], device='cuda:0')


tensor([[1.8750e+04, 1.0000e+00, 1.7500e+04,  ..., 5.0000e+00, 1.0357e+00,
         6.6850e+01],
        [3.4590e+04, 3.0000e+00, 4.4765e+04,  ..., 2.4000e+01, 1.1236e+00,
         6.7049e+01],
        [2.1999e+04, 1.1000e+01, 1.5918e+04,  ..., 2.1000e+01, 1.1140e+00,
         6.5578e+01],
        ...,
        [1.3990e+04, 1.0000e+00, 9.1000e+03,  ..., 7.0000e+00, 1.0286e+00,
         4.3703e+01],
        [9.9900e+02, 1.8000e+01, 5.2206e+03,  ..., 7.0000e+00, 1.0964e+00,
         4.3920e+01],
        [2.5990e+04, 8.0000e+00, 3.3349e+04,  ..., 1.2000e+01, 1.0694e+00,
         6.8808e+01]], device='cuda:0')


tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [22]:
losses = train_loop(model=net, train_dl=train_loader,
                    valid_dl=test_loader, epochs=3,
                    train_size=TRAIN_SIZE, test_size=TEST_SIZE,
                    chunksize=CHUNKSIZE, batch_size=BATCH_SIZE,
                    device=device, lr=0.02, wd=0.00001,
                    loss_fn=F.cross_entropy)

  0%|          | 0/962 [00:00<?, ?it/s]

read data


963it [04:40,  3.43it/s]                         
  0%|          | 0/274 [00:00<?, ?it/s]

training loss:  0.0005126195600893918
read data


 53%|█████▎    | 145/274 [01:14<01:06,  1.95it/s]
  0%|          | 0/962 [00:00<?, ?it/s]

valid loss 0.000, accuracy 0.926, macro auc 0.525 and micro auc 0.967
time taken: 354.64
read data


963it [04:41,  3.42it/s]                         
  0%|          | 0/274 [00:00<?, ?it/s]

training loss:  0.0004887084268028866
read data


 53%|█████▎    | 145/274 [01:16<01:08,  1.89it/s]
  0%|          | 0/962 [00:00<?, ?it/s]

valid loss 0.000, accuracy 0.933, macro auc 0.557 and micro auc 0.969
time taken: 713.12
read data


963it [04:48,  3.33it/s]                         
  0%|          | 0/274 [00:00<?, ?it/s]

training loss:  0.0004896451841191066
read data


 53%|█████▎    | 145/274 [01:16<01:08,  1.89it/s]

valid loss 0.000, accuracy 0.928, macro auc 0.559 and micro auc 0.966
time taken: 1078.79





In [23]:
losses

[{'epoch': 1,
  'train_loss': 0.0005126195600893918,
  'test_loss': 0.00044533589014774744,
  'test_acc': 0.9257399659073299,
  'test_auc_macro': 0.5251515309534442,
  'test_auc_micro': 0.9667040375041938},
 {'epoch': 2,
  'train_loss': 0.0004887084268028866,
  'test_loss': 0.0004100935259715875,
  'test_acc': 0.933364326669766,
  'test_auc_macro': 0.5570571219986813,
  'test_auc_micro': 0.9694310671958396},
 {'epoch': 3,
  'train_loss': 0.0004896451841191066,
  'test_loss': 0.0004525444943295622,
  'test_acc': 0.9276873805465158,
  'test_auc_macro': 0.5594984226243153,
  'test_auc_micro': 0.9662497053403207}]

### Segment >= 20

In [8]:
# GLOBALS
TRAIN_FILE_NAME = '0000_part_00.gz'
TEST_FILE_NAME = '0005_part_07.gz'
SEGMENT = '>=20'
N_USERS = 1444170
N_ITEMS = 1175648
N_ONTOLOGIES = 801
N_BRANDS = 1686
BATCH_SIZE = 20
CHUNKSIZE = 100
TRAIN_SIZE = 4812995 # corresponds to FILE_NAME
TEST_SIZE = 1371989    # corresponds to FILE_NAME

In [9]:
# choose embedding size

if SEGMENT == '<20':
    cat_cols = [ITEM_COL, ONTOLOGY_COL, BRAND_COL]
    cat_num_values = [N_ITEMS, N_ONTOLOGIES, N_BRANDS]
else:
    cat_cols = [USER_COL, ITEM_COL, ONTOLOGY_COL, BRAND_COL]
    cat_num_values = [N_USERS, N_ITEMS, N_ONTOLOGIES, N_BRANDS]

embedding_sizes = choose_embedding_size(cat_cols, cat_num_values, 150)

In [10]:
embedding_sizes

[(1444170, 150), (1175648, 150), (801, 150), (1686, 150)]

In [11]:
# dataset

from torch.utils.data import DataLoader

train_dataset = InteractionsStream(
    file_name=TRAIN_FILE_NAME, model_type='classification',
    sample='train', chunksize=CHUNKSIZE, segment='>=20')
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                          shuffle=False)

test_dataset = InteractionsStream(
    file_name=TEST_FILE_NAME, model_type='classification',
    sample='test', chunksize=CHUNKSIZE, segment='>=20')
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         shuffle=False)

['/data/ExternalTest_Data/MAD/interim/0000_part_00.gz']
['/data/ExternalTest_Data/MAD/interim/0005_part_07.gz']


In [12]:
device = get_default_device()
device

device(type='cuda')

In [13]:
n_cont = len(train_loader.dataset.numeric_feats)
print('number of numeric vars: ', n_cont)

net = ProductRecommendationModel(embedding_sizes, n_cont, 3)

number of numeric vars:  18


In [14]:
net

ProductRecommendationModel(
  (embeddings): ModuleList(
    (0): Embedding(1444170, 150)
    (1): Embedding(1175648, 150)
    (2): Embedding(801, 150)
    (3): Embedding(1686, 150)
  )
  (lin1): Linear(in_features=618, out_features=300, bias=True)
  (lin2): Linear(in_features=300, out_features=100, bias=True)
  (lin3): Linear(in_features=100, out_features=3, bias=True)
  (bn1): BatchNorm1d(18, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [15]:
to_device(net, device)

ProductRecommendationModel(
  (embeddings): ModuleList(
    (0): Embedding(1444170, 150)
    (1): Embedding(1175648, 150)
    (2): Embedding(801, 150)
    (3): Embedding(1686, 150)
  )
  (lin1): Linear(in_features=618, out_features=300, bias=True)
  (lin2): Linear(in_features=300, out_features=100, bias=True)
  (lin3): Linear(in_features=100, out_features=3, bias=True)
  (bn1): BatchNorm1d(18, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [16]:
train_dataset.cat_feats

['uuid', 'sourceprodid', 'ontology', 'brand']

In [17]:
from itertools import islice

for x1, x2, y in islice(train_loader, 2):
    x1, x2, y = (construct_tensor(x1), construct_tensor(x2),
                 construct_tensor_y(y))
    print(x1, '\t', x1.shape)
    print('\n')
    print(x2)
    print('\n')
    print(y, '\t', y.shape)
    print('\n\n\n')

read data
tensor([[      0, 1084662,     431,    1356],
        [     84,  383428,     512,    1246],
        [    166, 1082098,     217,    1327],
        ...,
        [   1236,  867060,     695,     906],
        [   1300,  459801,     696,    1437],
        [   1356,  424042,     739,     708]]) 	 torch.Size([1540, 4])


tensor([[8.9900e+02, 9.2000e+01, 1.7840e+03,  ..., 6.0000e+00, 1.0883e+00,
         4.0625e-03],
        [9.9900e+02, 5.7700e+02, 1.7922e+03,  ..., 1.5000e+01, 1.0766e+00,
         4.3403e-03],
        [4.1990e+03, 3.2000e+01, 5.2756e+03,  ..., 7.0000e+00, 1.0177e+00,
         5.9375e-03],
        ...,
        [1.2990e+03, 3.9460e+03, 1.6930e+03,  ..., 6.0000e+00, 1.0583e+00,
         4.2176e-02],
        [7.9900e+02, 5.9000e+01, 3.6607e+03,  ..., 1.1000e+01, 1.0694e+00,
         2.2638e-01],
        [6.9900e+02, 5.9500e+02, 3.4337e+03,  ..., 4.0000e+00, 1.0638e+00,
         0.0000e+00]])


tensor([0, 0, 0,  ..., 0, 0, 0]) 	 torch.Size([1540])




tensor([[    543, 

In [18]:
for x1, x2, y in islice(test_loader, 2):
    x1, x2, y = (construct_tensor(x1), construct_tensor(x2),
                 construct_tensor_y(y))
    x1 = x1.to(device)
    x2 = x2.to(device)
    y = y.to(device)
    print(x1)
    print('\n')
    print(x2)
    print('\n')
    print(y)
    print('\n\n\n')
    print('shape of y: ', y.size())
    print('shape of x1: ', x1.size())
    out = net(x1, x2)
    print('model output: ', out)
    loss = F.cross_entropy(out, y)
    print('Loss: ', loss.item())

read data
tensor([[ 659123,  911340,     431,    1480],
        [  71264,  329759,     222,    1437],
        [ 445621,  898016,     591,    1327],
        ...,
        [1015707,  677610,     277,    1197],
        [ 484325,   18975,     542,     557],
        [ 841773,  838605,     512,    1246]], device='cuda:0')


tensor([[1.0990e+03, 7.7600e+02, 2.6948e+03,  ..., 4.0000e+00, 1.0271e+00,
         3.8233e+01],
        [2.9900e+02, 4.7700e+02, 1.0494e+03,  ..., 5.0000e+00, 1.0723e+00,
         1.2464e+02],
        [5.2950e+03, 2.0500e+02, 7.3025e+03,  ..., 1.3000e+01, 1.0593e+00,
         5.2573e+01],
        ...,
        [1.6990e+03, 2.8000e+01, 2.6732e+03,  ..., 6.0000e+00, 1.0906e+00,
         5.2098e+01],
        [1.2999e+04, 3.7000e+02, 2.6119e+04,  ..., 1.4000e+01, 1.0940e+00,
         5.5663e+01],
        [1.4990e+03, 2.4900e+02, 4.9271e+03,  ..., 5.0000e+00, 1.0452e+00,
         2.6743e+01]], device='cuda:0')


tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')




shape of y: 

In [20]:
losses = train_loop(model=net, train_dl=train_loader,
                    valid_dl=test_loader, epochs=2,
                    train_size=TRAIN_SIZE, test_size=TEST_SIZE,
                    chunksize=CHUNKSIZE, batch_size=BATCH_SIZE,
                    device=device, lr=0.02, wd=0.00001,
                    loss_fn=F.cross_entropy)

  0%|          | 1/2406 [00:00<07:31,  5.32it/s]

read data


2407it [07:29,  5.35it/s]                          
  0%|          | 1/685 [00:00<01:33,  7.31it/s]

training loss:  0.00014746794868605878
read data


100%|██████████| 685/685 [01:31<00:00,  7.48it/s]
  0%|          | 1/2406 [00:00<07:27,  5.37it/s]

valid loss 0.000, accuracy 0.937, macro auc 0.637 and micro auc 0.973
time taken: 541.33
read data


2407it [07:24,  5.41it/s]                          
  0%|          | 1/685 [00:00<01:30,  7.57it/s]

training loss:  0.00014503995077135603
read data


100%|██████████| 685/685 [01:28<00:00,  7.74it/s]

valid loss 0.000, accuracy 0.937, macro auc 0.636 and micro auc 0.973
time taken: 1074.76





In [21]:
losses

[{'epoch': 1,
  'train_loss': 0.00014746794868605878,
  'test_loss': 0.0001891156478376139,
  'test_acc': 0.9365029674558143,
  'test_auc_macro': 0.6369750546379287,
  'test_auc_micro': 0.9731460712555777},
 {'epoch': 2,
  'train_loss': 0.00014503995077135603,
  'test_loss': 0.0001883261116955,
  'test_acc': 0.9365464466618839,
  'test_auc_macro': 0.6356256690320318,
  'test_auc_micro': 0.9726313888331596}]