In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import io
import os
import json
import time
import sys
import math
import copy
import pickle
import zipfile
from textwrap import wrap
from pathlib import Path
from itertools import zip_longest
from collections import defaultdict
from urllib.error import URLError
from urllib.request import urlopen

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch import optim
from torch import tensor
from torch.nn import functional as F 
from torch.optim.lr_scheduler import _LRScheduler

In [3]:
sys.path.append("/data/ExternalTest/MAD/src/")
from constants import *
from metadata_utils import _find_files
from baseline_feats_utils import feat_type_feats_dct

In [4]:
user2idx_dct = {'GE20': USER2IDX_SEGGE20_FN,
                'LE3': USER2IDX_SEGLE3_FN,
                '4-19': USER2IDX_SEG4TO19_FN}

In [5]:
from torch.utils.data import IterableDataset
from itertools import chain, islice


class InteractionsStream(IterableDataset):

    def __init__(self, sample, model_type, file_name=None,
                 interim_data_dir=INTERIM_DATA_DIR, user_col=USER_COL,
                 item_col=ITEM_COL, ontology_col=ONTOLOGY_COL,
                 brand_col=BRAND_COL, price_col=PRICE_COL, dv_col=DV_COL,
                 date_col=DATE_COL, end_token='.gz', chunksize=10,
                 segment='LE3', user2idx_dct=user2idx_dct):

        data_dir = interim_data_dir
        
        if file_name is None:
            files = _find_files(data_dir, end_token)
            if sample == 'train':
                self.files = [os.path.join(data_dir, x) for x in files
                              if not x.startswith('0005')]
            elif sample == 'test':
                self.files = [os.path.join(data_dir, x) for x in files
                              if x.startswith('0005')]
        else:
            self.files = [os.path.join(data_dir, file_name)]
        print(self.files)
        
        self.model_type = model_type
        self.segment = segment
        self.user_col = user_col
        self.item_col = item_col
        self.ontology_col = ontology_col
        self.brand_col = brand_col
        self.price_col = price_col
        self.date_col = date_col
        self.dv_col = dv_col
        self.feat_type_feats_dct = feat_type_feats_dct
        self.chunksize = chunksize
        user_feats = ['{}_{}'.format(self.user_col, x) for x in
                      self.feat_type_feats_dct['user']
                      if x != 'earliest_interaction_date']
        user_feats.append('{}_days_since_earliest_interaction'.format(
            self.user_col))
        item_feats = ['{}_{}'.format(self.item_col, x) for x in
                      self.feat_type_feats_dct['item']
                      if x != 'earliest_interaction_date']
        item_feats.append('{}_days_since_earliest_interaction'.format(
            self.item_col))
        self.numeric_feats = [self.price_col] + user_feats + item_feats
        if self.segment == 'GE20':
            self.cat_feats = [self.user_col, self.item_col,
                              self.ontology_col, self.brand_col]
        else:
            self.cat_feats = [self.item_col, self.ontology_col,
                              self.brand_col]
        if self.segment == 'GE20':
            self.user2idx = json.load(open(user2idx_dct.get(self.segment)))
        else:
            self.user2idx = None
        

    def read_file(self, fn):
        
        df = pd.read_csv(fn, compression='gzip', sep='|', iterator=True,
                         chunksize=self.chunksize)
        return df
    
    
    def get_dv_for_classification(self, dv_lst):
        
        if self.model_type == 'classification':
            return [int(x-1) for x in dv_lst]
        else:
            return [int(x) for x in dv_lst]
        
    
    def _segment_filter(self, num_interactions_lst, feat_type, feats_lst):
        
        if self.segment != 'GE20':
            idxs = [i for i, x in enumerate(num_interactions_lst)
                    if x < 20]
        elif self.segment == 'GE20':
            idxs = [i for i, x in enumerate(num_interactions_lst)
                    if x >= 20]
        
        if idxs:
            new_feats_lst = [feats_lst[i] for i in idxs]
            if (self.segment == 'GE20') and (feat_type == 'cat'):
                new_feats_lst = []
                for i in idxs:
                    out = feats_lst[i]
                    out[0] = self.user2idx[str(out[0])]
                    new_feats_lst.append(out)
            return new_feats_lst

    
    def process_data(self, fn):

        print('read data')
        data = self.read_file(fn)

        for row in data:
            num_interactions = row['uuid_num_interactions'].values.tolist()
            
            x1 = row[self.cat_feats].values.tolist()
            x2 = row[self.numeric_feats].values.tolist()
            y = self.get_dv_for_classification(
                    row[self.dv_col].tolist())
            x1 = self._segment_filter(num_interactions, 'cat', x1)
            if x1:
                x2 = self._segment_filter(num_interactions, 'numeric', x2)
                y = self._segment_filter(num_interactions, 'dv', y)
                yield (x1, x2, y)
            else:
                continue

    
    def get_stream(self, files):
        return chain.from_iterable(map(self.process_data, files))

    
    def __iter__(self):
        return self.get_stream(self.files)

In [6]:
class ProductRecommendationModel(nn.Module):
    """
    Defines the neural network for product recommendation
    """

    def __init__(self, embedding_sizes, n_cont, n_classes=3):
        super().__init__()
        self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for
                                         categories, size in embedding_sizes])
        n_emb = sum(e.embedding_dim for e in self.embeddings)
        self.n_emb, self.n_cont, self.n_classes = n_emb, n_cont, n_classes
        self.lin1 = nn.Linear(self.n_emb + self.n_cont, 300)
        self.lin2 = nn.Linear(300, 100)
        self.lin3 = nn.Linear(100, self.n_classes)
        self.bn1 = nn.BatchNorm1d(self.n_cont)
        self.bn2 = nn.BatchNorm1d(300)
        self.bn3 = nn.BatchNorm1d(100)
        self.emb_drop = nn.Dropout(0.6)
        self.drops = nn.Dropout(0.3)


    def forward(self, x_cat, x_cont):
        x = [e(x_cat[:, i]) for i, e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x2 = self.bn1(x_cont)
        x = torch.cat([x, x2], 1)
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.drops(x)
        x = self.bn3(x)
        x = self.lin3(x)

        return x

In [7]:
import torch.optim as torch_optim
from torch import tensor
from sklearn.metrics import roc_auc_score
from tqdm import tqdm


def choose_embedding_size(cat_cols, cat_num_values, min_emb_dim=100):
    """
    cat_cols: list of categorical columns
    cat_num_values: list of number of unique values for each categorical column
    """

    embedded_cols = dict(zip(cat_cols, cat_num_values))
    embedding_sizes = [(n_categories, min(min_emb_dim, (n_categories+1)//2))
                       for _, n_categories in embedded_cols.items()]
    return embedding_sizes


def get_default_device():
    """Pick GPU if available, else CPU"""

    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')


def to_device(data, device):
    """Move tensor(s) to chosen device"""

    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)


class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""

    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl:
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)


def get_optimizer(model, lr = 0.001, wd = 0.0):

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch_optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim


def construct_tensor(a):

    final = []
    for i in a:
        out = []
        for j in i:
            out.append(j.tolist())
        out1 = []
        for item in zip(*out):
            out1.append(list(item))
        final += out1
    return tensor(final)


def construct_tensor_y(a):

    out = []
    for i in a:
        out += i.tolist()
    return tensor(out)


def train_model(model, optim, train_dl, train_size, chunksize, batch_size,
                device, loss_fn=F.cross_entropy):

    model.train()
    total = 0
    sum_loss = 0
    with tqdm(total=train_size // (batch_size * chunksize)) as pbar:
        for x1, x2, y in train_dl:
            x1, x2, y = (construct_tensor(x1), construct_tensor(x2),
                         construct_tensor_y(y))
            x1 = x1.to(device)
            x2 = x2.to(device)
            y = y.to(device)
            batch = y.size()[0]
            output = model(x1, x2)
            loss = loss_fn(output, y)
            optim.zero_grad()
            loss.backward()
            optim.step()
            total += batch
            sum_loss += loss.item()
            pbar.update(1)
    return sum_loss/total


def val_loss(model, valid_dl, test_size, chunksize, batch_size,
             device, loss_fn=F.cross_entropy):

    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    sum_auc_macro = 0
    sum_auc_micro = 0
    num_aucs = 0
    with tqdm(total=test_size // (batch_size * chunksize)) as pbar:
        for x1, x2, y in valid_dl:
            x1, x2, y = (construct_tensor(x1), construct_tensor(x2),
                         construct_tensor_y(y))
            x1 = x1.to(device)
            x2 = x2.to(device)
            y = y.to(device)
            batch = y.size()[0]
            out = model(x1, x2)
            loss = loss_fn(out, y)
            sum_loss += loss.item()
            total += batch
            pred = torch.max(out, 1)[1]
            pred_prob = F.softmax(out, dim=1)
            y_onehot = F.one_hot(y)
            correct += (pred == y).float().sum().item()
            pred_prob = pred_prob.cpu().detach().numpy()
            y_onehot = y_onehot.cpu().detach().numpy()
            try:
                sum_auc_macro += roc_auc_score(y_onehot, pred_prob,
                                               average='macro')
                sum_auc_micro += roc_auc_score(y_onehot, pred_prob,
                                               average='micro')
                num_aucs += 1
            except:
                continue

            pbar.update(1)
    print("valid loss %.3f, accuracy %.3f, macro auc %.3f and micro auc %.3f" % (
        sum_loss/total, correct/total, sum_auc_macro/num_aucs, sum_auc_micro/num_aucs))
    return sum_loss/total, correct/total, sum_auc_macro/num_aucs, sum_auc_micro/num_aucs


def train_loop(model, train_dl, valid_dl, epochs, train_size,
               test_size, chunksize, batch_size, device, lr=0.01,
               wd=0.0, loss_fn=F.cross_entropy):

    optim = get_optimizer(model, lr = lr, wd = wd)
    start = time.time()
    losses = []
    for i in range(epochs):
        stats = {'epoch': i+1}
        train_loss = train_model(model, optim, train_dl, train_size,
                                 chunksize, batch_size, device,
                                 loss_fn)
        print("training loss: ", train_loss)
        stats['train_loss'] = train_loss
        loss, acc, auc_macro, auc_micro = val_loss(
            model, valid_dl, test_size, chunksize, batch_size, device, loss_fn)
        print('time taken: %0.2f' % (time.time() - start))
        stats['test_loss'] = loss
        stats['test_acc'] = acc
        stats['test_auc_macro'] = auc_macro
        stats['test_auc_micro'] = auc_micro
        losses.append(stats)
    return losses

In [8]:
# GLOBALS
SEGMENT = 'GE20'
N_USERS = 1444170
N_ITEMS = 1175648
N_ONTOLOGIES = 801
N_BRANDS = 1686
BATCH_SIZE = 20
CHUNKSIZE = 100
TRAIN_SIZE = 173044425
TEST_SIZE = 34608886

In [9]:
# choose embedding size

if SEGMENT != 'GE20':
    cat_cols = [ITEM_COL, ONTOLOGY_COL, BRAND_COL]
    cat_num_values = [N_ITEMS, N_ONTOLOGIES, N_BRANDS]
else:
    cat_cols = [USER_COL, ITEM_COL, ONTOLOGY_COL, BRAND_COL]
    cat_num_values = [N_USERS, N_ITEMS, N_ONTOLOGIES, N_BRANDS]

embedding_sizes = choose_embedding_size(cat_cols, cat_num_values, 150)

In [10]:
embedding_sizes

[(1444170, 150), (1175648, 150), (801, 150), (1686, 150)]

In [11]:
# dataset

from torch.utils.data import DataLoader

train_dataset = InteractionsStream(
    file_name=None, model_type='classification',
    sample='train', chunksize=CHUNKSIZE, segment=SEGMENT)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                          shuffle=False)

test_dataset = InteractionsStream(
    file_name=None, model_type='classification',
    sample='test', chunksize=CHUNKSIZE, segment=SEGMENT)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         shuffle=False)

['/data/ExternalTest_Data/MAD/interim/0000_part_00.gz', '/data/ExternalTest_Data/MAD/interim/0001_part_07.gz', '/data/ExternalTest_Data/MAD/interim/0003_part_07.gz', '/data/ExternalTest_Data/MAD/interim/0004_part_01.gz', '/data/ExternalTest_Data/MAD/interim/0004_part_05.gz', '/data/ExternalTest_Data/MAD/interim/0002_part_00.gz', '/data/ExternalTest_Data/MAD/interim/0003_part_04.gz', '/data/ExternalTest_Data/MAD/interim/0001_part_01.gz', '/data/ExternalTest_Data/MAD/interim/0000_part_01.gz', '/data/ExternalTest_Data/MAD/interim/0002_part_03.gz', '/data/ExternalTest_Data/MAD/interim/0000_part_03.gz', '/data/ExternalTest_Data/MAD/interim/0000_part_02.gz', '/data/ExternalTest_Data/MAD/interim/0001_part_06.gz', '/data/ExternalTest_Data/MAD/interim/0001_part_02.gz', '/data/ExternalTest_Data/MAD/interim/0001_part_05.gz', '/data/ExternalTest_Data/MAD/interim/0002_part_05.gz', '/data/ExternalTest_Data/MAD/interim/0002_part_01.gz', '/data/ExternalTest_Data/MAD/interim/0004_part_03.gz', '/data/Ex

In [13]:
device = get_default_device()
device

device(type='cuda')

In [14]:
n_cont = len(train_loader.dataset.numeric_feats)
print('number of numeric vars: ', n_cont)

net = ProductRecommendationModel(embedding_sizes, n_cont, 3)

number of numeric vars:  18


In [15]:
net

ProductRecommendationModel(
  (embeddings): ModuleList(
    (0): Embedding(1444170, 150)
    (1): Embedding(1175648, 150)
    (2): Embedding(801, 150)
    (3): Embedding(1686, 150)
  )
  (lin1): Linear(in_features=618, out_features=300, bias=True)
  (lin2): Linear(in_features=300, out_features=100, bias=True)
  (lin3): Linear(in_features=100, out_features=3, bias=True)
  (bn1): BatchNorm1d(18, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [16]:
to_device(net, device)

ProductRecommendationModel(
  (embeddings): ModuleList(
    (0): Embedding(1444170, 150)
    (1): Embedding(1175648, 150)
    (2): Embedding(801, 150)
    (3): Embedding(1686, 150)
  )
  (lin1): Linear(in_features=618, out_features=300, bias=True)
  (lin2): Linear(in_features=300, out_features=100, bias=True)
  (lin3): Linear(in_features=100, out_features=3, bias=True)
  (bn1): BatchNorm1d(18, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn2): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn3): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (emb_drop): Dropout(p=0.6, inplace=False)
  (drops): Dropout(p=0.3, inplace=False)
)

In [17]:
losses = train_loop(model=net, train_dl=train_loader,
                    valid_dl=test_loader, epochs=1,
                    train_size=TRAIN_SIZE, test_size=TEST_SIZE,
                    chunksize=CHUNKSIZE, batch_size=BATCH_SIZE,
                    device=device, lr=0.02, wd=0.00001,
                    loss_fn=F.cross_entropy)

  0%|          | 1/86522 [00:00<4:48:30,  5.00it/s]

read data


  3%|▎         | 2407/86522 [07:44<4:31:09,  5.17it/s]

read data


  4%|▎         | 3095/86522 [09:53<4:15:18,  5.45it/s]

read data


  4%|▍         | 3781/86522 [12:04<4:18:53,  5.33it/s]

read data


  7%|▋         | 6120/86522 [19:35<4:15:11,  5.25it/s]

read data


 10%|▉         | 8510/86522 [27:05<4:08:11,  5.24it/s]

read data


 13%|█▎        | 10917/86522 [34:46<3:56:15,  5.33it/s]

read data


 15%|█▌        | 13293/86522 [42:10<3:46:34,  5.39it/s]

read data


 18%|█▊        | 15633/86522 [49:38<3:46:05,  5.23it/s]

read data


 21%|██        | 17973/86522 [57:09<3:36:03,  5.29it/s]

read data


 24%|██▎       | 20343/86522 [1:04:30<3:23:02,  5.43it/s]

read data


 26%|██▋       | 22713/86522 [1:11:51<3:14:57,  5.45it/s]

read data


 29%|██▉       | 25061/86522 [1:19:13<3:05:36,  5.52it/s]

read data


 32%|███▏      | 27449/86522 [1:26:41<3:05:04,  5.32it/s]

read data


 34%|███▍      | 29797/86522 [1:34:02<2:50:34,  5.54it/s]

read data


 37%|███▋      | 32185/86522 [1:41:31<2:48:35,  5.37it/s]

read data


 40%|███▉      | 34575/86522 [1:48:58<2:59:54,  4.81it/s]

read data


 43%|████▎     | 36915/86522 [1:56:26<2:34:12,  5.36it/s]

read data


 45%|████▌     | 39286/86522 [2:03:48<2:24:34,  5.45it/s]

read data


 46%|████▌     | 39970/86522 [2:05:57<2:25:17,  5.34it/s]

read data


 49%|████▉     | 42362/86522 [2:13:24<2:16:00,  5.41it/s]

read data


 52%|█████▏    | 44701/86522 [2:20:53<2:11:30,  5.30it/s]

read data


 54%|█████▍    | 47078/86522 [2:28:16<1:59:42,  5.49it/s]

read data


 57%|█████▋    | 49467/86522 [2:35:42<2:04:22,  4.97it/s]

read data


 58%|█████▊    | 50150/86522 [2:37:49<1:55:48,  5.23it/s]

read data


 59%|█████▉    | 50834/86522 [2:39:57<1:51:16,  5.35it/s]

read data


 62%|██████▏   | 53224/86522 [2:47:23<1:44:05,  5.33it/s]

read data


 64%|██████▍   | 55600/86522 [2:54:47<1:42:41,  5.02it/s]

read data


 67%|██████▋   | 57992/86522 [3:02:14<1:28:29,  5.37it/s]

read data


 70%|██████▉   | 60339/86522 [3:09:35<1:19:39,  5.48it/s]

read data


 73%|███████▎  | 62729/86522 [3:17:04<1:14:58,  5.29it/s]

read data


 75%|███████▌  | 65134/86522 [3:24:43<1:08:18,  5.22it/s]

read data


 78%|███████▊  | 67481/86522 [3:32:10<58:13,  5.45it/s]  

read data


 81%|████████  | 69829/86522 [3:39:37<50:57,  5.46it/s]  

read data


 83%|████████▎ | 72235/86522 [3:47:23<46:01,  5.17it/s]

read data


 86%|████████▌ | 74606/86522 [3:54:45<36:33,  5.43it/s]

read data


 89%|████████▉ | 76982/86522 [4:02:12<29:30,  5.39it/s]

read data


 92%|█████████▏| 79353/86522 [4:09:35<22:01,  5.43it/s]

read data


 94%|█████████▍| 81743/86522 [4:17:06<15:41,  5.07it/s]

read data


 97%|█████████▋| 84119/86522 [4:24:31<07:26,  5.38it/s]

read data


86524it [4:32:14,  5.30it/s]                           
  0%|          | 1/17304 [00:00<38:25,  7.50it/s]

training loss:  0.0001807517974001042
read data


 14%|█▎        | 2379/17304 [05:30<33:49,  7.35it/s]

read data


 27%|██▋       | 4706/17304 [10:50<27:43,  7.57it/s]

read data


 31%|███       | 5391/17304 [12:23<26:24,  7.52it/s]

read data


 45%|████▌     | 7795/17304 [18:05<22:08,  7.16it/s]

read data


 59%|█████▊    | 10128/17304 [23:34<16:33,  7.22it/s]

read data


 72%|███████▏  | 12491/17304 [28:55<10:45,  7.46it/s]

read data


 86%|████████▌ | 14874/17304 [34:17<05:36,  7.21it/s]

read data


100%|█████████▉| 17247/17304 [39:37<00:07,  7.25it/s]

valid loss 0.000, accuracy 0.937, macro auc 0.639 and micro auc 0.973
time taken: 18712.36





In [18]:
losses

[{'epoch': 1,
  'train_loss': 0.0001807517974001042,
  'test_loss': 0.00020090498921754253,
  'test_acc': 0.9368723338407677,
  'test_auc_macro': 0.6392410360275745,
  'test_auc_micro': 0.9732561114418942}]

In [19]:
# save model's state dict
model_fn = os.path.join(MODEL_DIR, 'Class_model_SegGE20_E1.pt')
torch.save(net.state_dict, model_fn)

  "type " + obj.__name__ + ". It won't be checked "


In [20]:
# save and checkpoint model
model_ckpt_fn = os.path.join(MODEL_DIR, 'Class_model_SegGE20_E1_ckpt.pt')
torch.save({
            'epoch': 1,
            'model_state_dict': net.state_dict(),
            'optimizer_state_dict': get_optimizer(
                net, lr = 0.02, wd = 0.00001).state_dict(),
            'loss': losses[-1]['test_loss'],
            'acc': losses[-1]['test_acc'],
            'auc_macro': losses[-1]['test_auc_macro'],
            'auc_micro': losses[-1]['test_auc_micro']
            }, model_ckpt_fn)

In [21]:
# testing

preds = []
pred_probs = []
actuals = []
actuals_onehot = []
with torch.no_grad():
    for x1, x2, y in test_loader:
        x1, x2, y = (construct_tensor(x1), construct_tensor(x2),
                     construct_tensor_y(y))
        x1 = x1.to(device)
        x2 = x2.to(device)
        y = y.to(device)
        out = net(x1, x2)
        pred = torch.max(out, 1)[1]
        pred_prob = F.softmax(out, dim=1)
        y_onehot = F.one_hot(y)
        preds.append(pred)
        pred_probs.append(pred_prob)
        actuals.append(y)
        actuals_onehot.append(y_onehot)

read data
read data
read data
read data
read data
read data
read data
read data


In [22]:
(len(preds), len(pred_probs), len(actuals), len(actuals_onehot),
 actuals_onehot[0])

(17305,
 17305,
 17305,
 17305,
 tensor([[1, 0, 0],
         [1, 0, 0],
         [1, 0, 0],
         ...,
         [1, 0, 0],
         [0, 1, 0],
         [1, 0, 0]], device='cuda:0'))

In [23]:
final_preds = [item for sublist in preds for
               item in sublist.cpu().detach().tolist()]
final_pred_probs = [item for sublist in pred_probs for
                    item in sublist.cpu().detach().tolist()]
final_actuals = [item for sublist in actuals for
                 item in sublist.cpu().detach().tolist()]
final_actuals_onehot = F.one_hot(tensor(final_actuals))

In [24]:
(len(final_preds), len(final_pred_probs), len(final_actuals),
 len(final_actuals_onehot), final_actuals_onehot[:5],
 final_pred_probs[:5], final_actuals[:5])

(23251612,
 23251612,
 23251612,
 23251612,
 tensor([[1, 0, 0],
         [1, 0, 0],
         [1, 0, 0],
         [1, 0, 0],
         [1, 0, 0]]),
 [[0.9744861721992493, 0.022715192288160324, 0.002798564499244094],
  [0.9864744544029236, 0.01223214715719223, 0.001293312176130712],
  [0.9910107851028442, 0.008373735472559929, 0.0006154446164146066],
  [0.9869650602340698, 0.011990916915237904, 0.0010439311154186726],
  [0.9823639392852783, 0.016432181000709534, 0.0012038853019475937]],
 [0, 0, 0, 0, 0])

In [25]:
final_pred_probs = np.array(final_pred_probs)
final_actuals_onehot = np.array(final_actuals_onehot)
final_actuals = np.array(final_actuals)
final_preds = np.array(final_preds)

In [26]:
print(final_pred_probs.shape)
print(final_actuals_onehot.shape)
print(final_pred_probs[0])
print(final_actuals_onehot[:2])

(23251612, 3)
(23251612, 3)
[0.97448617 0.02271519 0.00279856]
[[1 0 0]
 [1 0 0]]


In [27]:
from sklearn.metrics import mean_squared_error, roc_auc_score

acc = (final_actuals == final_preds).mean()
auc_macro = roc_auc_score(y_true=final_actuals_onehot,
                          y_score=final_pred_probs, average='macro')
auc_micro = roc_auc_score(y_true=final_actuals_onehot,
                          y_score=final_pred_probs, average='micro')
rmse = np.sqrt(mean_squared_error(y_true=final_actuals,
                                  y_pred=final_preds))

print('Test Accuracy: %0.3f' % (acc))
print('Test Macro AUC: %0.3f' % (auc_macro))
print('Test Micro AUC: %0.3f' % (auc_micro))
print('Test RMSE: %0.3f' % (rmse))

Test Accuracy: 0.937
Test Macro AUC: 0.632
Test Micro AUC: 0.974
Test RMSE: 0.277


In [28]:
prediction_df = pd.DataFrame(
    {'actual': final_actuals,
     'actual_onehot': final_actuals_onehot.tolist(),
     'pred_class': final_preds,
     'pred_prob': final_pred_probs.tolist()})

print(prediction_df.shape)
prediction_df.head()

(23251612, 4)


Unnamed: 0,actual,actual_onehot,pred_class,pred_prob
0,0,"[1, 0, 0]",0,"[0.9744861721992493, 0.022715192288160324, 0.0..."
1,0,"[1, 0, 0]",0,"[0.9864744544029236, 0.01223214715719223, 0.00..."
2,0,"[1, 0, 0]",0,"[0.9910107851028442, 0.008373735472559929, 0.0..."
3,0,"[1, 0, 0]",0,"[0.9869650602340698, 0.011990916915237904, 0.0..."
4,0,"[1, 0, 0]",0,"[0.9823639392852783, 0.016432181000709534, 0.0..."


In [29]:
pred_fn = os.path.join(PREDICTION_DIR, 'prediction_SegGE20_E1.csv.gz')
prediction_df.to_csv(pred_fn, compression='gzip', index=False)