In [1]:
import os
train_path = '../../raw_train_artifact'
test_path = '../../raw_test_artifact'
embedding_path = '../../embedding_artifact'
input_path = '../../input_artifact'
input_split_path = '../../input_artifact/input_split'
model_path = '../../model_artifact'
output_path = '../../output_artifact'

In [2]:
import sys
import gc
gc.enable()
import time
import re

import numpy as np
import pandas as pd
pd.set_option('display.max_columns',120)
pd.set_option('display.max_rows',2000)
pd.set_option('precision',5)
pd.set_option('float_format', '{:.5f}'.format)

import tqdm
import joblib
import json

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score
from gensim.models import Word2Vec
import torch
from torch import nn
import torch.nn.functional as F

In [3]:
import logging

log_path = '[1.3]LSTM with Creative, Advertiser & Product Embedding Sequence.log'
    
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

formatter = logging.Formatter('%(asctime)s %(levelname)-s: %(message)s', datefmt='%H:%M:%S')

fh = logging.FileHandler(log_path)
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)

sh = logging.StreamHandler(sys.stdout)
sh.setLevel(logging.INFO)
sh.setFormatter(formatter)
logger.addHandler(sh)

logger.info(f'Restart notebook\n==========================\n{time.ctime()}\n==========================')

17:49:03 INFO: Restart notebook
Wed Jun  3 17:49:03 2020


In [4]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info('Device in Use: {}'.format(DEVICE))
torch.cuda.empty_cache()
t = torch.cuda.get_device_properties(DEVICE).total_memory/1024**3
c = torch.cuda.memory_cached(DEVICE)/1024**3
a = torch.cuda.memory_allocated(DEVICE)/1024**3
logger.info('CUDA Memory: Total {:.2f} GB, Cached {:.2f} GB, Allocated {:.2f} GB'.format(t,c,a))

17:49:03 INFO: Device in Use: cuda
17:49:03 INFO: CUDA Memory: Total 8.00 GB, Cached 0.00 GB, Allocated 0.00 GB


## Data Loader

### General Utility

In [9]:
inp_embed_artifact = {
    'creative': {
        'embedding_artifact': r'C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\creative_id_embed_s160_w64_cbow_38168zon',
        'train_file_prefix': 'train_creative_agg_user',
        'test_file_prefix': 'test_creative_agg_user'
    },
    'ad': {
        'embedding_artifact': r'C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\ad_id_embed_s160_w64_cbow_ibfi8g78',
        'train_file_prefix': 'train_ad_agg_user',
        'test_file_prefix': 'test_ad_agg_user'
    },
    'advertiser': {
        'embedding_artifact': r'C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\advertiser_id_embed_s128_w64_cbow_n4re8tds',
        'train_file_prefix': 'train_advertiser_agg_user',
        'test_file_prefix': 'test_advertiser_agg_user'
    },
    'product': {
        'embedding_artifact': r'C:\JupyterNotebook\Tencent-Ads-Algo-Comp-2020\embedding_artifact\product_id_embed_s128_w64_cbow_8yemmp45',
        'train_file_prefix': 'train_product_agg_user',
        'test_file_prefix': 'test_product_agg_user'
    }
}

def get_truth(split_id, logger=None):
    """
    Get user id and ground truth
    """
    start = time.time()
    
    truth_path = os.path.join(input_split_path, f'train_truth_{split_id}.npy')
    with open(truth_path, 'rb') as f:
        truth = np.load(f)
        
    inp_user = truth[:,0]
    out_age = torch.from_numpy(truth[:,1]).long()
    out_gender = torch.from_numpy(truth[:,2]).long()
    
    del truth
    _ = gc.collect()
    
    if logger: logger.info(f'Target output ready after {time.time()-start:.2f}s')
    return inp_user, out_age, out_gender

def get_embed_seq(split_id, embed_var, inp_user, max_seq=100, train=True, logger=None):
    """
    Get corresponding embedding sequence
    """
    global inp_embed_artifact, input_split_path
    assert embed_var in inp_embed_artifact
    
    start = time.time()
    embedding = Word2Vec.load(inp_embed_artifact[embed_var]['embedding_artifact'])
    if logger: logger.info(f'{embed_var.capitalize()} embedding artifact is loaded after {time.time()-start:.2f}s')
    start = time.time()
    file_prefix = inp_embed_artifact[embed_var]['train_file_prefix'] if train else inp_embed_artifact[embed_var]['test_file_prefix']
    raw_path = os.path.join(input_split_path, f'{file_prefix}_{split_id}.json')
    with open(raw_path, 'r') as f:
        raw = json.load(f)
    inp_seq = []
    for user in inp_user:
        inp_seq.append(torch.from_numpy(np.stack([embedding.wv[key] for key in raw[str(user)][:max_seq]], axis=0)).float())
    inp_last_idx = np.array([i.shape[0] for i in inp_seq])-1
    
    del embedding, raw
    _ = gc.collect()
    
    if logger: logger.info(f'{embed_var.capitalize()} embedding sequence ready after {time.time()-start:.2f}s')
    return inp_seq, inp_last_idx

In [11]:
def prepare_train(split_id, max_seq=100, logger=None):
    """
    Get ground truth, and embedding sequence for creative, product and advertiser
    """
    if logger: logger.info(f'Preparing Training Split-{split_id}')
        
    inp_user, out_age, out_gender = get_truth(split_id, logger=logger)
    inp_creative_seq, inp_last_idx = get_embed_seq(split_id, 'creative',inp_user, max_seq=max_seq, logger=logger)
    inp_advertiser_seq, _ = get_embed_seq(split_id, 'advertiser',inp_user, max_seq=max_seq, logger=logger)
    inp_product_seq, _ = get_embed_seq(split_id, 'product',inp_user, max_seq=max_seq, logger=logger)
    
    del inp_user
    _ = gc.collect()
    
    return out_age, out_gender, inp_creative_seq, inp_advertiser_seq, inp_product_seq, inp_last_idx   

def prepare_test(split_id, max_seq=100, logger=None):
    global input_split_path
    if logger: logger.info(f'Preparing Training Split-{split_id}')
        
    idx_path = os.path.join(input_split_path, 'test_idx_shuffle.npy')
    with open(idx_path, 'rb') as f:
        test_idx = np.load(f)
    inp_user = test_idx[(split_id-1)*100000:split_id*100000]
    del test_idx
    _ = gc.collect()
    
    inp_creative_seq, inp_last_idx = get_embed_seq(split_id, 'creative',inp_user, max_seq=max_seq, train=False, logger=logger)
    inp_advertiser_seq, _ = get_embed_seq(split_id, 'advertiser',inp_user, max_seq=max_seq, train=False, logger=logger)
    inp_product_seq, _ = get_embed_seq(split_id, 'product',inp_user, max_seq=max_seq, train=False, logger=logger)
    
    _ = gc.collect()
    
    return inp_user, inp_creative_seq, inp_advertiser_seq, inp_product_seq, inp_last_idx

## Model

In [None]:
class LSTM_Extraction_Layer(nn.Module):
    """
    Feature extration layer
    - Layer 1: BiLSTM + Dropout + Layernorm
    - Layer 2: LSTM with Residual Connection + Dropout + Layernorm
    - Layer 3: LSTM + Batchnorm + ReLU + Dropout
    """
    def __init__(self, embed_size, lstm_hidden_size, rnn_dropout=0.2, mlp_dropout=0.4, **kwargs):
        super(LSTM_Extraction_Layer, self).__init__(**kwargs)
        self.embed_size = embed_size
        self.lstm_hidden_size = lstm_hidden_size
        self.dropout = dropout
        
        self.bi_lstm = nn.LSTM(input_size=embed_size, hidden_size=lstm_hidden_size, bias=True, bidirectional=True)
        self.rnn_dropout_1 = nn.Dropout(p=rnn_dropout)
        self.layernorm_1 = nn.LayerNorm(2*lstm_hidden_size)
        self.lstm_1 = nn.LSTM(input_size=2*lstm_hidden_size, hidden_size=2*lstm_hidden_size)
        self.rnn_dropout_2 = nn.Dropout(p=rnn_dropout)
        self.layernorm_2 = nn.LayerNorm(2*lstm_hidden_size)
        self.lstm_2 = nn.LSTM(input_size=2*lstm_hidden_size, hidden_size=2*lstm_hidden_size)
        self.batchnorm = nn.BatchNorm1d(2*lstm_hidden_size)
        self.mlp_dropout = nn.Dropout(p=mlp_dropout)
        
    def forward(self, inp_embed, inp_last_idx):
        bilstm_out, _ = self.bi_lstm(inp_embed.permute(1,0,2))                            # (max_seq_length, batch_size, embed_size) -> (max_seq_length, batch_size, 2*lstm_hidden_size)
        bilstm_out = self.layernorm_1(self.rnn_dropout_1(bilstm_out))                     # (max_seq_length, batch_size, 2*lstm_hidden_size)
        lstm_out, _ = self.lstm_1(bilstm_out)                                             # (max_seq_length, batch_size, 2*lstm_hidden_size)
        lstm_out = self.rnn_dropout_2(lstm_out)                                           # (max_seq_length, batch_size, 2*lstm_hidden_size)
        lstm_out = self.layernorm_2(lstm_out+bilstm_out)                                  # (max_seq_length, batch_size, 2*lstm_hidden_size)
        lstm_out, _ = self.lstm_2(lstm_out)                                               # (max_seq_length, batch_size, 2*lstm_hidden_size)
        lstm_out = lstm_out.permute(1,0,2)[np.arange(len(inp_last_idx)), inp_last_idx,:]  # (batch_size, 2*lstm_hidden_size)
        lstm_out = self.mlp_dropout(F.relu(self.batchnorm(lstm_out)))                     # (batch_size, 2*lstm_hidden_size)
        return lstm_out
    
class MLP_Classification_Layer(nn.Module):
    """
    Multilayer Perception Classification Layer
    - Layer 1: Linear + Batchnorm + ReLU + Dropout
    - Layer 2: Linear + Batchnorm + ReLU + Dropout
    - Layer 3: Linear
    """
    def __init__(self, inp_size, out_size, dropout=0.4, **kwargs):
        super(MLP_Classification_Layer, self).__init__(**kwargs)
        self.inp_size = inp_size
        self.out_size = out_size
        self.dropout = dropout
        
        self.mlp_1 = nn.Linear(inp_size, 1024)
        self.batchnorm_1 = nn.BatchNorm1d(1024)
        self.mlp_dropout_1 = nn.Dropout(p=mlp_dropout)
        self.mlp_2 = nn.Linear(1024, 512)
        self.batchnorm_2 = nn.BatchNorm1d(512)
        self.mlp_dropout_2 = nn.Dropout(p=mlp_dropout)
        self.mlp_3 = nn.Linear(512, out_size)
        
    def forward(self, inp):
        mlp_out = self.mlp_1(inp)                                                         # (batch_size, 1024)
        mlp_out = self.mlp_dropout_1(F.relu(self.batchnorm_1(mlp_out)))                   # (batch_size, 1024)
        mlp_out = self.mlp_2(mlp_out)                                                     # (batch_size, 512)
        mlp_out = self.mlp_dropout_2(F.relu(self.batchnorm_2(mlp_out)))                   # (batch_size, 512)
        mlp_out = self.mlp_3(mlp_out)                                                     # (batch_size, out_size)
        return mlp_out   
    
class Multi_Seq_LSTM_Classifier(nn.Module):
    

In [None]:
class LSTM_Classifier(nn.Module):
    def __init__(self, embed_size, lstm_hidden_size, out_size, rnn_dropout=0.2, mlp_dropout=0.4, **kwargs):
        super(LSTM_Classifier, self).__init__(**kwargs)
        self.embed_size = embed_size
        self.lstm_hidden_size = lstm_hidden_size
        self.out_size = out_size
        self.rnn_dropout = rnn_dropout
        self.mlp_dropout = mlp_dropout
        
        self.bi_lstm = nn.LSTM(input_size=embed_size, hidden_size=lstm_hidden_size, bias=True, bidirectional=True)
        self.rnn_dropout_1 = nn.Dropout(p=rnn_dropout)
        self.layernorm_1 = nn.LayerNorm(2*lstm_hidden_size)
        self.lstm_1 = nn.LSTM(input_size=2*lstm_hidden_size, hidden_size=2*lstm_hidden_size)
        self.rnn_dropout_2 = nn.Dropout(p=rnn_dropout)
        self.layernorm_2 = nn.LayerNorm(2*lstm_hidden_size)
        self.lstm_2 = nn.LSTM(input_size=2*lstm_hidden_size, hidden_size=2*lstm_hidden_size)
        self.batchnorm_1 = nn.BatchNorm1d(2*lstm_hidden_size)
        self.mlp_dropout_1 = nn.Dropout(p=mlp_dropout)
        self.mlp_1 = nn.Linear(2*lstm_hidden_size, 1024)
        self.batchnorm_2 = nn.BatchNorm1d(1024)
        self.mlp_dropout_2 = nn.Dropout(p=mlp_dropout)
        self.mlp_2 = nn.Linear(1024, 512)
        self.batchnorm_3 = nn.BatchNorm1d(512)
        self.mlp_dropout_3 = nn.Dropout(p=mlp_dropout)
        self.mlp_3 = nn.Linear(512, out_size)
        
    def forward(self, inp_embed, inp_last_idx):
        bilstm_out, _ = self.bi_lstm(inp_embed.permute(1,0,2))                            # (max_seq_length, batch_size, embed_size) -> (max_seq_length, batch_size, 2*lstm_hidden_size)
        bilstm_out = self.rnn_dropout_1(bilstm_out)                                       # (max_seq_length, batch_size, 2*lstm_hidden_size)
        bilstm_out = self.layernorm_1(bilstm_out)                                         # (max_seq_length, batch_size, 2*lstm_hidden_size)
        lstm_out, _ = self.lstm_1(bilstm_out)                                             # (max_seq_length, batch_size, 2*lstm_hidden_size)
        lstm_out = self.rnn_dropout_2(lstm_out)                                           # (max_seq_length, batch_size, 2*lstm_hidden_size)
        lstm_out = self.layernorm_2(lstm_out+bilstm_out)                                  # (max_seq_length, batch_size, 2*lstm_hidden_size)
        lstm_out, _ = self.lstm_2(lstm_out)                                               # (max_seq_length, batch_size, 2*lstm_hidden_size)
        lstm_out = lstm_out.permute(1,0,2)[np.arange(len(inp_last_idx)), inp_last_idx,:]  # (batch_size, 2*lstm_hidden_size)
        lstm_out = self.mlp_dropout_1(F.relu(self.batchnorm_1(lstm_out)))                 # (batch_size, 2*lstm_hidden_size)
        mlp_out = self.mlp_1(lstm_out)                                                    # (batch_size, 1024)
        mlp_out = self.mlp_dropout_2(F.relu(self.batchnorm_2(mlp_out)))                   # (batch_size, 1024)
        mlp_out = self.mlp_2(mlp_out)                                                     # (batch_size, 512)
        mlp_out = self.mlp_dropout_3(F.relu(self.batchnorm_3(mlp_out)))                   # (batch_size, 512)
        mlp_out = self.mlp_3(mlp_out)                                                     # (batch_size, out_size)
        return mlp_out   