In [None]:
import torch
import torch.nn as nn
import os
import sys
import pickle as pk
import numpy as np
import random

from sklearn.metrics import roc_auc_score


In [None]:
workspace_dir = '.'
try:
    from google.colab import drive
    drive.mount( '/content/drive/' )

    workspace_dir = os.path.join( '.' , 'drive', 'My Drive', 'DIN-pytorch')
    sys.path.append( workspace_dir)
    ! rm -rf data
    ! tar zxf "{workspace_dir}/data.tar.gz" -C ./
    ! tar zxf "{workspace_dir}/loader.tar.gz" -C ./
    ! ls -al data   
except ImportError:
    pass

In [None]:
from model import DIN, DIEN, DynamicGRU
from DataLoader import MyDataSet

%load_ext autoreload
%autoreload 2

In [None]:
#Model hyper parameter
MAX_LEN = 100
EMBEDDING_DIM = 18
# HIDDEN_SIZE_ATTENTION = [80, 40]
# HIDDEN_SIZE_FC = [200, 80]
# ACTIVATION_LAYER = 'LeakyReLU' # lr = 0.01


# Adam
LR = 1e-3
BETA1 = 0.5
BETA2 = 0.99

# Train
BATCH_SIZE = 128
EPOCH_TIME = 20
TEST_ITER = 1000

RANDOM_SEED = 19940808

USE_CUDA = True

In [None]:
train_file = os.path.join( './data', "local_train_splitByUser")
test_file  = os.path.join( './data', "local_test_splitByUser")
uid_voc    = os.path.join( './data', "uid_voc.pkl")
mid_voc    = os.path.join( './data', "mid_voc.pkl")
cat_voc    = os.path.join( './data', "cat_voc.pkl")

In [None]:
if USE_CUDA and torch.cuda.is_available():
    print( "Cuda is avialable" )
    device = torch.device('cuda')
    dtype = torch.cuda.FloatTensor
else:
    device = torch.device( 'cpu')
    dtype = torch.FloatTensor

In [None]:
# Stable the random seed
def same_seeds(seed = RANDOM_SEED):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    np.random.seed(seed)  
    random.seed(seed) 
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# Initilize  parameters
def weights_init( m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_( m.weight.data, 0.0, 0.02)
        nn.init.constant_( m.bias.data, 0)
    elif classname.find( 'BatchNorm') != -1:
        nn.init.normal_( m.weight.data, 1.0, 0.02)
        nn.init.constant_( m.bias.data, 0)
    elif classname.find( 'Linear') != -1:
        nn.init.normal_( m.weight.data, 0.0, 0.02)


def eval_output( scores, target, loss_function = torch.nn.functional.binary_cross_entropy_with_logits):
    loss = loss_function( scores.type( dtype) , target.type( dtype))

    y_pred = scores.sigmoid().round()
    accuracy = ( y_pred == target).type( dtype).mean()

    auc = roc_auc_score( target.cpu().detach(), scores.cpu().detach() )
    return loss, accuracy, auc

In [None]:
# The dict mapping description(string) to type index(int) 
# A more graceful api https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder not used in this project

user_map = pk.load( open( uid_voc, 'rb')); n_uid = len( user_map)
material_map = pk.load( open( mid_voc, 'rb')); n_mid = len( material_map)
category_map = pk.load( open( cat_voc, 'rb')); n_cat = len( category_map)

In [None]:
same_seeds( RANDOM_SEED)

dataset_train = MyDataSet( train_file, user_map, material_map, category_map, max_length = MAX_LEN)
dataset_test = MyDataSet( test_file, user_map, material_map, category_map, max_length = MAX_LEN)

loader_train = torch.utils.data.DataLoader( dataset_train, batch_size = BATCH_SIZE, shuffle = True)
loader_test = torch.utils.data.DataLoader( dataset_test, batch_size = BATCH_SIZE, shuffle = False)

# with open( 'loader.pkl', 'rb') as fin:
#     loader_train, loader_test = pk.load(fin, encoding="bytes") 

In [None]:
# Get model and initialize it
# model = DIEN(  n_uid, n_mid, n_cat, EMBEDDING_DIM).to( device)
model = DIEN(  n_uid, n_mid, n_cat, EMBEDDING_DIM ).to( device)
model.apply( weights_init)

# Set loss function and optimizer
optimizer = torch.optim.Adam( model.parameters(), LR, ( BETA1, BETA2))

model.train(); iter = 0
for epoch in range( EPOCH_TIME):

    for i, data in enumerate( loader_train):
        iter += 1

        # transform data to target device
        data = [ item.to( device) if item != None else None for item in data]
        
        target = data[-1]
        
        model.zero_grad()

        scores = model( data, neg_sample = False)
        
        loss, accuracy, auc = eval_output( scores, target)

        loss.backward()
        optimizer.step( )
        
        print( "\r[%d/%d][%d/%d]\tloss:%.5f\tacc:%.5f\tauc:%.5f"%( epoch + 1, EPOCH_TIME, i + 1, len( loader_train), loss.item(), accuracy.item(), auc.item() ) ,end='')

        if iter % TEST_ITER == 0:
            model.eval()
            with torch.no_grad():
                score_list = []; target_list = []
                for data in  loader_test:
                    data = [ item.to( device) if item != None else None for item in data]
                    
                    target = data[-1]
                    
                    scores = model( data, neg_sample = False)
                    score_list.append( scores)
                    target_list.append( target)
                scores = torch.cat( score_list, dim = -1)
                target = torch.cat( target_list, dim = -1)
                loss, accuracy, auc = eval_output( scores, target)
                print( "\tTest Set\tloss:%.5f\tacc:%.5f\tauc:%.5f"%( loss.item(), accuracy.item(), auc.item() ) )
            model.train()