In [1]:

import pandas as pd
import numpy as np
import time
import pickle
import re 
import matplotlib.pyplot as plt
import nvstrings
import warnings
import cudf as gd
import torch
import torch.nn
import nvcategory

from librmm_cffi import librmm
from numba import cuda
from sklearn.metrics import roc_auc_score
from datetime import date
from fastai.callbacks import *
from fastai import *
from fastai.tabular import *
from fastai.text import *
from fastai.metrics import accuracy
from multiprocessing import Process
from os import listdir
from os.path import isfile, join
warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
import os
GPU_id = 2
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [None]:
torch.cuda.current_device()


<h2> Functions </h2>

In [None]:
#########################
#                       #
# Metrics and callbacks #
#                       #
#########################

def write_pkl(obj, file_path=None):
    if not file_path:
        date_time = date.fromtimestamp(time.time()).strftime("%m_%d_%Y_%H-%M-%S")
        file_path=f'{date_time}.csv'
    if os.path.exists(file_path):
        os.remove(file_path)
    max_bytes = 2**31 - 1
    bytes_out = pickle.dumps(obj, protocol=4)
    with open(file_path, 'wb') as f_out:
        for idx in range(0, len(bytes_out), max_bytes):
            f_out.write(bytes_out[idx:idx+max_bytes])


def get_mean_reciprocal_rank(sub):
    # sub is a pandas dataframe
    # sub should have the following columns:
    # 'row_id', 'prob', 'reference', 'item_id'
    # sorted by prob in descending order for each group
    sub = gd.from_pandas(sub)
    
    def get_order_in_group(prob,row_id,order):
        for i in range(cuda.threadIdx.x, len(prob), cuda.blockDim.x):
            order[i] = i

    dg = sub.groupby('row_id',method="cudf").apply_grouped(get_order_in_group,incols=['prob','row_id'],
                                  outcols={'order': np.int32},
                                  tpb=32)

    dg = dg.to_pandas()
    dg['order'] = 1.0/(1+dg['order'])
    dg = dg[dg['reference']==dg['item_id']]
    print(dg.isnull().values.any())
    return dg['order'].mean()

def auroc_score(input, target):
    input, target = input.cpu().numpy()[:,1], target.cpu().numpy()
    return roc_auc_score(target, input)

# Callback to calculate AUC at the end of each epoch
class AUROC(Callback):
    _order = -20 #Needs to run before the recorder

    def __init__(self, learn, **kwargs): self.learn = learn
    def on_train_begin(self, **kwargs): self.learn.recorder.add_metric_names(['AUROC'])
    def on_epoch_begin(self, **kwargs): self.output, self.target = [], []

    def on_batch_end(self, last_target, last_output, train, **kwargs):
        if not train:
            self.output.append(last_output)
            self.target.append(last_target)

    def on_epoch_end(self, last_metrics, **kwargs):
        if len(self.output) > 0:
            output = torch.cat(self.output)
            target = torch.cat(self.target)
            preds = F.softmax(output, dim=1)
            metric = auroc_score(preds, target)
            return add_metrics(last_metrics, [metric])

        
def get_idx(x): 
    return 0 if pd.isnull(x) else id_to_index.get(str(x), 0)

<h1> <center> Data Processing </center> </h1>

In [None]:
with open('cache/id_to_index.pkl', 'rb') as handle:
    id_to_index = pickle.load(handle)

In [None]:
%%time
#load tabular data 
data_pair = pd.read_pickle('cache/data_pair_all.pkl')
data_pair = data_pair.drop(columns = [c for c in data_pair.columns if c.startswith('delta') or c.startswith('is')])

In [None]:
%%time
data_context = pd.read_csv('cache/context_info.csv')
data_context['past_clickout_item'] = data_context['past_clickout_item'].apply(get_idx)
data_context['future_clickout_item'] = data_context['future_clickout_item'].apply(get_idx)

In [None]:
%%time
data_pair = data_pair.merge(data_context, on='row_id', how='left')

In [None]:
data_pair['past_clickout_price_diff'] = data_pair['price'] - data_pair['past_clickout_price']
data_pair['future_clickout_price_diff'] = data_pair['price'] - data_pair['future_clickout_price']

In [None]:
# get the train and test dataset 
train = data_pair[data_pair.clickout_missing==0]
test = data_pair[data_pair.clickout_missing>0]
print(train.shape,test.shape)

<h3> Input Data </h3> 

In [None]:
%%time 
cat_names = ['user_id','item_id','platform','city','device','current_filters']
cat_names +=['past_clickout_available', 
             'past_clickout_is_same', 
             'past_clickout_impression_valid', 
             'future_clickout_available',
             'future_clickout_is_same', 
             'future_clickout_impression_valid']

cont_names = ['price','candidate_order','item_count'] + [i for i in train.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]
cont_names += [ 'past_clickout_step_diff', 
                'past_clickout_timestamp_diff',
                'past_clickout_price', 
                'past_clickout_price_mean',  
                'past_clickout_price_std',
                'past_clickout_price_diff',
                'cur_item_count_past',
                'future_clickout_step_diff', 
                'future_clickout_timestamp_diff',
                'future_clickout_price',   
                'future_clickout_price_mean', 
                'future_clickout_price_std',
                'future_clickout_price_diff',
                'cur_item_count_future']

procs = [FillMissing,Categorify, Normalize]
train['is_va'] = train.row_id%5 == 0

test_list = TabularList.from_df(test, path='./', cat_names=cat_names, cont_names=cont_names)

data_tab = (TabularList.from_df(train, path='./', cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_from_df('is_va')
                           .label_from_df(cols='target')
                           .add_test(test_list)
                           .databunch(num_workers=10,bs=1024))

In [None]:
%%time 
write_pkl(data_tab)
del data_pair

In [None]:
%%time
learn = tabular_learner(data_tab, layers=[128,64], metrics=None, callback_fns=AUROC,#wd=0.2,
        emb_szs = {'user_id': 32,'item_id':32,'platform':4,'city':8,'device':1,
                   'current_filters':8})

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(1, max_lr=slice(3e-3), callbacks=[SaveModelCallback(learn,
        every='improvement', monitor='AUROC',name='tab_nn')])

<h2> Performance Evaluation</h2>

In [None]:
%%time
yp,y_valid = learn.get_preds()

In [None]:
%%time
cv = train.loc[train['is_va']>0,['row_id','reference','item_id']].copy()
cv['prob'] = yp.numpy()[:,1]
cv = cv.sort_values(by=['row_id','prob'],ascending=False)
print('here')
# del train

In [None]:
cv.columns

In [None]:
cv['row_id'] = cv['row_id'].astype('int32')
cv['reference'] = cv['reference'].astype('int32')
cv['item_id'] = cv['item_id'].astype('int32')

In [None]:
%%time
auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])
mean_reciprocal_rank = get_mean_reciprocal_rank(cv)
print('mean_reciprocal_rank %.4f, AUC %.4f'%(mean_reciprocal_rank,auc))

In [None]:
%%time
yps,_ = learn.get_preds(DatasetType.Test)

In [None]:
%%time
test['target'] = yps.numpy()[:,1]
test = test['row_id,user_id,session_id,timestamp,step,item_id,target'.split(',')]
test.head()

In [None]:
%%time
test = test.sort_values(by=['row_id','target'],ascending=False) # larger probs first

In [None]:
test.head()

In [None]:
%%time
sub = test[['row_id','item_id']].copy()
sub = sub.groupby('row_id')['item_id'].apply(lambda x: ' '.join([str(i) for i in x]))
sub = sub.to_frame()
sub.columns = ['new_item_recommendations']
sub = sub.reset_index()

test = test.drop_duplicates(subset=['row_id'])
sub = test.merge(sub,on='row_id',how='left')
sub = sub[['session_id','new_item_recommendations']]

In [None]:
%%time
sample_sub = pd.read_csv('/datasets/trivago/data/submission_popular.csv')
# sample_sub = pd.read_csv('../input/submission_popular.csv')
assert sample_sub.shape[0] == sample_sub.session_id.unique().shape[0]
sub = sample_sub.merge(sub,on='session_id',how='left')

from datetime import datetime
clock = "{}".format(datetime.now()).replace(' ','-').replace(':','-').split('.')[0]

mask = sub.new_item_recommendations.isnull() == 0
sub.loc[mask,'item_recommendations'] = sub.loc[mask,'new_item_recommendations']
sub = sub.drop('new_item_recommendations',axis=1)
out = 'fastai_%s_mrr_%.4f_auc_%.4f.csv'%(clock,mean_reciprocal_rank,auc)
sub.to_csv(out,index=False)

sub.head()