In [1]:
import pandas as pd
import numpy as np
import time
import pickle
import re 
import matplotlib.pyplot as plt
import nvstrings
import warnings
import cudf as gd
import torch
import torch.nn

from numba import cuda
from sklearn.metrics import roc_auc_score
from datetime import date
from fastai.callbacks import *
from fastai import *
from fastai.tabular import *
from fastai.text import *
from fastai.metrics import accuracy
from multiprocessing import Process
from os import listdir
from os.path import isfile, join
warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
import os
GPU_id = 2
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [3]:
torch.cuda.current_device()

0


<h2> Functions </h2>

In [4]:
#########################
#                       #
# Metrics and callbacks #
#                       #
#########################

def write_pkl(obj, file_path=None):
    if not file_path:
        date_time = date.fromtimestamp(time.time()).strftime("%m_%d_%Y_%H-%M-%S")
        file_path=f'{date_time}.csv'
    if os.path.exists(file_path):
        os.remove(file_path)
    max_bytes = 2**31 - 1
    bytes_out = pickle.dumps(obj, protocol=4)
    with open(file_path, 'wb') as f_out:
        for idx in range(0, len(bytes_out), max_bytes):
            f_out.write(bytes_out[idx:idx+max_bytes])


def get_mean_reciprocal_rank(sub):
    # sub is a pandas dataframe
    # sub should have the following columns:
    # 'row_id', 'prob', 'reference', 'item_id'
    # sorted by prob in descending order for each group
    sub = gd.from_pandas(sub)
    
    def get_order_in_group(prob,row_id,order):
        for i in range(cuda.threadIdx.x, len(prob), cuda.blockDim.x):
            order[i] = i

    dg = sub.groupby('row_id',method="cudf").apply_grouped(get_order_in_group,incols=['prob','row_id'],
                                  outcols={'order': np.int32},
                                  tpb=32)

    dg = dg.to_pandas()
    dg['order'] = 1.0/(1+dg['order'])
    dg = dg[dg['reference']==dg['item_id']]
    print(dg.isnull().values.any())
    return dg['order'].mean()

def auroc_score(input, target):
    input, target = input.cpu().numpy()[:,1], target.cpu().numpy()
    return roc_auc_score(target, input)

# Callback to calculate AUC at the end of each epoch
class AUROC(Callback):
    _order = -20 #Needs to run before the recorder

    def __init__(self, learn, **kwargs): self.learn = learn
    def on_train_begin(self, **kwargs): self.learn.recorder.add_metric_names(['AUROC'])
    def on_epoch_begin(self, **kwargs): self.output, self.target = [], []

    def on_batch_end(self, last_target, last_output, train, **kwargs):
        if not train:
            self.output.append(last_output)
            self.target.append(last_target)

    def on_epoch_end(self, last_metrics, **kwargs):
        if len(self.output) > 0:
            output = torch.cat(self.output)
            target = torch.cat(self.target)
            preds = F.softmax(output, dim=1)
            metric = auroc_score(preds, target)
            return add_metrics(last_metrics, [metric])

        
def get_idx(x): 
    return 0 if pd.isnull(x) else id_to_index.get(str(x), 0)

<h1> <center> Data Processing </center> </h1>

In [5]:
with open('cache/id_to_index.pkl', 'rb') as handle:
    id_to_index = pickle.load(handle)

In [6]:
%%time
#load tabular data 
data_pair = pd.read_pickle('cache/data_pair_all.pkl')
data_pair = data_pair.drop(columns = [c for c in data_pair.columns if c.startswith('delta') or c.startswith('is')])

CPU times: user 17.1 s, sys: 31.5 s, total: 48.5 s
Wall time: 51.6 s


In [7]:
%%time
data_context = pd.read_csv('cache/context_info_multi_2_more.csv')

CPU times: user 8.25 s, sys: 1.61 s, total: 9.86 s
Wall time: 10 s


In [8]:
clickout_list = [c for c in data_context.columns if 'clickout_item' in c]

In [9]:
data_context['past_clickout_item_0'].unique()

array([      0,  109038, 1032816,   65685, ..., 5997900, 2633858,   99769, 2343502])

In [10]:
data_context[clickout_list ] = data_context[clickout_list].applymap(get_idx)

In [11]:
%%time
data_pair = data_pair.merge(data_context, on='row_id', how='left')

CPU times: user 1min 2s, sys: 1min 23s, total: 2min 25s
Wall time: 2min 25s


In [12]:
# get the train and test dataset 
train = data_pair[data_pair.clickout_missing==0]
test = data_pair[data_pair.clickout_missing>0]
print(train.shape,test.shape)

(42756036, 89) (5762533, 89)


In [13]:
# train['is_va'] = train.row_id%5 == 0

<h3> Input Data </h3> 

In [14]:
cat_cols = ["clickout_available", 'clickout_is_same', 'clickout_impression_valid',  "has_changed_sort", 
            "sort_metric"]
cont_cols = ["clickout_step_diff", "price", "clickout_timestamp_diff", 'clickout_price_mean', 'clickout_price_std',
            "cur_item_interaction_image_count","cur_item_interaction_info_count", "cur_item_interaction_deal_count",
            "cur_item_interaction_rating_count", "clickout_count", "other_clicked_item_interaction_image_count",
            "other_clicked_item_interaction_info_count", "other_clicked_item_interaction_deal_count",
            "other_clicked_item_interaction_rating_count", "num_of_search_poi"]

In [15]:
%%time 
cat_names = ['user_id','item_id','platform','city','device','current_filters']
cat_names += [c for c in data_pair.columns for sub in cat_cols if sub in c]

cont_names = ['price','candidate_order','item_count'] + [i for i in train.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]
cont_names += [c for c in data_pair.columns for sub in cont_cols if sub in c]
cont_names = list(set(cont_names))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 261 µs


In [16]:
cat_names

['user_id',
 'item_id',
 'platform',
 'city',
 'device',
 'current_filters',
 'past_clickout_available_0',
 'past_clickout_is_same_0',
 'past_clickout_impression_valid_0',
 'past_clickout_available_1',
 'past_clickout_is_same_1',
 'past_clickout_impression_valid_1',
 'has_changed_sort_past',
 'sort_metric_past',
 'future_clickout_available_0',
 'future_clickout_is_same_0',
 'future_clickout_impression_valid_0',
 'future_clickout_available_1',
 'future_clickout_is_same_1',
 'future_clickout_impression_valid_1',
 'has_changed_sort_future',
 'sort_metric_future']

In [17]:
cont_names

['price_rank_norm',
 'future_clickout_timestamp_diff_1',
 'other_clicked_item_interaction_rating_count_future',
 'count_item_user_id_session_id_rank_norm',
 'past_clickout_step_diff_1',
 'cur_item_interaction_info_count_future',
 'item_count',
 'past_clickout_price_mean_1',
 'cur_item_interaction_deal_count_future',
 'num_of_search_poi_future',
 'future_clickout_step_diff_0',
 'count_item_user_id_session_id_rank',
 'past_clickout_timestamp_diff_1',
 'future_clickout_price_1',
 'cur_item_interaction_image_count_past',
 'future_clickout_timestamp_diff_0',
 'other_clicked_item_interaction_deal_count_future',
 'past_clickout_step_diff_0',
 'past_clickout_price_0',
 'count_item_user_id',
 'count_item_user_id_session_id',
 'cur_item_interaction_info_count_past',
 'other_clicked_item_interaction_image_count_future',
 'cur_item_interaction_deal_count_past',
 'item_count_rank_norm',
 'future_clickout_price_std_1',
 'price_rank',
 'other_clicked_item_interaction_deal_count_past',
 'past_clickout

In [18]:
%%time
procs = [FillMissing, Categorify, Normalize]
train['is_va'] = train.row_id%5 == 0

test_list = TabularList.from_df(test, path='./', cat_names=cat_names, cont_names=cont_names)

data_tab = (TabularList.from_df(train, path='./', cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_from_df('is_va')
                           .label_from_df(cols='target')
                           .add_test(test_list)
                           .databunch(num_workers=10,bs=1024))

CPU times: user 6min 35s, sys: 9min 35s, total: 16min 11s
Wall time: 14min 58s


In [None]:
%%time 
# write_pkl(data_tab, "./data_tab_1.csv")
# del data_pair

In [None]:
# %%time
# data_tab = pickle.load(open('data_tab_1.csv', 'rb'))
# data_tab.batch_size = 2048

In [19]:
%%time
learn = tabular_learner(data_tab, layers=[256, 128], metrics=None, callback_fns=AUROC,#wd=0.2,
        emb_szs = {'user_id': 32,'item_id':32,'platform':4,'city':8,'device':1,
                   'current_filters':8, 'searched_item_id_past': 32, 'searched_item_id_future': 32})

CPU times: user 6.4 s, sys: 1.55 s, total: 7.95 s
Wall time: 7.61 s


In [None]:
learn.model

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [20]:
learn.fit_one_cycle(2, max_lr=slice(1e-3), callbacks=[SaveModelCallback(learn,
        every='improvement', monitor='AUROC',name='tab_nn')])

epoch,train_loss,valid_loss,AUROC,time
0,0.10192,0.103263,0.915998,13:13
1,0.097125,0.104041,0.91572,13:28


Better model found at epoch 0 with AUROC value: 0.9159981114231803.


<h2> Performance Evaluation</h2>

In [21]:
%%time
yp,y_valid = learn.get_preds()

CPU times: user 40.7 s, sys: 48.8 s, total: 1min 29s
Wall time: 2min 6s


In [None]:
%%time
cv = train.loc[train['row_id']%5 == 0,['row_id','reference','item_id', 'target']].copy()
cv['prob'] = yp.numpy()[:,1]
cv = cv.sort_values(by=['row_id','prob'],ascending=False)

In [None]:
%%time
cv['rank']=cv.groupby('row_id')['prob'].rank(ascending=False) 
target_rank = cv[cv['target']==1]['rank']
target_rank.hist(bins=25)

In [None]:
%%time
auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])
mean_reciprocal_rank = (1/target_rank).mean() # get_mean_reciprocal_rank(cv)
print('mean_reciprocal_rank %.4f, AUC %.4f'%(mean_reciprocal_rank,auc))

In [None]:
%%time
yps,_ = learn.get_preds(DatasetType.Test)

In [None]:
%%time
test['target'] = yps.numpy()[:,1]
test = test['row_id,user_id,session_id,timestamp,step,item_id,target'.split(',')]
test.head()

In [None]:
%%time
test = test.sort_values(by=['row_id','target'],ascending=False) # larger probs first

In [None]:
test.head()

In [None]:
%%time
sub = test[['row_id','item_id']].copy()
sub = sub.groupby('row_id')['item_id'].apply(lambda x: ' '.join([str(i) for i in x]))
sub = sub.to_frame()
sub.columns = ['new_item_recommendations']
sub = sub.reset_index()

test = test.drop_duplicates(subset=['row_id'])
sub = test.merge(sub,on='row_id',how='left')
sub = sub[['session_id','new_item_recommendations']]

In [None]:
%%time
sample_sub = pd.read_csv('/datasets/trivago/data/submission_popular.csv')
# sample_sub = pd.read_csv('../input/submission_popular.csv')
assert sample_sub.shape[0] == sample_sub.session_id.unique().shape[0]
sub = sample_sub.merge(sub,on='session_id',how='left')

from datetime import datetime
clock = "{}".format(datetime.now()).replace(' ','-').replace(':','-').split('.')[0]

mask = sub.new_item_recommendations.isnull() == 0
sub.loc[mask,'item_recommendations'] = sub.loc[mask,'new_item_recommendations']
sub = sub.drop('new_item_recommendations',axis=1)
out = 'fastai_%s_mrr_%.4f_auc_%.4f.csv'%(clock,mean_reciprocal_rank,auc)
sub.to_csv(out,index=False)

sub.head()

In [None]:
df = pd.read_csv('fastai_2019-06-30-06-32-46_mrr_0.6400_auc_0.9175.csv')

In [None]:
df.head()

In [None]:
df2 = pd.read_csv('fastai_2019-06-28-22-53-15_mrr_0.1726_auc_0.9266.csv')

In [None]:
df2.head()