In [1]:
import os
GPU_id = 2
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import pandas as pd
import numpy as np
from fastai.tabular import *
import time
import matplotlib.pyplot as plt
import nvstrings
import warnings
import cudf as gd
from numba import cuda
from sklearn.metrics import roc_auc_score
import torch
from fastai.callbacks import SaveModelCallback
warnings.filterwarnings("ignore")

%matplotlib inline

### Functions

In [3]:
def get_mean_reciprocal_rank(sub):
    # sub is a pandas dataframe
    # sub should have the following columns:
    # 'row_id', 'prob', 'reference', 'item_id'
    # sorted by prob in descending order for each group
    sub = gd.from_pandas(sub)
    
    def get_order_in_group(prob,row_id,order):
        for i in range(cuda.threadIdx.x, len(prob), cuda.blockDim.x):
            order[i] = i

    dg = sub.groupby('row_id',method="cudf").apply_grouped(get_order_in_group,incols=['prob','row_id'],
                                  outcols={'order': np.int32},
                                  tpb=32)

    dg = dg.to_pandas()
    dg['order'] = 1.0/(1+dg['order'])
    dg = dg[dg['reference']==dg['item_id']]
    return dg['order'].mean() 

def auroc_score(input, target):
    input, target = input.cpu().numpy()[:,1], target.cpu().numpy()
    return roc_auc_score(target, input)

# Callback to calculate AUC at the end of each epoch
class AUROC(Callback):
    _order = -20 #Needs to run before the recorder

    def __init__(self, learn, **kwargs): self.learn = learn
    def on_train_begin(self, **kwargs): self.learn.recorder.add_metric_names(['AUROC'])
    def on_epoch_begin(self, **kwargs): self.output, self.target = [], []

    def on_batch_end(self, last_target, last_output, train, **kwargs):
        if not train:
            self.output.append(last_output)
            self.target.append(last_target)

    def on_epoch_end(self, last_metrics, **kwargs):
        if len(self.output) > 0:
            output = torch.cat(self.output)
            target = torch.cat(self.target)
            preds = F.softmax(output, dim=1)
            metric = auroc_score(preds, target)
            return add_metrics(last_metrics, [metric])

### Preprocessing

In [14]:
%%time
data_pair = pd.read_pickle('cache/data_pair_all.pkl')

CPU times: user 11.3 s, sys: 16 s, total: 27.3 s
Wall time: 27.3 s


In [15]:
%%time
train = data_pair[data_pair.clickout_missing==0]
test = data_pair[data_pair.clickout_missing>0]
print(train.shape,test.shape)
del data_pair

(42756036, 46) (5762533, 46)
CPU times: user 8.12 s, sys: 8.92 s, total: 17 s
Wall time: 17 s


In [16]:
%%time
cat_names = ['user_id','item_id','platform','city','device','current_filters'] + [i for i in train.columns if i.startswith('is_')]
cont_names = ['price','candidate_order','item_count'] + [i for i in train.columns if i.startswith('count') or 'rank' in i or i.startswith('delta_')]

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 123 µs


In [17]:
%%time
procs = [FillMissing, Categorify, Normalize]
train['is_va'] = train.row_id%5 == 0

test_list = TabularList.from_df(test, path='./', cat_names=cat_names, cont_names=cont_names)
data = (TabularList.from_df(train, path='./', cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_from_df('is_va')
                           .label_from_df(cols='target')
                           .add_test(test_list)
                           .databunch(num_workers=8,bs=1024))

CPU times: user 3min 21s, sys: 2min 36s, total: 5min 58s
Wall time: 5min 46s


In [18]:
%%time

learn = tabular_learner(data, layers=[64,32], metrics=None, callback_fns=AUROC,#wd=0.2,
        emb_szs = {'user_id':16,'item_id':16,'platform':4,'city':8,'device':1,
                   'current_filters':8, 'has_past': 1}
)

learn.fit_one_cycle(1, max_lr=slice(3e-3), callbacks=[SaveModelCallback(learn,
        every='improvement', monitor='AUROC',name='tab_nn')])

epoch,train_loss,valid_loss,AUROC,time
0,0.114635,0.113934,0.887146,13:50


Better model found at epoch 0 with AUROC value: 0.8871461408447601.
CPU times: user 10min 51s, sys: 2min 50s, total: 13min 42s
Wall time: 13min 58s


In [None]:
%%time
yp,y_valid = learn.get_preds()

In [None]:
%%time
cv = train.loc[train['is_va']>0,['row_id','reference','item_id']].copy()
cv['prob'] = yp.numpy()[:,1]
cv = cv.sort_values(by=['row_id','prob'],ascending=False)

In [None]:
del train

In [None]:
%%time
auc = roc_auc_score(y_valid.numpy().ravel(),yp.numpy()[:,1])
mean_reciprocal_rank = get_mean_reciprocal_rank(cv)
print('mean_reciprocal_rank %.4f, AUC %.4f'%(mean_reciprocal_rank,auc))

In [None]:
%%time
yps,_ = learn.get_preds(DatasetType.Test)

In [None]:
%%time
test['target'] = yps.numpy()[:,1]
test = test['row_id,user_id,session_id,timestamp,step,item_id,target'.split(',')]
test.head()

In [None]:
%%time
test = test.sort_values(by=['row_id','target'],ascending=False) # larger probs first

In [None]:
test.head()

In [None]:
%%time
sub = test[['row_id','item_id']].copy()
sub = sub.groupby('row_id')['item_id'].apply(lambda x: ' '.join([str(i) for i in x]))
sub = sub.to_frame()
sub.columns = ['new_item_recommendations']
sub = sub.reset_index()

test = test.drop_duplicates(subset=['row_id'])
sub = test.merge(sub,on='row_id',how='left')
sub = sub[['session_id','new_item_recommendations']]

In [None]:
%%time
#sample_sub = pd.read_csv('/datasets/trivago/data/submission_popular.csv')
sample_sub = pd.read_csv('../input/submission_popular.csv')
assert sample_sub.shape[0] == sample_sub.session_id.unique().shape[0]
sub = sample_sub.merge(sub,on='session_id',how='left')

In [None]:
from datetime import datetime
clock = "{}".format(datetime.now()).replace(' ','-').replace(':','-').split('.')[0]

mask = sub.new_item_recommendations.isnull() == 0
sub.loc[mask,'item_recommendations'] = sub.loc[mask,'new_item_recommendations']
sub = sub.drop('new_item_recommendations',axis=1)
out = 'fastai_%s_mrr_%.4f_auc_%.4f.csv'%(clock,mean_reciprocal_rank,auc)
sub.to_csv(out,index=False)

In [None]:
sub.head()