### In this notebook, we get the last item viewed by the user before the clickout

In [26]:
import os
GPU_id = 2
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [27]:
import cudf as gd
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import time
import nvstrings
from librmm_cffi import librmm
import matplotlib.pyplot as plt
%matplotlib inline

### Functions

In [28]:
def on_gpu(words,func,arg=None,dtype=np.int32):
    res = librmm.device_array(words.size(), dtype=dtype)
    if arg is None:
        cmd = 'words.%s(devptr=res.device_ctypes_pointer.value)'%(func)
    else:
        cmd = 'words.%s(arg,devptr=res.device_ctypes_pointer.value)'%(func)
    eval(cmd)
    return res

def get_last_item_given_action_type(df, action_type, drop_reference=True):
    cols = ['user_id','session_id','action_type','reference','is_click_out','step','timestamp']
    data = df[cols]
    col = 'is_%s'%action_type
    
    if action_type =='any':
        data[col] = on_gpu(data['action_type'].data,'contains',arg='interaction',dtype=np.bool)
    else:    
        data[col] = on_gpu(data['action_type'].data,'compare',arg=action_type)
        data[col] = data[col] == 0
    data[col] = data[col].astype('int32')
    
    data['keep'] = data['is_click_out'] + data[col]
    data = data[data['keep']>0]

    data['all_row_id'] = np.arange(data.shape[0])
    
    data_click = data[data['is_click_out']>0]
    ids = data_click['all_row_id'].to_pandas().values-1
    if ids[0]<0:
        ids[0] = 100
    data_last_view = data.iloc[ids]
    
    data_last_view['is_same_user'] = data_last_view['user_id'].to_pandas().values == data_click['user_id'].to_pandas().values
    data_last_view['is_same_session'] = data_last_view['session_id'].to_pandas().values == data_click['session_id'].to_pandas().values

    for col in ['is_same_user','is_same_session']:
        data_last_view = data_last_view[data_last_view[col]]
        data_last_view.drop_column(col)

    # to align with data_click
    data_last_view['all_row_id'] = data_last_view['all_row_id']+1
    
    data_click['row_id'] = np.arange(data_click.shape[0])   
    data_click = data_click[['all_row_id','row_id','reference']]

    data_last_view = data_last_view[['all_row_id','reference','step','timestamp']]
    ncols = ['all_row_id'] + ['last_viewed_item_%s_%s'%(col,action_type) for col in ['reference','step','timestamp']]
    data_last_view.columns = ncols

    data_click = data_click.merge(data_last_view,on='all_row_id',how='left')
    
#     data_click.drop_column('all_row_id')
    if drop_reference:
        data_click.drop_column('reference')
        
    del data
    del data_last_view
    return data_click

def get_last_item(data):
    data['is_click_out'] = on_gpu(data['action_type'].data,'compare',arg='clickout item')
    data['is_click_out'] = data['is_click_out']==0 # 0 means string match
    data['is_click_out'] = data['is_click_out'].astype('int32')
    
    res = None
    for action_type in ['any','interaction item rating','interaction item image',
                        'interaction item info','interaction item deals']:
        if res is None:
            res = get_last_item_given_action_type(data, action_type, drop_reference=False)
        else:
            tmp = get_last_item_given_action_type(data, action_type, drop_reference=True)
            res = res.merge(tmp,on='row_id',how='left')
        print(action_type,'done')
    return res
    

### Read data

In [29]:
path = '/datasets/trivago/data/'
os.listdir(path)
if os.path.exists('cache')==0:
    os.mkdir('cache')

In [30]:
%%time
train = gd.read_csv('%s/train.csv'%path)
test = gd.read_csv('%s/test.csv'%path)
submission = pd.read_csv('%s/submission_popular.csv'%path)
train['is_test'] = 0
test['is_test'] = 1
print("train & test",train.shape,test.shape)
data = gd.concat([train,test])
print('combined',data.shape)

train & test (15932992, 13) (3782335, 13)
combined (19715327, 13)
CPU times: user 2.88 s, sys: 1.35 s, total: 4.23 s
Wall time: 5.31 s


In [31]:
del train
del test
data.head(5).to_pandas()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,is_test
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,,0
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,0
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,0
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,0
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,,0


### build features 

In [32]:
%%time
data_click = get_last_item(data)

any done
interaction item rating done
interaction item image done
interaction item info done
interaction item deals done
CPU times: user 10.6 s, sys: 7.26 s, total: 17.8 s
Wall time: 29.1 s


In [33]:
data_click = data_click.sort_values('row_id')
data_click.head(15).to_pandas()

Unnamed: 0,all_row_id_x,row_id,reference,last_viewed_item_reference_any,last_viewed_item_step_any,last_viewed_item_timestamp_any,all_row_id_y,last_viewed_item_reference_interaction item rating,last_viewed_item_step_interaction item rating,last_viewed_item_timestamp_interaction item rating,last_viewed_item_reference_interaction item image,last_viewed_item_step_interaction item image,last_viewed_item_timestamp_interaction item image,last_viewed_item_reference_interaction item info,last_viewed_item_step_interaction item info,last_viewed_item_timestamp_interaction item info,all_row_id,last_viewed_item_reference_interaction item deals,last_viewed_item_step_interaction item deals,last_viewed_item_timestamp_interaction item deals
38336,12,0,109038,109038.0,13,1541037542,0,,-1,-1,109038.0,13,1541037542,,-1,-1,0,,-1,-1
38337,13,1,1257342,109038.0,14,1541037543,1,109038.0,14,1541037543,109038.0,14,1541037543,109038.0,14,1541037543,1,109038.0,14,1541037543
38338,107,2,2795374,2795374.0,34,1541063863,7,,-1,-1,2795374.0,33,1541063851,2795374.0,34,1541063863,3,,-1,-1
38339,110,3,1032816,1032816.0,5,1541100314,8,,-1,-1,1032816.0,5,1541100314,,-1,-1,4,,-1,-1
38340,111,4,1032816,1032816.0,6,1541100322,9,1032816.0,6,1541100322,1032816.0,6,1541100322,1032816.0,6,1541100322,5,1032816.0,6,1541100322
38341,149,5,65685,65685.0,53,1541107497,10,,-1,-1,65685.0,53,1541107497,,-1,-1,6,,-1,-1
38342,150,6,1320460,65685.0,54,1541107500,11,65685.0,54,1541107500,65685.0,54,1541107500,65685.0,54,1541107500,7,65685.0,54,1541107500
38343,151,7,3143258,749441.0,1,1541062453,12,749441.0,1,1541062453,,-1,-1,,-1,-1,8,,-1,-1
38360,152,8,2552514,,-1,-1,13,,-1,-1,,-1,-1,,-1,-1,9,,-1,-1
38361,155,9,110591,,-1,-1,14,,-1,-1,,-1,-1,,-1,-1,10,,-1,-1


### verify the features

In [34]:
data_click['reference'] = data_click['reference'].astype('int32')
for col in data_click.columns[2:]:
    if 'reference' not in col:
        continue
    data_click[col] = data_click[col].astype('int32')
    data_click['match'] = data_click['reference']==data_click[col]
    print('%s match rate %.4f'%(col,data_click['match'].astype('int32').mean()))
data_click.drop_column('match')

reference match rate 1.0000
last_viewed_item_reference_any match rate 0.3182
last_viewed_item_reference_interaction item rating match rate 0.1926
last_viewed_item_reference_interaction item image match rate 0.2911
last_viewed_item_reference_interaction item info match rate 0.2058
last_viewed_item_reference_interaction item deals match rate 0.1879


In [35]:
%%time
data_click = data_click.to_pandas()
for col in data_click.columns[2:]:
    if 'reference' not in col:
        mask = data_click[col]<0
        data_click.loc[mask,col] = np.nan

CPU times: user 1.3 s, sys: 2.15 s, total: 3.45 s
Wall time: 3.7 s


In [36]:
data_click.head()

Unnamed: 0,all_row_id_x,row_id,reference,last_viewed_item_reference_any,last_viewed_item_step_any,last_viewed_item_timestamp_any,all_row_id_y,last_viewed_item_reference_interaction item rating,last_viewed_item_step_interaction item rating,last_viewed_item_timestamp_interaction item rating,last_viewed_item_reference_interaction item image,last_viewed_item_step_interaction item image,last_viewed_item_timestamp_interaction item image,last_viewed_item_reference_interaction item info,last_viewed_item_step_interaction item info,last_viewed_item_timestamp_interaction item info,all_row_id,last_viewed_item_reference_interaction item deals,last_viewed_item_step_interaction item deals,last_viewed_item_timestamp_interaction item deals
38336,12,0,109038,109038,13.0,1541038000.0,0.0,0,,,109038,13.0,1541038000.0,0,,,0.0,0,,
38337,13,1,1257342,109038,14.0,1541038000.0,1.0,109038,14.0,1541038000.0,109038,14.0,1541038000.0,109038,14.0,1541038000.0,1.0,109038,14.0,1541038000.0
38338,107,2,2795374,2795374,34.0,1541064000.0,7.0,0,,,2795374,33.0,1541064000.0,2795374,34.0,1541064000.0,3.0,0,,
38339,110,3,1032816,1032816,5.0,1541100000.0,8.0,0,,,1032816,5.0,1541100000.0,0,,,4.0,0,,
38340,111,4,1032816,1032816,6.0,1541100000.0,9.0,1032816,6.0,1541100000.0,1032816,6.0,1541100000.0,1032816,6.0,1541100000.0,5.0,1032816,6.0,1541100000.0


In [37]:
data_click.shape

(2115365, 20)

In [38]:
data_click.to_csv('cache/more_last_viewed_item.csv',index=False)