In [1]:
import os
GPU_id = 6
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import cudf as gd
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import time
import nvstrings
from librmm_cffi import librmm
import matplotlib.pyplot as plt
%matplotlib inline

### Functions

In [3]:
def on_gpu(words,func,arg=None,dtype=np.int32):
    res = librmm.device_array(words.size(), dtype=dtype)
    if arg is None:
        cmd = 'words.%s(devptr=res.device_ctypes_pointer.value)'%(func)
    else:
        cmd = 'words.%s(arg,devptr=res.device_ctypes_pointer.value)'%(func)
    eval(cmd)
    return res

def get_count(data,cols):
    dg = data_interaction.groupby(cols+['item_id'],
            as_index=False).agg({'step':['count']})
    tag = '_'.join(cols)
    dg.columns = cols + ['item_id', 'count_item_%s'%tag]

    df = data_interaction.groupby(cols,
            as_index=False).agg({'step':['count']})
    df.columns = cols + ['count_item_%s_all'%tag]
    
    dg = dg.merge(df,on=cols,how='left')
    dg['count_item_%s_norm'%tag] = dg['count_item_%s'%tag] / dg['count_item_%s_all'%tag]
    dg.drop_column('count_item_%s_all'%tag)
    del df
    return dg

### Read data

In [4]:
path = '/datasets/trivago/data/'
os.listdir(path)

['item_metadata.csv', 'submission_popular.csv', 'test.csv', 'train.csv']

In [5]:
%%time
train = gd.read_csv('%s/train.csv'%path)
test = gd.read_csv('%s/test.csv'%path)
data = gd.concat([train,test])

CPU times: user 2.48 s, sys: 1.16 s, total: 3.63 s
Wall time: 3.66 s


### Get browsing history

In [6]:
cols = ['user_id', 'session_id', 'step', 'action_type', 'reference']
data = data[cols]
cols = ['user_id', 'session_id', 'step', 'action_type', 'item_id']
data.columns = cols

In [7]:
%%time
data['is_interaction'] = on_gpu(data['action_type'].data,'contains',
                                          arg='interaction',dtype=np.bool)
data_interaction = data[data['is_interaction']]
data_interaction['item_id'] = data_interaction['item_id'].astype(int)
print(data.shape,data_interaction.shape)
del data

(19715327, 6) (15299599, 6)
CPU times: user 876 ms, sys: 184 ms, total: 1.06 s
Wall time: 1.07 s


In [8]:
data_interaction.head().to_pandas()

Unnamed: 0,user_id,session_id,step,action_type,item_id,is_interaction
1,00RL8Z82B2Z1,aff3928535f48,2,interaction item image,666856,True
2,00RL8Z82B2Z1,aff3928535f48,3,interaction item image,666856,True
3,00RL8Z82B2Z1,aff3928535f48,4,interaction item image,666856,True
4,00RL8Z82B2Z1,aff3928535f48,5,interaction item image,109038,True
5,00RL8Z82B2Z1,aff3928535f48,6,interaction item image,666856,True


In [9]:
%%time
dg_user_session = get_count(data_interaction,cols=['user_id','session_id'])
dg_user = get_count(data_interaction,cols=['user_id'])

CPU times: user 1.06 s, sys: 696 ms, total: 1.76 s
Wall time: 1.77 s


In [10]:
dg_user_session.head(10).to_pandas()

Unnamed: 0,user_id,session_id,item_id,count_item_user_id_session_id,count_item_user_id_session_id_norm
0,Q2KMY5VLQJKS,7ca2e808714b3,2862155,6,0.056075
1,ZHFF87Y6YUN8,fa47fa46eaf2c,2222862,21,0.283784
2,C972FKTHO7XE,e326419d23f5c,7993,38,0.094059
3,YL6GG78Y6U14,f4df9ec48f3eb,2861730,11,0.07483
4,40LFWDPQN12R,3d785f499e977,35548,11,0.038596
5,LWE8BPAF0BQY,cdf8d3ff46167,5202702,1,0.333333
6,L1TO8GJVVNRQ,a552be9157638,457111,2,0.5
7,OVAAGP1LAA2V,4a2f445b80a0a,10279558,11,0.03481
8,DRPZQ9GKVP4E,d7412ae524cf5,1172704,11,0.064327
9,546CWA73PWTF,767d8ffec7172,18055,2,0.105263


In [11]:
del data_interaction

### Merge with clickout data

In [12]:
%%time
data_pair = pd.read_pickle('cache/data_pair_clickout_only.pkl')
data_pair['all_row_id'] = np.arange(data_pair.shape[0])

CPU times: user 8.63 s, sys: 8.39 s, total: 17 s
Wall time: 14.8 s


In [13]:
%%time
cols = ['all_row_id','user_id','session_id','item_id','target','step']
agg_all = gd.from_pandas(data_pair[cols])

CPU times: user 37.6 s, sys: 30.9 s, total: 1min 8s
Wall time: 29.5 s


In [14]:
%%time
agg_all = agg_all.merge(dg_user_session,on=['user_id','session_id','item_id'],how='left')
agg_all = agg_all.merge(dg_user,on=['user_id','item_id'],how='left')
print(agg_all.shape,data_pair.shape)
agg_all = agg_all.sort_values(by='all_row_id')

(48518569, 10) (48518569, 20)
CPU times: user 2.45 s, sys: 1.24 s, total: 3.69 s
Wall time: 3.7 s


In [15]:
agg_all.head().to_pandas()

Unnamed: 0,all_row_id,user_id,session_id,item_id,target,step,count_item_user_id_session_id,count_item_user_id_session_id_norm,count_item_user_id,count_item_user_id_norm
18592,0,XFE5BT9RNTQW,62f66f7671352,73376,0,6,-1,,-1,
18593,1,XFE5BT9RNTQW,62f66f7671352,10348476,0,6,-1,,-1,
18594,2,XFE5BT9RNTQW,62f66f7671352,407711,0,6,-1,,-1,
18595,3,XFE5BT9RNTQW,62f66f7671352,9882224,0,6,-1,,-1,
18596,4,XFE5BT9RNTQW,62f66f7671352,10455202,0,6,-1,,-1,


In [16]:
agg_all.to_pandas().to_pickle('cache/global_count.pkl')

In [17]:
dx = agg_all.to_pandas()

#### higher mean means higher precision? higher sum means higher recall?

In [18]:
print(dx.loc[dx.count_item_user_id>0,'target'].sum(),dx.loc[dx.count_item_user_id>0,'target'].mean())

646418 0.25786004461365153


In [19]:
print(dx.loc[dx.count_item_user_id_session_id>0,'target'].sum(),dx.loc[dx.count_item_user_id_session_id>0,'target'].mean())

601990 0.2865609261498329
