In [1]:
import os
GPU_id = 6
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import cudf as gd
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import time
import nvstrings
from librmm_cffi import librmm
import matplotlib.pyplot as plt
%matplotlib inline

### Functions

In [3]:
def on_gpu(words,func,arg=None,dtype=np.int32):
    res = librmm.device_array(words.size(), dtype=dtype)
    if arg is None:
        cmd = 'words.%s(devptr=res.device_ctypes_pointer.value)'%(func)
    else:
        cmd = 'words.%s(arg,devptr=res.device_ctypes_pointer.value)'%(func)
    eval(cmd)
    return res

def get_count(data,cols):
    dg = data_interaction.groupby(cols+['item_id'],
            as_index=False).agg({'step':['count']})
    tag = '_'.join(cols)
    dg.columns = cols + ['item_id', 'count_item_%s'%tag]

    df = data_interaction.groupby(cols,
            as_index=False).agg({'step':['count']})
    df.columns = cols + ['count_item_%s_all'%tag]
    
    dg = dg.merge(df,on=cols,how='left')
    dg['count_item_%s_norm'%tag] = dg['count_item_%s'%tag] / dg['count_item_%s_all'%tag]
    dg.drop_column('count_item_%s_all'%tag)
    del df
    return dg

### Read data

In [4]:
path = '/datasets/trivago/data/'
os.listdir(path)

['item_metadata.csv',
 'submission_popular.csv',
 'test.csv',
 'train.csv',
 'text_classification_full.pkl',
 'bin_itemmeta.csv']

In [5]:
%%time
train = gd.read_csv('%s/train.csv'%path)
test = gd.read_csv('%s/test.csv'%path)
data = gd.concat([train,test])

CPU times: user 2.68 s, sys: 1.31 s, total: 3.99 s
Wall time: 4.2 s


### Get browsing history

In [6]:
cols = ['user_id', 'session_id', 'step', 'action_type', 'reference']
data = data[cols]
cols = ['user_id', 'session_id', 'step', 'action_type', 'item_id']
data.columns = cols

In [7]:
%%time
data['is_interaction'] = on_gpu(data['action_type'].data,'contains',
                                          arg='interaction',dtype=np.bool)
data_interaction = data[data['is_interaction']]
data_interaction['item_id'] = data_interaction['item_id'].astype(int)
print(data.shape,data_interaction.shape)
del data

(19715327, 6) (15299599, 6)
CPU times: user 824 ms, sys: 244 ms, total: 1.07 s
Wall time: 1.13 s


In [8]:
data_interaction.head().to_pandas()

Unnamed: 0,user_id,session_id,step,action_type,item_id,is_interaction
1,00RL8Z82B2Z1,aff3928535f48,2,interaction item image,666856,True
2,00RL8Z82B2Z1,aff3928535f48,3,interaction item image,666856,True
3,00RL8Z82B2Z1,aff3928535f48,4,interaction item image,666856,True
4,00RL8Z82B2Z1,aff3928535f48,5,interaction item image,109038,True
5,00RL8Z82B2Z1,aff3928535f48,6,interaction item image,666856,True


In [9]:
%%time
dg_user_session = get_count(data_interaction,cols=['user_id','session_id'])
dg_user = get_count(data_interaction,cols=['user_id'])

CPU times: user 1.08 s, sys: 744 ms, total: 1.82 s
Wall time: 2.03 s


In [10]:
dg_user_session.head(10).to_pandas()

Unnamed: 0,user_id,session_id,item_id,count_item_user_id_session_id,count_item_user_id_session_id_norm
0,LGRKYE43DWYO,6b7edaef011ec,79838,13,0.333333
1,9VDG6GGTEKXE,9a570eb5210f9,41190,19,0.147287
2,PMMFBC5VA4G7,715997ff66ada,10259246,29,0.27619
3,QYOZ1MOX7PB1,c8c35692197bb,1100322,2,0.040816
4,9R81UQ2QVJXN,12d1b27ddcb36,5967496,1,1.0
5,6AOHTXSMLY3P,9640c7010f614,147888,19,0.159664
6,H0VYHQS69U4C,34c2b679d9d3c,5819070,48,0.352941
7,ZJRJ8RHCRA9E,6b8088b18d942,8275,1,1.0
8,8517XTQGS8Q4,25ca8b94ac9ab,82492,1,1.0
9,KL948I22H9GB,d5a46aaa1d747,2240588,15,0.033186


In [11]:
del data_interaction

### Merge with clickout data

In [12]:
%%time
data_pair = pd.read_pickle('cache/data_pair_clickout_only.pkl')
data_pair['all_row_id'] = np.arange(data_pair.shape[0])

CPU times: user 10.7 s, sys: 7.36 s, total: 18.1 s
Wall time: 18.1 s


In [13]:
%%time
cols = ['all_row_id','user_id','session_id','item_id','target','step']
agg_all = gd.from_pandas(data_pair[cols])

CPU times: user 11.8 s, sys: 12.2 s, total: 24 s
Wall time: 24 s


In [14]:
%%time
agg_all = agg_all.merge(dg_user_session,on=['user_id','session_id','item_id'],how='left')
agg_all = agg_all.merge(dg_user,on=['user_id','item_id'],how='left')
print(agg_all.shape,data_pair.shape)
agg_all = agg_all.sort_values(by='all_row_id')

(48518569, 10) (48518569, 20)
CPU times: user 3.17 s, sys: 1.43 s, total: 4.6 s
Wall time: 4.74 s


In [15]:
agg_all.head().to_pandas()

Unnamed: 0,all_row_id,user_id,session_id,item_id,target,step,count_item_user_id_session_id,count_item_user_id_session_id_norm,count_item_user_id,count_item_user_id_norm
46498,0,9Z8H0R5BPH3H,b0d46e23f4544,1812701,0,2,-1,,-1,
46499,1,9Z8H0R5BPH3H,b0d46e23f4544,5164712,0,2,-1,,-1,
46500,2,9Z8H0R5BPH3H,b0d46e23f4544,9791940,0,2,-1,,-1,
46501,3,9Z8H0R5BPH3H,b0d46e23f4544,80878,0,2,-1,,-1,
46502,4,9Z8H0R5BPH3H,b0d46e23f4544,1336778,0,2,-1,,-1,


In [16]:
agg_all.to_pandas().to_pickle('cache/global_count.pkl')

In [17]:
dx = agg_all.to_pandas()

#### higher mean means higher precision? higher sum means higher recall?

In [18]:
print(dx.loc[dx.count_item_user_id>0,'target'].sum(),dx.loc[dx.count_item_user_id>0,'target'].mean())

646418 0.25786004461365153


In [19]:
print(dx.loc[dx.count_item_user_id_session_id>0,'target'].sum(),dx.loc[dx.count_item_user_id_session_id>0,'target'].mean())

601990 0.2865609261498329
