In [1]:
import os
GPU_id = 3
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import cudf as gd
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import time
import nvstrings
from librmm_cffi import librmm
import matplotlib.pyplot as plt
%matplotlib inline

### Functions

In [3]:
def on_gpu(words,func,arg=None,dtype=np.int32):
    res = librmm.device_array(words.size(), dtype=dtype)
    if arg is None:
        cmd = 'words.%s(devptr=res.device_ctypes_pointer.value)'%(func)
    else:
        cmd = 'words.%s(arg,devptr=res.device_ctypes_pointer.value)'%(func)
    eval(cmd)
    return res

def get_count(data,cols):
    dg = data_interaction.groupby(cols+['item_id'],
            as_index=False).agg({'step':['count']})
    tag = '_'.join(cols)
    dg.columns = cols + ['item_id', 'count_item_%s'%tag]

    df = data_interaction.groupby(cols,
            as_index=False).agg({'step':['count']})
    df.columns = cols + ['count_item_%s_all'%tag]
    
    dg = dg.merge(df,on=cols,how='left')
    dg['count_item_%s_norm'%tag] = dg['count_item_%s'%tag] / dg['count_item_%s_all'%tag]
    dg.drop_column('count_item_%s_all'%tag)
    del df
    return dg

### Read data

In [4]:
path = '/datasets/trivago/data/'
os.listdir(path)

['item_metadata.csv',
 'submission_popular.csv',
 'test.csv',
 'train.csv',
 'text_classification_full.pkl']

In [5]:
%%time
train = gd.read_csv("{}/train.csv".format(path))
test = gd.read_csv('%s/test.csv'%path)
data = gd.concat([train,test])

CPU times: user 2.59 s, sys: 1.18 s, total: 3.76 s
Wall time: 3.87 s


### Get browsing history

In [6]:
cols = ['user_id', 'session_id', 'step', 'action_type', 'reference']
data = data[cols]
cols = ['user_id', 'session_id', 'step', 'action_type', 'item_id']
data.columns = cols

In [7]:
%%time
data['is_interaction'] = on_gpu(data['action_type'].data,'contains',
                                          arg='interaction',dtype=np.bool)
data_interaction = data[data['is_interaction']]
data_interaction['item_id'] = data_interaction['item_id'].astype(int)
print(data.shape,data_interaction.shape)
del data

(19715327, 6) (15299599, 6)
CPU times: user 1.18 s, sys: 184 ms, total: 1.36 s
Wall time: 1.4 s


In [8]:
data_interaction.head().to_pandas()

Unnamed: 0,user_id,session_id,step,action_type,item_id,is_interaction
1,00RL8Z82B2Z1,aff3928535f48,2,interaction item image,666856,True
2,00RL8Z82B2Z1,aff3928535f48,3,interaction item image,666856,True
3,00RL8Z82B2Z1,aff3928535f48,4,interaction item image,666856,True
4,00RL8Z82B2Z1,aff3928535f48,5,interaction item image,109038,True
5,00RL8Z82B2Z1,aff3928535f48,6,interaction item image,666856,True


In [9]:
%%time
dg_user_session = get_count(data_interaction,cols=['user_id','session_id'])
dg_user = get_count(data_interaction,cols=['user_id'])

CPU times: user 1.26 s, sys: 696 ms, total: 1.96 s
Wall time: 2.13 s


In [10]:
dg_user_session.head(10).to_pandas()

Unnamed: 0,user_id,session_id,item_id,count_item_user_id_session_id,count_item_user_id_session_id_norm
0,04QP3ZTT1P28,59db5d99a33b6,3053070,13,0.25
1,L68DGV71JNTP,6a7440e286941,5794878,12,0.151899
2,6779S82Q7TKF,33a55b96304b1,110187,1,0.5
3,VIYNRGFG2ZQN,7667296e44a87,3747780,5,0.089286
4,49SBKEH892HZ,45b34abe47add,2873641,7,0.042945
5,1168C8B2LC78,80e24caaef701,2524518,21,0.724138
6,MPT6F695GTY8,61aee69897a5c,2051545,2,0.4
7,X8WYO6EVFQ4L,9d375253e9978,1009861,25,0.19084
8,9KXETFW3XZ5A,43ca32df3d3f2,2655570,9,0.019651
9,JCZ8KN8EYBMR,c09a7dc35c1ce,8232,15,0.096154


In [11]:
del data_interaction

### Merge with clickout data

In [12]:
%%time
data_pair = pd.read_pickle('cache/data_pair_clickout_only.pkl')
data_pair['all_row_id'] = np.arange(data_pair.shape[0])

FileNotFoundError: [Errno 2] No such file or directory: 'cache/data_pair_clickout_only.pkl'

In [None]:
%%time
cols = ['all_row_id','user_id','session_id','item_id','target','step']
agg_all = gd.from_pandas(data_pair[cols])

In [None]:
%%time
agg_all = agg_all.merge(dg_user_session,on=['user_id','session_id','item_id'],how='left')
agg_all = agg_all.merge(dg_user,on=['user_id','item_id'],how='left')
print(agg_all.shape,data_pair.shape)
agg_all = agg_all.sort_values(by='all_row_id')

In [None]:
agg_all.head().to_pandas()

In [None]:
agg_all.to_pandas().to_pickle('cache/global_count.pkl')

In [None]:
dx = agg_all.to_pandas()

#### higher mean means higher precision? higher sum means higher recall?

In [None]:
print(dx.loc[dx.count_item_user_id>0,'target'].sum(),dx.loc[dx.count_item_user_id>0,'target'].mean())

In [None]:
print(dx.loc[dx.count_item_user_id_session_id>0,'target'].sum(),dx.loc[dx.count_item_user_id_session_id>0,'target'].mean())