In [1]:
import os
GPU_id = 2
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import time
import matplotlib.pyplot as plt
import nvstrings
import warnings
import cudf as gd
from numba import cuda
from sklearn.metrics import roc_auc_score
warnings.filterwarnings("ignore")

%matplotlib inline

### Functions

In [3]:
def ranknorm_in_group(df,col,ascending=True):
    # df is a cudf dataframe with a column row_id, pair_row_id and col
    df = df.sort_values(by=['row_id',col],ascending=ascending)
    
    def ranknorm_in_group(row_id,rank,rank_norm):
        N = len(row_id)
        for i in range(cuda.threadIdx.x, N, cuda.blockDim.x):
            rank[i] = i
            rank_norm[i] = i*1.0/N
            
    df = df.groupby('row_id',method="cudf").apply_grouped(ranknorm_in_group,incols=['row_id'],
                                  outcols={'rank': np.int32,
                                          'rank_norm':np.float32},
                                  tpb=32)
    df = df.sort_values(by='all_row_id')
    cols = []
    for i in df.columns:
        if i.startswith('rank'):
            cols.append('%s_%s'%(col,i))
        else:
            cols.append(i)
    df.columns = cols
    return df        

### Load data pair and other features

In [4]:
%%time
data_pair = pd.read_pickle('cache/data_pair_clickout_only.pkl')
data_pair['all_row_id'] = np.arange(data_pair.shape[0])
global_count = pd.read_pickle('cache/global_count.pkl')
data_last_view_gd = gd.read_csv('cache/more_last_viewed_item.csv')

CPU times: user 15.5 s, sys: 13.6 s, total: 29.1 s
Wall time: 29.1 s


### Combine features

In [5]:
%%time
global_count_cols = ['count_item_user_id_session_id','count_item_user_id']
for col in global_count_cols:
    data_pair[col] = global_count[col].values # they have the same row order
    data_pair[col+'_norm'] = global_count[col+'_norm'].values
    data_pair['is_%s_null'%col] = data_pair[col]<0
    data_pair['is_%s_null'%col] = data_pair['is_%s_null'%col].astype(int)
    print('is_%s_null ratio'%col, data_pair['is_%s_null'%col].mean())

is_count_item_user_id_session_id_null ratio 0.956702350392898
is_count_item_user_id_null ratio 0.9483320293308733
CPU times: user 676 ms, sys: 1.2 s, total: 1.88 s
Wall time: 1.87 s


In [6]:
%%time
data_pair_gd = gd.from_pandas(data_pair[['all_row_id','row_id']])
data_last_view_gd.drop_column('reference')
data_last_view_gd = data_pair_gd.merge(data_last_view_gd,on='row_id',how='left')
data_last_view_gd = data_last_view_gd.sort_values('all_row_id')

CPU times: user 9.36 s, sys: 13.9 s, total: 23.2 s
Wall time: 23.5 s


In [7]:
print(data_last_view_gd.shape,data_pair.shape)

(48518569, 17) (48518569, 26)


In [8]:
%%time
cols = []
for col in data_last_view_gd.columns:
    if col not in ['row_id','all_row_id']:
        data_pair[col] = data_last_view_gd[col].to_pandas().values
        print(col,'done')
        cols.append(col)
del data_last_view_gd
del data_pair_gd 

last_viewed_item_reference_any done
last_viewed_item_step_any done
last_viewed_item_timestamp_any done
last_viewed_item_reference_interaction item rating done
last_viewed_item_step_interaction item rating done
last_viewed_item_timestamp_interaction item rating done
last_viewed_item_reference_interaction item image done
last_viewed_item_step_interaction item image done
last_viewed_item_timestamp_interaction item image done
last_viewed_item_reference_interaction item info done
last_viewed_item_step_interaction item info done
last_viewed_item_timestamp_interaction item info done
last_viewed_item_reference_interaction item deals done
last_viewed_item_step_interaction item deals done
last_viewed_item_timestamp_interaction item deals done
CPU times: user 3.38 s, sys: 7.33 s, total: 10.7 s
Wall time: 10.7 s


### Add is_last and delta_last features

In [9]:
%%time
last_item_cols = []
delta_last_item_cols = []
for i in cols:
    if 'reference' in i:
        col = 'is_%s'%i
        data_pair[col] = data_pair['item_id'] == data_pair[i]
        last_item_cols.append(col)
        data_pair[col] = data_pair[col].astype(int)
    else:
        tag = i.split('_')[3]
        col = 'delta_%s'%i
        data_pair[col] = data_pair[tag] - data_pair[i]
        delta_last_item_cols.append(col)
        data_pair[col] = data_pair[col].astype(float)
    data_pair.drop(i,axis=1,inplace=True)

CPU times: user 1min 52s, sys: 3min 5s, total: 4min 57s
Wall time: 4min 57s


In [10]:
data_pair.columns

Index(['row_id', 'candidate_order', 'item_id', 'price', 'row_id_count',
       'item_count', 'user_id', 'session_id', 'timestamp', 'step',
       'action_type', 'reference', 'platform', 'city', 'device',
       'current_filters', 'is_test', 'clickout_missing', 'target',
       'all_row_id', 'count_item_user_id_session_id',
       'count_item_user_id_session_id_norm',
       'is_count_item_user_id_session_id_null', 'count_item_user_id',
       'count_item_user_id_norm', 'is_count_item_user_id_null',
       'is_last_viewed_item_reference_any', 'delta_last_viewed_item_step_any',
       'delta_last_viewed_item_timestamp_any',
       'is_last_viewed_item_reference_interaction item rating',
       'delta_last_viewed_item_step_interaction item rating',
       'delta_last_viewed_item_timestamp_interaction item rating',
       'is_last_viewed_item_reference_interaction item image',
       'delta_last_viewed_item_step_interaction item image',
       'delta_last_viewed_item_timestamp_interaction 

### Rank normalization

In [11]:
%%time
# smaller means more click
ascending_cols = ['price']
print(ascending_cols)
# greater means more click
descending_cols = ['item_count','count_item_user_id_session_id','count_item_user_id']
print(descending_cols)

to_rank_cols = ascending_cols + descending_cols

['price']
['item_count', 'count_item_user_id_session_id', 'count_item_user_id']
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 102 µs


In [12]:
%%time
ascendings = [True if col in ascending_cols else False for col in to_rank_cols]

for col,ascending in zip(to_rank_cols,ascendings):
    df = gd.from_pandas(data_pair[['row_id','all_row_id',col]])
    df = ranknorm_in_group(df,col,ascending=ascending) # lower price higher click
    
    data_pair['%s_rank'%col] = df['%s_rank'%col].to_pandas().values
    data_pair['%s_rank_norm'%col] = df['%s_rank_norm'%col].to_pandas().values
    print(col,'done','ascending' if ascending else 'desending')
    del df # save gpu memory

price done ascending
item_count done desending
count_item_user_id_session_id done desending
count_item_user_id done desending
CPU times: user 8.7 s, sys: 12.1 s, total: 20.8 s
Wall time: 21 s


In [13]:
data_pair.columns

Index(['row_id', 'candidate_order', 'item_id', 'price', 'row_id_count',
       'item_count', 'user_id', 'session_id', 'timestamp', 'step',
       'action_type', 'reference', 'platform', 'city', 'device',
       'current_filters', 'is_test', 'clickout_missing', 'target',
       'all_row_id', 'count_item_user_id_session_id',
       'count_item_user_id_session_id_norm',
       'is_count_item_user_id_session_id_null', 'count_item_user_id',
       'count_item_user_id_norm', 'is_count_item_user_id_null',
       'is_last_viewed_item_reference_any', 'delta_last_viewed_item_step_any',
       'delta_last_viewed_item_timestamp_any',
       'is_last_viewed_item_reference_interaction item rating',
       'delta_last_viewed_item_step_interaction item rating',
       'delta_last_viewed_item_timestamp_interaction item rating',
       'is_last_viewed_item_reference_interaction item image',
       'delta_last_viewed_item_step_interaction item image',
       'delta_last_viewed_item_timestamp_interaction 

### Be aware there are sessions without positive items in training

In [14]:
%%time
dg = data_pair[['row_id','target','clickout_missing']]
dg = dg.groupby('row_id').agg({'target':'max'})
dg.columns = ['max_target']
dg = dg.reset_index()

CPU times: user 2.33 s, sys: 3.04 s, total: 5.36 s
Wall time: 5.36 s


In [15]:
print(data_pair.loc[data_pair['clickout_missing']==0,'row_id'].unique().shape,(dg['max_target']>0).sum())

(1856943,) 1856058


In [16]:
%%time
print(data_pair.shape)
mask = (data_pair['clickout_missing']==0)&(data_pair.row_id.isin(dg.loc[dg['max_target']>0,'row_id']))
print(data_pair[mask].shape)
print(data_pair.shape)

(48518569, 49)
(42735233, 49)
(48518569, 49)
CPU times: user 9.6 s, sys: 8.7 s, total: 18.3 s
Wall time: 18.3 s


### Save the result

In [17]:
%%time
data_pair.drop(['all_row_id','action_type','is_test'],axis=1).to_pickle('cache/data_pair_all.pkl')

CPU times: user 23.1 s, sys: 27.4 s, total: 50.5 s
Wall time: 1min 5s
