In [14]:
import os
GPU_id = 2
os.environ['CUDA_VISIBLE_DEVICES'] = str(GPU_id)

In [15]:
import pandas as pd
import cudf as gd
import numpy as np
import multiprocessing
import pickle
import matplotlib.pyplot as plt

from os import listdir
from os.path import isfile, join
from multiprocessing import Process
from librmm_cffi import librmm

In [16]:
def on_gpu(words,func,arg=None, dtype=np.int32):
    res = librmm.device_array(words.size(), dtype=dtype)
    if arg is None:
        cmd = 'words.%s(devptr=res.device_ctypes_pointer.value)'%(func)
    else:
        cmd = 'words.%s(arg,devptr=res.device_ctypes_pointer.value)'%(func)
    eval(cmd)
    return res

def on_gpu(words,func,arg=None, dtype=np.int32):
    res = librmm.device_array(words.size(), dtype=dtype)
    if arg is None:
        cmd = 'words.%s(devptr=res.device_ctypes_pointer.value)'%(func)
    else:
        cmd = 'words.%s(arg,devptr=res.device_ctypes_pointer.value)'%(func)
    eval(cmd)
    return res

In [17]:

"""
array(['search for poi', 'interaction item image', 'clickout item',
       'interaction item info', 'interaction item deals',
       'search for destination', 'filter selection',
       'interaction item rating', 'search for item',
       'change of sort order'], dtype=object)
"""


def get_future_or_past_info(data_click, data, thread_id, num_cores, name, num_steps_to_look=1):
    info = {}
    for i in range(data_click.shape[0]):
        if i % num_cores != thread_id:
            continue
        clickout_count = 0
        clickout_available = [False] * num_steps_to_look
        clickout_item = [0] * num_steps_to_look
        clickout_step_diff = [0] * num_steps_to_look
        clickout_timestamp_diff = [0] * num_steps_to_look
        clickout_price = [0] * num_steps_to_look
        clickout_is_same = [False] * num_steps_to_look

        clickout_impression_valid = [False] * num_steps_to_look
        clickout_price_mean = [0] * num_steps_to_look
        clickout_price_std = [0] * num_steps_to_look

        has_changed_sort = False
        sort_metric = "0"

        row_i = data_click.iloc[i]
        cur_item = row_i['reference']
        cur_item_interaction_image_count = 0
        cur_item_interaction_info_count = 0
        cur_item_interaction_deal_count = 0
        cur_item_interaction_rating_count = 0

        other_clicked_item_interaction_image_count = 0
        other_clicked_item_interaction_info_count = 0
        other_clicked_item_interaction_deal_count = 0
        other_clicked_item_interaction_rating_count = 0
        num_of_search_poi = 0

        all_row_id, user_id = row_i['all_row_id'], row_i['user_id']
        session_id, step = row_i['session_id'], row_i['step']
        if len(info) > 0 and len(info) % 3000 == 0:
            if thread_id == 0:
                print('thread 0 finished ', len(info))
        j_range = range(all_row_id + 1, data.shape[0]) if name == 'future' else range(all_row_id - 1, all_row_id - step,
                                                                                      -1)
        for j in j_range:
            row_j = data.iloc[j]
            if name == 'future' and (user_id != row_j['user_id'] or session_id != row_j['session_id']):
                break
            reference_j, action_j = row_j['reference'], row_j['action_type']

            if action_j == 'change of sort order' and not has_changed_sort:
                has_changed_sort = True
                sort_metric = reference_j
                continue
            if action_j == 'search for poi':
                num_of_search_poi += 1
                continue
            if not action_j.startswith('clickout') and not action_j.startswith('interaction'):
                continue
            try:
                if pd.notnull(reference_j):
                    if pd.notnull(cur_item):
                        if int(cur_item) == int(reference_j):
                            if action_j == 'interaction item image':
                                cur_item_interaction_image_count += 1
                            if action_j == 'interaction item info':
                                cur_item_interaction_info_count += 1
                            if action_j == 'interaction item deals':
                                cur_item_interaction_deal_count += 1
                            if action_j == 'interaction item rating':
                                cur_item_interaction_rating_count += 1
                        else:
                            clicked_item_list_int = [int(x) for x in clickout_item]
                            if int(reference_j) in clicked_item_list_int:
                                if action_j == 'interaction item image':
                                    other_clicked_item_interaction_image_count += 1
                                if action_j == 'interaction item info':
                                    other_clicked_item_interaction_info_count += 1
                                if action_j == 'interaction item deals':
                                    other_clicked_item_interaction_deal_count += 1
                                if action_j == 'interaction item rating':
                                    other_clicked_item_interaction_rating_count += 1
            except:
                pass
            if not action_j.startswith('clickout'):
                continue
            # only care about neighboring clickout
            if clickout_count < num_steps_to_look:
                clickout_available[clickout_count] = True
                clickout_item[clickout_count] = reference_j
                if pd.notnull(reference_j) and pd.notnull(cur_item):
                    clickout_is_same[clickout_count] = int(reference_j) == int(cur_item)
                clickout_step_diff[clickout_count] = row_j['step'] - step
                clickout_timestamp_diff[clickout_count] = row_j['timestamp'] - row_i['timestamp']
                if pd.isnull(row_j['impressions']):
                    continue
                impressions = row_j['impressions'].split("|")
                prices = row_j['prices'].split("|")
                clickout_impression_valid[clickout_count] = True
                clickout_price_mean[clickout_count] = np.mean(list(map(float, prices)))
                clickout_price_std[clickout_count] = np.std(list(map(float, prices)))
                if pd.notnull(reference_j):
                    reference_j = str(int(reference_j))
                    if reference_j in impressions:
                        clickout_price[clickout_count] = int(prices[impressions.index(reference_j)])
            clickout_count += 1
        info[i] = []
        for k in range(num_steps_to_look):
            info[i].extend([clickout_available[k], clickout_item[k], clickout_step_diff[k],
                       clickout_timestamp_diff[k], clickout_price[k], clickout_is_same[k],
                       clickout_impression_valid[k], clickout_price_mean[k], clickout_price_std[k]])
        info[i].extend([has_changed_sort, sort_metric, cur_item_interaction_image_count,
                        cur_item_interaction_info_count, cur_item_interaction_deal_count,
                        cur_item_interaction_rating_count, clickout_count, other_clicked_item_interaction_image_count,
                        other_clicked_item_interaction_info_count, other_clicked_item_interaction_deal_count,
                        other_clicked_item_interaction_rating_count, num_of_search_poi])
    df = pd.DataFrame.from_dict(info, orient='index')
    cols = ["clickout_available", "clickout_item", "clickout_step_diff",
            "clickout_timestamp_diff", "clickout_price", 'clickout_is_same',
            'clickout_impression_valid', 'clickout_price_mean', 'clickout_price_std']
    df_cols = []
    for k in range(num_steps_to_look):
        df_cols.extend(["{}_{}_{}".format(name, c, k) for c in cols])
    tmp = ["has_changed_sort", "sort_metric", "cur_item_interaction_image_count",
            "cur_item_interaction_info_count", "cur_item_interaction_deal_count",
            "cur_item_interaction_rating_count", "clickout_count", "other_clicked_item_interaction_image_count",
            "other_clicked_item_interaction_info_count",
            "other_clicked_item_interaction_deal_count",
            "other_clicked_item_interaction_rating_count",
            "num_of_search_poi"]
    df_cols.extend(["{}_{}".format(c, name) for c in tmp])
    df.columns = df_cols
    df['row_id'] = df.index
    df.to_pickle('cache/sub_{}_info_{}_multisteps.pkl'.format(name, thread_id))

In [5]:
%%time
path = '/datasets/trivago/data'
train = gd.read_csv("{}/train.csv".format(path))
test = gd.read_csv("{}/test.csv".format(path))
train['is_train'] = 1
test['is_train'] = 0
data = gd.concat([train, test])
print('combined',data.shape)

combined (19715327, 13)
CPU times: user 2.77 s, sys: 1.54 s, total: 4.3 s
Wall time: 5.73 s


In [6]:
del train
del test

In [7]:
data['all_row_id'] = np.arange(data.shape[0])
data = data.to_pandas()

In [8]:
%%time
data_click = pd.read_csv('cache/data_clickout.csv')

CPU times: user 7.76 s, sys: 1.01 s, total: 8.78 s
Wall time: 8.8 s


In [9]:
data_click['row_id'] = data_click.index

In [10]:
data_click['reference'].replace('unknown', None, inplace=True)
data['reference'].replace('unknown', None, inplace=True)

In [26]:
data_click.shape

(2115365, 16)

In [18]:
data.head(100)

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices,is_train,all_row_id
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,,1,0
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,1,1
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,1,2
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,1,3
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,,1,4
5,00RL8Z82B2Z1,aff3928535f48,1541037532,6,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,1,5
6,00RL8Z82B2Z1,aff3928535f48,1541037532,7,interaction item image,109038,AU,"Sydney, Australia",mobile,,,,1,6
7,00RL8Z82B2Z1,aff3928535f48,1541037532,8,interaction item image,666856,AU,"Sydney, Australia",mobile,,,,1,7
8,00RL8Z82B2Z1,aff3928535f48,1541037542,9,interaction item image,109038,AU,"Sydney, Australia",mobile,,,,1,8
9,00RL8Z82B2Z1,aff3928535f48,1541037542,10,interaction item image,109038,AU,"Sydney, Australia",mobile,,,,1,9


In [27]:
num_cores = 50
path = 'cache'
for name in ['past', 'future']:
    proc_list = []
    for thread_id in range(num_cores):
        p = Process(target=get_future_or_past_info, 
                    args=(data_click, data, thread_id, num_cores, name, 2))
        proc_list.append(p)
    for p in proc_list:
        p.start()
    for p in proc_list:
        p.join()
    files = [f for f in listdir(path) if isfile(join(path, f)) and f.startswith('sub_{}'.format(name))]
    df_list = []
    for f in files:
        df = pd.read_pickle(join(path, f))
        df_list.append(df)

    df = pd.concat(df_list)
    df = df.sort_values(by=['row_id'])
    print('shape match: ', df.shape[0] == data_click.shape[0])
    df.to_csv('cache/{}_info.csv'.format(name), index=False)

thread 0 finished  3000
thread 0 finished  6000
thread 0 finished  9000
thread 0 finished  12000
thread 0 finished  15000
thread 0 finished  18000
thread 0 finished  21000
thread 0 finished  24000
thread 0 finished  27000
thread 0 finished  30000
thread 0 finished  33000
thread 0 finished  36000
thread 0 finished  39000
thread 0 finished  42000
shape match:  True
thread 0 finished  3000
thread 0 finished  6000
thread 0 finished  9000
thread 0 finished  12000
thread 0 finished  15000
thread 0 finished  18000
thread 0 finished  21000
thread 0 finished  24000
thread 0 finished  27000
thread 0 finished  30000
thread 0 finished  33000
thread 0 finished  36000
thread 0 finished  39000
thread 0 finished  42000
shape match:  True


In [28]:
! rm cache/sub*

In [29]:
past_info = pd.read_csv('cache/past_info.csv')
future_info = pd.read_csv('cache/future_info.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [30]:
future_info.shape

(2115365, 31)

In [31]:
data_context = past_info.merge(future_info, on='row_id', how='left')
data_context.to_csv('cache/context_info_multi_2_more.csv'.format(name), index=False)

In [34]:
data_context.shape

(2115365, 61)

In [33]:
data_context.columns

Index(['past_clickout_available_0', 'past_clickout_item_0',
       'past_clickout_step_diff_0', 'past_clickout_timestamp_diff_0',
       'past_clickout_price_0', 'past_clickout_is_same_0',
       'past_clickout_impression_valid_0', 'past_clickout_price_mean_0',
       'past_clickout_price_std_0', 'past_clickout_available_1',
       'past_clickout_item_1', 'past_clickout_step_diff_1',
       'past_clickout_timestamp_diff_1', 'past_clickout_price_1',
       'past_clickout_is_same_1', 'past_clickout_impression_valid_1',
       'past_clickout_price_mean_1', 'past_clickout_price_std_1',
       'has_changed_sort_past', 'sort_metric_past',
       'cur_item_interaction_image_count_past',
       'cur_item_interaction_info_count_past',
       'cur_item_interaction_deal_count_past',
       'cur_item_interaction_rating_count_past', 'clickout_count_past',
       'other_clicked_item_interaction_image_count_past',
       'other_clicked_item_interaction_info_count_past',
       'other_clicked_item_int