In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools

## 生成データの読み込み

In [15]:
# trueならlocal cv用, falseならpred用
valid_flag = True
#valid_flag = False

base_path = '/content/drive/MyDrive/input/otto'
output_path = '/content/drive/MyDrive/output/otto'
if valid_flag:
  input_path = base_path + '/otto-validation'
else:
  input_path = base_path + '/otto-origin'

In [16]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

def load_data():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(f'{input_path}/*_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

all_df = load_data()
# メモリ削減
all_df['session'] = all_df['session'].astype('int32')
all_df['aid'] = all_df['aid'].astype('int32')

print('All data has shape',all_df.shape)
all_df.head()
print('All:', (all_df['ts'].max() - all_df['ts'].min()) / 60 / 60 / 24, 'days')

All data has shape (171638757, 4)
All: 27.999953703703707 days


In [17]:
all_period_ts_max = all_df['ts'].max()
ts_1day = 60 * 60 * 24
ts_1week = ts_1day * 7
ts_2weeks = ts_1day * 7 * 2
#ts_3weeks = ts_1day * 7 * 3
ts_4weeks = ts_1day * 7 * 4
#week_list = ['4weeks', '3weeks', '2weeks', '1week']
#ts_list = [ts_4weeks, ts_3weeks, ts_2weeks, ts_1week]
week_list = ['4weeks', '2weeks', '1week']
ts_list = [ts_4weeks, ts_2weeks, ts_1week]

In [18]:
all_df['session']

0                   0
1                   0
2                   0
3                   0
4                   0
               ...   
171638752    12539533
171638753    12539534
171638754    12539534
171638755    12539534
171638756    12539535
Name: session, Length: 171638757, dtype: int32

In [19]:
aid_df = pd.DataFrame(all_df['aid'].unique(), columns=["aid"])
session_df = pd.DataFrame(all_df['session'].unique(), columns=["session"])
print('aid shape:', aid_df.shape, 'session shape:',session_df.shape)

aid shape: (1844284, 1) session shape: (12899779, 1)


In [20]:
def type_change(df, column_name, num, change_type):
    df[column_name] = df[column_name].fillna(num).astype(change_type)
    return df

## Add aid features

In [21]:
Ntop_all = 1000000

df = all_df.copy()

for i, t in zip(week_list, ts_list):
    print('i=',i,'*******')
    # 長い期間から順に削っていく
    df = df[all_period_ts_max - df['ts'] < t]
    print(df.shape)

    # rankの計算
    top_clicks = df.loc[df['type']== 0,'aid'].value_counts().index.values[:Ntop_all] 
    top_carts = df.loc[df['type']== 1,'aid'].value_counts().index.values[:Ntop_all]
    top_orders = df.loc[df['type']== 2,'aid'].value_counts().index.values[:Ntop_all]

    # aidごとのcount
    top_clicks_num_df = df.loc[df['type']== 0,'aid'].value_counts()
    top_carts_num_df = df.loc[df['type']== 1,'aid'].value_counts()
    top_orders_num_df = df.loc[df['type']== 2,'aid'].value_counts()

    top_clicks_num_df = pd.DataFrame({'aid': top_clicks_num_df.index, f'clicks_count_{i}': top_clicks_num_df.values})
    top_carts_num_df = pd.DataFrame({'aid': top_carts_num_df.index, f'carts_count_{i}': top_carts_num_df.values})
    top_orders_num_df = pd.DataFrame({'aid': top_orders_num_df.index, f'orders_count_{i}': top_orders_num_df.values})
    
    top_counts_df = top_clicks_num_df.merge(top_carts_num_df, how = 'outer', on = 'aid').merge(top_orders_num_df, how = 'outer', on = 'aid').fillna(0)

    top_counts_df['aid'] = top_counts_df['aid'].astype('int32')
    aid_df = aid_df.merge(top_counts_df, how = 'left', on = ['aid'])

    aid_df[f'clicks_count_{i}'] = aid_df[f'clicks_count_{i}'].fillna(0).astype('int32')
    aid_df[f'carts_count_{i}'] = aid_df[f'carts_count_{i}'].fillna(0).astype('int16')
    aid_df[f'orders_count_{i}'] = aid_df[f'orders_count_{i}'].fillna(0).astype('int16')

    dic_clicks = dict()
    dic_carts = dict()
    dic_orders = dict()

    for j, v in enumerate(top_clicks):
        dic_clicks[v] = j
    for j, v in enumerate(top_carts):
        dic_carts[v] = j
    for j, v in enumerate(top_orders):
        dic_orders[v] = j
    del top_clicks_num_df, top_carts_num_df, top_orders_num_df
    gc.collect()

    # 期間ごとにfeatureを追加する
    aid_df[f'clicks_rank_{i}'] = aid_df['aid'].apply(lambda x: dic_clicks[x] if x in dic_clicks else -1).astype('int32')
    aid_df[f'carts_rank_{i}'] = aid_df['aid'].apply(lambda x: dic_carts[x] if x in dic_carts else -1).astype('int32')
    aid_df[f'orders_rank_{i}'] = aid_df['aid'].apply(lambda x: dic_orders[x] if x in dic_orders else -1).astype('int32')
    del dic_clicks, dic_carts, dic_orders, top_clicks, top_carts, top_orders
    gc.collect()

i= 4weeks *******
(171638757, 4)
i= 2weeks *******
(63090916, 4)
i= 1week *******
(7683780, 4)


In [22]:
for i in ['clicks', 'carts', 'orders']:
    #for j in [2,3,4]:
    for j in [2,4]:
        aid_df[f'aid_{i}_count_rate_1_{j}'] = aid_df[f'{i}_count_1week'] / (aid_df[f'{i}_count_{j}weeks'] + 0.000001)
        aid_df[f'aid_{i}_count_rate_1_{j}'] = aid_df[f'aid_{i}_count_rate_1_{j}'].astype('float32')

In [23]:
aid_df

Unnamed: 0,aid,clicks_count_4weeks,carts_count_4weeks,orders_count_4weeks,clicks_rank_4weeks,carts_rank_4weeks,orders_rank_4weeks,clicks_count_2weeks,carts_count_2weeks,orders_count_2weeks,...,orders_count_1week,clicks_rank_1week,carts_rank_1week,orders_rank_1week,aid_clicks_count_rate_1_2,aid_clicks_count_rate_1_4,aid_carts_count_rate_1_2,aid_carts_count_rate_1_4,aid_orders_count_rate_1_2,aid_orders_count_rate_1_4
0,1517085,85,13,3,257959,173381,194587,20,2,0,...,0,200249,223414,-1,0.300000,0.070588,0.500000,0.076923,0.000000,0.000000
1,1563459,74,0,0,286867,-1,-1,26,0,0,...,0,113772,-1,-1,0.384615,0.135135,0.000000,0.000000,0.000000,0.000000
2,1309446,4263,505,89,3123,1943,5458,1572,166,37,...,2,4984,6948,9149,0.094148,0.034717,0.072289,0.023762,0.054054,0.022472
3,16246,1095,120,39,21036,15821,15985,455,55,21,...,1,15703,54304,25534,0.136264,0.056621,0.054545,0.025000,0.047619,0.025641
4,1781822,42,4,1,443825,453860,372874,16,3,1,...,0,371586,144904,-1,0.187500,0.071429,0.333333,0.250000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1844279,1537666,2,1,0,-1,772692,-1,2,1,0,...,0,417326,103056,-1,1.000000,1.000000,0.999999,0.999999,0.000000,0.000000
1844280,537178,1,0,0,-1,-1,-1,1,0,0,...,0,559600,-1,-1,0.999999,0.999999,0.000000,0.000000,0.000000,0.000000
1844281,508024,1,0,0,-1,-1,-1,1,0,0,...,0,559601,-1,-1,0.999999,0.999999,0.000000,0.000000,0.000000,0.000000
1844282,123379,4,2,1,-1,590196,343578,4,2,1,...,1,276013,94874,21332,1.000000,1.000000,1.000000,1.000000,0.999999,0.999999


## Save aid features

In [24]:
if valid_flag:
    aid_df.to_parquet(f'{output_path}/valid_aid_features.parquet')
else:
    aid_df.to_parquet(f'{output_path}/test_aid_features.parquet')

del aid_df
gc.collect()

0

## Add session features

In [None]:
df = all_df.copy()
#del all_df
#gc.collect()

for i, t in zip(week_list, ts_list):
    print('i=',i,'*******')
    # 長い期間から順に削っていく
    df = df[all_period_ts_max - df['ts'] < t]
    print(df.shape)

    # sessionごとにtype別、全actions数を求める
    # total action
    tmp = df.groupby('session').size().reset_index(name=f'session_action_count_{i}')
    session_df = session_df.merge(tmp, how = 'left', on = 'session')
    session_df = type_change(session_df, f'session_action_count_{i}', 0, 'int16')
    # click
    tmp = df.query('type==0').groupby('session').size().reset_index(name=f'session_click_count_{i}')
    session_df = session_df.merge(tmp, how = 'left', on = 'session')
    session_df = type_change(session_df, f'session_click_count_{i}', 0, 'int16')
    # cart
    tmp = df.query('type==1').groupby('session').size().reset_index(name=f'session_cart_count_{i}')
    session_df = session_df.merge(tmp, how = 'left', on = 'session')
    session_df = type_change(session_df, f'session_cart_count_{i}', 0, 'int16')
    # order
    tmp = df.query('type==2').groupby('session').size().reset_index(name=f'session_order_count_{i}')
    session_df = session_df.merge(tmp, how = 'left', on = 'session')
    session_df = type_change(session_df, f'session_order_count_{i}', 0, 'int16')
    # type_mean
    tmp = df.groupby('session').mean()['type'].reset_index(name=f'session_type_mean_{i}')
    session_df = session_df.merge(tmp, how = 'left', on = 'session')
    session_df = type_change(session_df, f'session_type_mean_{i}', -1, 'float32')

    session_df[f'session_click_rate_{i}'] = session_df[f'session_click_count_{i}'] / (session_df[f'session_action_count_{i}'] + 0.000001)
    session_df[f'session_cart_rate_{i}'] = session_df[f'session_cart_count_{i}'] / (session_df[f'session_action_count_{i}'] + 0.000001)
    session_df[f'session_order_rate_{i}'] = session_df[f'session_order_count_{i}'] / (session_df[f'session_action_count_{i}'] + 0.000001)
    
    session_df[f'session_click_rate_{i}'] = session_df[f'session_click_rate_{i}'].astype('float32')
    session_df[f'session_cart_rate_{i}'] = session_df[f'session_cart_rate_{i}'].astype('float32')
    session_df[f'session_order_rate_{i}'] = session_df[f'session_order_rate_{i}'].astype('float32')

del tmp
gc.collect()


i= 4weeks *******
(171638757, 4)
i= 2weeks *******
(63090916, 4)
i= 1week *******
(7683780, 4)


15

In [None]:
session_df

Unnamed: 0,session,session_action_count_4weeks,session_click_count_4weeks,session_cart_count_4weeks,session_order_count_4weeks,session_type_mean_4weeks,session_click_rate_4weeks,session_cart_rate_4weeks,session_order_rate_4weeks,session_action_count_2weeks,...,session_cart_rate_2weeks,session_order_rate_2weeks,session_action_count_1week,session_click_count_1week,session_cart_count_1week,session_order_count_1week,session_type_mean_1week,session_click_rate_1week,session_cart_rate_1week,session_order_rate_1week
0,0,147,142,3,2,0.047619,0.965986,0.020408,0.013605,61,...,0.00,0.0,0,0,0,0,-1.0,0.000000,0.0,0.0
1,1,27,19,8,0,0.296296,0.703704,0.296296,0.000000,1,...,0.00,0.0,0,0,0,0,-1.0,0.000000,0.0,0.0
2,2,13,13,0,0,0.000000,1.000000,0.000000,0.000000,4,...,0.00,0.0,0,0,0,0,-1.0,0.000000,0.0,0.0
3,3,226,200,21,5,0.137168,0.884956,0.092920,0.022124,20,...,0.15,0.0,0,0,0,0,-1.0,0.000000,0.0,0.0
4,4,3,2,0,1,0.666667,0.666666,0.000000,0.333333,0,...,0.00,0.0,0,0,0,0,-1.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12899774,12539531,3,3,0,0,0.000000,1.000000,0.000000,0.000000,3,...,0.00,0.0,3,3,0,0,0.0,1.000000,0.0,0.0
12899775,12539532,6,6,0,0,0.000000,1.000000,0.000000,0.000000,6,...,0.00,0.0,6,6,0,0,0.0,1.000000,0.0,0.0
12899776,12539533,1,1,0,0,0.000000,0.999999,0.000000,0.000000,1,...,0.00,0.0,1,1,0,0,0.0,0.999999,0.0,0.0
12899777,12539534,3,3,0,0,0.000000,1.000000,0.000000,0.000000,3,...,0.00,0.0,3,3,0,0,0.0,1.000000,0.0,0.0


## Save session features

In [None]:
if valid_flag:
    session_df.to_parquet(f'{output_path}/valid_session_features.parquet')
else:
    session_df.to_parquet(f'{output_path}/test_session_features.parquet')

In [None]:
session_df['session_action_count_4weeks'].mean()

13.305557947930735

In [None]:
session_df['session_action_count_2weeks'].mean()

4.890852471193499

In [None]:
session_df['session_action_count_1week'].mean()

0.5956520650470059