In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools

## 生成データの読み込み

In [9]:
# trueならlocal cv用, falseならpred用
#valid_flag = True
valid_flag = False

base_path = '/content/drive/MyDrive/input/otto'
output_path = '/content/drive/MyDrive/output/otto'
if valid_flag:
  input_path = base_path + '/otto-validation'
else:
  input_path = base_path + '/otto-origin'

In [10]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

def load_data():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(f'{input_path}/*_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

all_df = load_data()
# メモリ削減
all_df['session'] = all_df['session'].astype('int32')
all_df['aid'] = all_df['aid'].astype('int32')

print('All data has shape',all_df.shape)
all_df.head()
print('All:', (all_df['ts'].max() - all_df['ts'].min()) / 60 / 60 / 24, 'days')

All data has shape (223644219, 4)
All: 34.99989583333333 days


In [13]:
all_period_ts_max = all_df['ts'].max()
ts_1day = 60 * 60 * 24
ts_1week = ts_1day * 7
ts_2weeks = ts_1day * 7 * 2
#ts_3weeks = ts_1day * 7 * 3
ts_4weeks = ts_1day * 7 * 4
#week_list = ['4weeks', '3weeks', '2weeks', '1week']
#ts_list = [ts_4weeks, ts_3weeks, ts_2weeks, ts_1week]
week_list = ['4weeks', '2weeks', '1week']
ts_list = [ts_4weeks, ts_2weeks, ts_1week]

In [15]:
all_df['ts'].max()

1662328791

In [None]:
aid_df = pd.DataFrame(all_df['aid'].unique(), columns=["aid"])
#session_df = pd.DataFrame(all_df['session'].unique(), columns=["session"])
#print('aid shape:', aid_df.shape, 'session shape:',session_df.shape)
print('aid shape:', aid_df.shape)

aid shape: (1844284, 1)


In [None]:
def type_change(df, column_name, num, change_type):
    df[column_name] = df[column_name].fillna(num).astype(change_type)
    return df

In [None]:
all_df.head()

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800,0
1,0,1563459,1659304904,0
2,0,1309446,1659367439,0
3,0,16246,1659367719,0
4,0,1781822,1659367871,0


## Add aid features

In [None]:
Ntop_all = 1000000

df = all_df.copy()

for i, t in zip(week_list, ts_list):
    print('i=',i,'*******')
    # 長い期間から順に削っていく
    df = df[all_period_ts_max - df['ts'] < t]
    print(df.shape)
    print('get aid feature by count....')
    # aidごとのclick, cart, order count
    top_clicks_num_df = df.loc[df['type']== 0,'aid'].value_counts()
    top_carts_num_df = df.loc[df['type']== 1,'aid'].value_counts()
    top_orders_num_df = df.loc[df['type']== 2,'aid'].value_counts()

    top_clicks_num_df = pd.DataFrame({'aid': top_clicks_num_df.index, f'aid_clicks_count_{i}': top_clicks_num_df.values})
    top_carts_num_df = pd.DataFrame({'aid': top_carts_num_df.index, f'aid_carts_count_{i}': top_carts_num_df.values})
    top_orders_num_df = pd.DataFrame({'aid': top_orders_num_df.index, f'aid_orders_count_{i}': top_orders_num_df.values})
    
    top_counts_df = top_clicks_num_df.merge(top_carts_num_df, how = 'outer', on = 'aid').merge(top_orders_num_df, how = 'outer', on = 'aid').fillna(0)
    del top_clicks_num_df, top_carts_num_df, top_orders_num_df
    gc.collect()

    top_counts_df['aid'] = top_counts_df['aid'].astype('int32')
    aid_df = aid_df.merge(top_counts_df, how = 'left', on = ['aid'])
    del top_counts_df
    gc.collect()

    aid_df[f'aid_clicks_count_{i}'] = aid_df[f'aid_clicks_count_{i}'].fillna(0).astype('int32')
    aid_df[f'aid_carts_count_{i}'] = aid_df[f'aid_carts_count_{i}'].fillna(0).astype('int16')
    aid_df[f'aid_orders_count_{i}'] = aid_df[f'aid_orders_count_{i}'].fillna(0).astype('int16')
    aid_df[f'aid_total_count_{i}'] = (aid_df[f'aid_clicks_count_{i}'] + aid_df[f'aid_carts_count_{i}'] + aid_df[f'aid_orders_count_{i}']).astype('int32')
    
    print('get aid feature by uu....')
    # aidごとのunique user (uu) count
    total_uu = df.groupby('aid')['session'].nunique()
    clicks_uu = df[df['type']== 0].groupby('aid')['session'].nunique()
    carts_uu = df[df['type']== 1].groupby('aid')['session'].nunique()
    orders_uu =df[df['type']== 2].groupby('aid')['session'].nunique()

    total_uu = pd.DataFrame({'aid': total_uu.index, f'aid_total_uu_{i}': total_uu.values})
    clicks_uu = pd.DataFrame({'aid': clicks_uu.index, f'aid_clicks_uu_{i}': clicks_uu.values})
    carts_uu = pd.DataFrame({'aid': carts_uu.index, f'aid_carts_uu_{i}': carts_uu.values})
    orders_uu = pd.DataFrame({'aid': orders_uu.index, f'aid_orders_uu_{i}': orders_uu.values})

    aid_uu_df = total_uu.merge(clicks_uu, how = 'outer', on = 'aid').merge(carts_uu, how = 'outer', on = 'aid').merge(orders_uu, how = 'outer', on = 'aid').fillna(0)
    aid_uu_df['aid'] = aid_uu_df['aid'].astype('int32')
    aid_df = aid_df.merge(aid_uu_df, how = 'left', on = ['aid'])
    del total_uu, clicks_uu, carts_uu, orders_uu
    gc.collect()

    aid_df[f'aid_total_uu_{i}'] = aid_df[f'aid_total_uu_{i}'].fillna(0).astype('int32')
    aid_df[f'aid_clicks_uu_{i}'] = aid_df[f'aid_clicks_uu_{i}'].fillna(0).astype('int32')
    aid_df[f'aid_carts_uu_{i}'] = aid_df[f'aid_carts_uu_{i}'].fillna(0).astype('int32')
    aid_df[f'aid_orders_uu_{i}'] = aid_df[f'aid_orders_uu_{i}'].fillna(0).astype('int32')
    
    # uu/(action count) ratio, この値が小さいほど一部のuserにactionが集中している
    aid_df[f'aid_total_uu_action_ratio_{i}'] = aid_df[f'aid_total_uu_{i}'] / (aid_df[f'aid_total_count_{i}'] + 0.000001)
    aid_df[f'aid_clicks_uu_action_ratio_{i}'] = aid_df[f'aid_clicks_uu_{i}'] / (aid_df[f'aid_clicks_count_{i}'] + 0.000001)
    aid_df[f'aid_carts_uu_action_ratio_{i}'] = aid_df[f'aid_carts_uu_{i}'] / (aid_df[f'aid_carts_count_{i}'] + 0.000001)
    aid_df[f'aid_orders_uu_action_ratio_{i}'] = aid_df[f'aid_orders_uu_{i}'] / (aid_df[f'aid_orders_count_{i}'] + 0.000001)

    aid_df[f'aid_total_uu_action_ratio_{i}']  = aid_df[f'aid_total_uu_action_ratio_{i}'].astype('float32')  
    aid_df[f'aid_clicks_uu_action_ratio_{i}'] = aid_df[f'aid_clicks_uu_action_ratio_{i}'].astype('float32') 
    aid_df[f'aid_carts_uu_action_ratio_{i}']  = aid_df[f'aid_carts_uu_action_ratio_{i}'].astype('float32')
    aid_df[f'aid_orders_uu_action_ratio_{i}'] = aid_df[f'aid_orders_uu_action_ratio_{i}'].astype('float32')
    
    print('get aid feature by ranking....')
    # rankの計算
    top_clicks = df.loc[df['type']== 0,'aid'].value_counts().index.values[:Ntop_all] 
    top_carts = df.loc[df['type']== 1,'aid'].value_counts().index.values[:Ntop_all]
    top_orders = df.loc[df['type']== 2,'aid'].value_counts().index.values[:Ntop_all]

    dic_clicks = dict()
    dic_carts = dict()
    dic_orders = dict()

    for j, v in enumerate(top_clicks):
        dic_clicks[v] = j
    for j, v in enumerate(top_carts):
        dic_carts[v] = j
    for j, v in enumerate(top_orders):
        dic_orders[v] = j
    del top_clicks, top_carts, top_orders
    gc.collect()

    # click, carts, order ranking
    aid_df[f'aid_clicks_rank_{i}'] = aid_df['aid'].apply(lambda x: dic_clicks[x] if x in dic_clicks else -1).astype('int32')
    aid_df[f'aid_carts_rank_{i}'] = aid_df['aid'].apply(lambda x: dic_carts[x] if x in dic_carts else -1).astype('int32')
    aid_df[f'aid_orders_rank_{i}'] = aid_df['aid'].apply(lambda x: dic_orders[x] if x in dic_orders else -1).astype('int32')
    del dic_clicks, dic_carts, dic_orders
    gc.collect()

    # aidごとの平均session action/click/cart/order count
    # ここでのsession featureはaid素性として使う用
    print('get aid feature by session info....')

    # total action
    tmp_session_df = df.groupby('session').size().reset_index(name=f'tmp_session_action_count_{i}')
    tmp_session_df = type_change(tmp_session_df, f'tmp_session_action_count_{i}', 0, 'int16')
    # click
    tmp = df.query('type==0').groupby('session').size().reset_index(name=f'tmp_session_click_count_{i}')
    tmp_session_df = tmp_session_df.merge(tmp, how = 'left', on = 'session')
    tmp_session_df = type_change(tmp_session_df, f'tmp_session_click_count_{i}', 0, 'int16')
    # cart
    tmp = df.query('type==1').groupby('session').size().reset_index(name=f'tmp_session_cart_count_{i}')
    tmp_session_df = tmp_session_df.merge(tmp, how = 'left', on = 'session')
    tmp_session_df = type_change(tmp_session_df, f'tmp_session_cart_count_{i}', 0, 'int16')
    # order
    tmp = df.query('type==2').groupby('session').size().reset_index(name=f'tmp_session_order_count_{i}')
    tmp_session_df = tmp_session_df.merge(tmp, how = 'left', on = 'session')
    tmp_session_df = type_change(tmp_session_df, f'tmp_session_order_count_{i}', 0, 'int16')
    # type_mean
    tmp = df.groupby('session').mean()['type'].reset_index(name=f'tmp_session_type_mean_{i}')
    tmp_session_df = tmp_session_df.merge(tmp, how = 'left', on = 'session')
    tmp_session_df = type_change(tmp_session_df, f'tmp_session_type_mean_{i}', -1, 'float32')

    df = df.merge(tmp_session_df, how = 'left', on = 'session')
    del tmp, tmp_session_df
    gc.collect()

    aid_df[f'aid_mean_session_action_count_{i}'] = df.groupby('aid')[f'tmp_session_action_count_{i}'].mean()
    aid_df[f'aid_mean_session_click_count_{i}'] = df.groupby('aid')[f'tmp_session_click_count_{i}'].mean()
    aid_df[f'aid_mean_session_cart_count_{i}'] = df.groupby('aid')[f'tmp_session_cart_count_{i}'].mean()
    aid_df[f'aid_mean_session_order_count_{i}'] = df.groupby('aid')[f'tmp_session_order_count_{i}'].mean()
    aid_df[f'aid_mean_session_type_mean_{i}'] = df.groupby('aid')[f'tmp_session_type_mean_{i}'].mean()

    aid_df = type_change(aid_df, f'aid_mean_session_action_count_{i}', 0, 'float32')
    aid_df = type_change(aid_df, f'aid_mean_session_click_count_{i}', 0, 'float32')
    aid_df = type_change(aid_df, f'aid_mean_session_cart_count_{i}', 0, 'float32')
    aid_df = type_change(aid_df, f'aid_mean_session_order_count_{i}', 0, 'float32')
    aid_df = type_change(aid_df, f'aid_mean_session_type_mean_{i}', 0, 'float32')


i= 4weeks *******
(170483914, 4)
get aid feature by count....
get aid feature by uu....
get aid feature by ranking....
get aid feature by session info....
i= 2weeks *******
(59689604, 9)
get aid feature by count....
get aid feature by uu....
get aid feature by ranking....
get aid feature by session info....
i= 1week *******
(6928666, 14)
get aid feature by count....
get aid feature by uu....
get aid feature by ranking....
get aid feature by session info....


In [None]:
# weekごとの比のfeature
for i in ['clicks', 'carts', 'orders']:
    for j in [2,4]:
        aid_df[f'aid_{i}_count_rate_1_{j}'] = aid_df[f'aid_{i}_count_1week'] / (aid_df[f'aid_{i}_count_{j}weeks'] + 0.000001)
        aid_df[f'aid_{i}_count_rate_1_{j}'] = aid_df[f'aid_{i}_count_rate_1_{j}'].astype('float32')

        aid_df[f'aid_{i}_uu_rate_1_{j}'] = aid_df[f'aid_{i}_uu_1week'] / (aid_df[f'aid_{i}_uu_{j}weeks'] + 0.000001)
        aid_df[f'aid_{i}_uu_rate_1_{j}'] = aid_df[f'aid_{i}_uu_rate_1_{j}'].astype('float32')
        # i loopの一回だけ実行
        if i == 'clicks':
            aid_df[f'aid_total_uu_rate_1_{j}'] = aid_df[f'aid_total_uu_1week'] / (aid_df[f'aid_total_uu_{j}weeks'] + 0.000001)
            aid_df[f'aid_total_uu_rate_1_{j}'] = aid_df[f'aid_total_uu_rate_1_{j}'].astype('float32')

In [None]:
pd.set_option('display.max_columns', 100)
aid_df

Unnamed: 0,aid,aid_clicks_count_4weeks,aid_carts_count_4weeks,aid_orders_count_4weeks,aid_total_count_4weeks,aid_total_uu_4weeks,aid_clicks_uu_4weeks,aid_carts_uu_4weeks,aid_orders_uu_4weeks,aid_total_uu_action_ratio_4weeks,aid_clicks_uu_action_ratio_4weeks,aid_carts_uu_action_ratio_4weeks,aid_orders_uu_action_ratio_4weeks,aid_clicks_rank_4weeks,aid_carts_rank_4weeks,aid_orders_rank_4weeks,aid_mean_session_action_count_4weeks,aid_mean_session_click_count_4weeks,aid_mean_session_cart_count_4weeks,aid_mean_session_order_count_4weeks,aid_mean_session_type_mean_4weeks,aid_clicks_count_2weeks,aid_carts_count_2weeks,aid_orders_count_2weeks,aid_total_count_2weeks,aid_total_uu_2weeks,aid_clicks_uu_2weeks,aid_carts_uu_2weeks,aid_orders_uu_2weeks,aid_total_uu_action_ratio_2weeks,aid_clicks_uu_action_ratio_2weeks,aid_carts_uu_action_ratio_2weeks,aid_orders_uu_action_ratio_2weeks,aid_clicks_rank_2weeks,aid_carts_rank_2weeks,aid_orders_rank_2weeks,aid_mean_session_action_count_2weeks,aid_mean_session_click_count_2weeks,aid_mean_session_cart_count_2weeks,aid_mean_session_order_count_2weeks,aid_mean_session_type_mean_2weeks,aid_clicks_count_1week,aid_carts_count_1week,aid_orders_count_1week,aid_total_count_1week,aid_total_uu_1week,aid_clicks_uu_1week,aid_carts_uu_1week,aid_orders_uu_1week,aid_total_uu_action_ratio_1week,aid_clicks_uu_action_ratio_1week,aid_carts_uu_action_ratio_1week,aid_orders_uu_action_ratio_1week,aid_clicks_rank_1week,aid_carts_rank_1week,aid_orders_rank_1week,aid_mean_session_action_count_1week,aid_mean_session_click_count_1week,aid_mean_session_cart_count_1week,aid_mean_session_order_count_1week,aid_mean_session_type_mean_1week,aid_clicks_count_rate_1_2,aid_clicks_uu_rate_1_2,aid_total_uu_rate_1_2,aid_clicks_count_rate_1_4,aid_clicks_uu_rate_1_4,aid_total_uu_rate_1_4,aid_carts_count_rate_1_2,aid_carts_uu_rate_1_2,aid_carts_count_rate_1_4,aid_carts_uu_rate_1_4,aid_orders_count_rate_1_2,aid_orders_uu_rate_1_2,aid_orders_count_rate_1_4,aid_orders_uu_rate_1_4
0,1517085,73,10,2,85,49,49,8,1,0.576471,0.671233,0.800000,0.500000,292881,209861,286255,74.783783,66.162163,6.081081,2.540540,0.161161,32,2,0,34,25,25,2,0,0.735294,0.781250,1.000000,0.000000,244739,308739,-1,63.866665,56.066666,5.933333,1.866667,0.159817,6,1,0,7,6,6,1,0,0.857143,1.000000,0.999999,0.000000,171362,183024,-1,7.250000,7.000000,0.250000,0.000000,0.050000,0.187500,0.240000,0.240000,0.082192,0.122449,0.122449,0.500000,0.500000,0.100000,0.125000,0.000000,0.00,0.000000,0.000000
1,1563459,58,0,0,58,54,54,0,0,0.931034,0.931034,0.000000,0.000000,349800,-1,-1,70.354836,65.741936,4.064516,0.548387,0.061483,21,0,0,21,17,17,0,0,0.809524,0.809524,0.000000,0.000000,338973,-1,-1,3.000000,3.000000,0.000000,0.000000,0.000000,1,0,0,1,1,1,0,0,0.999999,0.999999,0.000000,0.000000,665418,-1,-1,0.000000,0.000000,0.000000,0.000000,0.000000,0.047619,0.058824,0.058824,0.017241,0.018519,0.018519,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
2,1309446,3719,393,79,4191,2136,2131,311,70,0.509664,0.573003,0.791349,0.886076,3806,2779,6235,31.687500,29.562500,1.937500,0.187500,0.119104,1219,112,22,1353,788,787,101,20,0.582409,0.645611,0.901786,0.909091,4319,4061,8214,21.200001,19.799999,1.000000,0.400000,0.290000,216,20,1,237,163,163,19,1,0.687764,0.754630,0.950000,0.999999,2626,2746,21226,0.000000,0.000000,0.000000,0.000000,0.000000,0.177194,0.207116,0.206853,0.058080,0.076490,0.076311,0.178571,0.188119,0.050891,0.061093,0.045455,0.05,0.012658,0.014286
3,16246,1150,110,35,1295,651,650,88,35,0.502703,0.565217,0.800000,1.000000,19438,17584,18379,37.984318,34.801605,2.551058,0.631656,0.103055,427,25,6,458,235,235,22,6,0.513100,0.550351,0.880000,1.000000,18105,31549,38643,22.883825,20.607830,1.887035,0.388960,0.119659,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,-1,-1,-1,8.054456,7.183168,0.742574,0.128713,0.122097,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
4,1781822,30,3,1,34,25,25,3,1,0.735294,0.833333,1.000000,0.999999,565281,477826,374295,99.169594,91.339180,7.222222,0.608187,0.062201,9,1,0,10,8,8,1,0,0.800000,0.888889,0.999999,0.000000,617910,541929,-1,53.246914,49.049381,4.148148,0.049383,0.065673,2,0,0,2,2,2,0,0,1.000000,1.000000,0.000000,0.000000,425805,-1,-1,18.375000,17.750000,0.625000,0.000000,0.029762,0.222222,0.250000,0.250000,0.066667,0.080000,0.080000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1855598,1084367,5,1,0,6,1,1,1,0,0.166667,0.200000,0.999999,0.000000,-1,806179,-1,48.333332,44.333332,2.166667,1.833333,0.078344,5,1,0,6,1,1,1,0,0.166667,0.200000,0.999999,0.000000,862227,492452,-1,6.000000,6.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,-1,-1,-1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
1855599,1193701,9,0,0,9,1,1,0,0,0.111111,0.111111,0.000000,0.000000,-1,-1,-1,104.090912,100.818184,2.454545,0.818182,0.082022,9,0,0,9,1,1,0,0,0.111111,0.111111,0.000000,0.000000,624947,-1,-1,78.599998,75.800003,2.000000,0.800000,0.093197,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,-1,-1,-1,2.000000,2.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
1855600,276646,5,0,0,5,1,1,0,0,0.200000,0.200000,0.000000,0.000000,-1,-1,-1,88.126762,80.028168,6.056338,2.042253,0.124816,5,0,0,5,1,1,0,0,0.200000,0.200000,0.000000,0.000000,863774,-1,-1,28.434782,23.695652,3.565217,1.173913,0.176429,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,-1,-1,-1,7.000000,6.000000,1.000000,0.000000,0.142857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000
1855601,1832342,6,0,0,6,1,1,0,0,0.166667,0.166667,0.000000,0.000000,-1,-1,-1,105.021736,94.152176,8.282609,2.586957,0.149751,6,0,0,6,1,1,0,0,0.166667,0.166667,0.000000,0.000000,806460,-1,-1,0.000000,0.000000,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0.000000,0.000000,0.000000,0.000000,-1,-1,-1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00,0.000000,0.000000


## Save aid features

In [None]:
if valid_flag:
    aid_df.to_parquet(f'{output_path}/valid_aid_features.parquet')
else:
    aid_df.to_parquet(f'{output_path}/test_aid_features.parquet')

## Add session features

In [None]:
df = all_df.copy()
#del all_df
#gc.collect()

# session情報として使うのは最後の1週間のみ
i = week_list[-1]
t = ts_list[-1]

df = df[all_period_ts_max - df['ts'] < t]
df['ts_diff'] = all_period_ts_max - df['ts'] 
print(df.shape)

# sessionごとにtype別、全actions数を求める
# total action
session_df = df.groupby('session').size().reset_index(name='session_action_count')
session_df['session'] = session_df['session'].astype('int32')
session_df = type_change(session_df, 'session_action_count', 0, 'int16')
'''
# click
tmp = df.query('type==0').groupby('session').size().reset_index(name='session_click_count')
session_df = session_df.merge(tmp, how = 'left', on = 'session')
session_df = type_change(session_df, 'session_click_count', 0, 'int16')
# cart
tmp = df.query('type==1').groupby('session').size().reset_index(name='session_cart_count')
session_df = session_df.merge(tmp, how = 'left', on = 'session')
session_df = type_change(session_df, 'session_cart_count', 0, 'int16')
# order
tmp = df.query('type==2').groupby('session').size().reset_index(name='session_order_count')
session_df = session_df.merge(tmp, how = 'left', on = 'session')
session_df = type_change(session_df, 'session_order_count', 0, 'int16')
# type_mean
tmp = df.groupby('session').mean()['type'].reset_index(name='session_type_mean')
session_df = session_df.merge(tmp, how = 'left', on = 'session')
session_df = type_change(session_df, 'session_type_mean', -1, 'float32')
# last action type
tmp = df.groupby('session').last()['type'].reset_index(name='session_last_type')
session_df = session_df.merge(tmp, how = 'left', on = 'session')
session_df = type_change(session_df, 'session_type_mean', -1, 'int8') # 実際には欠損値ないはず

# time diff, 直近のtsから遡る (diff)
tmp = df.groupby('session').max()['ts_diff'].reset_index(name='session_first_action_ts_diff')
session_df = session_df.merge(tmp, how = 'left', on = 'session')
session_df = type_change(session_df, 'session_first_action_ts_diff', -1, 'int32') # 実際には欠損値ないはず

tmp = df.groupby('session').min()['ts_diff'].reset_index(name='session_last_action_ts_diff')
session_df = session_df.merge(tmp, how = 'left', on = 'session')
session_df = type_change(session_df, 'session_last_action_ts_diff', -1, 'int32') # 実際には欠損値ないはず

session_df['session_ts_period'] = session_df['session_first_action_ts_diff'] - session_df['session_last_action_ts_diff']

tmp = df.groupby('session').mean()['ts_diff'].reset_index(name='session_mean_action_ts_diff')
session_df = session_df.merge(tmp, how = 'left', on = 'session')
session_df = type_change(session_df, 'session_mean_action_ts_diff', -1, 'float32') # 実際には欠損値ないはず


# sessionごとにuniqueなaidのtotal/click/cart/order のaction数
# unique aid click
tmp = df.groupby('session')['aid'].nunique().reset_index(name='session_unique_aid_action_count')
session_df = session_df.merge(tmp, how = 'left', on = 'session')
session_df = type_change(session_df, 'session_unique_aid_action_count', 0, 'int16')
# unique aid click
tmp = df[df['type']== 0].groupby('session')['aid'].nunique().reset_index(name='session_unique_aid_click_count')
session_df = session_df.merge(tmp, how = 'left', on = 'session')
session_df = type_change(session_df, 'session_unique_aid_click_count', 0, 'int16')
# unique aid cart
tmp = df[df['type']== 1].groupby('session')['aid'].nunique().reset_index(name='session_unique_aid_cart_count')
session_df = session_df.merge(tmp, how = 'left', on = 'session')
session_df = type_change(session_df, 'session_unique_aid_cart_count', 0, 'int16')
# unique aid order
tmp = df[df['type']== 2].groupby('session')['aid'].nunique().reset_index(name='session_unique_aid_order_count')
session_df = session_df.merge(tmp, how = 'left', on = 'session')
session_df = type_change(session_df, 'session_unique_aid_order_count', 0, 'int16')

# count ratio, (click/cart/order) / total action count
session_df['session_click_rate'] = session_df['session_click_count'] / (session_df['session_action_count'] + 0.000001)
session_df['session_cart_rate'] = session_df['session_cart_count'] / (session_df['session_action_count'] + 0.000001)
session_df['session_order_rate'] = session_df['session_order_count'] / (session_df['session_action_count'] + 0.000001)
    
session_df['session_click_rate'] = session_df['session_click_rate'].astype('float32')
session_df['session_cart_rate'] = session_df['session_cart_rate'].astype('float32')
session_df['session_order_rate'] = session_df['session_order_rate'].astype('float32')

# unique count ratio, (click/cart/order) / total action count
session_df['session_unique_aid_click_rate'] = session_df['session_unique_aid_click_count'] / (session_df['session_unique_aid_action_count'] + 0.000001)
session_df['session_unique_aid_cart_rate'] = session_df['session_unique_aid_cart_count'] / (session_df['session_unique_aid_action_count'] + 0.000001)
session_df['session_unique_aid_order_rate'] = session_df['session_unique_aid_order_count'] / (session_df['session_unique_aid_action_count'] + 0.000001)
    
session_df['session_unique_aid_click_rate'] = session_df['session_unique_aid_click_rate'].astype('float32')
session_df['session_unique_aid_cart_rate'] = session_df['session_unique_aid_cart_rate'].astype('float32')
session_df['session_unique_aid_order_rate'] = session_df['session_unique_aid_order_rate'].astype('float32')

# uu/(action count) ratio, この値が小さいほど一部のaidにactionが集中している
session_df['session_total_uu_action_ratio'] = session_df['session_unique_aid_action_count'] / (session_df['session_action_count'] + 0.000001)
session_df['session_clicks_uu_action_ratio'] = session_df['session_unique_aid_click_count'] / (session_df['session_click_count'] + 0.000001)
session_df['session_carts_uu_action_ratio'] = session_df['session_unique_aid_cart_count'] / (session_df['session_cart_count'] + 0.000001)
session_df['session_orders_uu_action_ratio'] = session_df['session_unique_aid_order_count'] / (session_df['session_order_count'] + 0.000001)

session_df['session_total_uu_action_ratio'] = session_df['session_total_uu_action_ratio'].astype('float32')
session_df['session_clicks_uu_action_ratio'] = session_df['session_clicks_uu_action_ratio'].astype('float32')
session_df['session_carts_uu_action_ratio'] = session_df['session_carts_uu_action_ratio'].astype('float32')
session_df['session_orders_uu_action_ratio'] = session_df['session_orders_uu_action_ratio'].astype('float32')

del tmp
gc.collect()
'''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ts_diff'] = all_period_ts_max - df['ts']


(7683780, 5)


"\n# click\ntmp = df.query('type==0').groupby('session').size().reset_index(name='session_click_count')\nsession_df = session_df.merge(tmp, how = 'left', on = 'session')\nsession_df = type_change(session_df, 'session_click_count', 0, 'int16')\n# cart\ntmp = df.query('type==1').groupby('session').size().reset_index(name='session_cart_count')\nsession_df = session_df.merge(tmp, how = 'left', on = 'session')\nsession_df = type_change(session_df, 'session_cart_count', 0, 'int16')\n# order\ntmp = df.query('type==2').groupby('session').size().reset_index(name='session_order_count')\nsession_df = session_df.merge(tmp, how = 'left', on = 'session')\nsession_df = type_change(session_df, 'session_order_count', 0, 'int16')\n# type_mean\ntmp = df.groupby('session').mean()['type'].reset_index(name='session_type_mean')\nsession_df = session_df.merge(tmp, how = 'left', on = 'session')\nsession_df = type_change(session_df, 'session_type_mean', -1, 'float32')\n# last action type\ntmp = df.groupby('sess

In [None]:
df.query('session==12539534')

Unnamed: 0,session,aid,ts,type,ts_diff
171638753,12539534,938933,1661616688,0,107308
171638754,12539534,430024,1661616716,0,107280
171638755,12539534,938933,1661616724,0,107272


In [None]:
session_df.shape

(1672287, 25)

In [None]:
# session単位でのaid素性
session_aid = df.merge(aid_df, 'left', 'aid')
for i in week_list:
    print(i, "*******")
    lis = [f'aid_clicks_count_{i}', 
           f'aid_carts_count_{i}', 
           f'aid_orders_count_{i}', 
           f'aid_total_count_{i}', 
           f'aid_total_uu_{i}', 
           f'aid_clicks_uu_{i}', 
           f'aid_carts_uu_{i}', 
           f'aid_orders_uu_{i}',
           f'aid_total_uu_action_ratio_{i}',
           f'aid_clicks_uu_action_ratio_{i}',
           f'aid_carts_uu_action_ratio_{i}',
           f'aid_orders_uu_action_ratio_{i}']
    for l in lis:
        print(l)
        tmp = session_aid.groupby('session').mean()[l].reset_index(name=f'session_mean_{l}')
        session_df = session_df.merge(tmp, how = 'left', on = 'session')
        session_df[f'session_mean_{l}'] = session_df[f'session_mean_{l}'].astype('float32')

4weeks *******
aid_clicks_count_4weeks
aid_carts_count_4weeks
aid_orders_count_4weeks
aid_total_count_4weeks
aid_total_uu_4weeks
aid_clicks_uu_4weeks
aid_carts_uu_4weeks
aid_orders_uu_4weeks
aid_total_uu_action_ratio_4weeks
aid_clicks_uu_action_ratio_4weeks
aid_carts_uu_action_ratio_4weeks
aid_orders_uu_action_ratio_4weeks
2weeks *******
aid_clicks_count_2weeks
aid_carts_count_2weeks
aid_orders_count_2weeks
aid_total_count_2weeks
aid_total_uu_2weeks
aid_clicks_uu_2weeks
aid_carts_uu_2weeks
aid_orders_uu_2weeks
aid_total_uu_action_ratio_2weeks
aid_clicks_uu_action_ratio_2weeks
aid_carts_uu_action_ratio_2weeks
aid_orders_uu_action_ratio_2weeks
1week *******
aid_clicks_count_1week
aid_carts_count_1week
aid_orders_count_1week
aid_total_count_1week
aid_total_uu_1week
aid_clicks_uu_1week
aid_carts_uu_1week
aid_orders_uu_1week
aid_total_uu_action_ratio_1week
aid_clicks_uu_action_ratio_1week
aid_carts_uu_action_ratio_1week
aid_orders_uu_action_ratio_1week


In [None]:
pd.set_option('display.max_columns', 100)
session_df

Unnamed: 0,session,session_action_count,session_click_count,session_cart_count,session_order_count,session_type_mean,session_last_type,session_first_action_ts_diff,session_last_action_ts_diff,session_ts_period,session_mean_action_ts_diff,session_unique_aid_action_count,session_unique_aid_click_count,session_unique_aid_cart_count,session_unique_aid_order_count,session_click_rate,session_cart_rate,session_order_rate,session_unique_aid_click_rate,session_unique_aid_cart_rate,session_unique_aid_order_rate,session_total_uu_action_ratio,session_clicks_uu_action_ratio,session_carts_uu_action_ratio,session_orders_uu_action_ratio,session_mean_aid_clicks_count_4weeks,session_mean_aid_carts_count_4weeks,session_mean_aid_orders_count_4weeks,session_mean_aid_total_count_4weeks,session_mean_aid_total_uu_4weeks,session_mean_aid_clicks_uu_4weeks,session_mean_aid_carts_uu_4weeks,session_mean_aid_orders_uu_4weeks,session_mean_aid_total_uu_action_ratio_4weeks,session_mean_aid_clicks_uu_action_ratio_4weeks,session_mean_aid_carts_uu_action_ratio_4weeks,session_mean_aid_orders_uu_action_ratio_4weeks,session_mean_aid_clicks_count_2weeks,session_mean_aid_carts_count_2weeks,session_mean_aid_orders_count_2weeks,session_mean_aid_total_count_2weeks,session_mean_aid_total_uu_2weeks,session_mean_aid_clicks_uu_2weeks,session_mean_aid_carts_uu_2weeks,session_mean_aid_orders_uu_2weeks,session_mean_aid_total_uu_action_ratio_2weeks,session_mean_aid_clicks_uu_action_ratio_2weeks,session_mean_aid_carts_uu_action_ratio_2weeks,session_mean_aid_orders_uu_action_ratio_2weeks,session_mean_aid_clicks_count_1week,session_mean_aid_carts_count_1week,session_mean_aid_orders_count_1week,session_mean_aid_total_count_1week,session_mean_aid_total_uu_1week,session_mean_aid_clicks_uu_1week,session_mean_aid_carts_uu_1week,session_mean_aid_orders_uu_1week,session_mean_aid_total_uu_action_ratio_1week,session_mean_aid_clicks_uu_action_ratio_1week,session_mean_aid_carts_uu_action_ratio_1week,session_mean_aid_orders_uu_action_ratio_1week
0,1527,1,1,0,0,0,0,604795,604795,0,604795.0,1,1,0,0,0.999999,0.0,0.0,0.999999,0.0,0.0,0.999999,0.999999,0.0,0.0,9.0,0.0,0.0,9.0,6.0,6.0,0.0,0.0,0.666667,0.666667,0.000000,0.000000,2.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.500000,0.500000,0.000000,0.000000,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.999999,0.999999,0.000000,0.000000
1,2681,1,1,0,0,0,0,604796,604796,0,604796.0,1,1,0,0,0.999999,0.0,0.0,0.999999,0.0,0.0,0.999999,0.999999,0.0,0.0,1505.0,75.0,9.0,1589.0,992.0,988.0,60.0,9.0,0.624292,0.656478,0.800000,1.000000,535.0,30.0,5.0,570.0,385.0,382.0,29.0,5.0,0.675439,0.714019,0.966667,1.000000,55.0,1.0,0.0,56.0,51.0,50.0,1.0,0.0,0.910714,0.909091,0.999999,0.000000
2,2799,1,1,0,0,0,0,604794,604794,0,604794.0,1,1,0,0,0.999999,0.0,0.0,0.999999,0.0,0.0,0.999999,0.999999,0.0,0.0,12044.0,248.0,27.0,12319.0,7750.0,7742.0,208.0,24.0,0.629110,0.642810,0.838710,0.888889,3495.0,82.0,6.0,3583.0,2359.0,2354.0,73.0,6.0,0.658387,0.673534,0.890244,1.000000,351.0,8.0,1.0,360.0,265.0,265.0,6.0,1.0,0.736111,0.754986,0.750000,0.999999
3,2904,1,1,0,0,0,0,604798,604798,0,604798.0,1,1,0,0,0.999999,0.0,0.0,0.999999,0.0,0.0,0.999999,0.999999,0.0,0.0,10000.0,536.0,84.0,10620.0,6800.0,6792.0,466.0,82.0,0.640301,0.679200,0.869403,0.976190,4635.0,250.0,40.0,4925.0,3243.0,3234.0,214.0,39.0,0.658477,0.697735,0.856000,0.975000,802.0,47.0,6.0,855.0,636.0,634.0,38.0,5.0,0.743860,0.790524,0.808511,0.833333
4,3184,1,1,0,0,0,0,604796,604796,0,604796.0,1,1,0,0,0.999999,0.0,0.0,0.999999,0.0,0.0,0.999999,0.999999,0.0,0.0,52.0,5.0,3.0,60.0,48.0,48.0,4.0,2.0,0.800000,0.923077,0.800000,0.666666,20.0,2.0,1.0,23.0,20.0,20.0,2.0,1.0,0.869565,1.000000,1.000000,0.999999,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.999999,0.999999,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1672282,14571577,1,1,0,0,0,0,17,17,0,17.0,1,1,0,0,0.999999,0.0,0.0,0.999999,0.0,0.0,0.999999,0.999999,0.0,0.0,77.0,2.0,0.0,79.0,62.0,62.0,2.0,0.0,0.784810,0.805195,1.000000,0.000000,44.0,0.0,0.0,44.0,34.0,34.0,0.0,0.0,0.772727,0.772727,0.000000,0.000000,6.0,0.0,0.0,6.0,5.0,5.0,0.0,0.0,0.833333,0.833333,0.000000,0.000000
1672283,14571578,1,1,0,0,0,0,16,16,0,16.0,1,1,0,0,0.999999,0.0,0.0,0.999999,0.0,0.0,0.999999,0.999999,0.0,0.0,237.0,2.0,0.0,239.0,173.0,173.0,2.0,0.0,0.723849,0.729958,1.000000,0.000000,68.0,1.0,0.0,69.0,54.0,54.0,1.0,0.0,0.782609,0.794118,0.999999,0.000000,8.0,0.0,0.0,8.0,7.0,7.0,0.0,0.0,0.875000,0.875000,0.000000,0.000000
1672284,14571579,1,1,0,0,0,0,16,16,0,16.0,1,1,0,0,0.999999,0.0,0.0,0.999999,0.0,0.0,0.999999,0.999999,0.0,0.0,252.0,30.0,11.0,293.0,180.0,179.0,23.0,9.0,0.614334,0.710317,0.766667,0.818182,162.0,25.0,9.0,196.0,115.0,115.0,20.0,7.0,0.586735,0.709877,0.800000,0.777778,24.0,5.0,0.0,29.0,23.0,23.0,5.0,0.0,0.793103,0.958333,1.000000,0.000000
1672285,14571580,1,1,0,0,0,0,10,10,0,10.0,1,1,0,0,0.999999,0.0,0.0,0.999999,0.0,0.0,0.999999,0.999999,0.0,0.0,115.0,13.0,5.0,133.0,88.0,88.0,11.0,4.0,0.661654,0.765217,0.846154,0.800000,59.0,8.0,3.0,70.0,42.0,42.0,6.0,2.0,0.600000,0.711864,0.750000,0.666666,19.0,1.0,0.0,20.0,18.0,18.0,1.0,0.0,0.900000,0.947368,0.999999,0.000000


## Save session features

In [None]:
if valid_flag:
    session_df.to_parquet(f'{output_path}/valid_session_features.parquet')
else:
    session_df.to_parquet(f'{output_path}/test_session_features.parquet')

In [None]:
# check
'''
print(origin_train['ts'].min(), origin_train['ts'].max()) #8/1, 7:00 - 8/29, 6:59 4 weeks
print(origin_test['ts'].min(), origin_test['ts'].max()) #8/29, 7:00 - 9/5 7:00 1 week
print(valid_train['ts'].min(), valid_train['ts'].max()) #8/1, 7:00 - 8/22, 6:59 3 weeks
print(valid_test['ts'].min(), valid_test['ts'].max()) #8/22, 7:00 - 8/29, 6:59 1 week
'''

"\nprint(origin_train['ts'].min(), origin_train['ts'].max()) #8/1, 7:00 - 8/29, 6:59 4 weeks\nprint(origin_test['ts'].min(), origin_test['ts'].max()) #8/29, 7:00 - 9/5 7:00 1 week\nprint(valid_train['ts'].min(), valid_train['ts'].max()) #8/1, 7:00 - 8/22, 6:59 3 weeks\nprint(valid_test['ts'].min(), valid_test['ts'].max()) #8/22, 7:00 - 8/29, 6:59 1 week\n"