In [1]:
# True: Google Colab Notebook
# False: My local PC
colab = False
if colab:
    from google.colab import drive
    drive.mount('/content/drive')
    !ls /content/drive/MyDrive/output/otto/
    base_path = '/content/drive/MyDrive'
else:
    base_path = '../data'

In [2]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools

## Parameter settings

In [3]:
# trueならlocal cv用, falseならpred用
valid_flag = False

# 上位何件までを書き出すか
Ntop = 50
input_path = f'{base_path}/input/otto'
type_weight = {0:1, 1:6, 2:3}
output_path = f'{base_path}/output/otto'
# メモリに乗らないので分割書き出し設定
# for文では回さないので分割数だけこのnotebookを回す必要がある
OUTPUT_SPLIT_NUM = 0 # 何番目の分割か,0 start
#output_session_splits = 2 # 全体で何分割するか
output_session_splits = 1 # 全体で何分割するか
click_topn_list = [5, 10, 20, 30]
cart_topn_list = [5, 15, 20]
buy_topn_list = [5, 15, 20]


if valid_flag:
  input_path = input_path + '/otto-validation'
  VER = 7
  Ntop_buy2buy = 30
  Ntop_carts = 30
  Ntop_clicks = 40
  DISK_PIECES = 4
else:
  input_path = input_path + '/otto-origin'
  VER = 7
  Ntop_buy2buy = 30
  Ntop_carts = 30
  Ntop_clicks = 40
  DISK_PIECES = 4

## Read input data

In [4]:
!ls {input_path}

top_30_buy2buy_v7_0_ver0.pqt	   top_30_carts_orders_v7_2_ver1.pqt
top_30_buy2buy_v7_0_ver1.pqt	   top_30_carts_orders_v7_3_ver0.pqt
top_30_carts_orders_v7_0_ver0.pqt  top_30_carts_orders_v7_3_ver1.pqt
top_30_carts_orders_v7_0_ver1.pqt  top_40_clicks_v7_0.pqt
top_30_carts_orders_v7_1_ver0.pqt  top_40_clicks_v7_1.pqt
top_30_carts_orders_v7_1_ver1.pqt  top_40_clicks_v7_2.pqt
top_30_carts_orders_v7_2_ver0.pqt  top_40_clicks_v7_3.pqt


In [6]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

def load_test():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(f'{input_path}/test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

test_df = load_test()
# to reduce memory
test_df['session'] = test_df['session'].astype('int32')
test_df['aid'] = test_df['aid'].astype('int32')

print('Test data has shape',test_df.shape)
test_df.head()

Test data has shape (6928123, 4)


Unnamed: 0,session,aid,ts,type
0,13599779,1346074,1661958067,0
1,13599780,78453,1661958067,0
2,13599781,208146,1661958067,0
3,13599781,125499,1661958074,0
4,13599782,1649873,1661958068,0


In [7]:
%%time
# LOAD THREE CO-VISITATION MATRICES
def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()


top_n_clicks = pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_clicks}_clicks_v{VER}_0.pqt') )
for k in range(1,DISK_PIECES): 
    top_n_clicks.update( pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_clicks}_clicks_v{VER}_{k}.pqt') ) )

# weight
top_n_buys = pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_carts}_carts_orders_v{VER}_0_ver0.pqt') )
for k in range(1,DISK_PIECES): 
    top_n_buys.update( pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_carts}_carts_orders_v{VER}_{k}_ver0.pqt') ) )
# weight2
top_n_buys2 = pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_carts}_carts_orders_v{VER}_0_ver1.pqt') )
for k in range(1,DISK_PIECES): 
    top_n_buys2.update( pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_carts}_carts_orders_v{VER}_{k}_ver1.pqt') ) )

# 14 days
top_n_buy2buy = pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_buy2buy}_buy2buy_v{VER}_0_ver0.pqt') )
for k in range(1,DISK_PIECES // 4): 
    top_n_buy2buy.update( pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_buy2buy}_buy2buy_v{VER}_{k}_ver0.pqt') ) )
# 7 days
top_n_buy2buy2 = pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_buy2buy}_buy2buy_v{VER}_0_ver1.pqt') )
for k in range(1,DISK_PIECES // 4): 
    top_n_buy2buy2.update( pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_buy2buy}_buy2buy_v{VER}_{k}_ver1.pqt') ) )

print('Here are size of our 3 co-visitation matrices:')
print( len( top_n_clicks ), len( top_n_buy2buy ), len( top_n_buy2buy2 ), len( top_n_buys ),  len( top_n_buys2 ) )

Here are size of our 3 co-visitation matrices:
1837166 1168768 1158900 1837166 1837166
CPU times: user 1min 10s, sys: 7.54 s, total: 1min 18s
Wall time: 1min 15s


## Popular item preparation

In [8]:
Ntop_all = 1000000
top_clicks = test_df.loc[test_df['type']== 0,'aid'].value_counts().index.values[:Ntop_all] 
top_carts = test_df.loc[test_df['type']== 1,'aid'].value_counts().index.values[:Ntop_all]
top_orders = test_df.loc[test_df['type']== 2,'aid'].value_counts().index.values[:Ntop_all]

## Settings for output file split

In [9]:
min_val = test_df['session'].min()
max_val = test_df['session'].max()
print('OUTPUT_SPLIT_NUM=', OUTPUT_SPLIT_NUM)
print('original min session:', min_val, 'max_session:', max_val, 'shape:', test_df.shape)

output_session_splits_list = [int(min_val) + int(i*(max_val - min_val) / output_session_splits) for i in range(output_session_splits)]
print('output_session_splits=', output_session_splits)
print('output_session_splits_list=', output_session_splits_list)

if OUTPUT_SPLIT_NUM == output_session_splits - 1:
    test_df = test_df[output_session_splits_list[OUTPUT_SPLIT_NUM] <= test_df['session']]
else:
    test_df = test_df[(output_session_splits_list[OUTPUT_SPLIT_NUM] <= test_df['session']) & (test_df['session'] < output_session_splits_list[OUTPUT_SPLIT_NUM + 1])]
print('after min session:', test_df['session'].min(), 'max_session:', test_df['session'].max(), 'shape:', test_df.shape)

OUTPUT_SPLIT_NUM= 0
original min session: 12899779 max_session: 14571581 shape: (6928123, 4)
output_session_splits= 1
output_session_splits_list= [12899779]
after min session: 12899779 max_session: 14571581 shape: (6928123, 4)


## User history

In [10]:
# sortしてsessionごとの通し番号をつける
test_df = test_df.sort_values(['session','ts'])
test_df = test_df.reset_index(drop=True)
test_df['ts_rank_inv'] = test_df.groupby('session').cumcount()

# sessionごとの全actions数を求める
session_df = test_df.groupby('session').size().reset_index(name='session_action_count')
# action countはのちに重みを計算するときに使う
test_df = test_df.merge(session_df, how = 'inner', on = 'session')

In [11]:
type_weight_multipliers = type_weight

def func(df):
  weights=np.logspace(0.1,1,df['session_action_count'],base=2, endpoint=True)-1
  # tsでdescになっているので昇順の順位で計算する
  return weights[df['ts_rank_inv']] * type_weight_multipliers[df['type']]

test_df['score'] = test_df.apply(func, axis=1)

In [12]:
test_df.head()

Unnamed: 0,session,aid,ts,type,ts_rank_inv,session_action_count,score
0,12899779,59625,1661724000,0,0,1,0.071773
1,12899780,1142000,1661724000,0,0,5,0.071773
2,12899780,582732,1661724058,0,1,5,0.252664
3,12899780,973453,1661724109,0,2,5,0.464086
4,12899780,736515,1661724136,0,3,5,0.71119


In [13]:
# それぞれのscoreを記録する
test_group1 = test_df[['session','aid','score']].groupby(['session','aid'])['score'].sum().reset_index(name ='score_click')
test_group2 = test_df.query('type==0 or type==1')[['session','aid','score']].groupby(['session','aid'])['score'].sum().reset_index(name ='score_cart')
test_group3 = test_df.query('type==1 or type==2')[['session','aid','score']].groupby(['session','aid'])['score'].sum().reset_index(name ='score_buy')
test_group4 = test_df.query('type==0')[['session','aid','score']].groupby(['session','aid'])['score'].sum().reset_index(name ='score_click_only')
test_group5 = test_df.query('type==1')[['session','aid','score']].groupby(['session','aid'])['score'].sum().reset_index(name ='score_cart_only')
test_group6 = test_df.query('type==2')[['session','aid','score']].groupby(['session','aid'])['score'].sum().reset_index(name ='score_buy_only')

# cart, buyがnullの場合は0で埋める
test_group = test_group1.merge(test_group2, how = 'left', on = ['session','aid']).merge(test_group3, how = 'left', on = ['session','aid']).merge(test_group4, how = 'left', on = ['session','aid']).merge(test_group5, how = 'left', on = ['session','aid']).merge(test_group6, how = 'left', on = ['session','aid']).fillna(0)
# session情報を付与
test_group = test_group.merge(session_df, how = 'inner', on = 'session')

del test_group1, test_group2, test_group3, test_group4, test_group5, test_group6, session_df
gc.collect()

0

In [14]:
test_group.groupby('session').size().describe()

count    1.671803e+06
mean     2.994705e+00
std      5.113723e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      3.000000e+00
max      4.330000e+02
dtype: float64

In [15]:
print(test_group.shape)
test_group.head()

(5006557, 9)


Unnamed: 0,session,aid,score_click,score_cart,score_buy,score_click_only,score_cart_only,score_buy_only,session_action_count
0,12899779,59625,0.071773,0.071773,0.0,0.071773,0.0,0.0,1
1,12899780,582732,0.252664,0.252664,0.0,0.252664,0.0,0.0,5
2,12899780,736515,0.71119,0.71119,0.0,0.71119,0.0,0.0,5
3,12899780,973453,0.464086,0.464086,0.0,0.464086,0.0,0.0,5
4,12899780,1142000,1.071773,1.071773,0.0,1.071773,0.0,0.0,5


In [16]:
# メモリ削減1
test_group['score_click'] = test_group['score_click'].astype('float32')
test_group['score_cart'] = test_group['score_cart'].astype('float32')
test_group['score_buy'] = test_group['score_buy'].astype('float32')
test_group['score_click_only'] = test_group['score_click_only'].astype('float32')
test_group['score_cart_only'] = test_group['score_cart_only'].astype('float32')
test_group['score_buy_only'] = test_group['score_buy_only'].astype('float32')
test_group['session_action_count'] = test_group['session_action_count'].astype('int16')

## Candidate Generation by Co-Visitation Matrix

In [17]:
def get_expand_df(df, name):
    df = pd.DataFrame(df, columns=["lis"]).reset_index()
    ex_df = df.explode("lis").reset_index(drop=True)
    ex_df[name] = ex_df.groupby('session').cumcount().astype('int8')
    return ex_df

In [18]:
# Nclicksはtop_n_clicksの上位何件までを考慮するか、baseline: 20
def suggest_clicks(df, Nclicks):
    aids=df.aid.tolist()
    # 長すぎる場合は上限で切る
    unique_aids = list(dict.fromkeys(aids[::-1] ))[:Ntop]
    aids2 = list(itertools.chain(*[top_n_clicks[aid][:Nclicks] for aid in unique_aids if aid in top_n_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(Ntop)]
    result = top_aids2[:Ntop]

    return result

In [19]:
# 4パターン
for i, val in enumerate(click_topn_list):
    print('i, val=', i, val,'.........')

    df_clicks_tmp = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
        lambda x: suggest_clicks(x, val)
    )
    df_clicks_tmp = get_expand_df(df_clicks_tmp, f'n_clicks_{val}')
    df_clicks_tmp = df_clicks_tmp[df_clicks_tmp['lis'].notnull()]
    if i == 0:
        df_clicks = df_clicks_tmp.copy()
    else:
        df_clicks = df_clicks.merge(df_clicks_tmp, how = 'outer', on = ['session','lis'])
    del df_clicks_tmp
    gc.collect()
# floatになってたりするのでcastし直し
df_clicks['session'] = df_clicks['session'].astype('int32')
df_clicks['lis'] = df_clicks['lis'].astype('int32')
for i, val in enumerate(click_topn_list):
    df_clicks[f'n_clicks_{val}'] = df_clicks[f'n_clicks_{val}'].fillna(-1).astype('int8')
print('mean size:', df_clicks.groupby('session').size().mean())
print('shape:', df_clicks.shape)

i, val= 0 5 .........
i, val= 1 10 .........


  key_col = Index(lvals).where(~mask_left, rvals)


i, val= 2 20 .........


  key_col = Index(lvals).where(~mask_left, rvals)


i, val= 3 30 .........


  key_col = Index(lvals).where(~mask_left, rvals)


mean size: 44.444440189990615
shape: (74286704, 6)


In [20]:
# Nbuysはtop_n_buysの上位何件までを考慮するか、baseline: 15
# ver=0: weight, ver1: weight2
def suggest_carts(df, Nbuys, ver=0):
    # 長すぎる場合は上限で切る
    unique_aids = list(dict.fromkeys(df.aid.tolist()[::-1] ))[:Ntop]

    # Use "cart order" co-visitation matrices
    if ver == 0:
        aids2 = list(itertools.chain(*[top_n_buys[aid][:Nbuys] for aid in unique_aids if aid in top_n_buys]))
    else:
        aids2 = list(itertools.chain(*[top_n_buys2[aid][:Nbuys] for aid in unique_aids if aid in top_n_buys2]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(Ntop)] 
    return top_aids2[:Ntop]


# 3パターン * 2
for i, val in enumerate(cart_topn_list):
    for j in range(2):
        print('i=', i, 'val=', val, 'ver=', j, '.........')
        df_carts_tmp = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
            lambda x: suggest_carts(x, val, j)
        )
        df_carts_tmp = get_expand_df(df_carts_tmp, f'n_carts_{val}_ver{j}')
        df_carts_tmp = df_carts_tmp[df_carts_tmp['lis'].notnull()]
        if i == 0 and j == 0:
            df_carts = df_carts_tmp.copy()
        else:
            df_carts = df_carts.merge(df_carts_tmp, how = 'outer', on = ['session','lis'])
        del df_carts_tmp
        gc.collect()

        df_carts[f'n_carts_{val}_ver{j}'] = df_carts[f'n_carts_{val}_ver{j}'].fillna(-1).astype('int8')
# floatになってたりするのでcastし直し
df_carts['session'] = df_carts['session'].astype('int32')
df_carts['lis'] = df_carts['lis'].astype('int32')
for i, val in enumerate(cart_topn_list):
    for j in range(2):
        df_carts[f'n_carts_{val}_ver{j}'] = df_carts[f'n_carts_{val}_ver{j}'].fillna(-1).astype('int8')
print('mean size:', df_carts.groupby('session').size().mean())
print('shape:', df_carts.shape)

i= 0 val= 5 ver= 0 .........
i= 0 val= 5 ver= 1 .........


  key_col = Index(lvals).where(~mask_left, rvals)


i= 1 val= 15 ver= 0 .........


  key_col = Index(lvals).where(~mask_left, rvals)


i= 1 val= 15 ver= 1 .........


  key_col = Index(lvals).where(~mask_left, rvals)


i= 2 val= 20 ver= 0 .........


  key_col = Index(lvals).where(~mask_left, rvals)


i= 2 val= 20 ver= 1 .........


  key_col = Index(lvals).where(~mask_left, rvals)


mean size: 39.05817639882952
shape: (65283828, 8)


In [21]:
# Nbuy2buyはtop_n_buy2buyの上位何件までを考慮するか、baseline: 15
def suggest_buys(df, Nbuy2buy, ver=0):
    # 長すぎる場合は上限で切る
    unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))[:Ntop]

    # USE "BUY2BUY" CO-VISITATION MATRIX
    if ver == 0:
        aids3 = list(itertools.chain(*[top_n_buy2buy[aid][:Nbuy2buy] for aid in unique_buys if aid in top_n_buy2buy]))
    else:
        aids3 = list(itertools.chain(*[top_n_buy2buy2[aid][:Nbuy2buy] for aid in unique_buys if aid in top_n_buy2buy2]))

    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids3).most_common(Ntop)] 
    return top_aids2[:Ntop]    


# 3パターン * 2
for i, val in enumerate(buy_topn_list):
    for j in range(2):
        print('i=', i, 'val=', val, 'ver=', j, '.........')
        # type=1or2に絞ってからapply
        #df_buys_tmp = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
        df_buys_tmp = test_df.loc[(test_df['type']==1)|(test_df['type']==2)].sort_values(["session", "ts"]).groupby(["session"]).apply(
            lambda x: suggest_buys(x, val, j)
        )
        df_buys_tmp = get_expand_df(df_buys_tmp, f'n_buys_{val}_ver{j}')
        df_buys_tmp = df_buys_tmp[df_buys_tmp['lis'].notnull()]
        if i == 0 and j == 0:
            df_buys = df_buys_tmp.copy()
        else:
            df_buys = df_buys.merge(df_buys_tmp, how = 'outer', on = ['session','lis'])
        del df_buys_tmp
        gc.collect()

        df_buys[f'n_buys_{val}_ver{j}'] = df_buys[f'n_buys_{val}_ver{j}'].fillna(-1).astype('int8')
# floatになってたりするのでcastし直し
df_buys['session'] = df_buys['session'].astype('int32')
df_buys['lis'] = df_buys['lis'].astype('int32')
for i, val in enumerate(buy_topn_list):
    for j in range(2):
        df_buys[f'n_buys_{val}_ver{j}'] = df_buys[f'n_buys_{val}_ver{j}'].fillna(-1).astype('int8')
print('mean size:', df_buys.groupby('session').size().mean())
print('shape:', df_buys.shape)

i= 0 val= 5 ver= 0 .........
i= 0 val= 5 ver= 1 .........


  key_col = Index(lvals).where(~mask_left, rvals)


i= 1 val= 15 ver= 0 .........


  key_col = Index(lvals).where(~mask_left, rvals)


i= 1 val= 15 ver= 1 .........


  key_col = Index(lvals).where(~mask_left, rvals)


i= 2 val= 20 ver= 0 .........


  key_col = Index(lvals).where(~mask_left, rvals)


i= 2 val= 20 ver= 1 .........


  key_col = Index(lvals).where(~mask_left, rvals)


mean size: 33.34943251425463
shape: (8030510, 8)


In [22]:
ex_df_all = df_clicks.merge(df_carts, how = 'outer', on = ['session','lis']).merge(df_buys, how = 'outer', on = ['session','lis'])
del df_clicks, df_carts, df_buys
gc.collect()
ex_df_all = ex_df_all.rename(columns={'lis': 'aid'})

In [23]:
test_group = test_group.merge(ex_df_all, how='outer', on = ['session', 'aid'])
del ex_df_all
gc.collect()

0

In [24]:
# メモリ削減, NaNがあるとJOINなどの行程でfloatになったりするのでちゃんと欠損値埋める
def reduce_memory(df):
    df['session'] = df['session'].astype('int32')
    df['aid'] = df['aid'].astype('int32')
    df['session_action_count'] = df['session_action_count'].fillna(0).astype('int16')
    
    for i in click_topn_list:
        df[f'n_clicks_{i}'] = df[f'n_clicks_{i}'].fillna(-1).astype('int8')
    
    for i, val in enumerate(cart_topn_list):
        for j in range(2):
            df[f'n_carts_{val}_ver{j}'] = df[f'n_carts_{val}_ver{j}'].fillna(-1).astype('int8')
    for i, val in enumerate(buy_topn_list):
        for j in range(2):
            df[f'n_buys_{val}_ver{j}'] = df[f'n_buys_{val}_ver{j}'].fillna(-1).astype('int8')
    return df

In [25]:
test_group = reduce_memory(test_group)
print(test_group.shape)
print('mean size:', test_group.groupby('session').size().mean())
test_group.head()

(100987139, 25)
mean size: 60.406123807649585


Unnamed: 0,session,aid,score_click,score_cart,score_buy,score_click_only,score_cart_only,score_buy_only,session_action_count,n_clicks_5,...,n_carts_15_ver0,n_carts_15_ver1,n_carts_20_ver0,n_carts_20_ver1,n_buys_5_ver0,n_buys_5_ver1,n_buys_15_ver0,n_buys_15_ver1,n_buys_20_ver0,n_buys_20_ver1
0,12899779,59625,0.071773,0.071773,0.0,0.071773,0.0,0.0,1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,12899780,582732,0.252664,0.252664,0.0,0.252664,0.0,0.0,5,8,...,4,4,7,6,-1,-1,-1,-1,-1,-1
2,12899780,736515,0.71119,0.71119,0.0,0.71119,0.0,0.0,5,13,...,33,32,39,40,-1,-1,-1,-1,-1,-1
3,12899780,973453,0.464086,0.464086,0.0,0.464086,0.0,0.0,5,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,12899780,1142000,1.071773,1.071773,0.0,1.071773,0.0,0.0,5,0,...,6,7,10,11,-1,-1,-1,-1,-1,-1


In [26]:
test_group.groupby('session').size().describe()

count    1.671803e+06
mean     6.040612e+01
std      3.357716e+01
min      1.000000e+00
25%      3.600000e+01
50%      4.600000e+01
75%      7.600000e+01
max      4.720000e+02
dtype: float64

## popular items

In [27]:
# ここでいったん保存したい
if valid_flag:
    test_group.to_parquet(f'{output_path}/train_{Ntop}_{OUTPUT_SPLIT_NUM}_wo_rank.parquet')
else:
    test_group.to_parquet(f'{output_path}/test_{Ntop}_{OUTPUT_SPLIT_NUM}_wo_rank.parquet')

In [28]:
# read tmp df
if valid_flag:
    test_group = pd.read_parquet(f'{output_path}/train_{Ntop}_{OUTPUT_SPLIT_NUM}_wo_rank.parquet')
else:
    test_group = pd.read_parquet(f'{output_path}/test_{Ntop}_{OUTPUT_SPLIT_NUM}_wo_rank.parquet')

In [29]:
top_df = set(list(top_clicks[:40]) + list(top_carts[:40]) + list(top_orders[:40]))
top_df = pd.DataFrame(top_df, columns=['aid'])
top_df['dammy'] = 1

# cross join by dammy key
tmp = pd.DataFrame(test_group['session'].unique(), columns=['session'])
tmp['dammy'] = 1
top_df = top_df.merge(tmp, on='dammy').drop('dammy', axis = 1)
top_df['aid'] = top_df['aid'].astype('int32')
del tmp
gc.collect()

0

In [30]:
test_group = test_group.merge(top_df, how = 'outer', on = ['session', 'aid'])
test_group = reduce_memory(test_group)
del top_df
gc.collect()

0

In [31]:
print(test_group.shape)
test_group.head()

(216601197, 25)


Unnamed: 0,session,aid,score_click,score_cart,score_buy,score_click_only,score_cart_only,score_buy_only,session_action_count,n_clicks_5,...,n_carts_15_ver0,n_carts_15_ver1,n_carts_20_ver0,n_carts_20_ver1,n_buys_5_ver0,n_buys_5_ver1,n_buys_15_ver0,n_buys_15_ver1,n_buys_20_ver0,n_buys_20_ver1
0,12899779,59625,0.071773,0.071773,0.0,0.071773,0.0,0.0,1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,12899780,582732,0.252664,0.252664,0.0,0.252664,0.0,0.0,5,8,...,4,4,7,6,-1,-1,-1,-1,-1,-1
2,12899780,736515,0.71119,0.71119,0.0,0.71119,0.0,0.0,5,13,...,33,32,39,40,-1,-1,-1,-1,-1,-1
3,12899780,973453,0.464086,0.464086,0.0,0.464086,0.0,0.0,5,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,12899780,1142000,1.071773,1.071773,0.0,1.071773,0.0,0.0,5,0,...,6,7,10,11,-1,-1,-1,-1,-1,-1


# 正解ラベル定義

In [32]:
if valid_flag:
    for t in ['clicks','carts','orders']:
        print(t, '*******')
        test_labels = pd.read_parquet(f'{input_path}/test_labels.parquet')
        test_labels = test_labels.loc[test_labels['type']==t].rename(columns={'ground_truth': 'aid'})
        test_labels = test_labels.explode('aid')[['session', 'aid']].reset_index(drop=True)
        test_labels['session'] = test_labels['session'].astype('int32')
        test_labels['aid'] = test_labels['aid'].astype('int32')
        test_labels[f'y_{t}'] = 1
        test_labels[f'y_{t}'] = test_labels[f'y_{t}'].astype('bool')
    
        test_group = test_group.merge(test_labels, how='left', on=['session', 'aid'])
        test_group[f'y_{t}'] = test_group[f'y_{t}'].fillna(0).astype('bool')
    del test_labels
    gc.collect()

In [33]:
# 負例しかないものは学習に使えないので削る（学習のみ）
if valid_flag:
    true_df = (test_group.groupby('session')['y_clicks'].agg('sum') > 0) | (test_group.groupby('session')['y_carts'].agg('sum') > 0) | (test_group.groupby('session')['y_orders'].agg('sum') > 0)
    session = pd.DataFrame(true_df[true_df]).reset_index()['session']
    test_group = test_group.merge(session, how = 'inner', on = 'session')
    del true_df, session
    gc.collect()

In [34]:
test_group

Unnamed: 0,session,aid,score_click,score_cart,score_buy,score_click_only,score_cart_only,score_buy_only,session_action_count,n_clicks_5,...,n_carts_15_ver0,n_carts_15_ver1,n_carts_20_ver0,n_carts_20_ver1,n_buys_5_ver0,n_buys_5_ver1,n_buys_15_ver0,n_buys_15_ver1,n_buys_20_ver0,n_buys_20_ver1
0,12899779,59625,0.071773,0.071773,0.0,0.071773,0.0,0.0,1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,12899780,582732,0.252664,0.252664,0.0,0.252664,0.0,0.0,5,8,...,4,4,7,6,-1,-1,-1,-1,-1,-1
2,12899780,736515,0.711190,0.711190,0.0,0.711190,0.0,0.0,5,13,...,33,32,39,40,-1,-1,-1,-1,-1,-1
3,12899780,973453,0.464086,0.464086,0.0,0.464086,0.0,0.0,5,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,12899780,1142000,1.071773,1.071773,0.0,1.071773,0.0,0.0,5,0,...,6,7,10,11,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216601192,14571577,329725,,,,,,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
216601193,14571578,329725,,,,,,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
216601194,14571579,329725,,,,,,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
216601195,14571580,329725,,,,,,,0,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


## Save

In [35]:
if valid_flag:
    test_group.to_parquet(f'{output_path}/train_{Ntop}_{OUTPUT_SPLIT_NUM}.parquet')
    #test_group.to_parquet(f'{output_path}/train_{Ntop}_tmp.parquet') 

else:
    #test_group.to_parquet(f'{output_path}/test_{Ntop}_{OUTPUT_SPLIT_NUM}_tmp.parquet')
    test_group.to_parquet(f'{output_path}/test_{Ntop}_{OUTPUT_SPLIT_NUM}.parquet')

In [36]:
test_group.groupby('session').size().describe()

count    1.671803e+06
mean     1.295614e+02
std      3.355708e+01
min      7.100000e+01
25%      1.050000e+02
50%      1.150000e+02
75%      1.460000e+02
max      5.320000e+02
dtype: float64

In [20]:
# label情報
# all: 2,212,692
# click: 1,755,534 gt:1
# cart: 306,341 gt: 1.89
# order: 150,817 gt: 2.08

if valid_flag:
    print('click summary............')
    a = test_group['y_clicks'].mean()
    b = test_group['y_clicks'].sum()
    #c = test_group.query('score_click >= -1 or score_cart >= -1 or score_buy >= -1 or (-1 < n_clicks_20_ver0 and n_clicks_20_ver0<20) or (-1 < n_carts and n_carts<20) or (-1 < n_buys and n_buys<20)')['y_clicks'].sum()
    l1 = b / 1755534
    print('mean:', test_group['y_clicks'].mean(), 'sum:', test_group['y_clicks'].sum(),'limit:', l1)
    #print('click<20 sum:',c, 'limit:', c / 1755534)

    print('cart summary............')
    a = test_group['y_carts'].mean()
    b = test_group['y_carts'].sum()
    #c = test_group.query('score_click >= -1 or score_cart >= -1 or score_buy >= -1 or (-1 < n_clicks_20 and n_clicks_20<20) or (-1 < n_carts and n_carts<20) or (-1 < n_buys and n_buys<20)')['y_carts'].sum()
    l2 = b / (306341*1.89)
    print('mean:', test_group['y_carts'].mean(), 'sum:', test_group['y_carts'].sum(),'limit:', l2)
    #print('cart<20 sum:',c, 'limit:', c / (306341*1.89))

    print('order summary............')
    a = test_group['y_orders'].mean()
    b = test_group['y_orders'].sum()
    #c = test_group.query('score_click >= -1 or score_cart >= -1 or score_buy >= -1 or (-1 < n_clicks_20 and n_clicks_20<20) or (-1 < n_carts and n_carts<20) or (-1 < n_buys and n_buys<20)')['y_orders'].sum()
    l3 = b / (150817*2.08)
    print('mean:', test_group['y_orders'].mean(), 'sum:', test_group['y_orders'].sum(),'limit:', l3)
    #print('order<20 sum:',c, 'limit:', c / (150817*2.08))
   
    print('Total limit recall............')
    print(0.1 * l1 + 0.3 * l2 + 0.6 * l3)

click summary............
mean: 0.007220023154005536 sum: 1073848 limit: 0.611693080282125
cart summary............
mean: 0.001964480620311712 sum: 292181 limit: 0.5046439154181833
order summary............
mean: 0.001495005110973714 sum: 222355 limit: 0.7088155997513034
Total limit recall............
0.6378518425044495
