In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
import itertools

## 生成データの読み込み

In [None]:
# trueならlocal cv用, falseならpred用
#valid_flag = True
valid_flag = False

Ntop = 50
#Ntop = 20
base_path = '/content/drive/MyDrive/input/otto'
type_weight = {0:1, 1:6, 2:3}

if valid_flag:
  input_path = base_path + '/otto-validation'
  
  '''
  VER = 7
  Ntop_buy2buy = 50
  Ntop_carts = 50
  Ntop_clicks = 50
  DISK_PIECES = 16
  '''
  # baseline
  VER = 6
  Ntop_buy2buy = 15
  Ntop_carts = 15
  Ntop_clicks = 20
  DISK_PIECES = 4
  
  
else:
  input_path = base_path + '/otto-origin'
  # baseline
  VER = 5
  Ntop_buy2buy = 15
  Ntop_carts = 15
  Ntop_clicks = 20
  DISK_PIECES = 4

  '''
  VER = 7
  Ntop_buy2buy = 50
  Ntop_carts = 50
  Ntop_clicks = 50
  DISK_PIECES = 16
  '''


In [None]:
!ls {input_path}

test_parquet		       top_50_carts_orders_v7_4.pqt
top_15_buy2buy_v5_0.pqt        top_50_carts_orders_v7_5.pqt
top_15_carts_orders_v5_0.pqt   top_50_carts_orders_v7_6.pqt
top_15_carts_orders_v5_1.pqt   top_50_carts_orders_v7_7.pqt
top_15_carts_orders_v5_2.pqt   top_50_carts_orders_v7_8.pqt
top_15_carts_orders_v5_3.pqt   top_50_carts_orders_v7_9.pqt
top_20_clicks_v5_0.pqt	       top_50_clicks_v7_0.pqt
top_20_clicks_v5_1.pqt	       top_50_clicks_v7_10.pqt
top_20_clicks_v5_2.pqt	       top_50_clicks_v7_11.pqt
top_20_clicks_v5_3.pqt	       top_50_clicks_v7_12.pqt
top_50_buy2buy_v7_0.pqt        top_50_clicks_v7_13.pqt
top_50_buy2buy_v7_1.pqt        top_50_clicks_v7_14.pqt
top_50_buy2buy_v7_2.pqt        top_50_clicks_v7_15.pqt
top_50_buy2buy_v7_3.pqt        top_50_clicks_v7_1.pqt
top_50_carts_orders_v7_0.pqt   top_50_clicks_v7_2.pqt
top_50_carts_orders_v7_10.pqt  top_50_clicks_v7_3.pqt
top_50_carts_orders_v7_11.pqt  top_50_clicks_v7_4.pqt
top_50_carts_orders_v7_12.pqt  top_50_clicks_v7_5.pqt


In [None]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

def load_test():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(f'{input_path}/test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

test_df = load_test()
# メモリ削減
test_df['session'] = test_df['session'].astype('int32')
test_df['aid'] = test_df['aid'].astype('int32')

print('Test data has shape',test_df.shape)
test_df.head()

Test data has shape (6928123, 4)


Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899780,1142000,1661724000,0
2,12899780,582732,1661724058,0
3,12899780,973453,1661724109,0
4,12899780,736515,1661724136,0


In [None]:
%%time
#DISK_PIECES = 4

# LOAD THREE CO-VISITATION MATRICES
def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()


top_n_clicks = pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_clicks}_clicks_v{VER}_0.pqt') )
for k in range(1,DISK_PIECES): 
    top_n_clicks.update( pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_clicks}_clicks_v{VER}_{k}.pqt') ) )
top_n_buys = pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_carts}_carts_orders_v{VER}_0.pqt') )
for k in range(1,DISK_PIECES): 
    top_n_buys.update( pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_carts}_carts_orders_v{VER}_{k}.pqt') ) )

top_n_buy2buy = pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_buy2buy}_buy2buy_v{VER}_0.pqt') )
for k in range(1,DISK_PIECES // 4): 
    top_n_buy2buy.update( pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_buy2buy}_buy2buy_v{VER}_{k}.pqt') ) )

'''
top_n_clicks = pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_clicks}_clicks_v{VER}_0_{type_weight[0]}_{type_weight[1]}_{type_weight[2]}.pqt') )
for k in range(1,DISK_PIECES): 
    top_n_clicks.update( pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_clicks}_clicks_v{VER}_{k}_{type_weight[0]}_{type_weight[1]}_{type_weight[2]}.pqt') ) )
top_n_buys = pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_carts}_carts_orders_v{VER}_0_{type_weight[0]}_{type_weight[1]}_{type_weight[2]}.pqt') )
for k in range(1,DISK_PIECES): 
    top_n_buys.update( pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_carts}_carts_orders_v{VER}_{k}_{type_weight[0]}_{type_weight[1]}_{type_weight[2]}.pqt') ) )

top_n_buy2buy = pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_buy2buy}_buy2buy_v{VER}_0_{type_weight[0]}_{type_weight[1]}_{type_weight[2]}.pqt') )
for k in range(1,DISK_PIECES // 4): 
    top_n_buy2buy.update( pqt_to_dict( pd.read_parquet(f'{input_path}/top_{Ntop_buy2buy}_buy2buy_v{VER}_{k}_{type_weight[0]}_{type_weight[1]}_{type_weight[2]}.pqt') ) )
'''
print('Here are size of our 3 co-visitation matrices:')
print( len( top_n_clicks ), len( top_n_buy2buy ), len( top_n_buys ) )

Here are size of our 3 co-visitation matrices:
1837166 1168768 1837166
CPU times: user 1min 37s, sys: 14.2 s, total: 1min 51s
Wall time: 2min


In [None]:
# 全体のランキングは少し多めに取る
Ntop_all = 1000000
top_clicks = test_df.loc[test_df['type']== 0,'aid'].value_counts().index.values[:Ntop_all] 
top_carts = test_df.loc[test_df['type']== 1,'aid'].value_counts().index.values[:Ntop_all]
top_orders = test_df.loc[test_df['type']== 2,'aid'].value_counts().index.values[:Ntop_all]

In [None]:
# aidごとのcount
top_clicks_num_df = test_df.loc[test_df['type']== 0,'aid'].value_counts()
top_carts_num_df = test_df.loc[test_df['type']== 1,'aid'].value_counts()
top_orders_num_df = test_df.loc[test_df['type']== 2,'aid'].value_counts()

top_clicks_num_df = pd.DataFrame({'aid': top_clicks_num_df.index, 'clicks_count': top_clicks_num_df.values})
top_carts_num_df = pd.DataFrame({'aid': top_carts_num_df.index, 'carts_count': top_carts_num_df.values})
top_orders_num_df = pd.DataFrame({'aid': top_orders_num_df.index, 'orders_count': top_orders_num_df.values})

top_counts_df = top_clicks_num_df.merge(top_carts_num_df, how = 'outer', on = 'aid').merge(top_orders_num_df, how = 'outer', on = 'aid').fillna(0)

del top_clicks_num_df, top_carts_num_df, top_orders_num_df
gc.collect()

top_counts_df['aid'] = top_counts_df['aid'].astype('int32')
top_counts_df['clicks_count'] = top_counts_df['clicks_count'].astype('int32')
top_counts_df['carts_count'] = top_counts_df['carts_count'].astype('int16')
top_counts_df['orders_count'] = top_counts_df['orders_count'].astype('int16')

In [None]:
print(top_counts_df.shape)
top_counts_df.head()

(783486, 4)


Unnamed: 0,aid,clicks_count,carts_count,orders_count
0,1460571,8318,482,70
1,485256,7055,2001,0
2,108125,6115,221,25
3,986164,4014,481,74
4,1551213,3574,18,0


## 履歴ベース

In [None]:
# sortしてsessionごとの通し番号をつける
#test_df = test_df.sort_values(['session','ts'],ascending=[True,False])
test_df = test_df.sort_values(['session','ts'])
test_df = test_df.reset_index(drop=True)
test_df['ts_rank_inv'] = test_df.groupby('session').cumcount()

# sessionごとにtsの最大値からのdiffを求める
max_tmp = test_df.groupby('session')['ts'].agg('max')
test_df = test_df.merge(max_tmp, how = 'inner', on = 'session').rename(columns={'ts_y': 'ts_max'}).rename(columns={'ts_x': 'ts'})
test_df['diff_ts'] = (test_df['ts_max'] - test_df['ts'])

# sessionごとにtype別、全actions数を求める
tmp = test_df.groupby('session').size().reset_index(name='session_action_count')
tmp0 = test_df.query('type==0').groupby('session').size().reset_index(name='session_click_count')
tmp1 = test_df.query('type==1').groupby('session').size().reset_index(name='session_cart_count')
tmp2 = test_df.query('type==2').groupby('session').size().reset_index(name='session_order_count')
tmp3 = test_df.groupby('session').mean()['type'].reset_index(name='session_type_mean')
# session情報を残したdf
session_df = tmp.merge(tmp0, how = 'left', on = 'session').merge(tmp1, how = 'left', on = 'session').merge(tmp2, how = 'left', on = 'session').fillna(0)
session_df = session_df.merge(tmp3, 'left', on = 'session')
# action countはのちに重みを計算するときに使う
test_df = test_df.merge(tmp, how = 'inner', on = 'session')
test_df = test_df.drop(['ts_max'] , axis=1)
del max_tmp, tmp, tmp0, tmp1, tmp2, tmp3
gc.collect()

45

In [None]:
#type_weight_multipliers = {0: 1, 1: 5, 2: 4}
#type_weight_multipliers = {0:0.5, 1:9, 2:0.5}
type_weight_multipliers = {0: 1, 1: 6, 2: 3}


def func(df):
  weights=np.logspace(0.1,1,df['session_action_count'],base=2, endpoint=True)-1
  # tsでdescになっているので昇順の順位で計算する
  return weights[df['ts_rank_inv']] * type_weight_multipliers[df['type']]

test_df['score'] = test_df.apply(func, axis=1)

In [None]:
test_df.head()

Unnamed: 0,session,aid,ts,type,ts_rank_inv,diff_ts,session_action_count,score
0,12899779,59625,1661724000,0,0,0,1,0.071773
1,12899780,1142000,1661724000,0,0,155,5,0.071773
2,12899780,582732,1661724058,0,1,97,5,0.252664
3,12899780,973453,1661724109,0,2,46,5,0.464086
4,12899780,736515,1661724136,0,3,19,5,0.71119


In [None]:
# それぞれのscoreを記録する
test_group1 = test_df[['session','aid','score']].groupby(['session','aid'])['score'].sum().reset_index(name ='score_click')
test_group2 = test_df.query('type==0 or type==1')[['session','aid','score']].groupby(['session','aid'])['score'].sum().reset_index(name ='score_cart')
test_group3 = test_df.query('type==1 or type==2')[['session','aid','score']].groupby(['session','aid'])['score'].sum().reset_index(name ='score_buy')
test_group4 = test_df.query('type==0')[['session','aid','score']].groupby(['session','aid'])['score'].sum().reset_index(name ='score_click_only')
test_group5 = test_df.query('type==1')[['session','aid','score']].groupby(['session','aid'])['score'].sum().reset_index(name ='score_cart_only')
test_group6 = test_df.query('type==2')[['session','aid','score']].groupby(['session','aid'])['score'].sum().reset_index(name ='score_buy_only')

# cart, buyがnullの場合は0で埋める
test_group = test_group1.merge(test_group2, how = 'left', on = ['session','aid']).merge(test_group3, how = 'left', on = ['session','aid']).merge(test_group4, how = 'left', on = ['session','aid']).merge(test_group5, how = 'left', on = ['session','aid']).merge(test_group6, how = 'left', on = ['session','aid']).fillna(0)
# session情報を付与
test_group = test_group.merge(session_df, how = 'inner', on = 'session')

del test_group1, test_group2, test_group3, test_group4, test_group5, test_group6, session_df
gc.collect()

0

In [None]:
test_group.groupby('session').size().describe()

count    1.671803e+06
mean     2.994705e+00
std      5.113723e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      3.000000e+00
max      4.330000e+02
dtype: float64

In [None]:
print(test_group.shape)
test_group.head()

(5006557, 13)


Unnamed: 0,session,aid,score_click,score_cart,score_buy,score_click_only,score_cart_only,score_buy_only,session_action_count,session_click_count,session_cart_count,session_order_count,session_type_mean
0,12899779,59625,0.071773,0.071773,0.0,0.071773,0.0,0.0,1,1.0,0.0,0.0,0.0
1,12899780,582732,0.252664,0.252664,0.0,0.252664,0.0,0.0,5,5.0,0.0,0.0,0.0
2,12899780,736515,0.71119,0.71119,0.0,0.71119,0.0,0.0,5,5.0,0.0,0.0,0.0
3,12899780,973453,0.464086,0.464086,0.0,0.464086,0.0,0.0,5,5.0,0.0,0.0,0.0
4,12899780,1142000,1.071773,1.071773,0.0,1.071773,0.0,0.0,5,5.0,0.0,0.0,0.0


In [None]:
# メモリ削減1
test_group['score_click'] = test_group['score_click'].astype('float32')
test_group['score_cart'] = test_group['score_cart'].astype('float32')
test_group['score_buy'] = test_group['score_buy'].astype('float32')
test_group['score_click_only'] = test_group['score_click_only'].astype('float32')
test_group['score_cart_only'] = test_group['score_cart_only'].astype('float32')
test_group['score_buy_only'] = test_group['score_buy_only'].astype('float32')
test_group['session_action_count'] = test_group['session_action_count'].astype('int16')
test_group['session_click_count'] = test_group['session_click_count'].astype('int16')
test_group['session_cart_count'] = test_group['session_cart_count'].astype('int16')
test_group['session_order_count'] = test_group['session_order_count'].astype('int16')
test_group['session_type_mean'] = test_group['session_type_mean'].astype('float32')

## 共起ベース

In [None]:
def get_expand_df(df, name):
    df = pd.DataFrame(df, columns=["lis"]).reset_index()
    ex_df = df.explode("lis").reset_index(drop=True)
    ex_df[name] = ex_df.groupby('session').cumcount().astype('int8')
    return ex_df

In [None]:
small_top_clicks = top_clicks[:Ntop]
# Nclicksはtop_n_clicksの上位何件までを考慮するか、baseline: 20
def suggest_clicks(df, Nclicks):
    aids=df.aid.tolist()
    # 長すぎる場合は上限で切る
    unique_aids = list(dict.fromkeys(aids[::-1] ))[:Ntop]
    aids2 = list(itertools.chain(*[top_n_clicks[aid][:Nclicks] for aid in unique_aids if aid in top_n_clicks]))
    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2).most_common(Ntop)]
    result = top_aids2[:Ntop]
    len_result = len(result)
  
    if len_result == Ntop:
        return result
    else:
        set_result = set(result)
        return result + [i for i in small_top_clicks if i not in set_result][:Ntop - len_result]

In [None]:
# 5,10,20,30,50の5パターン作る. backfillは一応そのまま残しておく

click_topn_list = [10, 20]
for i, val in enumerate(click_topn_list):
    print('i, val=', i, val,'.........')

    df_clicks_tmp = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
        lambda x: suggest_clicks(x, val)
    )
    df_clicks_tmp = get_expand_df(df_clicks_tmp, f'n_clicks_{val}')
    if i == 0:
        df_clicks = df_clicks_tmp.copy()
    else:
        df_clicks = df_clicks.merge(df_clicks_tmp, how = 'outer', on = ['session','lis'])
    del df_clicks_tmp
    gc.collect()

    df_clicks[f'n_clicks_{val}'] = df_clicks[f'n_clicks_{val}'].fillna(-1).astype('int8')

del small_top_clicks
gc.collect()

i, val= 0 10 .........
i, val= 1 20 .........


0

In [None]:
'''
df = pd.DataFrame(df_clicks, columns=["lis"]).reset_index()
ex_df_clicks = df.explode("lis").reset_index(drop=True)
ex_df_clicks['n_clicks'] = ex_df_clicks.groupby('session').cumcount().astype('int8')
del df_clicks
gc.collect()
'''

'\ndf = pd.DataFrame(df_clicks, columns=["lis"]).reset_index()\nex_df_clicks = df.explode("lis").reset_index(drop=True)\nex_df_clicks[\'n_clicks\'] = ex_df_clicks.groupby(\'session\').cumcount().astype(\'int8\')\ndel df_clicks\ngc.collect()\n'

In [None]:
small_top_carts = top_carts[:Ntop]
# Nclicksはtop_n_clicksの上位何件までを考慮するか、baseline: 20
# Nbuysはtop_n_buysの上位何件までを考慮するか、baseline: 15
def suggest_carts(df,Nclicks, Nbuys):
    aids=df.aid.tolist()
    # 長すぎる場合は上限で切る
    unique_aids = list(dict.fromkeys(aids[::-1] ))[:Ntop]

  # Use "cart order" and "clicks" co-visitation matrices
    aids1 = list(itertools.chain(*[top_n_clicks[aid][:Nclicks] for aid in unique_aids if aid in top_n_clicks]))
    aids2 = list(itertools.chain(*[top_n_buys[aid][:Nbuys] for aid in unique_aids if aid in top_n_buys]))

    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids1+aids2).most_common(Ntop)] 
    result = top_aids2[:Ntop]
    len_result = len(result)
    
    if len_result == Ntop:
        return result
    else:
        set_result = set(result)
        #return result + list(top_carts)[:Ntop - len(result)]
        return result + [i for i in small_top_carts if i not in set_result][:Ntop - len_result]


df_carts = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_carts(x,20,15)
)

del small_top_carts
gc.collect()

0

In [None]:
df_carts = get_expand_df(df_carts, 'n_carts')
del top_n_clicks
gc.collect()
'''
df = pd.DataFrame(df_carts, columns=["lis"]).reset_index()
ex_df_carts = df.explode("lis").reset_index(drop=True)
ex_df_carts['n_carts'] = ex_df_carts.groupby('session').cumcount().astype('int8')
del df_carts, top_n_clicks
gc.collect()
'''

'\ndf = pd.DataFrame(df_carts, columns=["lis"]).reset_index()\nex_df_carts = df.explode("lis").reset_index(drop=True)\nex_df_carts[\'n_carts\'] = ex_df_carts.groupby(\'session\').cumcount().astype(\'int8\')\ndel df_carts, top_n_clicks\ngc.collect()\n'

In [None]:
small_top_orders = top_orders[:Ntop]
# Nbuysはtop_n_buysの上位何件までを考慮するか、baseline: 15
# Nbuy2buyはtop_n_buy2buyの上位何件までを考慮するか、baseline: 15
def suggest_buys(df, Nbuys, Nbuy2buy):
    aids=df.aid.tolist()
    # 長すぎる場合は上限で切る
    unique_aids = list(dict.fromkeys(aids[::-1] ))[:Ntop]
    df = df.loc[(df['type']==1)|(df['type']==2)]
    unique_buys = list(dict.fromkeys( df.aid.tolist()[::-1] ))[:Ntop]

    # USE "CART ORDER" CO-VISITATION MATRIX
    aids2 = list(itertools.chain(*[top_n_buys[aid][:Nbuys] for aid in unique_aids if aid in top_n_buys]))
    # USE "BUY2BUY" CO-VISITATION MATRIX
    aids3 = list(itertools.chain(*[top_n_buy2buy[aid][:Nbuy2buy] for aid in unique_buys if aid in top_n_buy2buy]))

    # RERANK CANDIDATES
    top_aids2 = [aid2 for aid2, cnt in Counter(aids2+aids3).most_common(Ntop)] 
    result = top_aids2[:Ntop]
    len_result = len(result) 
    
    if len_result == Ntop:
        return result
    else:
        set_result = set(result)
        #return result + list(top_orders)[:Ntop - len(result)]
        return result + [i for i in small_top_orders if i not in set_result][:Ntop - len_result]


df_buys = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(
    lambda x: suggest_buys(x,15,15)
)
del small_top_orders
gc.collect()

0

In [None]:
df_buys = get_expand_df(df_buys, 'n_buys')
del test_df, top_n_buys, top_n_buy2buy
gc.collect()

'''
df = pd.DataFrame(df_buys, columns=["lis"]).reset_index()
ex_df_buys = df.explode("lis").reset_index(drop=True)
ex_df_buys['n_buys'] = ex_df_buys.groupby('session').cumcount().astype('int8')
del df_buys, df, test_df, top_n_buys, top_n_buy2buy
gc.collect()
'''

'\ndf = pd.DataFrame(df_buys, columns=["lis"]).reset_index()\nex_df_buys = df.explode("lis").reset_index(drop=True)\nex_df_buys[\'n_buys\'] = ex_df_buys.groupby(\'session\').cumcount().astype(\'int8\')\ndel df_buys, df, test_df, top_n_buys, top_n_buy2buy\ngc.collect()\n'

In [None]:
ex_df_all = df_clicks.merge(df_carts, how = 'outer', on = ['session','lis']).merge(df_buys, how = 'outer', on = ['session','lis'])
del df_clicks, df_carts, df_buys
gc.collect()
ex_df_all = ex_df_all.rename(columns={'lis': 'aid'})

'''
ex_df_all = ex_df_clicks.merge(ex_df_carts, how = 'outer', on = ['session','lis']).merge(ex_df_buys, how = 'outer', on = ['session','lis'])
del ex_df_clicks, ex_df_carts, ex_df_buys
gc.collect()
ex_df_all = ex_df_all.rename(columns={'lis': 'aid'})
'''

"\nex_df_all = ex_df_clicks.merge(ex_df_carts, how = 'outer', on = ['session','lis']).merge(ex_df_buys, how = 'outer', on = ['session','lis'])\ndel ex_df_clicks, ex_df_carts, ex_df_buys\ngc.collect()\nex_df_all = ex_df_all.rename(columns={'lis': 'aid'})\n"

In [None]:
test_group = test_group.merge(ex_df_all, how='outer', on = ['session', 'aid'])
del ex_df_all
gc.collect()

0

In [None]:
# メモリ削減, NaNがあるとJOINなどの行程でfloatになったりするのでちゃんと欠損値埋める
test_group['session'] = test_group['session'].astype('int32')
test_group['aid'] = test_group['aid'].astype('int32')
test_group['session_action_count'] = test_group['session_action_count'].fillna(0).astype('int16')
test_group['session_click_count'] = test_group['session_click_count'].fillna(0).astype('int16')
test_group['session_cart_count'] = test_group['session_cart_count'].fillna(0).astype('int16')
test_group['session_order_count'] = test_group['session_order_count'].fillna(0).astype('int16')
for i in click_topn_list:
    test_group[f'n_clicks_{i}'] = test_group[f'n_clicks_{i}'].fillna(-1).astype('int8')
test_group['n_carts'] = test_group['n_carts'].fillna(-1).astype('int8')
test_group['n_buys'] = test_group['n_buys'].fillna(-1).astype('int8')

In [None]:
print(test_group.shape)
test_group.head()

(143574745, 17)


Unnamed: 0,session,aid,score_click,score_cart,score_buy,score_click_only,score_cart_only,score_buy_only,session_action_count,session_click_count,session_cart_count,session_order_count,session_type_mean,n_clicks_10,n_clicks_20,n_carts,n_buys
0,12899779,59625,0.071773,0.071773,0.0,0.071773,0.0,0.0,1,1,0,0,0.0,-1,-1,-1,-1
1,12899780,582732,0.252664,0.252664,0.0,0.252664,0.0,0.0,5,5,0,0,0.0,3,6,4,4
2,12899780,736515,0.71119,0.71119,0.0,0.71119,0.0,0.0,5,5,0,0,0.0,22,41,36,33
3,12899780,973453,0.464086,0.464086,0.0,0.464086,0.0,0.0,5,5,0,0,0.0,-1,-1,-1,-1
4,12899780,1142000,1.071773,1.071773,0.0,1.071773,0.0,0.0,5,5,0,0,0.0,4,12,7,6


### popular items

In [None]:
dic_clicks = dict()
dic_carts = dict()
dic_orders = dict()

for i, v in enumerate(top_clicks):
    dic_clicks[v] = i
for i, v in enumerate(top_carts):
    dic_carts[v] = i
for i, v in enumerate(top_orders):
    dic_orders[v] = i

In [None]:
test_group['clicks_rank'] = test_group['aid'].apply(lambda x: dic_clicks[x] if x in dic_clicks else -1).astype('int32')
test_group['carts_rank'] = test_group['aid'].apply(lambda x: dic_carts[x] if x in dic_carts else -1).astype('int32')
test_group['orders_rank'] = test_group['aid'].apply(lambda x: dic_orders[x] if x in dic_orders else -1).astype('int32')

del dic_clicks, dic_carts, dic_orders, top_clicks, top_carts, top_orders
gc.collect()

In [None]:
test_group = test_group.merge(top_counts_df, how = 'left', on = 'aid')
del top_counts_df
gc.collect()

0

In [None]:
test_group['clicks_count'] = test_group['clicks_count'].fillna(0).astype('int32')
test_group['carts_count'] = test_group['carts_count'].fillna(0).astype('int16')
test_group['orders_count'] = test_group['orders_count'].fillna(0).astype('int16')

In [None]:
test_group

Unnamed: 0,session,aid,score_click,score_cart,score_buy,score_click_only,score_cart_only,score_buy_only,session_action_count,session_click_count,...,n_clicks_10,n_clicks_20,n_carts,n_buys,clicks_rank,carts_rank,orders_rank,clicks_count,carts_count,orders_count
0,12899779,59625,0.071773,0.071773,0.0,0.071773,0.0,0.0,1,1,...,-1,-1,-1,-1,601483,-1,-1,1,0,0
1,12899780,582732,0.252664,0.252664,0.0,0.252664,0.0,0.0,5,5,...,3,6,4,4,1356,1157,1301,333,33,5
2,12899780,736515,0.711190,0.711190,0.0,0.711190,0.0,0.0,5,5,...,22,41,36,33,711,297,459,488,67,8
3,12899780,973453,0.464086,0.464086,0.0,0.464086,0.0,0.0,5,5,...,-1,-1,-1,-1,9547,13316,-1,87,7,0
4,12899780,1142000,1.071773,1.071773,0.0,1.071773,0.0,0.0,5,5,...,4,12,7,6,750,1114,1276,473,33,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143574740,14571581,145332,,,,,,,0,0,...,-1,-1,-1,45,162,62,30,1040,142,26
143574741,14571581,1336175,,,,,,,0,0,...,-1,-1,-1,46,298,179,31,775,86,25
143574742,14571581,714524,,,,,,,0,0,...,-1,-1,-1,47,104,56,32,1233,148,25
143574743,14571581,1359971,,,,,,,0,0,...,-1,-1,-1,48,110,60,33,1205,146,25


In [None]:
test_group.dtypes

# 正解ラベル定義

In [None]:
if valid_flag:
    for t in ['clicks','carts','orders']:
        print(t, '*******')
        test_labels = pd.read_parquet(f'{input_path}/test_labels.parquet')
        test_labels = test_labels.loc[test_labels['type']==t].rename(columns={'ground_truth': 'aid'})
        test_labels = test_labels.explode('aid')[['session', 'aid']].reset_index(drop=True)
        test_labels['session'] = test_labels['session'].astype('int32')
        test_labels['aid'] = test_labels['aid'].astype('int32')
        test_labels[f'y_{t}'] = 1
        test_labels[f'y_{t}'] = test_labels[f'y_{t}'].astype('bool')
    
        test_group = test_group.merge(test_labels, how='left', on=['session', 'aid'])
        test_group[f'y_{t}'] = test_group[f'y_{t}'].fillna(0).astype('bool')
    del test_labels
    gc.collect()

In [None]:
# 負例しかないものは学習に使えないので削る（学習のみ）
if valid_flag:
    true_df = (test_group.groupby('session')['y_clicks'].agg('sum') > 0) | (test_group.groupby('session')['y_carts'].agg('sum') > 0) | (test_group.groupby('session')['y_orders'].agg('sum') > 0)
    session = pd.DataFrame(true_df[true_df]).reset_index()['session']
    test_group = test_group.merge(session, how = 'inner', on = 'session')
    del true_df, session
    gc.collect()

In [None]:
test_group

Unnamed: 0,session,aid,score_click,score_cart,score_buy,score_click_only,score_cart_only,score_buy_only,session_action_count,session_click_count,...,n_clicks_10,n_clicks_20,n_carts,n_buys,clicks_rank,carts_rank,orders_rank,clicks_count,carts_count,orders_count
0,12899779,59625,0.071773,0.071773,0.0,0.071773,0.0,0.0,1,1,...,-1,-1,-1,-1,601483,-1,-1,1,0,0
1,12899780,582732,0.252664,0.252664,0.0,0.252664,0.0,0.0,5,5,...,3,6,4,4,1356,1157,1301,333,33,5
2,12899780,736515,0.711190,0.711190,0.0,0.711190,0.0,0.0,5,5,...,22,41,36,33,711,297,459,488,67,8
3,12899780,973453,0.464086,0.464086,0.0,0.464086,0.0,0.0,5,5,...,-1,-1,-1,-1,9547,13316,-1,87,7,0
4,12899780,1142000,1.071773,1.071773,0.0,1.071773,0.0,0.0,5,5,...,4,12,7,6,750,1114,1276,473,33,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143574740,14571581,145332,,,,,,,0,0,...,-1,-1,-1,45,162,62,30,1040,142,26
143574741,14571581,1336175,,,,,,,0,0,...,-1,-1,-1,46,298,179,31,775,86,25
143574742,14571581,714524,,,,,,,0,0,...,-1,-1,-1,47,104,56,32,1233,148,25
143574743,14571581,1359971,,,,,,,0,0,...,-1,-1,-1,48,110,60,33,1205,146,25


## Save

In [None]:
output_path = '/content/drive/MyDrive/output/otto'
if valid_flag:
    #test_group.to_parquet(f'{output_path}/train_{Ntop}.parquet')
    test_group.to_parquet(f'{output_path}/train_{Ntop}_tmp.parquet')
    #test_group.to_parquet(f'{output_path}/train_{Ntop}_old2.parquet') # top20にして50のi2i読み込む

else:
    test_group.to_parquet(f'{output_path}/test_{Ntop}_tmp.parquet')

In [None]:
test_group.groupby('session').size().describe()

count    1.671803e+06
mean     8.588018e+01
std      6.162662e+00
min      5.500000e+01
25%      8.300000e+01
50%      8.500000e+01
75%      8.800000e+01
max      4.500000e+02
dtype: float64

In [None]:
# label情報
# all: 2,212,692
# click: 1,755,534 gt:1
# cart: 306,341 gt: 1.89
# order: 150,817 gt: 2.08

if valid_flag:
    print('click summary............')
    a = test_group['y_clicks'].mean()
    b = test_group['y_clicks'].sum()
    c = test_group.query('score_click >= -1 or score_cart >= -1 or score_buy >= -1 or (-1 < n_clicks_20 and n_clicks_20<20) or (-1 < n_carts and n_carts<20) or (-1 < n_buys and n_buys<20)')['y_clicks'].sum()
    l1 = b / 1755534
    print('mean:', test_group['y_clicks'].mean(), 'sum:', test_group['y_clicks'].sum(),'limit:', l1)
    print('click<20 sum:',c, 'limit:', c / 1755534)

    print('cart summary............')
    a = test_group['y_carts'].mean()
    b = test_group['y_carts'].sum()
    c = test_group.query('score_click >= -1 or score_cart >= -1 or score_buy >= -1 or (-1 < n_clicks_20 and n_clicks_20<20) or (-1 < n_carts and n_carts<20) or (-1 < n_buys and n_buys<20)')['y_carts'].sum()
    l2 = b / (306341*1.89)
    print('mean:', test_group['y_carts'].mean(), 'sum:', test_group['y_carts'].sum(),'limit:', l2)
    print('cart<20 sum:',c, 'limit:', c / (306341*1.89))

    print('order summary............')
    a = test_group['y_orders'].mean()
    b = test_group['y_orders'].sum()
    c = test_group.query('score_click >= -1 or score_cart >= -1 or score_buy >= -1 or (-1 < n_clicks_20 and n_clicks_20<20) or (-1 < n_carts and n_carts<20) or (-1 < n_buys and n_buys<20)')['y_orders'].sum()
    l3 = b / (150817*2.08)
    print('mean:', test_group['y_orders'].mean(), 'sum:', test_group['y_orders'].sum(),'limit:', l3)
    print('order<20 sum:',c, 'limit:', c / (150817*2.08))
   
    print('Total limit recall............')
    print(0.1 * l1 + 0.3 * l2 + 0.6 * l3)

click summary............
mean: 0.010768325956732374 sum: 1016195 limit: 0.5788523605922756
click<20 sum: 964628 limit: 0.5494783923296274
cart summary............
mean: 0.002913830937912984 sum: 274975 limit: 0.47492636633496005
cart<20 sum: 253067 limit: 0.4370877016066527
order summary............
mean: 0.0022968903534769662 sum: 216755 limit: 0.6909641129009636
order<20 sum: 209508 limit: 0.6678623762573185
Total limit recall............
0.6149416137002938


In [None]:
test_group.groupby('session')['y_clicks'].agg('sum').describe()
#test_group.groupby('session')['y_carts'].agg('mean').describe()
#test_group.groupby('session')['y_orders'].agg('mean').describe()