In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
#baseline ver
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
#import cudf, 
import itertools
#print('We will use RAPIDS version',cudf.__version__)

# test data

In [3]:
INPUT_DIR = '/content/drive/MyDrive/kaggle/2022/OTTO/input/'
OUTPUT_DIR = '/content/drive/MyDrive/kaggle/2022/OTTO/output/candidate/popular/'

In [4]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

def load_test():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(INPUT_DIR+'cris_baseline/otto-validation/test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

#test_df
test_df = load_test()
print('Test data has shape',test_df.shape)
test_df

Test data has shape (7683577, 4)


Unnamed: 0,session,aid,ts,type
0,11188591,1485015,1661165822,0
1,11188591,1562739,1661165866,0
2,11188591,1485015,1661165882,0
3,11188591,1441634,1661165891,0
4,11188591,1485015,1661165901,0
...,...,...,...,...
7683572,12629597,744187,1661646518,0
7683573,12629597,714524,1661646521,0
7683574,12629597,1844379,1661646612,0
7683575,12629598,894387,1661646273,0


#test_labels

In [5]:

test_labels = pd.read_parquet('/content/drive/MyDrive/kaggle/2022/OTTO/input/cris_baseline/otto-validation/test_labels.parquet')
test_labels

Unnamed: 0,session,type,ground_truth
0,11098528,clicks,[1679529]
1,11098528,carts,[1199737]
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369..."
3,11098529,clicks,[1105029]
4,11098530,orders,[409236]
...,...,...,...
2212687,12899774,clicks,[1399483]
2212688,12899775,clicks,[1760714]
2212689,12899776,clicks,[1737908]
2212690,12899777,clicks,[384045]


# test top 20clicks

In [6]:
%%time
top_clicks = test_df.loc[test_df['type']==0,'aid'].value_counts().index.values[:20]
top_clicks

#test top 20clicksは全てのセッションで同じ商品コード予測するだけなので、groupby周りの処理は不要だけど一応baselineのcodeに合わせる。
pred_df = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(lambda x: top_clicks)
pred_df = pd.DataFrame(pred_df, columns=["labels"]).reset_index()
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))

CPU times: user 33 s, sys: 821 ms, total: 33.8 s
Wall time: 33.7 s


# recall

In [7]:
%%time

#clicks,cart,orderのracall計算
def recall_k_evaluate(test_labels, pred_df, session_types, top_k):
    sub = pred_df.copy()
    #print(sub)
    sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')][:top_k])
    test_labels = test_labels.loc[test_labels['type']==session_types]
    test_labels = test_labels.merge(sub, how='left', on=['session'])
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1) 
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,top_k)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    print(f'{session_types} recall =',recall)

#clicksの場合
#test label(CV変わるときのため), pred_df_clicks, type, top_kは変わるので引数にする。
recall_k_evaluate(test_labels, pred_df, 'clicks', 20)

clicks recall = 0.017962625617048717
CPU times: user 53.6 s, sys: 1.41 s, total: 55 s
Wall time: 54.8 s


# save

In [9]:
with open(OUTPUT_DIR + 'top_clicks.pkl', 'wb') as f:
  pickle.dump(top_clicks, f)