In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
#baseline ver
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
import os, sys, pickle, glob, gc
from collections import Counter
#import cudf, 
import itertools
#print('We will use RAPIDS version',cudf.__version__)

# test data

In [3]:
INPUT_DIR = '/content/drive/MyDrive/kaggle/2022/OTTO/input/'
OUTPUT_DIR = '/content/drive/MyDrive/kaggle/2022/OTTO/output/candidate/popular/'

In [4]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

def load_test():    
    dfs = []
    for e, chunk_file in enumerate(glob.glob(INPUT_DIR+'cris_baseline/otto-validation/test_parquet/*')):
        chunk = pd.read_parquet(chunk_file)
        chunk.ts = (chunk.ts/1000).astype('int32')
        chunk['type'] = chunk['type'].map(type_labels).astype('int8')
        dfs.append(chunk)
    return pd.concat(dfs).reset_index(drop=True) #.astype({"ts": "datetime64[ms]"})

#test_df
test_df = load_test()
print('Test data has shape',test_df.shape)
test_df

Test data has shape (7683577, 4)


Unnamed: 0,session,aid,ts,type
0,11188591,1485015,1661165822,0
1,11188591,1562739,1661165866,0
2,11188591,1485015,1661165882,0
3,11188591,1441634,1661165891,0
4,11188591,1485015,1661165901,0
...,...,...,...,...
7683572,12629597,744187,1661646518,0
7683573,12629597,714524,1661646521,0
7683574,12629597,1844379,1661646612,0
7683575,12629598,894387,1661646273,0


#test_labels

In [5]:
test_labels = pd.read_parquet('/content/drive/MyDrive/kaggle/2022/OTTO/input/cris_baseline/otto-validation/test_labels.parquet')
test_labels

Unnamed: 0,session,type,ground_truth
0,11098528,clicks,[1679529]
1,11098528,carts,[1199737]
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369..."
3,11098529,clicks,[1105029]
4,11098530,orders,[409236]
...,...,...,...
2212687,12899774,clicks,[1399483]
2212688,12899775,clicks,[1760714]
2212689,12899776,clicks,[1737908]
2212690,12899777,clicks,[384045]


# clicks candidate

## covisi matrix

In [6]:
%%time

# LOAD THREE CO-VISITATION MATRICES
def pqt_to_dict(df):
    return df.groupby('aid_x').aid_y.apply(list).to_dict()

#読み込み
DISK_PIECES = 4
top_20_clicks = pqt_to_dict( pd.read_parquet(INPUT_DIR+f'cris_baseline/output/top_20_clicks_v{VER}_0.pqt') )
for k in range(1,DISK_PIECES): 
    top_20_clicks.update( pqt_to_dict( pd.read_parquet(INPUT_DIR+f'cris_baseline/output/top_20_clicks_v{VER}_{k}.pqt') ) )

CPU times: user 38.3 s, sys: 3.07 s, total: 41.4 s
Wall time: 49 s


In [7]:
%%time
def covis_matrix_click_cart_order_to_clicks_timeweighting(df):
    aids = df.aid.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1] ))
    results =itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks])
    results = list(itertools.chain(*[top_20_clicks[aid] for aid in unique_aids if aid in top_20_clicks]))
    return results

pred_df = test_df.sort_values(["session", "ts"]).groupby(["session"]).apply(lambda x: covis_matrix_click_cart_order_to_clicks_timeweighting(x))
pred_df = pd.DataFrame(pred_df, columns=["labels"]).reset_index()
pred_df["labels"] = pred_df.labels.apply(lambda x: " ".join(map(str,x)))
pred_df


#pred_df_clicks['labels']が空のレコードはrecall_evaluateの[int(i) for i in x.split(' ')]　でintにできなくてエラーになるので適当なリストで埋めておく。
for i in range(len(pred_df)):
  if len(pred_df['labels'][i]) == 0:
    pred_df['labels'][i] = '1 1' #２個以上の数字で空白を作らないとrecall_evaluateのx.split(' ')できなくてエラーになる。

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 4min 10s, sys: 2.96 s, total: 4min 13s
Wall time: 4min 13s


# recall

In [8]:
%%time

#clicks,cart,orderのracall計算
def recall_k_evaluate(test_labels, pred_df, session_types, top_k):
    sub = pred_df.copy()
    #print(sub)
    sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')][:top_k])
    test_labels = test_labels.loc[test_labels['type']==session_types]
    test_labels = test_labels.merge(sub, how='left', on=['session'])
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1) 
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,top_k)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    print(f'{session_types} recall =',recall)

#clicksの場合
#test label(CV変わるときのため), pred_df_clicks, type, top_kは変わるので引数にする。
recall_k_evaluate(test_labels, pred_df, 'clicks', 20)

clicks recall = 0.2786132310738499
CPU times: user 1min 14s, sys: 1.79 s, total: 1min 16s
Wall time: 1min 16s


In [9]:
test_labels

Unnamed: 0,session,type,ground_truth
0,11098528,clicks,[1679529]
1,11098528,carts,[1199737]
2,11098528,orders,"[990658, 950341, 1462506, 1561739, 907564, 369..."
3,11098529,clicks,[1105029]
4,11098530,orders,[409236]
...,...,...,...
2212687,12899774,clicks,[1399483]
2212688,12899775,clicks,[1760714]
2212689,12899776,clicks,[1737908]
2212690,12899777,clicks,[384045]


In [11]:
pred_df

Unnamed: 0,session,labels
0,11098528,588923 1732105 571762 884502 1157882 876129 11...
1,11098529,459126 1339838 1544564 217742 1694360 1383767 ...
2,11098530,1603001 963957 254154 583026 167895 364155 752...
3,11098531,452188 1365569 698990 1136142 1553691 653835 6...
4,11098532,1202618 1159379 77906 1704066 1212859 669555 7...
...,...,...
1801246,12899774,1539309 819288 95488 771913 270852 743977 3149...
1801247,12899775,1760714 1255910 1163166 832192 29735 1498443 1...
1801248,12899776,1401030 1440959 1607333 1144446 364185 1150130...
1801249,12899777,1308634 1688215 703474 395762 1486067 211171 9...
