In [1]:
import os
import pickle
import pandas as pd
import glob
import cudf
from datetime import datetime
import numpy as np
from collections import Counter
import itertools
pd.options.display.max_rows = 999


def read_file(f):
    df = cudf.read_parquet(f)
    df.ts = (df.ts / 1000).astype("int32")
    return df

def load_files(file_path):
    dfs = []
    # file_path = "./input/otto-validation/test_parquet/*"
    for e, chunk_file in enumerate(glob.glob(file_path)):
        chunk = pd.read_parquet(chunk_file)
        dfs.append(chunk)
    df = pd.concat(dfs).reset_index(drop=True).astype({"ts": "datetime64[ms]"})
    return df

In [2]:
train_df = load_files("../input/otto-validation/train_parquet/*")
test_df = load_files("../input/otto-validation/test_parquet/*")

In [3]:
click_df = pickle.load(open("../output/cv/test_labels_clicks.pkl", "rb"))
cart_df = pickle.load(open("../output/cv/test_labels_carts.pkl", "rb"))
order_df = pickle.load(open("../output/cv/test_labels_orders.pkl", "rb"))

print(click_df.shape)
print(cart_df.shape)
print(order_df.shape)

(1755534, 10)
(306341, 10)
(150817, 10)


In [4]:
test_labels = pd.read_parquet("../input/otto-validation/test_labels.parquet")
test_labels["gt_len"] = test_labels["ground_truth"].str.len()

In [5]:
file_path = "../input/otto-chunk-data-inparquet-format/*_parquet/*"
all_df = load_files(file_path)

In [6]:
cart_df

Unnamed: 0,session,type,ground_truth,session_type,top_n,top,labels,hits,gt_count,recall
0,11098528,carts,[1199737],11098528_carts,11830 1732105 588923 884502 876129 1157882 571...,876493 1406660 1236775 166037,"[11830, 1732105, 588923, 884502, 876129, 11578...",0,1,0.000000
1,11098533,carts,"[108676, 1406660, 988295, 1118792, 1366413, 15...",11098533_carts,1165015 385390 978918 1074173 765030 833149 16...,,"[1165015, 385390, 978918, 1074173, 765030, 833...",0,20,0.000000
2,11098534,carts,[223062],11098534_carts,908024 223062 1342293 1607945 1449202 530377 1...,,"[908024, 223062, 1342293, 1607945, 1449202, 53...",1,1,1.000000
3,11098536,carts,[649909],11098536_carts,1320019 1808377 442293 1539309 743977 986954 7...,,"[1320019, 1808377, 442293, 1539309, 743977, 98...",0,1,0.000000
4,11098538,carts,"[1263747, 668757, 1550143]",11098538_carts,1711586 1550143 1263747 703265 717871 1289587 ...,,"[1711586, 1550143, 1263747, 703265, 717871, 12...",2,3,0.666667
...,...,...,...,...,...,...,...,...,...,...
306336,12899644,carts,[822934],12899644_carts,436912 822934 371484 1032986 24547 231487 1260...,,"[436912, 822934, 371484, 1032986, 24547, 23148...",1,1,1.000000
306337,12899676,carts,[35328],12899676_carts,35328 1780088 182264 1784638 888801 980008 890...,,"[35328, 1780088, 182264, 1784638, 888801, 9800...",1,1,1.000000
306338,12899686,carts,[1627951],12899686_carts,1627951 1626731 348028 1238179 53600 447841 91...,876493 1406660 1236775 166037,"[1627951, 1626731, 348028, 1238179, 53600, 447...",1,1,1.000000
306339,12899713,carts,[1097818],12899713_carts,692474 213517 1085134 1511908 690107 788635 25...,876493 1406660 1236775 166037,"[692474, 213517, 1085134, 1511908, 690107, 788...",0,1,0.000000


In [7]:
cart_df["hits"].sum() / cart_df["gt_count"].sum()

0.41176480792114933

In [8]:
cart_df["miss"] = cart_df.apply(lambda x: list(set(x["ground_truth"]) - set(x["labels"])), axis=1)

In [9]:
cart_df

Unnamed: 0,session,type,ground_truth,session_type,top_n,top,labels,hits,gt_count,recall,miss
0,11098528,carts,[1199737],11098528_carts,11830 1732105 588923 884502 876129 1157882 571...,876493 1406660 1236775 166037,"[11830, 1732105, 588923, 884502, 876129, 11578...",0,1,0.000000,[1199737]
1,11098533,carts,"[108676, 1406660, 988295, 1118792, 1366413, 15...",11098533_carts,1165015 385390 978918 1074173 765030 833149 16...,,"[1165015, 385390, 978918, 1074173, 765030, 833...",0,20,0.000000,"[108676, 988295, 1366413, 1233050, 1189919, 14..."
2,11098534,carts,[223062],11098534_carts,908024 223062 1342293 1607945 1449202 530377 1...,,"[908024, 223062, 1342293, 1607945, 1449202, 53...",1,1,1.000000,[]
3,11098536,carts,[649909],11098536_carts,1320019 1808377 442293 1539309 743977 986954 7...,,"[1320019, 1808377, 442293, 1539309, 743977, 98...",0,1,0.000000,[649909]
4,11098538,carts,"[1263747, 668757, 1550143]",11098538_carts,1711586 1550143 1263747 703265 717871 1289587 ...,,"[1711586, 1550143, 1263747, 703265, 717871, 12...",2,3,0.666667,[668757]
...,...,...,...,...,...,...,...,...,...,...,...
306336,12899644,carts,[822934],12899644_carts,436912 822934 371484 1032986 24547 231487 1260...,,"[436912, 822934, 371484, 1032986, 24547, 23148...",1,1,1.000000,[]
306337,12899676,carts,[35328],12899676_carts,35328 1780088 182264 1784638 888801 980008 890...,,"[35328, 1780088, 182264, 1784638, 888801, 9800...",1,1,1.000000,[]
306338,12899686,carts,[1627951],12899686_carts,1627951 1626731 348028 1238179 53600 447841 91...,876493 1406660 1236775 166037,"[1627951, 1626731, 348028, 1238179, 53600, 447...",1,1,1.000000,[]
306339,12899713,carts,[1097818],12899713_carts,692474 213517 1085134 1511908 690107 788635 25...,876493 1406660 1236775 166037,"[692474, 213517, 1085134, 1511908, 690107, 788...",0,1,0.000000,[1097818]


In [15]:
preds = cart_df.loc[cart_df["session"] == session_id, "labels"].item()

In [17]:
set(preds) & set(target_all["aid"].unique())

{11830, 588923}

In [22]:
target_test

Unnamed: 0,session,aid,ts,type
4219258,12899739,1379999,2022-08-28 21:58:45,clicks
4219259,12899739,1379999,2022-08-28 21:59:28,carts
4219260,12899739,992635,2022-08-28 21:59:31,clicks


In [53]:
from IPython.display import display
session_id = cart_df.sample()["session"].item()

gt = cart_df.loc[cart_df["session"] == session_id, "ground_truth"].item()
target_test = test_df[test_df["session"] == session_id]
target_all = all_df[all_df["session"] == session_id]
target_all = target_all.reset_index(drop=True)
target_all["visible"] = 0
target_all["gt"] = 0
target_all["predict"] = 0
target_all["correct"] = 0
target_all["missed"] = 0
target_all.loc[:len(target_test)-1, "visible"] = 1
target_all.loc[(target_all["type"] == "carts") & (target_all["aid"].isin(gt)), "gt"] = 1
target_all.loc[target_all["aid"].isin(cart_df.loc[cart_df["session"] == session_id, "labels"].item()), "predict"] = 1
target_all["correct"] = target_all.apply(lambda x: 1 if (x["gt"] == 1 and x["predict"] == 1) else 0, axis=1)
target_all["missed"] = target_all.apply(lambda x: 1 if (x["gt"] == 1 and x["predict"] == 0) else 0, axis=1)
display(target_all)
display(cart_df.loc[cart_df["session"] == session_id])

Unnamed: 0,session,aid,ts,type,visible,gt,predict,correct,missed
0,11713989,504361,2022-08-24 08:08:30.688,clicks,1,0,1,0,0
1,11713989,504361,2022-08-24 08:08:46.334,carts,1,0,1,0,0
2,11713989,1021135,2022-08-24 08:08:49.091,clicks,1,0,1,0,0
3,11713989,46906,2022-08-24 08:09:04.970,carts,0,1,1,1,0
4,11713989,1150262,2022-08-24 08:10:13.144,clicks,0,0,0,0,0
5,11713989,1150262,2022-08-24 08:10:25.046,carts,0,1,0,0,1
6,11713989,493911,2022-08-24 08:10:53.570,clicks,0,0,0,0,0
7,11713989,814357,2022-08-24 08:16:54.799,clicks,0,0,0,0,0
8,11713989,814357,2022-08-24 08:17:22.588,carts,0,1,0,0,1
9,11713989,1081705,2022-08-24 08:21:46.670,clicks,0,0,0,0,0


Unnamed: 0,session,type,ground_truth,session_type,top_n,top,labels,hits,gt_count,recall,miss
108395,11713989,carts,"[46906, 814357, 1150262]",11713989_carts,1021135 504361 559816 1457603 281151 24822 155...,,"[1021135, 504361, 559816, 1457603, 281151, 248...",1,3,0.333333,"[814357, 1150262]"
