In [6]:
import os
import pickle
import pandas as pd
import glob
import cudf
from datetime import datetime
import numpy as np
pd.options.display.max_rows = 999


def read_file(f):
    df = cudf.read_parquet(f)
    df.ts = (df.ts / 1000).astype("int32")
    return df

def load_files(file_path):
    dfs = []
    # file_path = "./input/otto-validation/test_parquet/*"
    for e, chunk_file in enumerate(glob.glob(file_path)):
        chunk = pd.read_parquet(chunk_file)
        dfs.append(chunk)
    df = pd.concat(dfs).reset_index(drop=True).astype({"ts": "datetime64[ms]"})
    return df

In [7]:
train_df = load_files("../input/otto-validation/train_parquet/*")
test_df = load_files("../input/otto-validation/test_parquet/*")

In [9]:
click_df = pickle.load(open("../output/cv/test_labels_clicks.pkl", "rb"))
cart_df = pickle.load(open("../output/cv/test_labels_carts.pkl", "rb"))
order_df = pickle.load(open("../output/cv/test_labels_orders.pkl", "rb"))

print(click_df.shape)
print(cart_df.shape)
print(order_df.shape)

(1755534, 10)
(306341, 10)
(150817, 10)


In [11]:
test_labels = pd.read_parquet("../input/otto-validation/test_labels.parquet")
test_labels["gt_len"] = test_labels["ground_truth"].str.len()

In [None]:
file_path = "../input/otto-chunk-data-inparquet-format/*_parquet/*"
all_df = load_files(file_path)

In [12]:
cart_df

Unnamed: 0,session,type,ground_truth,session_type,top_n,top,labels,hits,gt_count,recall
0,11098528,carts,[1199737],11098528_carts,11830 1732105 588923 884502 876129 1157882 571...,876493 1406660 1236775 166037,"[11830, 1732105, 588923, 884502, 876129, 11578...",0,1,0.000000
1,11098533,carts,"[108676, 1406660, 988295, 1118792, 1366413, 15...",11098533_carts,1165015 385390 978918 1074173 765030 833149 16...,,"[1165015, 385390, 978918, 1074173, 765030, 833...",0,20,0.000000
2,11098534,carts,[223062],11098534_carts,908024 223062 1342293 1607945 1449202 530377 1...,,"[908024, 223062, 1342293, 1607945, 1449202, 53...",1,1,1.000000
3,11098536,carts,[649909],11098536_carts,1320019 1808377 442293 1539309 743977 986954 7...,,"[1320019, 1808377, 442293, 1539309, 743977, 98...",0,1,0.000000
4,11098538,carts,"[1263747, 668757, 1550143]",11098538_carts,1711586 1550143 1263747 703265 717871 1289587 ...,,"[1711586, 1550143, 1263747, 703265, 717871, 12...",2,3,0.666667
...,...,...,...,...,...,...,...,...,...,...
306336,12899644,carts,[822934],12899644_carts,436912 822934 371484 1032986 24547 231487 1260...,,"[436912, 822934, 371484, 1032986, 24547, 23148...",1,1,1.000000
306337,12899676,carts,[35328],12899676_carts,35328 1780088 182264 1784638 888801 980008 890...,,"[35328, 1780088, 182264, 1784638, 888801, 9800...",1,1,1.000000
306338,12899686,carts,[1627951],12899686_carts,1627951 1626731 348028 1238179 53600 447841 91...,876493 1406660 1236775 166037,"[1627951, 1626731, 348028, 1238179, 53600, 447...",1,1,1.000000
306339,12899713,carts,[1097818],12899713_carts,692474 213517 1085134 1511908 690107 788635 25...,876493 1406660 1236775 166037,"[692474, 213517, 1085134, 1511908, 690107, 788...",0,1,0.000000


In [14]:
cart_df["hits"].sum() / cart_df["gt_count"].sum()

0.41176480792114933

In [46]:
cart_df["miss"] = cart_df.apply(lambda x: list(set(x["ground_truth"]) - set(x["labels"])), axis=1)

In [53]:
from collections import Counter
import itertools

In [56]:
c = Counter(list(itertools.chain.from_iterable(cart_df["miss"].values)))

In [59]:
c.most_common(20)

[(122983, 135),
 (1497089, 113),
 (1406660, 106),
 (1022566, 104),
 (332654, 90),
 (1462420, 89),
 (1445562, 88),
 (322370, 84),
 (485256, 76),
 (923948, 76),
 (1006198, 75),
 (162064, 74),
 (1531805, 73),
 (801774, 73),
 (165160, 73),
 (756588, 72),
 (231487, 72),
 (1025795, 72),
 (544144, 72),
 (776937, 66)]

In [40]:
session_id = 11098528

gt = cart_df.loc[cart_df["session"] == session_id, "ground_truth"].item()
target_test = test_df[test_df["session"] == session_id]
target_all = all_df[all_df["session"] == session_id]
target_all = target_all.reset_index(drop=True)
target_all.loc[:len(target_test), "visible"] = 1
target_all.loc[(target_all["type"] == "carts") & (target_all["aid"].isin(gt)), "gt"] = 1
# target_all[:len(target_test)]["aid"].values
# target_all[len(target_test):]
target_all

Unnamed: 0,session,aid,ts,type,visible,gt
0,11098528,11830,2022-08-21 22:00:00.060,clicks,1.0,
1,11098528,1679529,2022-08-21 22:03:37.928,clicks,1.0,
2,11098528,92401,2022-08-21 22:04:34.036,clicks,,
3,11098528,1055218,2022-08-21 22:06:38.121,clicks,,
4,11098528,1561739,2022-08-21 22:07:24.692,clicks,,
5,11098528,1679529,2022-08-21 22:08:28.739,clicks,,
6,11098528,1033148,2022-08-21 22:09:14.935,clicks,,
7,11098528,440367,2022-08-21 22:09:46.475,clicks,,
8,11098528,11830,2022-08-21 22:10:44.461,clicks,,
9,11098528,1033148,2022-08-21 22:12:12.068,clicks,,
