In [1]:
import os
import pickle
import pandas as pd
import glob
import cudf
from datetime import datetime
import numpy as np
pd.options.display.max_rows = 999


def read_file(f):
    df = cudf.read_parquet(f)
    df.ts = (df.ts / 1000).astype("int32")
    return df

def load_files(file_path):
    dfs = []
    # file_path = "./input/otto-validation/test_parquet/*"
    for e, chunk_file in enumerate(glob.glob(file_path)):
        chunk = pd.read_parquet(chunk_file)
        dfs.append(chunk)
    df = pd.concat(dfs).reset_index(drop=True).astype({"ts": "datetime64[ms]"})
    return df

In [2]:
train_df = load_files("./input/otto-validation/train_parquet/*")
test_df = load_files("./input/otto-validation/test_parquet/*")

In [3]:
click_df = pickle.load(open("output/cv/test_labels_clicks.pkl", "rb"))
cart_df = pickle.load(open("output/cv/test_labels_carts.pkl", "rb"))
order_df = pickle.load(open("output/cv/test_labels_orders.pkl", "rb"))

print(click_df.shape)
print(cart_df.shape)
print(order_df.shape)

(1755534, 10)
(306341, 10)
(150817, 10)


In [8]:
test_labels = pd.read_parquet("./input/otto-validation/test_labels.parquet")
test_labels["gt_len"] = test_labels["ground_truth"].str.len()

In [9]:
top_clicks = test_df.loc[test_df["type"] == "clicks", "aid"].value_counts().index.values[:20]
top_orders = test_df.loc[test_df["type"] == "orders", "aid"].value_counts().index.values[:20]

In [10]:
cart_df["top_len"] = cart_df["top"].str.len()
cart_df["intersection_top_gt"] = cart_df.apply(lambda x: list(set(x["ground_truth"]) & set(x["top"])), axis=1)
cart_df["intersection_top_gt_len"] = cart_df["intersection_top_gt"].str.len()
cart_df[cart_df["top_len"] > 0]["intersection_top_gt_len"].value_counts()

0    85085
1      768
2        1
Name: intersection_top_gt_len, dtype: int64

In [11]:
click_df["top_len"] = click_df["top"].str.len()
click_df["intersection_top_gt"] = click_df.apply(lambda x: list(set(x["ground_truth"]) & set(x["top"])), axis=1)
click_df["intersection_top_gt_len"] = click_df["intersection_top_gt"].str.len()
click_df[click_df["top_len"] > 0]["intersection_top_gt_len"].value_counts()

0    25988
1       28
Name: intersection_top_gt_len, dtype: int64

In [12]:
order_df["top_len"] = order_df["top"].str.len()
order_df["intersection_top_gt"] = order_df.apply(lambda x: list(set(x["ground_truth"]) & set(x["top"])), axis=1)
order_df["intersection_top_gt_len"] = order_df["intersection_top_gt"].str.len()
order_df[order_df["top_len"] > 0]["intersection_top_gt_len"].value_counts()

0    28299
1      289
Name: intersection_top_gt_len, dtype: int64

In [13]:
cart_df[cart_df["top_len"] > 0].head()

Unnamed: 0,session,type,ground_truth,session_type,top_n,top,labels,hits,gt_count,recall,top_len,intersection_top_gt,intersection_top_gt_len
0,11098528,carts,[1199737],11098528_carts,"[11830, 1732105, 588923, 884502, 876129, 11578...","[876493, 1406660, 1236775, 166037]","[11830, 1732105, 588923, 884502, 876129, 11578...",0,1,0.0,4,[],0
11,11098555,carts,[1849620],11098555_carts,"[1849620, 205767, 161054, 1699061, 66403, 3297...","[876493, 1406660, 1236775, 166037]","[1849620, 205767, 161054, 1699061, 66403, 3297...",1,1,1.0,4,[],0
17,11098568,carts,[751895],11098568_carts,"[1808304, 519330, 1399038, 308483, 1619978, 10...","[876493, 1406660, 1236775, 166037]","[1808304, 519330, 1399038, 308483, 1619978, 10...",1,1,1.0,4,[],0
22,11098580,carts,[906875],11098580_carts,"[255676, 126566, 550852, 1142902, 216819, 1219...","[876493, 1406660, 1236775, 166037]","[255676, 126566, 550852, 1142902, 216819, 1219...",0,1,0.0,4,[],0
33,11098606,carts,[1629491],11098606_carts,"[1216616, 385065, 1782099, 184976, 1160293, 11...","[876493, 1406660, 1236775, 166037]","[1216616, 385065, 1782099, 184976, 1160293, 11...",0,1,0.0,4,[],0


In [15]:
1199737 in top_orders

False

In [16]:
1199737 in top_clicks

False

In [17]:
top_carts = test_df.loc[test_df["type"] == "carts", "aid"].value_counts().index.values[:20]

In [18]:
1199737 in top_carts

False

In [26]:
top_orders = train_df.loc[train_df["type"] == "orders", "aid"].value_counts()[:30]

In [27]:
1199737 in top_orders

False

In [None]:
train_df[train_df["session"] == 11098528]

In [None]:
test_df[test_df["session"] == 11098528]

In [None]:
test_labels[test_labels["session"] == 11098528]

In [None]:
click_df[click_df["session"] == 11098528]

In [None]:
top_orders

In [None]:
# file_path = "./input/otto-chunk-data-inparquet-format/*_parquet/*"
file_path = "./input/otto-validation/*_parquet/*"
all_df = load_files(file_path)

In [None]:
all_df.shape

In [None]:
all_df.shape

In [None]:
all_df[all_df["session"] == 11098528]