In [1]:
import pandas as pd
import glob

def calc_metrics(pred_df, output_dir):
    score_potential = 0
    weights = {"clicks": 0.10, "carts": 0.30, "orders": 0.60}
    for t in ["clicks", "carts", "orders"]:
        sub = pred_df.loc[pred_df["type"] == t].copy()
        sub = sub.groupby("session")["aid"].apply(list)
        test_labels = pd.read_parquet("../input/otto-validation/test_labels.parquet")
        test_labels = test_labels.loc[test_labels["type"] == t]
        test_labels = test_labels.merge(sub, how="left", on=["session"])
        test_labels = test_labels[test_labels["aid"].notnull()]
        # potential recall
        test_labels["hits"] = test_labels.apply(lambda df: len(set(df["ground_truth"]).intersection(set(df["aid"]))), axis=1)
        test_labels["gt_count"] = test_labels.ground_truth.str.len().clip(0, 20)
        test_labels["recall"] = test_labels["hits"] / test_labels["gt_count"]
        recall = test_labels["hits"].sum() / test_labels["gt_count"].sum()
        score_potential += weights[t] * recall
        print(f"{t} recall={recall}")
    print(f"total recall={score_potential}")
    
    
def get_w2v_pred_df(exp_name):
    pred_df = pd.read_csv(f"../output/word2vec/{exp_name}/cv/pred_df.csv")
    prediction_dfs = []
    for st in ["clicks", "carts", "orders"]:
        modified_predictions = pred_df.copy()
        modified_predictions["type"] = st
        prediction_dfs.append(modified_predictions)
    prediction_dfs = pd.concat(prediction_dfs).reset_index(drop=True)
    return prediction_dfs

def load_test():
    dfs = []
    file_path = "../input/otto-validation/test_parquet/*"
    for e, chunk_file in enumerate(glob.glob(file_path)):
        chunk = pd.read_parquet(chunk_file)
        dfs.append(chunk)
    df = pd.concat(dfs).reset_index(drop=True)
    df = df[["session", "aid"]]
    df.drop_duplicates(inplace=True)
    prediction_dfs = []
    for st in ["clicks", "carts", "orders"]:
        modified_predictions = df.copy()
        modified_predictions["type"] = st
        prediction_dfs.append(modified_predictions)
    prediction_dfs = pd.concat(prediction_dfs).reset_index(drop=True)
    return prediction_dfs

In [3]:
test = load_test()

In [80]:
w2v = get_w2v_pred_df("daily-frog-198")
covisit = pd.read_csv("../output/covisit/warm-forest-181/cv/pred_df.csv")
print(w2v.shape)
print(covisit.shape)
pred_df = pd.concat([w2v, covisit, test])
pred_df = pred_df[["session", "aid", "type"]]
pred_df.drop_duplicates(inplace=True)
print(pred_df.shape)

(40540983, 4)
(116958096, 4)
(145703275, 3)


In [81]:
calc_metrics(pred_df, ".")

clicks recall=0.5416363340157467
carts recall=0.4439774355487249
orders recall=0.6727959834409502
total recall=0.5910344541307623


In [83]:
w2v = get_w2v_pred_df("jolly-snowball-180")
covisit = pd.read_csv("../output/covisit/warm-forest-181/cv/pred_df.csv")
print(w2v.shape)
print(covisit.shape)
pred_df = pd.concat([w2v, covisit, test])
pred_df = pred_df[["session", "aid", "type"]]
pred_df.drop_duplicates(inplace=True)
print(pred_df.shape)
calc_metrics(pred_df, ".")

(134264607, 4)
(116958096, 4)
(226857437, 3)
clicks recall=0.5584944524002383
carts recall=0.4540783580406674
orders recall=0.6777017775125038
total recall=0.5986940191597263


In [86]:
w2v = get_w2v_pred_df("decent-universe-186")
covisit = pd.read_csv("../output/covisit/warm-forest-181/cv/pred_df.csv")
print(w2v.shape)
print(covisit.shape)
pred_df = pd.concat([w2v, covisit, test])
pred_df = pred_df[["session", "aid", "type"]]
pred_df.drop_duplicates(inplace=True)
print(pred_df.shape)
calc_metrics(pred_df, ".")

(134179533, 4)
(116958096, 4)
(224243697, 3)
clicks recall=0.559971495852544
carts recall=0.45486936279016515
orders recall=0.6779475459858348
total recall=0.5992264860138048


In [87]:
w2v = get_w2v_pred_df("gallant-music-193")
covisit = pd.read_csv("../output/covisit/warm-forest-181/cv/pred_df.csv")
print(w2v.shape)
print(covisit.shape)
pred_df = pd.concat([w2v, covisit, test])
pred_df = pred_df[["session", "aid", "type"]]
pred_df.drop_duplicates(inplace=True)
print(pred_df.shape)
calc_metrics(pred_df, ".")

(118659354, 4)
(116958096, 4)
(209836310, 3)
clicks recall=0.5563611983590179
carts recall=0.4536845903254568
orders recall=0.6774145156605587
total recall=0.5981902063298741


In [4]:
w2v = get_w2v_pred_df("likely-oath-207")
covisit = pd.read_csv("../output/covisit/warm-forest-181/cv/pred_df.csv")
print(w2v.shape)
print(covisit.shape)
pred_df = pd.concat([w2v, covisit, test])
pred_df = pred_df[["session", "aid", "type"]]
pred_df.drop_duplicates(inplace=True)
print(pred_df.shape)
calc_metrics(pred_df, ".")

(213345204, 4)
(116958096, 4)
(295626960, 3)
clicks recall=0.5714449278681016
carts recall=0.4636692906283284
orders recall=0.6825054340366993
total recall=0.6057485403973283
