# exp013
latest+pairのナイーブ予測

In [1]:
import os
import sys
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing

import pandas as pd
import numpy as np
import itertools

In [2]:
EXP_NAME = "exp013"

In [3]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")

# os.makedirs(os.path.join(OUTPUT_DIR, EXP_NAME), exist_ok=True)

In [4]:
def pred(sessions, pair_dict):
    session_ids = sessions["session"].unique().tolist()

    # sessionごとの最頻値
    session_latest_aid = sessions.sort_values(["session", "ts"], ascending=(True, False)).drop_duplicates(subset=["session", "aid"])
    session_latest_aid = session_latest_aid.groupby("session").head(20)

    # sessionごとの最頻値のペア
    session_latest_aid_pair = session_latest_aid.copy()
    session_latest_aid_pair["aid"] = session_latest_aid_pair["aid"].map(pair_dict)
    session_latest_aid_pair = session_latest_aid_pair.dropna(subset=["aid"])

    # 全体の最頻値
    total_aid_cnt = sessions.groupby("aid").agg(cnt=("ts", "count")).reset_index()
    total_aid_cnt = total_aid_cnt.sort_values("cnt", ascending=False)
    total_aid_cnt = total_aid_cnt.head(20)
    most_freq_aid = total_aid_cnt["aid"].tolist()
    total_frequent = pd.DataFrame(list(itertools.product(session_ids, most_freq_aid)), columns=["session", "aid"])
    
    # 予測作成
    recom = pd.concat([session_latest_aid[["session", "aid"]],
                       session_latest_aid_pair[["session", "aid"]],
                       total_frequent[["session", "aid"]]])

    recom = recom.groupby("session").head(20)
    recom = recom.groupby("session")["aid"].apply(list).reset_index()

    return recom

In [5]:
def evaluate(click_labels, carts_labels, orders_labels, 
             click_preds, carts_preds, orders_preds, k=20):

    num_clicks = 0
    num_carts = 0
    num_orders = 0
    hit_clicks = 0
    hit_carts = 0
    hit_orders = 0

    for i in range(len(labels)):
        click_label = click_labels[i]
        carts_label = carts_labels[i]
        orders_label = orders_labels[i]
        click_pred = click_preds[i][:k]
        carts_pred = carts_preds[i][:k]
        orders_pred = orders_preds[i][:k]

        if not np.isnan(click_label):
            num_clicks += 1
            hit_clicks += int(click_label in click_pred)

        if type(carts_label) == list:
            num_carts += min(len(carts_label), k)
            hit_carts += len(set(carts_pred) & set(carts_label))
            
        if type(orders_label) == list:
            num_orders += min(len(orders_label), k)
            hit_orders += len(set(orders_pred) & set(orders_label))


    recall_clicks = hit_clicks / num_clicks
    recall_carts = hit_carts / num_carts
    recall_orders = hit_orders / num_orders
    score = (recall_clicks * 0.10) + (recall_carts * 0.30) + (recall_orders * 0.60)

    results = {}
    results["num_clicks"] = num_clicks
    results["hit_clicks"] = hit_clicks
    results["recall_clicks"] = recall_clicks
    results["num_carts"] = num_carts
    results["hit_carts"] = hit_carts
    results["recall_carts"] = recall_carts
    results["num_orders"] = num_orders
    results["hit_orders"] = hit_orders
    results["recall_orders"] = recall_orders
    results["score"] = score

    return results

In [6]:
# データ読み込み
train_sessions = pd.read_pickle(PREP_DIR + "train_sessions_week1.pkl")
labels = pd.read_pickle(PREP_DIR + "labels_week1.pkl")

In [7]:
train_sessions = train_sessions.drop(columns=["type"])

In [8]:
pair_df = pd.read_pickle(PREP_DIR + "co_visitation_matrix.pkl")
pair_df = pair_df[pair_df["cnt"] > 3].copy()
pair_df = pair_df.sort_values(["aid_x", "cnt"], ascending=(True, False))
pair_df = pair_df.groupby("aid_x").head(1)[["aid_x", "aid_y"]]
pair_dict = {k: v for k, v in zip(pair_df["aid_x"].tolist(), pair_df["aid_y"].tolist())}

In [9]:
recom = pred(train_sessions, pair_dict)

In [10]:
click_preds = recom["aid"].tolist()
carts_preds = recom["aid"].tolist()
orders_preds = recom["aid"].tolist()

In [11]:
click_labels = labels["clicks_labels"].tolist()
carts_labels = labels["carts_labels"].tolist()
orders_labels = labels["orders_labels"].tolist()

In [12]:
evaluate(click_labels=labels["clicks_labels"].tolist(),
         carts_labels=labels["carts_labels"].tolist(),
         orders_labels=labels["orders_labels"].tolist(),
         click_preds=recom["aid"].tolist(),
         carts_preds=recom["aid"].tolist(),
         orders_preds=recom["aid"].tolist())

{'num_clicks': 4320369,
 'hit_clicks': 1508346,
 'recall_clicks': 0.3491243456288109,
 'num_carts': 1831983,
 'hit_carts': 525125,
 'recall_carts': 0.28664294373910676,
 'num_orders': 885029,
 'hit_orders': 471016,
 'recall_orders': 0.5322040294724806,
 'score': 0.44022773536810145}