# exp002
予測モデルのベースライン

In [1]:
import os
import sys
import gc
import subprocess
from dotenv import load_dotenv
load_dotenv
sys.path.append(os.getenv('UTILS_PATH'))
from tqdm import tqdm
import multiprocessing
import inspect

import pandas as pd
import numpy as np
import itertools
import cudf

In [2]:
import builtins
import types

def imports():
    for name, val in globals().items():
        # module imports
        if isinstance(val, types.ModuleType):
            yield name, val

            # functions / callables
        if hasattr(val, '__call__'):
            yield name, val


def noglobal(f):
    '''
    ref: https://gist.github.com/raven38/4e4c3c7a179283c441f575d6e375510c
    '''
    return types.FunctionType(f.__code__,
                              dict(imports()),
                              f.__name__,
                              f.__defaults__,
                              f.__closure__
                              )

In [3]:
EXP_NAME = "exp002"

In [4]:
INPUT_DIR = os.getenv('INPUT_DIR')
OUTPUT_DIR = os.getenv('OUTPUT_DIR')
PREP_DIR = os.getenv("PREP_DIR")

os.makedirs(os.path.join(OUTPUT_DIR, EXP_NAME), exist_ok=True)

In [5]:
@noglobal
def evaluate(clicks_labels, carts_labels, orders_labels, 
             clicks_preds, carts_preds, orders_preds, k=20):

    num_clicks = 0
    num_carts = 0
    num_orders = 0
    hit_clicks = 0
    hit_carts = 0
    hit_orders = 0

    for i in range(len(clicks_labels)):
        clicks_label = clicks_labels[i]
        carts_label = carts_labels[i]
        orders_label = orders_labels[i]
        clicks_pred = clicks_preds[i]
        carts_pred = carts_preds[i]
        orders_pred = orders_preds[i]

        if type(clicks_pred) == list:
            clicks_pred = clicks_pred[:k]
        else:
            clicks_pred = []
        if type(carts_pred) == list:
            carts_pred = carts_pred[:k]
        else:
            carts_pred = []    
        if type(orders_pred) == list:
            orders_pred = orders_pred[:k]
        else:
            orders_pred = []

        if not np.isnan(clicks_label):
            num_clicks += 1
            hit_clicks += int(clicks_label in clicks_pred)

        if type(carts_label) == list:
            num_carts += min(len(carts_label), k)
            hit_carts += len(set(carts_pred) & set(carts_label))
            
        if type(orders_label) == list:
            num_orders += min(len(orders_label), k)
            hit_orders += len(set(orders_pred) & set(orders_label))


    recall_clicks = hit_clicks / num_clicks
    recall_carts = hit_carts / num_carts
    recall_orders = hit_orders / num_orders
    w_recall_clicks = recall_clicks * 0.10
    w_recall_carts = recall_carts * 0.30
    w_recall_orders = recall_orders * 0.6
    score = w_recall_clicks + w_recall_carts + w_recall_orders

    results = {}
    results["num_clicks"] = num_clicks
    results["hit_clicks"] = hit_clicks
    results["num_carts"] = num_carts
    results["hit_carts"] = hit_carts
    results["num_orders"] = num_orders
    results["hit_orders"] = hit_orders
    results["recall_clicks"] = format(recall_clicks, ".3f")
    results["recall_carts"] = format(recall_carts, ".3f")
    results["recall_orders"] = format(recall_orders, ".3f")
    results["w_recall_clicks"] = format(w_recall_clicks, ".3f")
    results["w_recall_carts"] = format(w_recall_carts, ".3f")
    results["w_recall_orders"] = format(w_recall_orders, ".3f")
    results["score"] = format(score, ".3f")

    return results

In [6]:
class Candidate:
    def __init__(self, df, labels=None):
        self.df = df
        self.target_sessions = df["session"].unique().tolist()
        self.results = pd.DataFrame(columns=["name", "num_clicks", "hit_clicks", "num_carts", "hit_carts", "num_orders", "hit_orders", 
                                             "recall_clicks", "recall_carts", "recall_orders", "w_recall_clicks", "w_recall_carts", "w_recall_orders", "score"])
        self.output = pd.DataFrame(columns=["session", "aid"], dtype=int)
        self.labels = labels

    def _entry(self, new_candidate_df, name, k):
        self.output = pd.concat([self.output, new_candidate_df])
        self.output = self.output.drop_duplicates()

        if self.labels is not None:
            self._eval(new_candidate_df, name, k)

    def _eval(self, new_candidate_df, name, k):
        new_candidate_df = new_candidate_df.groupby("session")["aid"].apply(list).reset_index()
        eval_df = pd.DataFrame(self.target_sessions, columns=["session"])
        eval_df = eval_df.merge(new_candidate_df, on=["session"], how="left")
        assert eval_df["session"].tolist() == self.labels["session"].tolist()
        eval_result = evaluate(self.labels["clicks_labels"].tolist(), self.labels["carts_labels"].tolist(), self.labels["orders_labels"].tolist(),
                               eval_df["aid"].tolist(), eval_df["aid"].tolist(), eval_df["aid"].tolist(), k)
        print(name)
        print(eval_result)
        self.results = pd.concat([self.results, pd.DataFrame([[name] + list(eval_result.values())], columns=["name"] + list(eval_result.keys()))])

    def add(self, strategy, name, usetypes=['clicks', 'carts', 'orders'], trans_map=None, k=20):
        candidate_df = self.df[self.df["type"].isin(usetypes)].copy()
        
        if strategy == "session_frequent":
            candidate_df = self._session_frequent(candidate_df, k)
        elif strategy == "session_latest":
            candidate_df = self._session_latest(candidate_df, k)
        elif strategy == "total_frequent":
            candidate_df = self._total_frequent(candidate_df, k)
        
        if trans_map is not None:
            candidate_df["aid"] = candidate_df["aid"].replace(trans_map)
            
        self._entry(candidate_df, name, k)

    def _session_frequent(self, candidate_df, k):
        candidate_df = candidate_df.groupby(["session", "aid"]).agg(aid_count=pd.NamedAgg(column="ts", aggfunc="count")).reset_index()
        candidate_df = candidate_df.sort_values(["session", "aid_count"], ascending=(True, False))
        candidate_df = candidate_df.groupby("session").head(k)
        candidate_df = candidate_df[["session", "aid"]].copy()
        return candidate_df

    def _session_latest(self, candidate_df, k):
         candidate_df = candidate_df.sort_values(["session", "ts"], ascending=(True, False))
         candidate_df = candidate_df.drop_duplicates(subset=["session", "aid"])
         candidate_df = candidate_df.groupby("session").head(k)
         candidate_df = candidate_df[["session", "aid"]]
         return candidate_df

    def _total_frequent(self, candidate_df, k):
        candidate_df = candidate_df.groupby("aid").agg(aid_count=pd.NamedAgg(column="ts", aggfunc="count")).reset_index()
        candidate_df = candidate_df.sort_values("aid_count", ascending=False)
        topk_freq_aids = candidate_df["aid"].tolist()[:k]
        sessions = []
        aids = []
        for session, aid in itertools.product(self.target_sessions, topk_freq_aids):
            sessions.append(session)
            aids.append(aid)
        candidate_df = pd.DataFrame({"session": sessions, "aid": aids})
        return candidate_df

In [7]:
class DataSet:
    def __init__(self, sessions, candidate, labels):
        self.sessions = sessions
        self.output = candidate
        self.labels = labels
        self.type_dict = {"clicks":0, "carts":1, "orders":2}

    def add_features(self, features_name):
        if features_name == "session_cnt":
            self._session_cnt()
        elif features_name == "session_aid_nunique":
            self._session_aid_nunique()
        elif features_name == "session_last_type":
            self._session_last_type()
        elif features_name == "aid_cnt":
            self._aid_cnt()

    def add_labels(self):
        labels_explode = pd.DataFrame()
        for _type in ["clicks", "carts", "orders"]:
            type_labels = self.labels[["session", f"{_type}_labels"]].dropna().copy()
            type_labels.columns = ["session", "aid"]
            type_labels = type_labels.explode("aid")
            type_labels["type"] = _type
            type_labels["labels"] = 1
            labels_explode = pd.concat([labels_explode, type_labels])
        self.output = self.output.merge(labels_explode, on=["session", "aid", "type"], how="left")
        self.output["labels"] = self.output["labels"].fillna(0)
        
    
    def _session_cnt(self):
        agg_df = self.sessions.groupby(["session", "type"]).agg(cnt=pd.NamedAgg(column="ts", aggfunc="count")).reset_index()
        # session_total_cnt
        features = agg_df.groupby("session").agg(session_total_cnt=pd.NamedAgg(column="cnt", aggfunc="sum")).reset_index()
        self.output = self.output.merge(features, on=["session"], how="left")
        self.output["session_total_cnt"] = self.output["session_total_cnt"].fillna(0)
        # session_{type}_cnt
        for _type in ["clicks", "carts", "orders"]:
            col_name = f"session_{_type}_cnt"
            features = agg_df[agg_df["type"]==_type].copy()
            features = features.rename(columns={"cnt": col_name})
            features = features[["session", col_name]].copy()
            self.output = self.output.merge(features, on=["session"], how="left")
            self.output[col_name] = self.output[col_name].fillna(0)

    def _session_aid_nunique(self):
        features = self.sessions.groupby(["session"]).agg(session_aid_nunique=pd.NamedAgg(column="aid", aggfunc="nunique")).reset_index()
        self.output = self.output.merge(features, on=["session"], how="left")
        self.output["session_aid_nunique"] = self.output["session_aid_nunique"].fillna(0)
        
    def _session_last_type(self):
        features = self.sessions.groupby("session").tail(1)[["session", "type"]]
        features = features.rename(columns={"type": "session_last_type"})
        features["session_last_type"] = features["session_last_type"].replace(self.type_dict)
        self.output = self.output.merge(features, on=["session"], how="left")

    def _aid_cnt(self):
        agg_df = self.sessions.groupby(["session", "aid", "type"]).agg(cnt=pd.NamedAgg(column="ts", aggfunc="count")).reset_index()

        # aid_total_cnt
        features = agg_df.groupby(["session", "aid"]).agg(aid_total_cnt=pd.NamedAgg(column="cnt", aggfunc="sum")).reset_index()
        self.output = self.output.merge(features, on=["session", "aid"], how="left")
        self.output["aid_total_cnt"] = self.output["aid_total_cnt"].fillna(0)

        # aid_{type}_cnt
        for _type in ["clicks", "carts", "orders"]:
            col_name = f"aid_{_type}_cnt"
            features = agg_df[agg_df["type"]==_type].copy()
            features = features.rename(columns={"cnt": col_name})
            features = features[["session", "aid", col_name]].copy()
            self.output = self.output.merge(features, on=["session", "aid"], how="left")
            self.output[col_name] = self.output[col_name].fillna(0)     

In [8]:
# データ読み込み
train_sessions = pd.read_pickle(PREP_DIR + "train_sessions_week1.pkl")
train_labels = pd.read_pickle(PREP_DIR + "labels_week1.pkl")

In [9]:
cand = Candidate(train_sessions)
cand.add(strategy="session_frequent", name="session_frequent")
cand.add(strategy="total_frequent", name="total_frequent")

In [10]:
dataset = DataSet(train_sessions, cand.output, train_labels)
dataset.add_features("session_cnt")
dataset.add_features("session_aid_nunique")
dataset.add_features("session_last_type")
dataset.add_features("aid_cnt")
#dataset.add_labels()

In [11]:
dataset.output

Unnamed: 0,session,aid,session_total_cnt,session_clicks_cnt,session_carts_cnt,session_orders_cnt,session_aid_nunique,session_last_type,aid_total_cnt,aid_clicks_cnt,aid_carts_cnt,aid_orders_cnt
0,0,1649869,41,37.0,2.0,2.0,31,0,4.0,3.0,1.0,0.0
1,0,305831,41,37.0,2.0,2.0,31,0,3.0,2.0,0.0,1.0
2,0,461689,41,37.0,2.0,2.0,31,0,3.0,1.0,1.0,1.0
3,0,1110548,41,37.0,2.0,2.0,31,0,2.0,2.0,0.0,0.0
4,0,1190046,41,37.0,2.0,2.0,31,0,2.0,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
104942511,5348719,95488,1,1.0,0.0,0.0,1,0,0.0,0.0,0.0,0.0
104942512,5348719,1615582,1,1.0,0.0,0.0,1,0,0.0,0.0,0.0,0.0
104942513,5348719,80222,1,1.0,0.0,0.0,1,0,0.0,0.0,0.0,0.0
104942514,5348719,554660,1,1.0,0.0,0.0,1,0,0.0,0.0,0.0,0.0
