In [1]:
import gc
import os
import pickle
import random
from collections import defaultdict
from pathlib import Path
from time import time

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.metrics import (accuracy_score, average_precision_score,
                             cohen_kappa_score, explained_variance_score,
                             f1_score, log_loss, mean_absolute_error,
                             mean_squared_error, mean_squared_log_error,
                             median_absolute_error, precision_score, r2_score,
                             recall_score, roc_auc_score)
from sklearn.model_selection import GroupKFold, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight
from tqdm import tqdm

plt.style.use("ggplot")
tqdm.pandas()

In [2]:
class CFG:
    name = "exp007"
    seed = 42
    n_fold = 5

    path_input = Path("../input")
    path_output = Path("../output")

    candidate_name_list = [
        "past_view_yado",
        "top_popular_yado",
        "top_latest_next_booking",
        "top_sml_popular_yado",
        "top_lrg_popular_yado",
    ]
    feature_name_list = [
        "top_latest_next_booking",
        "past_view_yado",
        "top_popular_yado",
        "top_wid_popular_yado",
        "top_ken_popular_yado",
        "top_lrg_popular_yado",
        "top_sml_popular_yado",
    ]

    cat_features = ["yad_no", "latest_yad_no", "wid_cd", "ken_cd", "lrg_cd", "sml_cd"]

    topn_top_popular_yado = 15
    topn_top_latest_next_booking = 20
    topn_top_area_popular_yado = 20

    lgb_model_params = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "learning_rate": 0.1,
        "random_state": seed,
    }

    lgb_train_params = {
        "num_boost_round": 999999,
    }

In [3]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True


def setup(CFG):
    # expフォルダを作成
    CFG.path_exp = CFG.path_output / CFG.name
    CFG.path_exp.mkdir(parents=True, exist_ok=True)

    # seedの設定
    set_seed(CFG.seed)

    return CFG

In [4]:
CFG = setup(CFG)

In [5]:
# データの読み込み
train_log_df = pl.read_csv(CFG.path_input / "train_log.csv")
train_label_df = pl.read_csv(CFG.path_input / "train_label.csv")

test_log_df = pl.read_csv(CFG.path_input / "test_log.csv")
test_session_df = pl.read_csv(CFG.path_input / "test_session.csv")

yado_df = pl.read_csv(CFG.path_input / "yado.csv")
yado_embedding_df = pl.read_parquet(CFG.path_input / "image_embeddings.parquet")

In [6]:
flag_column = [
    "wireless_lan_flg",
    "onsen_flg",
    "kd_stn_5min",
    "kd_bch_5min",
    "kd_slp_5min",
    "kd_conv_walk_5min",
]
# 欠損を0で埋める
yado_df[flag_column] = yado_df[flag_column].fill_null(0)

# make candidate

In [7]:
# Group Kfold, labelにfold情報を付けておく。
kf = KFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
fold_assignments = np.full(train_label_df.height, -1, dtype=int)  # heightは行数
for i, (_, valid_index) in enumerate(kf.split(train_label_df)):
    fold_assignments[valid_index] = i
# foldの情報を付与
train_label_df = train_label_df.with_columns(pl.Series("fold", fold_assignments))

In [8]:
def create_past_view_yado_candidates(log: pl.DataFrame):
    """
    過去に見た宿の候補を作成する。直近のものは除外することに注意
    """
    # session_idの最大値
    max_seq_no = log.group_by("session_id").agg(pl.max("seq_no").alias("max_seq_no"))
    log = log.join(max_seq_no, on="session_id")

    # 最大値に該当する行を除外
    past_yado_candidates = log.filter(pl.col("seq_no") != pl.col("max_seq_no"))
    past_yado_candidates = past_yado_candidates.select(
        ["session_id", "yad_no"]
    ).unique()

    # 簡易的な特徴量
    # 何個前に見たか
    past_yado_feature = log.with_columns(
        (pl.col("max_seq_no") - pl.col("seq_no")).alias("max_seq_no_diff")
    ).filter(pl.col("seq_no") != pl.col("max_seq_no"))
    # 同じやつを見ていたら、最大値を取る
    past_yado_feature = past_yado_feature.join(
        past_yado_feature.group_by(["session_id", "yad_no"]).agg(
            pl.col("max_seq_no_diff").max().alias("max_seq_no_diff")
        ),
        on=["session_id", "yad_no", "max_seq_no_diff"],
    )
    # max_seq_no_diffが2で割り切れるかどうかの0/1
    # past_yado_feature = past_yado_feature.with_columns(
    #     (pl.col("max_seq_no_diff") % 2 == 0).cast(pl.Int8).alias("max_seq_no_diff_even")
    # )

    # 何回見たか
    session_view_count = (
        log.group_by(["session_id", "yad_no"])
        .count()
        .rename({"count": "session_view_count"})
    )
    past_yado_feature = past_yado_feature.join(
        session_view_count, how="left", on=["session_id", "yad_no"]
    ).drop("seq_no")

    return past_yado_candidates, past_yado_feature

In [9]:
def create_topN_popular_yado_candidates(train_label_df, train_test="train", top=10):
    """
    予約された人気宿を候補として作成。リークに注意
    """
    top_yado_candidate = pl.DataFrame()
    popular_yado_feature = pl.DataFrame()
    if train_test == "train":
        for fold in range(CFG.n_fold):
            # valでない部分を取得
            train_label = train_label_df.filter(pl.col("fold") != fold)
            # yad_noでグループ化して、予約数をカウント
            popular_yado_sort = (
                train_label["yad_no"].value_counts().sort(by="counts", descending=True)
            )

            # 候補作成
            top_yado_candidate_fold = (
                popular_yado_sort.head(top)
                .with_columns(pl.lit(fold).alias("fold"))
                .select(["yad_no", "fold"])
            )
            top_yado_candidate = pl.concat(
                [top_yado_candidate, top_yado_candidate_fold]
            )

            # 簡易的な特徴量
            popular_yado_feature_fold = popular_yado_sort.with_columns(
                pl.lit(fold).alias("fold")
            )
            # 予約回数の順位
            popular_yado_feature_fold = popular_yado_feature_fold.with_columns(
                pl.arange(1, len(popular_yado_feature_fold) + 1).alias("popular_rank")
            )
            popular_yado_feature = pl.concat(
                [popular_yado_feature, popular_yado_feature_fold]
            )
    else:  # testは全体で作成
        # yad_noでグループ化して、予約数をカウント
        popular_yado_sort = (
            train_label_df["yad_no"].value_counts().sort(by="counts", descending=True)
        )

        # 候補作成
        top_yado_candidate = popular_yado_sort.head(top).select(["yad_no"])
        # 予約回数の順位
        popular_yado_feature = popular_yado_sort.with_columns(
            pl.arange(1, len(popular_yado_sort) + 1).alias("popular_rank")
        )

    popular_yado_feature = popular_yado_feature.rename({"counts": "reservation_counts"})

    return top_yado_candidate, popular_yado_feature

In [10]:
def create_topN_area_popular_yado_candidates(
    train_label_df, yado_df, train_test="train", area="wid_cd", top=10
):
    """
    エリア単位での人気宿を候補として作成。リークに注意
    """
    # yad_noでマージ
    label_yado = train_label_df.join(yado_df, on="yad_no")
    top_yado_area_candidate = pl.DataFrame()
    popular_yado_area_feature = pl.DataFrame()
    if train_test == "train":
        for fold in range(CFG.n_fold):
            # valでない部分を取得
            train_label = label_yado.filter(pl.col("fold") != fold)
            # areaとyad_noでグループ化して、予約数をカウント
            popular_yado_sort = (
                train_label.group_by([area, "yad_no"])
                .count()
                .sort(by=[area, "count"], descending=[False, True])
            )

            # 候補作成
            top_yado_area_candidate_fold = (
                popular_yado_sort.group_by(area)
                .head(top)
                .with_columns(pl.lit(fold).alias("fold"))
                .select([area, "yad_no", "fold"])
            )
            top_yado_area_candidate = pl.concat(
                [top_yado_area_candidate, top_yado_area_candidate_fold]
            )

            # 簡易的な特徴量
            popular_yado_area_feature_fold = popular_yado_sort.with_columns(
                pl.lit(fold).alias("fold")
            )
            # 予約回数の順位
            popular_yado_area_feature_fold = popular_yado_area_feature_fold.group_by(
                area
            ).map_groups(
                lambda group: group.with_columns(
                    pl.col("count")
                    .rank(method="dense", descending=True)
                    .over(area)
                    .alias(f"popular_{area}_rank")
                )
            )
            popular_yado_area_feature = pl.concat(
                [popular_yado_area_feature, popular_yado_area_feature_fold]
            )

    else:  # testは全体で作成
        # areaとyad_noでグループ化して、予約数をカウント
        popular_yado_sort = (
            label_yado.group_by([area, "yad_no"])
            .count()
            .sort(by=[area, "count"], descending=[False, True])
        )

        # 候補作成
        top_yado_area_candidate = (
            popular_yado_sort.group_by(area).head(top).select([area, "yad_no"])
        )
        # 簡易的な特徴量
        popular_yado_area_feature = popular_yado_sort.group_by(area).map_groups(
            lambda group: group.with_columns(
                pl.col("count")
                .rank(method="dense", descending=True)
                .over(area)
                .alias(f"popular_{area}_rank")
            )
        )

    popular_yado_area_feature = popular_yado_area_feature.drop("count")

    return top_yado_area_candidate, popular_yado_area_feature

In [11]:
def create_latest_next_booking_topN_candidate(
    log_df, train_label_df, train_test="train", top=10
):
    """
    直近見た宿からどこが予約しやすいか
    """
    # 最後に見た宿
    log_latest = log_df.group_by("session_id").tail(1)
    # rename
    log_latest = log_latest.rename({"yad_no": "latest_yad_no"})
    log_latest = log_latest.join(train_label_df, on="session_id")

    # 学習データはtrain/validで分ける
    top_latest_next_booking_candidate = pl.DataFrame()
    top_latest_next_booking_feature = pl.DataFrame()
    if train_test == "train":
        for fold in range(CFG.n_fold):
            # 学習データのみ
            train_log_latest = log_latest.filter(pl.col("fold") != fold)
            # 最後に見た宿と予約した宿の情報を取得
            train_log_latest = (
                train_log_latest.group_by(["latest_yad_no", "yad_no"])
                .count()
                .sort(by=["latest_yad_no", "count"], descending=[False, True])
            )

            # 候補作成
            top_latest_next_booking_candidate_fold = (
                train_log_latest.group_by("latest_yad_no")
                .head(top)
                .with_columns(pl.lit(fold).alias("fold"))
                .select(["yad_no", "latest_yad_no", "fold"])
            )
            top_latest_next_booking_candidate = pl.concat(
                [
                    top_latest_next_booking_candidate,
                    top_latest_next_booking_candidate_fold,
                ]
            )

            # 簡易的な特徴量
            top_latest_next_booking_feature_fold = train_log_latest.with_columns(
                pl.lit(fold).alias("fold")
            )
            top_latest_next_booking_feature_fold = (
                top_latest_next_booking_feature_fold.group_by(
                    "latest_yad_no"
                ).map_groups(
                    lambda group: group.with_columns(
                        pl.col("count")
                        .rank(method="dense", descending=True)
                        .over("latest_yad_no")
                        .alias(f"latest_next_booking_rank")
                    )
                )
            )
            top_latest_next_booking_feature = pl.concat(
                [top_latest_next_booking_feature, top_latest_next_booking_feature_fold]
            )

    else:  # testは全体で作成
        # 最後に見た宿と予約した宿の情報を取得
        log_latest = (
            log_latest.group_by(["latest_yad_no", "yad_no"])
            .count()
            .sort(by=["latest_yad_no", "count"], descending=[False, True])
        )

        # 候補作成
        top_latest_next_booking_candidate = (
            log_latest.group_by("latest_yad_no")
            .head(top)
            .select(["yad_no", "latest_yad_no"])
        )
        # 簡易的な特徴量
        top_latest_next_booking_feature = log_latest.group_by(
            "latest_yad_no"
        ).map_groups(
            lambda group: group.with_columns(
                pl.col("count")
                .rank(method="dense", descending=True)
                .over("latest_yad_no")
                .alias(f"latest_next_booking_rank")
            )
        )

    top_latest_next_booking_feature = top_latest_next_booking_feature.drop("count")

    return top_latest_next_booking_candidate, top_latest_next_booking_feature

In [12]:
# 過去に見た宿の候補を作成
(
    train_past_view_yado_candidates,
    train_past_view_yado_feature,
) = create_past_view_yado_candidates(train_log_df)
(
    test_past_view_yado_candidates,
    test_past_view_yado_feature,
) = create_past_view_yado_candidates(test_log_df)

In [13]:
# 人気宿を候補として作成
(
    train_top_popular_yado_candidates,
    train_top_popular_yado_feature,
) = create_topN_popular_yado_candidates(
    train_label_df, train_test="train", top=CFG.topn_top_popular_yado
)
(
    test_top_popular_yado_candidates,
    test_top_popular_yado_feature,
) = create_topN_popular_yado_candidates(
    train_label_df, train_test="test", top=CFG.topn_top_popular_yado
)

In [14]:
# エリア単位での人気宿を候補として作成
(
    train_top_wid_popular_yado_candidates,
    train_top_wid_popular_yado_feature,
) = create_topN_area_popular_yado_candidates(
    train_label_df, yado_df, train_test="train", area="wid_cd", top=CFG.topn_top_area_popular_yado
)
(
    test_top_wid_popular_yado_candidates,
    test_top_wid_popular_yado_feature,
) = create_topN_area_popular_yado_candidates(
    train_label_df, yado_df, train_test="test", area="wid_cd", top=CFG.topn_top_area_popular_yado
)

# 県単位での人気宿を候補として作成
(
    train_top_ken_popular_yado_candidates,
    train_top_ken_popular_yado_feature,
) = create_topN_area_popular_yado_candidates(
    train_label_df, yado_df, train_test="train", area="ken_cd", top=CFG.topn_top_area_popular_yado
)
(
    test_top_ken_popular_yado_candidates,
    test_top_ken_popular_yado_feature,
) = create_topN_area_popular_yado_candidates(
    train_label_df, yado_df, train_test="test", area="ken_cd", top=CFG.topn_top_area_popular_yado
)

# 大エリア単位での人気宿を候補として作成
(
    train_top_lrg_popular_yado_candidates,
    train_top_lrg_popular_yado_feature,
) = create_topN_area_popular_yado_candidates(
    train_label_df, yado_df, train_test="train", area="lrg_cd", top=CFG.topn_top_area_popular_yado
)
(
    test_top_lrg_popular_yado_candidates,
    test_top_lrg_popular_yado_feature,
) = create_topN_area_popular_yado_candidates(
    train_label_df, yado_df, train_test="test", area="lrg_cd", top=CFG.topn_top_area_popular_yado
)

# 小エリア単位での人気宿を候補として作成
(
    train_top_sml_popular_yado_candidates,
    train_top_sml_popular_yado_feature,
) = create_topN_area_popular_yado_candidates(
    train_label_df, yado_df, train_test="train", area="sml_cd", top=CFG.topn_top_area_popular_yado
)
(
    test_top_sml_popular_yado_candidates,
    test_top_sml_popular_yado_feature,
) = create_topN_area_popular_yado_candidates(
    train_label_df, yado_df, train_test="test", area="sml_cd", top=CFG.topn_top_area_popular_yado
)

In [15]:
# 直近見た宿からどこが予約しやすいかを候補として作成
(
    train_top_latest_next_booking_candidates,
    train_top_latest_next_booking_feature,
) = create_latest_next_booking_topN_candidate(
    train_log_df,
    train_label_df,
    train_test="train",
    top=CFG.topn_top_latest_next_booking,
)
(
    test_top_latest_next_booking_candidates,
    test_top_latest_next_booking_feature,
) = create_latest_next_booking_topN_candidate(
    train_log_df,
    train_label_df,
    train_test="test",
    top=CFG.topn_top_latest_next_booking,
)

In [16]:
# parquet形式で保存
train_past_view_yado_candidates.write_parquet(
    CFG.path_exp / "train_past_view_yado_candidates.parquet"
)
train_past_view_yado_feature.write_parquet(
    CFG.path_exp / "train_past_view_yado_feature.parquet"
)
test_past_view_yado_candidates.write_parquet(
    CFG.path_exp / "test_past_view_yado_candidates.parquet"
)
test_past_view_yado_feature.write_parquet(
    CFG.path_exp / "test_past_view_yado_feature.parquet"
)

train_top_popular_yado_candidates.write_parquet(
    CFG.path_exp / "train_top_popular_yado_candidates.parquet"
)
train_top_popular_yado_feature.write_parquet(
    CFG.path_exp / "train_top_popular_yado_feature.parquet"
)
test_top_popular_yado_candidates.write_parquet(
    CFG.path_exp / "test_top_popular_yado_candidates.parquet"
)
test_top_popular_yado_feature.write_parquet(
    CFG.path_exp / "test_top_popular_yado_feature.parquet"
)

train_top_wid_popular_yado_candidates.write_parquet(
    CFG.path_exp / "train_top_wid_popular_yado_candidates.parquet"
)
train_top_wid_popular_yado_feature.write_parquet(
    CFG.path_exp / "train_top_wid_popular_yado_feature.parquet"
)
test_top_wid_popular_yado_candidates.write_parquet(
    CFG.path_exp / "test_top_wid_popular_yado_candidates.parquet"
)
test_top_wid_popular_yado_feature.write_parquet(
    CFG.path_exp / "test_top_wid_popular_yado_feature.parquet"
)

train_top_ken_popular_yado_candidates.write_parquet(
    CFG.path_exp / "train_top_ken_popular_yado_candidates.parquet"
)
train_top_ken_popular_yado_feature.write_parquet(
    CFG.path_exp / "train_top_ken_popular_yado_feature.parquet"
)
test_top_ken_popular_yado_candidates.write_parquet(
    CFG.path_exp / "test_top_ken_popular_yado_candidates.parquet"
)
test_top_ken_popular_yado_feature.write_parquet(
    CFG.path_exp / "test_top_ken_popular_yado_feature.parquet"
)

train_top_lrg_popular_yado_candidates.write_parquet(
    CFG.path_exp / "train_top_lrg_popular_yado_candidates.parquet"
)
train_top_lrg_popular_yado_feature.write_parquet(
    CFG.path_exp / "train_top_lrg_popular_yado_feature.parquet"
)
test_top_lrg_popular_yado_candidates.write_parquet(
    CFG.path_exp / "test_top_lrg_popular_yado_candidates.parquet"
)
test_top_lrg_popular_yado_feature.write_parquet(
    CFG.path_exp / "test_top_lrg_popular_yado_feature.parquet"
)

train_top_sml_popular_yado_candidates.write_parquet(
    CFG.path_exp / "train_top_sml_popular_yado_candidates.parquet"
)
train_top_sml_popular_yado_feature.write_parquet(
    CFG.path_exp / "train_top_sml_popular_yado_feature.parquet"
)
test_top_sml_popular_yado_candidates.write_parquet(
    CFG.path_exp / "test_top_sml_popular_yado_candidates.parquet"
)
test_top_sml_popular_yado_feature.write_parquet(
    CFG.path_exp / "test_top_sml_popular_yado_feature.parquet"
)

train_top_latest_next_booking_candidates.write_parquet(
    CFG.path_exp / "train_top_latest_next_booking_candidates.parquet"
)
train_top_latest_next_booking_feature.write_parquet(
    CFG.path_exp / "train_top_latest_next_booking_feature.parquet"
)
test_top_latest_next_booking_candidates.write_parquet(
    CFG.path_exp / "test_top_latest_next_booking_candidates.parquet"
)
test_top_latest_next_booking_feature.write_parquet(
    CFG.path_exp / "test_top_latest_next_booking_feature.parquet"
)

In [17]:
def get_session_id_list(log_df):
    return log_df.group_by("session_id").head(1).select(["session_id"])  # セッションidを取得

In [18]:
train_session_id = get_session_id_list(train_log_df)
train_session_id = train_session_id.join(
    train_label_df.select(["fold", "session_id"]), how="left", on="session_id"
)

test_session_id = get_session_id_list(test_log_df)

In [19]:
candidate_list = {}
candidate_list["train"] = []
candidate_list["test"] = []

for train_test in ["train", "test"]:
    for candidate_name in tqdm(CFG.candidate_name_list):
        candidate = pl.read_parquet(
            CFG.path_exp / f"{train_test}_{candidate_name}_candidates.parquet"
        )
        # セッションに関連する候補を追加（今の所はpast_view_yadoだけ？）
        if "session_id" in candidate.columns:
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # 直近のやつ
        elif "latest_yad_no" in candidate.columns:
            if train_test == "train":
                # 最後に見た宿番号をセッションごとに取得
                latest_yad_no = (
                    train_log_df.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                    .rename({"yad_no": "latest_yad_no"})
                )
                latest_yad_no = latest_yad_no.join(
                    train_label_df.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["latest_yad_no", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log_df.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                    .rename({"yad_no": "latest_yad_no"})
                )
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["latest_yad_no"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "lrg" in candidate_name:
            print("lrg in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log_df.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    train_label_df.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado_df, how="left", on="yad_no"
                )
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["fold", "lrg_cd"]
                )
            else:
                latest_yad_no = (
                    test_log_df.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado_df, how="left", on="yad_no"
                )
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["lrg_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
                # エリア系のやつ
        elif "sml" in candidate_name:
            print("sml in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log_df.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    train_label_df.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado_df, how="left", on="yad_no"
                )
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["fold", "sml_cd"]
                )
            else:
                latest_yad_no = (
                    test_log_df.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado_df, how="left", on="yad_no"
                )
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["sml_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # それ以外（人気宿系はここに入る。target見てるから全てfoldがカラムに入ってるはず）
        else:
            if train_test == "train":
                if "fold" in candidate.columns:
                    candidate_all = pl.DataFrame()
                    for fold in range(CFG.n_fold):
                        candidate_fold = train_session_id.filter(
                            pl.col("fold") == fold
                        ).join(
                            candidate.filter(pl.col("fold") == fold).select(["yad_no"]),
                            how="cross",
                        )
                        candidate_all = pl.concat([candidate_all, candidate_fold])
            else:
                candidate_all = test_session_id.join(
                    candidate.select(["yad_no"]), how="cross"
                )
            candidate_list[train_test].append(
                candidate_all.select(["session_id", "yad_no"])
            )

 60%|██████    | 3/5 [00:00<00:00, 17.83it/s]

sml in candidate_name
lrg in candidate_name


100%|██████████| 5/5 [00:00<00:00,  6.43it/s]
 80%|████████  | 4/5 [00:00<00:00, 14.76it/s]

sml in candidate_name
lrg in candidate_name


100%|██████████| 5/5 [00:00<00:00, 10.54it/s]


In [20]:
# candidate_list = {}
# candidate_list["train"] = []
# candidate_list["test"] = []

# for train_test in ["train", "test"]:
#     for candidate_name in tqdm(CFG.candidate_name_list):
#         candidate = pl.read_parquet(
#             CFG.path_exp / f"{train_test}_{candidate_name}_candidates.parquet"
#         )
#         # セッションに関連する候補を追加（今の所はpast_view_yadoだけ？）
#         if "session_id" in candidate.columns:
#             candidate_list[train_test].append(
#                 candidate.select(["session_id", "yad_no"])
#             )
#         # 直近のやつ
#         elif "latest_yad_no" in candidate.columns:
#             if train_test == "train":
#                 # 最後に見た宿番号をセッションごとに取得
#                 latest_yad_no = (
#                     train_log_df.group_by("session_id")
#                     .tail(1)
#                     .select(["session_id", "yad_no"])
#                     .rename({"yad_no": "latest_yad_no"})
#                 )
#                 latest_yad_no = latest_yad_no.join(
#                     train_label_df.select(["session_id", "fold"]),
#                     how="left",
#                     on="session_id",
#                 )
#                 latest_yad_no = latest_yad_no.with_columns(
#                     pl.col("fold").cast(pl.Int32)
#                 )
#                 candidate = latest_yad_no.join(
#                     candidate, how="inner", on=["latest_yad_no", "fold"]
#                 )
#             else:
#                 latest_yad_no = (
#                     test_log_df.group_by("session_id")
#                     .tail(1)
#                     .select(["session_id", "yad_no"])
#                     .rename({"yad_no": "latest_yad_no"})
#                 )
#                 candidate = latest_yad_no.join(
#                     candidate, how="inner", on=["latest_yad_no"]
#                 )
#             candidate_list[train_test].append(
#                 candidate.select(["session_id", "yad_no"])
#             )
#         # それ以外（人気宿系はここに入る。target見てるから全てfoldがカラムに入ってるはず）
#         else:
#             if train_test == "train":
#                 if "fold" in candidate.columns:
#                     candidate_all = pl.DataFrame()
#                     for fold in range(CFG.n_fold):
#                         candidate_fold = train_session_id.filter(
#                             pl.col("fold") == fold
#                         ).join(
#                             candidate.filter(pl.col("fold") == fold).select(["yad_no"]),
#                             how="cross",
#                         )
#                         candidate_all = pl.concat([candidate_all, candidate_fold])
#             else:
#                 candidate_all = test_session_id.join(
#                     candidate.select(["yad_no"]), how="cross"
#                 )
#             candidate_list[train_test].append(
#                 candidate_all.select(["session_id", "yad_no"])
#             )

In [21]:
train_candidate = pl.concat(candidate_list["train"]).unique()
test_candidate = pl.concat(candidate_list["test"]).unique()

In [22]:
del candidate_list
gc.collect()

0

In [23]:
# trainとtestの件数
print(f"train_candidate: {len(train_candidate)}")
print(f"test_candidate: {len(test_candidate)}")

train_candidate: 6332704
test_candidate: 3724023


# Make feature

In [24]:
# 実際に予約した宿を結合
train_candidate = train_candidate.join(
    train_label_df.rename({"yad_no": "target"}), how="left", on="session_id"
)
train_candidate = train_candidate.with_columns(pl.col("fold").cast(pl.Int32))

# targetの作成
train_candidate = train_candidate.with_columns(
    (pl.col("yad_no") == pl.col("target")).alias("target").cast(pl.Int8)
)

In [25]:
train_latest_yad_no = (
    train_log_df.group_by("session_id")
    .tail(1)
    .select(["session_id", "yad_no"])
    .rename({"yad_no": "latest_yad_no"})
)
test_latest_yad_no = (
    test_log_df.group_by("session_id")
    .tail(1)
    .select(["session_id", "yad_no"])
    .rename({"yad_no": "latest_yad_no"})
)

In [26]:
# 直近に見た宿を結合
train_candidate = train_candidate.join(train_latest_yad_no, how="left", on="session_id")
test_candidate = test_candidate.join(test_latest_yad_no, how="left", on="session_id")

In [27]:
CFG.feature_name_list

['top_latest_next_booking',
 'past_view_yado',
 'top_popular_yado',
 'top_wid_popular_yado',
 'top_ken_popular_yado',
 'top_lrg_popular_yado',
 'top_sml_popular_yado']

In [28]:
for train_test in ["train", "test"]:
    for feature_name in tqdm(CFG.feature_name_list):
        feature = pl.read_parquet(
            CFG.path_exp / f"{train_test}_{feature_name}_feature.parquet"
        )
        if train_test == "train":
            # for fold in range(CFG.fold_num):
            if "session_id" in feature.columns:
                train_candidate = train_candidate.join(
                    feature, how="left", on=["session_id", "yad_no"]
                )
            elif "latest_yad_no" in feature.columns:
                train_candidate = train_candidate.join(
                    feature, how="left", on=["fold", "latest_yad_no", "yad_no"]
                )
            else:
                train_candidate = train_candidate.join(
                    feature, how="left", on=["fold", "yad_no"]
                )
        else:
            if "session_id" in feature.columns:
                test_candidate = test_candidate.join(
                    feature, how="left", on=["session_id", "yad_no"]
                )
            elif "latest_yad_no" in feature.columns:
                test_candidate = test_candidate.join(
                    feature, how="left", on=["latest_yad_no", "yad_no"]
                )
            else:
                test_candidate = test_candidate.join(feature, how="left", on=["yad_no"])

100%|██████████| 7/7 [00:00<00:00,  7.33it/s]
100%|██████████| 7/7 [00:00<00:00, 14.17it/s]


In [29]:
# 欠損を0で埋める
train_candidate = train_candidate.fill_null(0)
test_candidate = test_candidate.fill_null(0)

In [30]:
train_candidate = train_candidate.join(
    yado_df.select(
        [
            "yad_no",
            "yad_type",
            "total_room_cnt",
            "wireless_lan_flg",
            "onsen_flg",
            "kd_stn_5min",
            "kd_bch_5min",
            "kd_slp_5min",
            "kd_conv_walk_5min",
        ]
    ),
    how="left",
    on="yad_no",
)
test_candidate = test_candidate.join(
    yado_df.select(
        [
            "yad_no",
            "yad_type",
            "total_room_cnt",
            "wireless_lan_flg",
            "onsen_flg",
            "kd_stn_5min",
            "kd_bch_5min",
            "kd_slp_5min",
            "kd_conv_walk_5min",
        ]
    ),
    how="left",
    on="yad_no",
)

In [31]:
def create_num_picture_df(yado_df):
    # yad_noとcategoryごとのデータ件数
    _df = yado_df.group_by(["yad_no", "category"]).count()

    # ピボットテーブルに変換
    num_picture_df = _df.pivot("count", "yad_no", "category", "sum").sort("yad_no")

    # 欠損を0で埋める
    num_picture_df = num_picture_df.fill_null(0)

    return num_picture_df

In [32]:
num_picture_df = create_num_picture_df(yado_embedding_df)
num_picture_df

yad_no,food,exterior,facility,room,others
i64,u32,u32,u32,u32,u32
1,3,3,3,3,2
2,3,3,3,1,2
3,3,3,1,3,2
4,3,3,3,1,2
5,3,3,3,3,3
6,3,3,3,2,2
7,0,1,3,3,3
8,1,3,3,3,3
9,3,3,3,3,1
10,3,3,3,3,2


In [33]:
train_candidate = train_candidate.join(num_picture_df, how="left", on="yad_no")
test_candidate = test_candidate.join(num_picture_df, how="left", on="yad_no")

In [34]:
train_candidate

session_id,yad_no,target,fold,latest_yad_no,latest_next_booking_rank,max_seq_no,max_seq_no_diff,session_view_count,reservation_counts,popular_rank,wid_cd,popular_wid_cd_rank,ken_cd,popular_ken_cd_rank,lrg_cd,popular_lrg_cd_rank,sml_cd,popular_sml_cd_rank,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,food,exterior,facility,room,others
str,i64,i8,i32,i64,u32,i64,i64,u32,u32,i64,str,u32,str,u32,str,u32,str,u32,i64,f64,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32
"""008460149fa4c8…",7657,0,4,11099,1,2,1,1,36,1953,"""f0112abf369fb0…",89,"""fec19ba0016c01…",3,"""9b6b8d7f6b726a…",2,"""98d6ff1031df76…",2,0,94.0,1.0,0.0,1.0,0.0,0.0,1.0,3,3,3,2,2
"""00977bec57db43…",8801,1,3,4308,1,1,1,1,11,6025,"""e9316013ee1b03…",106,"""517061b8165aa6…",38,"""081c2ea377394c…",1,"""ab448b05846cb9…",1,0,64.0,1.0,0.0,0.0,0.0,0.0,1.0,3,3,3,3,3
"""00bb9c330e2b69…",5999,1,0,672,3,1,1,1,32,2332,"""e9316013ee1b03…",80,"""66c4d01ad8e301…",71,"""41e20110b38f12…",18,"""0aa9c9e83b1666…",17,0,106.0,0.0,0.0,1.0,0.0,0.0,1.0,3,3,3,3,3
"""00c2e4635c90f6…",11013,1,1,12337,3,1,1,1,79,345,"""c312e07b7a5d45…",18,"""6692a692f80687…",18,"""8cf750072f8520…",18,"""ddd5616ecb2d2c…",16,0,294.0,1.0,0.0,1.0,0.0,0.0,1.0,3,3,3,3,0
"""00e332817763ce…",3662,1,1,10561,1,1,1,1,15,4972,"""e9316013ee1b03…",99,"""21a8fca4573868…",43,"""6dbb1784988eef…",6,"""5d39f5b063b7af…",6,0,85.0,0.0,1.0,0.0,1.0,0.0,1.0,3,3,3,3,3
"""01bd0a690f9fb5…",10095,0,0,10095,0,2,2,2,484,1,"""46e33861f921c3…",1,"""572d60f0f5212a…",1,"""8a623b960557e8…",1,"""f7b42d92528e7a…",1,0,2007.0,1.0,0.0,0.0,0.0,0.0,1.0,3,3,3,3,3
"""01dfb5ff25ed14…",6517,1,4,4353,6,1,1,1,42,1524,"""e9316013ee1b03…",74,"""517061b8165aa6…",15,"""7e5ebb4d5e3cdf…",8,"""840f2157ec5bb4…",5,0,95.0,1.0,1.0,1.0,0.0,0.0,1.0,3,3,3,3,3
"""021c3f4b04345c…",12964,1,2,6409,0,1,1,1,14,5032,"""e9316013ee1b03…",105,"""517061b8165aa6…",38,"""2416a9ec6f5f34…",5,"""bd693a415c407d…",4,0,31.0,0.0,1.0,0.0,0.0,0.0,0.0,3,3,3,2,3
"""02e134a22b1bea…",5489,1,3,7475,3,1,1,1,13,5409,"""dc414a17890cfc…",88,"""d78f53d0856617…",79,"""b9c5be10de6118…",26,"""7ab46688e3643f…",12,0,196.0,1.0,0.0,0.0,0.0,0.0,1.0,1,2,3,3,3
"""03120d540745a6…",1012,1,4,1040,2,1,1,1,31,2357,"""f0112abf369fb0…",94,"""ce3aaf25e7e38a…",45,"""de53fe77d909fd…",2,"""9d9e1d08bd15f1…",2,0,29.0,1.0,1.0,1.0,0.0,0.0,0.0,3,3,2,3,3


In [35]:
use_session_ids = (
    train_candidate.group_by("session_id")
    .agg(pl.col("target").sum())
    .filter(pl.col("target") == 1)["session_id"]
)

In [36]:
train_candidate.get_column("target").sum() / len(train_label_df)

0.4951541056744418

# Train model

In [35]:
lgb_model_list = []
pred = np.zeros(len(test_candidate))
for fold in range(CFG.n_fold):
    X_train = train_candidate.filter(
        (pl.col("session_id").is_in(use_session_ids)) & (pl.col("fold") != fold)
    ).drop(["fold", "target", "session_id"])
    Y_train = train_candidate.filter(
        (pl.col("session_id").is_in(use_session_ids)) & (pl.col("fold") != fold)
    )["target"].to_numpy()
    X_valid = train_candidate.filter(
        (pl.col("session_id").is_in(use_session_ids)) & (pl.col("fold") == fold)
    ).drop(["fold", "target", "session_id"])
    Y_valid = train_candidate.filter(
        (pl.col("session_id").is_in(use_session_ids)) & (pl.col("fold") == fold)
    )["target"].to_numpy()

    # pandasの方が扱いやすいので変換
    X_train = X_train.to_pandas()
    X_valid = X_valid.to_pandas()
    for feature in CFG.cat_features:
        X_train[feature] = X_train[feature].astype("category")
        X_valid[feature] = X_valid[feature].astype("category")

    train_dataset = lgb.Dataset(X_train, Y_train)
    valid_dataset = lgb.Dataset(X_valid, Y_valid)
    lgb_model = lgb.train(
        params=CFG.lgb_model_params,
        train_set=train_dataset,
        valid_sets=[train_dataset, valid_dataset],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(1000)],
        **CFG.lgb_train_params,
    )

    lgb_model_list.append(lgb_model)

    del X_train, Y_train, X_valid, Y_valid, train_dataset, valid_dataset
    gc.collect()

Training until validation scores don't improve for 100 rounds
[1000]	training's auc: 0.978657	valid_1's auc: 0.928625
[2000]	training's auc: 0.983986	valid_1's auc: 0.931508


KeyboardInterrupt: 

In [36]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k for a single actual value.

    Parameters:
    actual : int
        The actual value that is to be predicted
    predicted : list
        A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The average precision at k
    """
    if actual in predicted[:k]:
        return 1.0 / (predicted[:k].index(actual) + 1)
    return 0.0


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k for lists of actual values and predicted values.

    Parameters:
    actual : list
        A list of actual values that are to be predicted
    predicted : list
        A list of lists of predicted elements (order does matter in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The mean average precision at k
    """
    return sum(apk(a, p, k) for a, p in zip(actual, predicted)) / len(actual)

In [37]:
def create_top_10_yad_predict(_df):
    # セッションごとに予測確率の高い順に yad_no の配列を作成
    _agg = (
        _df.sort_values("predict", ascending=False)
        .groupby("session_id")["yad_no"]
        .apply(list)
    )

    out_df = pd.DataFrame(index=_agg.index, data=_agg.values.tolist()).iloc[:, :10]

    return out_df

In [38]:
oof = pd.DataFrame()
test = test_candidate.with_columns(pl.lit(0).alias("predict"))
X_test = test.drop(["session_id"])
X_test = X_test.to_pandas()
for feature in CFG.cat_features:
    X_test[feature] = X_test[feature].astype("category")

for fold in range(CFG.n_fold):
    X_valid = train_candidate.filter((pl.col("fold") == fold)).drop(
        ["fold", "target", "session_id"]
    )
    Y_valid = train_candidate.filter((pl.col("fold") == fold))["target"].to_numpy()

    # pandasの方が扱いやすいので変換
    X_valid = X_valid.to_pandas()
    for feature in CFG.cat_features:
        X_valid[feature] = X_valid[feature].astype("category")

    X_valid["predict"] = lgb_model_list[fold].predict(X_valid)
    X_valid["session_id"] = train_candidate.filter((pl.col("fold") == fold))[
        "session_id"
    ].to_numpy()
    X_test["predict"] += (
        lgb_model_list[fold].predict(X_test.drop("predict", axis=1)) / CFG.n_fold
    )
    oof = pd.concat([oof, X_valid[["session_id", "predict", "yad_no"]]])
X_test["session_id"] = test["session_id"].to_numpy()

In [None]:
oof = oof.sort_values(["session_id", "predict"], ascending=False)
oof_ = create_top_10_yad_predict(oof)

In [None]:
oof_.to_csv(CFG.path_exp / "oof.csv")

In [None]:
pd_train_label_df = train_label_df.to_pandas()

In [None]:
mapk(
    actual=pd_train_label_df[
        pd_train_label_df["session_id"].isin(oof_.reset_index()["session_id"])
    ]
    .sort_values("session_id", ascending=True)["yad_no"]
    .to_list(),
    predicted=oof_.values.tolist(),
    k=10,
)

0.35779682360437065

In [43]:
sub = create_top_10_yad_predict(X_test)
sub.columns = [f"predict_{c}" for c in sub.columns]
sub = sub.reset_index(drop=True)

In [44]:
sub.to_csv(CFG.path_exp / "submission.csv", index=False)