## Candidate

In [2]:
import os
from tqdm import tqdm
import gc
import polars as pl
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold,KFold
import lightgbm as lgb

In [3]:
class CFG:
    name = "exp016"

    path_input = Path("../input")
    path_output = Path("../output")
    seed = 127
    fold_num = 5

    cat_features = ['yad_no','latest_yad_no','wid_cd','ken_cd','lrg_cd','sml_cd']

    lgb_train_params = {
        "num_boost_round": 999999,
    }

CFG.path_exp = CFG.path_output / CFG.name
CFG.path_exp.mkdir(parents=True, exist_ok=True)

In [4]:
train_log = pl.read_csv(CFG.path_input / "train_log.csv")
label = pl.read_csv(CFG.path_input / "train_label.csv")
test_log = pl.read_csv(CFG.path_input / "test_log.csv")
yado = pl.read_csv(CFG.path_input / "yado.csv")
yado_embedding = pl.read_parquet(CFG.path_input / "image_embeddings.parquet")

In [5]:
# Group Kfold, labelにfold情報を付けておく。
kf = KFold(n_splits=CFG.fold_num,shuffle=True,random_state=CFG.seed)
fold_assignments = np.full(label.height, -1, dtype=int)
for i, (_, valid_index) in enumerate(kf.split(label)):
    fold_assignments[valid_index] = i
label = label.with_columns(pl.Series("fold", fold_assignments))

In [6]:
def create_past_view_yado_candidates(log):
    """
    アクセスした宿をcandidateとして作成。ただし、直近の宿は予約しないので除外する。
    """
    max_seq_no = log.group_by("session_id").agg(pl.max("seq_no").alias("max_seq_no"))
    log = log.join(max_seq_no, on="session_id")
    # 最大値に該当する行を除外する
    past_yado_candidates = log.filter(pl.col("seq_no") != pl.col("max_seq_no"))
    past_yado_candidates = past_yado_candidates.select(['session_id','yad_no']).unique()

    # 簡易的な特徴量も作成しておく。
    # 何個前に見たか 複数回見た時は、直近のみ残す。
    past_yado_feature = log.with_columns((pl.col('max_seq_no') - pl.col('seq_no')).alias('max_seq_no_diff')).filter(pl.col("seq_no") != pl.col("max_seq_no"))
    past_yado_feature = past_yado_feature.join(past_yado_feature.group_by(["session_id", "yad_no"]).agg(pl.col("max_seq_no_diff").max().alias("max_seq_no_diff")), on=["session_id", "yad_no", "max_seq_no_diff"])
    # 何回見たか
    session_view_count = log.group_by(['session_id','yad_no']).count().rename({'count':'session_view_count'})
    past_yado_feature = past_yado_feature.join(session_view_count,how='left',on=['session_id','yad_no']).drop('seq_no')
    
    return past_yado_candidates,past_yado_feature

In [7]:
def generate_co_visit_matrix(df:pl.DataFrame) -> pl.DataFrame:
    # 共起ペアの作成
    df = df.join(df, on="session_id")
    # yad_noが同じものは除外する
    df = df.filter(pl.col("yad_no") != pl.col("yad_no_right"))
    # yad_noのペアごとに共起回数を計算
    df = df.group_by(["yad_no", "yad_no_right"]).count()
    # 整形
    df = df.rename(
        {
            "yad_no_right":"candidate_yad_no",
            "count":"co_visit_count",
        }
    )[["yad_no", "candidate_yad_no", "co_visit_count"]]

    return df

def create_topN_covisit_candidates(log_df: pl.DataFrame, top = 10):
    """
    共起行列から候補を作成する
    """
    # 共起行列の作成
    co_visit_matrix = generate_co_visit_matrix(log_df)

    # 最後に見た宿と紐づけてcandidateを作成するために、renameしておく
    co_visit_matrix = co_visit_matrix.rename({"yad_no":"latest_yad_no", "candidate_yad_no":"yad_no"})
    # 上からtop個の候補を取得
    topn_co_visit_matrix = co_visit_matrix.sort(['latest_yad_no','co_visit_count'],descending=[False,True]).group_by('latest_yad_no').head(top)

    # log_dfで最後に見た宿を取得
    log_latest = log_df.group_by("session_id").tail(1)
    log_latest = log_latest.rename({"yad_no": "latest_yad_no"})

    # 結合
    top_yado_co_candidate = log_latest.join(topn_co_visit_matrix, on="latest_yad_no")
    
    return top_yado_co_candidate.select(["session_id", "yad_no"])

In [8]:
def create_topN_popular_yado_candidates(label,train_test='train',top=10):
    """
    予約された人気宿をcandidateとして作成。train/validでリークしないように注意。
    """
    # labelデータを使うので、学習データはtrain/validで分割して作成。
    top10_yado_candidate = pl.DataFrame()
    popular_yado_feature = pl.DataFrame()
    if train_test == 'train':
        for fold in range(CFG.fold_num):
            train_label = label.filter(pl.col('fold') != fold)
            popular_yado_sort = train_label['yad_no'].value_counts().sort(by='counts',descending=True)

            # candidateの作成
            top10_yado_candidate_fold = popular_yado_sort.head(top).with_columns(pl.lit(fold).alias('fold')).select(['yad_no','fold'])
            top10_yado_candidate = pl.concat([top10_yado_candidate,top10_yado_candidate_fold])

            # 簡易的な特徴量も作成しておく。
            popular_yado_feature_fold = popular_yado_sort.with_columns(pl.lit(fold).alias('fold'))
            popular_yado_feature_fold = popular_yado_feature_fold.with_columns(pl.arange(1,len(popular_yado_sort)+1).alias('popular_rank'))
            popular_yado_feature = pl.concat([popular_yado_feature,popular_yado_feature_fold])
    else: # testデータはtrainデータ全体で作成する。
        # candidateの作成
        popular_yado_sort = label['yad_no'].value_counts().sort(by='counts',descending=True)
        top10_yado_candidate = popular_yado_sort.head(top).select(['yad_no'])

        # 簡易的な特徴量も作成しておく。
        popular_yado_feature = popular_yado_sort.with_columns(pl.arange(1,len(popular_yado_sort)+1).alias('popular_rank'))

    popular_yado_feature = popular_yado_feature.rename({'counts':'reservation_counts'})
    
    return top10_yado_candidate,popular_yado_feature

In [9]:
def create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='wid_cd',top=10):
    """
    エリア単位で予約された人気宿をcandidateとして作成。train/validでリークしないように注意。
    """
    label_yado = label.join(yado,how='left',on='yad_no')
    # labelデータを使うので、学習データはtrain/validで分割して作成。
    top10_yado_area_candidate = pl.DataFrame()
    popular_yado_area_feature = pl.DataFrame()
    if train_test == 'train':
        for fold in range(CFG.fold_num):
            train_label = label_yado.filter(pl.col('fold') != fold)
            popular_yado_sort = train_label.group_by([area,'yad_no']).count().sort(by=[area,'count'],descending=[False,True])

            # candidateの作成
            top10_yado_area_candidate_fold = popular_yado_sort.group_by(area).head(top).with_columns(pl.lit(fold).alias('fold')).select([area,'yad_no','fold'])
            top10_yado_area_candidate = pl.concat([top10_yado_area_candidate,top10_yado_area_candidate_fold])

            # 簡易的な特徴量も作成しておく。
            popular_yado_area_feature_fold = popular_yado_sort.with_columns(pl.lit(fold).alias('fold'))
            popular_yado_area_feature_fold = (popular_yado_area_feature_fold
                                            .group_by(area)
                                            .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over(area).alias(f'popular_{area}_rank'))))
            popular_yado_area_feature = pl.concat([popular_yado_area_feature,popular_yado_area_feature_fold])
            
    else: # testデータはtrainデータ全体で作成する。
        # candidateの作成
        popular_yado_sort = label_yado.group_by([area,'yad_no']).count().sort(by=[area,'count'],descending=[False,True])
        top10_yado_area_candidate = popular_yado_sort.group_by(area).head(top).select([area,'yad_no'])
        
        # 簡易的な特徴量も作成しておく。
        popular_yado_area_feature = (popular_yado_sort
                                    .group_by(area)
                                    .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over(area).alias(f'popular_{area}_rank'))))

    popular_yado_area_feature = popular_yado_area_feature.drop('count')
    
    return top10_yado_area_candidate,popular_yado_area_feature

In [10]:
def create_latest_next_booking_tonN_candidate(log,label,train_test='train',top=10):
    """
    直近見た宿で、次にどこを予約しやすいか。
    """
    log_latest = train_log.group_by('session_id').tail(1)
    log_latest = log_latest.rename({'yad_no':'latest_yad_no'})
    log_latest = log_latest.join(label,how='left',on='session_id')

    # labelデータを使うので、学習データはtrain/validで分割して作成。
    latest_next_booking_tonN_candidate = pl.DataFrame()
    latest_next_booking_tonN_feature = pl.DataFrame()
    if train_test == 'train':
        for fold in range(CFG.fold_num):
            train_log_latest = log_latest.filter(pl.col('fold') != fold)
            train_log_latest = train_log_latest.group_by(['latest_yad_no','yad_no']).count().sort(by=['latest_yad_no','count'],descending=[False,True])
    
            # candidateの作成
            latest_next_booking_tonN_candidate_fold = train_log_latest.group_by('latest_yad_no').head(top).with_columns(pl.lit(fold).alias('fold')).select(['yad_no','latest_yad_no','fold'])
            latest_next_booking_tonN_candidate = pl.concat([latest_next_booking_tonN_candidate,latest_next_booking_tonN_candidate_fold])
    
            # 簡易的な特徴量も作成しておく。
            latest_next_booking_tonN_feature_fold = train_log_latest.with_columns(pl.lit(fold).alias('fold'))
            latest_next_booking_tonN_feature_fold = (latest_next_booking_tonN_feature_fold
                                                    .group_by('latest_yad_no')
                                                    .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over('latest_yad_no').alias(f'latest_next_booking_rank'))))
            latest_next_booking_tonN_feature = pl.concat([latest_next_booking_tonN_feature,latest_next_booking_tonN_feature_fold])
    else:
        log_latest = log_latest.group_by(['latest_yad_no','yad_no']).count().sort(by=['latest_yad_no','count'],descending=[False,True])

        # candidateの作成
        latest_next_booking_tonN_candidate = log_latest.group_by('latest_yad_no').head(top).select(['yad_no','latest_yad_no'])

        # 簡易的な特徴量も作成しておく。
        latest_next_booking_tonN_feature = (log_latest
                                                    .group_by('latest_yad_no')
                                                    .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over('latest_yad_no').alias(f'latest_next_booking_rank'))))
    latest_next_booking_tonN_feature = latest_next_booking_tonN_feature.drop('count')
    return latest_next_booking_tonN_candidate,latest_next_booking_tonN_feature

In [11]:
train_past_view_yado_candidates,train_past_view_yado_feature = create_past_view_yado_candidates(train_log)
test_past_view_yado_candidates,test_past_view_yado_feature = create_past_view_yado_candidates(test_log)

In [12]:
train_top20_popular_yado_candidates,train_top20_popular_yado_feature = create_topN_popular_yado_candidates(label,train_test='train',top=10)
test_top20_popular_yado_candidates,test_top20_popular_yado_feature = create_topN_popular_yado_candidates(label,train_test='test',top=10)

In [13]:
train_top10_wid_popular_yado_candidates,train_top10_wid_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='wid_cd',top=10)
test_top10_wid_popular_yado_candidates,test_top10_wid_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='wid_cd',top=10)

train_top10_ken_popular_yado_candidates,train_top10_ken_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='ken_cd',top=10)
test_top10_ken_popular_yado_candidates,test_top10_ken_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='ken_cd',top=10)

train_top10_lrg_popular_yado_candidates,train_top10_lrg_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='lrg_cd',top=10)
test_top10_lrg_popular_yado_candidates,test_top10_lrg_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='lrg_cd',top=10)

train_top10_sml_popular_yado_candidates,train_top10_sml_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='sml_cd',top=10)
test_top10_sml_popular_yado_candidates,test_top10_sml_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='sml_cd',top=10)

In [14]:
train_latest_next_booking_top20_candidate,train_latest_next_booking_top20_feature = create_latest_next_booking_tonN_candidate(train_log,label,train_test='train',top=10)
test_latest_next_booking_top20_candidate,test_latest_next_booking_top20_feature = create_latest_next_booking_tonN_candidate(train_log,label,train_test='test',top=10)

In [15]:
train_top10_covisit_candidates = create_topN_covisit_candidates(train_log,top=10)
test_top10_covisit_candidates = create_topN_covisit_candidates(test_log,top=10)

In [16]:
# parquet形式で保存
train_past_view_yado_candidates.write_parquet(CFG.path_exp / "train_past_view_yado_candidates.parquet")
test_past_view_yado_candidates.write_parquet(CFG.path_exp / "test_past_view_yado_candidates.parquet")

train_past_view_yado_feature.write_parquet(CFG.path_exp / "train_past_view_yado_feature.parquet")
test_past_view_yado_feature.write_parquet(CFG.path_exp / "test_past_view_yado_feature.parquet")

train_top20_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top20_popular_yado_candidates.parquet")
test_top20_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top20_popular_yado_candidates.parquet")

train_top20_popular_yado_feature.write_parquet(CFG.path_exp / "train_top20_popular_yado_feature.parquet")
test_top20_popular_yado_feature.write_parquet(CFG.path_exp / "test_top20_popular_yado_feature.parquet")

train_top10_wid_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_wid_popular_yado_candidates.parquet")
test_top10_wid_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_wid_popular_yado_candidates.parquet")

train_top10_wid_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_wid_popular_yado_feature.parquet")
test_top10_wid_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_wid_popular_yado_feature.parquet")

train_top10_ken_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_ken_popular_yado_candidates.parquet")
test_top10_ken_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_ken_popular_yado_candidates.parquet")

train_top10_ken_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_ken_popular_yado_feature.parquet")
test_top10_ken_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_ken_popular_yado_feature.parquet")

train_top10_lrg_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_lrg_popular_yado_candidates.parquet")
test_top10_lrg_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_lrg_popular_yado_candidates.parquet")

train_top10_lrg_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_lrg_popular_yado_feature.parquet")
test_top10_lrg_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_lrg_popular_yado_feature.parquet")

train_top10_sml_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_sml_popular_yado_candidates.parquet")
test_top10_sml_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_sml_popular_yado_candidates.parquet")

train_top10_sml_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_sml_popular_yado_feature.parquet")
test_top10_sml_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_sml_popular_yado_feature.parquet")

train_latest_next_booking_top20_candidate.write_parquet(CFG.path_exp / "train_latest_next_booking_top20_candidates.parquet")
test_latest_next_booking_top20_candidate.write_parquet(CFG.path_exp / "test_latest_next_booking_top20_candidates.parquet")

train_latest_next_booking_top20_feature.write_parquet(CFG.path_exp / "train_latest_next_booking_top20_feature.parquet")
test_latest_next_booking_top20_feature.write_parquet(CFG.path_exp / "test_latest_next_booking_top20_feature.parquet")

train_top10_covisit_candidates.write_parquet(CFG.path_exp / "train_top10_covisit_candidates.parquet")
test_top10_covisit_candidates.write_parquet(CFG.path_exp / "test_top10_covisit_candidates.parquet")

## Feature

### candidate結合

In [17]:
# area単位のは多すぎるので、今回は除外。
candidate_name_list = ['past_view_yado',
                    #   'top20_popular_yado',
                      'top10_wid_popular_yado',
                    #   'top10_ken_popular_yado',
                    #   'top10_lrg_popular_yado',
                    #   'top10_sml_popular_yado',
                       "top10_covisit",
                       'latest_next_booking_top20']

In [18]:
def get_session_id_list(log):
    return log.group_by('session_id').head(1).select(['session_id'])

In [19]:
train_session_id = get_session_id_list(train_log)
train_session_id = train_session_id.join(label.select(['fold','session_id']),how='left',on='session_id')

test_session_id = get_session_id_list(test_log)

In [20]:
# 各candidateを結合
candidate_list = {}
candidate_list['train'] = []
candidate_list['test'] = []

for train_test in ['train','test']:
    for candidate_name in tqdm(candidate_name_list):
        candidate = pl.read_parquet(CFG.path_exp / f"{train_test}_{candidate_name}_candidates.parquet")
        if 'session_id' in candidate.columns:
            candidate_list[train_test].append(candidate.select(['session_id','yad_no']))
        elif 'latest_yad_no' in candidate.columns:
            if train_test == 'train':
                latest_yad_no = train_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})
                latest_yad_no = latest_yad_no.join(label.select(['session_id','fold']),how='left',on='session_id')
                latest_yad_no = latest_yad_no.with_columns(pl.col('fold').cast(pl.Int32))
                candidate = latest_yad_no.join(candidate,how='inner',on=['latest_yad_no','fold'])
            else:
                latest_yad_no = test_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(candidate,how='inner',on=['latest_yad_no'])
            candidate_list[train_test].append(candidate.select(['session_id','yad_no']))
                # エリア系のやつ
        elif "wid" in candidate_name:
            print("wid in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                display(latest_yad_no)
                display(candidate)
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["wid_cd", "fold"]
                )
                display(candidate)
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["wid_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "ken" in candidate_name:
            print("ken in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["ken_cd", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["ken_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "lrg" in candidate_name:
            print("lrg in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["lrg_cd", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["lrg_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "sml" in candidate_name:
            print("sml in candidate_name")
            if train_test == "train":
                # 最後に見た宿を取得
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                # foldを付与
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["sml_cd", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["sml_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )       
        else:
            if train_test == 'train':
                if 'fold' in candidate.columns:
                    candidate_all = pl.DataFrame()
                    for fold in range(CFG.fold_num):
                        candidate_fold = train_session_id.filter(pl.col('fold') == fold).join(candidate.filter(pl.col('fold') == fold).select(['yad_no']),how='cross')
                        candidate_all = pl.concat([candidate_all,candidate_fold])
            else:
                candidate_all = test_session_id.join(candidate.select(['yad_no']),how='cross')
            candidate_list[train_test].append(candidate_all.select(['session_id','yad_no']))

  0%|          | 0/4 [00:00<?, ?it/s]

wid in candidate_name


session_id,latest_yad_no,fold,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
str,i64,i32,i64,f64,f64,i64,f64,f64,f64,f64,str,str,str,str
"""d0889cb9ca408d…",3853,2,0,48.0,1.0,0,,,,,"""b07b75d367ebec…","""0a66f6ab9c0507…","""333bcf3094c1ca…","""4a70d8767ca3a1…"
"""baf602f090b7b7…",10876,0,0,247.0,1.0,0,1.0,,,1.0,"""e9316013ee1b03…","""21a8fca4573868…","""d20208936fbd81…","""394a5403d8c5d0…"
"""7a0055ccb4892f…",1082,2,0,51.0,,0,,,,1.0,"""8a1c0d3243bba1…","""94d4f7dc1971d3…","""a52a65a7a9bf31…","""760ccad1c5b1ef…"
"""e5091753e0ef5e…",3650,4,0,70.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""c86352f5b57e80…","""9d6a46da05976c…","""4a4c8d9d06f383…"
"""4196fe431ea861…",2902,2,0,96.0,1.0,0,1.0,,,1.0,"""e9316013ee1b03…","""66c4d01ad8e301…","""7763c74e2efa67…","""084c46af580a48…"
"""0b72cc9c7fd89e…",8953,1,0,51.0,,1,,1.0,,,"""e9316013ee1b03…","""84efa50e52f9b4…","""6991a9d2e7fe40…","""51450d88ac85e2…"
"""a6341b5e8937f6…",4138,1,0,25.0,1.0,1,,,,1.0,"""d86102dd9c232b…","""3831f43bb997a3…","""3671baf0a3a421…","""5bc1063a432b3b…"
"""90ddbe0fc110d9…",5149,0,0,205.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""572d60f0f5212a…","""8748b5a626046f…","""8a3f01bdf9eb39…"
"""b7d050fbaa087c…",830,3,0,85.0,1.0,0,1.0,,,1.0,"""89e181a4091476…","""2b99151dba9558…","""a9009f29bc3252…","""9dedfc2a42ed15…"
"""e6c3de1212f80b…",6863,3,0,51.0,1.0,1,,,,1.0,"""f0112abf369fb0…","""bd054cc265d68a…","""c3e3c099b1eec7…","""7ef7ae126238f8…"


wid_cd,yad_no,fold
str,i64,i32
"""43875109d1dab9…",6731,0
"""43875109d1dab9…",6556,0
"""43875109d1dab9…",9981,0
"""43875109d1dab9…",8602,0
"""43875109d1dab9…",2848,0
"""43875109d1dab9…",441,0
"""43875109d1dab9…",1143,0
"""43875109d1dab9…",9308,0
"""43875109d1dab9…",11636,0
"""43875109d1dab9…",10749,0


session_id,latest_yad_no,fold,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,yad_no
str,i64,i32,i64,f64,f64,i64,f64,f64,f64,f64,str,str,str,str,i64
"""d0889cb9ca408d…",3853,2,0,48.0,1.0,0,,,,,"""b07b75d367ebec…","""0a66f6ab9c0507…","""333bcf3094c1ca…","""4a70d8767ca3a1…",1882
"""d0889cb9ca408d…",3853,2,0,48.0,1.0,0,,,,,"""b07b75d367ebec…","""0a66f6ab9c0507…","""333bcf3094c1ca…","""4a70d8767ca3a1…",8140
"""d0889cb9ca408d…",3853,2,0,48.0,1.0,0,,,,,"""b07b75d367ebec…","""0a66f6ab9c0507…","""333bcf3094c1ca…","""4a70d8767ca3a1…",13717
"""d0889cb9ca408d…",3853,2,0,48.0,1.0,0,,,,,"""b07b75d367ebec…","""0a66f6ab9c0507…","""333bcf3094c1ca…","""4a70d8767ca3a1…",4856
"""d0889cb9ca408d…",3853,2,0,48.0,1.0,0,,,,,"""b07b75d367ebec…","""0a66f6ab9c0507…","""333bcf3094c1ca…","""4a70d8767ca3a1…",8677
"""d0889cb9ca408d…",3853,2,0,48.0,1.0,0,,,,,"""b07b75d367ebec…","""0a66f6ab9c0507…","""333bcf3094c1ca…","""4a70d8767ca3a1…",3184
"""d0889cb9ca408d…",3853,2,0,48.0,1.0,0,,,,,"""b07b75d367ebec…","""0a66f6ab9c0507…","""333bcf3094c1ca…","""4a70d8767ca3a1…",6602
"""d0889cb9ca408d…",3853,2,0,48.0,1.0,0,,,,,"""b07b75d367ebec…","""0a66f6ab9c0507…","""333bcf3094c1ca…","""4a70d8767ca3a1…",825
"""d0889cb9ca408d…",3853,2,0,48.0,1.0,0,,,,,"""b07b75d367ebec…","""0a66f6ab9c0507…","""333bcf3094c1ca…","""4a70d8767ca3a1…",2259
"""d0889cb9ca408d…",3853,2,0,48.0,1.0,0,,,,,"""b07b75d367ebec…","""0a66f6ab9c0507…","""333bcf3094c1ca…","""4a70d8767ca3a1…",12279


100%|██████████| 4/4 [00:00<00:00,  6.58it/s]
100%|██████████| 4/4 [00:00<00:00, 25.14it/s]

wid in candidate_name





In [21]:
train_candidate = pl.concat(candidate_list['train']).unique()
test_candidate = pl.concat(candidate_list['test']).unique()

In [22]:
del candidate_list
gc.collect()

0

### 特徴量作成

In [23]:
train_candidate = train_candidate.join(label.rename({'yad_no':'target'}),how='left',on='session_id')
train_candidate = train_candidate.with_columns(pl.col('fold').cast(pl.Int32))
train_candidate = train_candidate.with_columns((pl.col('yad_no') == pl.col('target')).alias('target').cast(pl.Int8))

In [24]:
train_latest_yad_no = train_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})
test_latest_yad_no = test_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})

In [25]:
train_candidate = train_candidate.join(train_latest_yad_no,how='left',on='session_id')
test_candidate = test_candidate.join(test_latest_yad_no,how='left',on='session_id')

In [26]:
feature_name_list = ['latest_next_booking_top20',
                     'past_view_yado',
                      'top20_popular_yado',
                      'top10_wid_popular_yado',
                      'top10_ken_popular_yado',
                      'top10_lrg_popular_yado',
                      'top10_sml_popular_yado']

In [27]:
for train_test in ['train','test']:
    for feature_name in tqdm(feature_name_list):
        feature = pl.read_parquet(CFG.path_exp / f"{train_test}_{feature_name}_feature.parquet")
        if train_test == 'train':
            # for fold in range(CFG.fold_num):
            if 'session_id' in feature.columns:
                train_candidate = train_candidate.join(feature,how='left',on=['session_id','yad_no'])
            elif 'latest_yad_no' in feature.columns:
                train_candidate = train_candidate.join(feature,how='left',on=['fold','latest_yad_no','yad_no'])
            else:
                train_candidate = train_candidate.join(feature,how='left',on=['fold','yad_no'])
        else:
            if 'session_id' in feature.columns:
                test_candidate = test_candidate.join(feature,how='left',on=['session_id','yad_no'])
            elif 'latest_yad_no' in feature.columns:
                test_candidate = test_candidate.join(feature,how='left',on=['latest_yad_no','yad_no'])
            else:
                test_candidate = test_candidate.join(feature,how='left',on=['yad_no'])

100%|██████████| 7/7 [00:01<00:00,  5.74it/s]
100%|██████████| 7/7 [00:00<00:00, 11.02it/s]


In [28]:
train_candidate = train_candidate.fill_null(0)
test_candidate = test_candidate.fill_null(0)

In [29]:
train_candidate = train_candidate.join(yado.select(['yad_no','yad_type','total_room_cnt','wireless_lan_flg','onsen_flg','kd_stn_5min','kd_bch_5min','kd_slp_5min','kd_conv_walk_5min']),how='left',on='yad_no')
test_candidate = test_candidate.join(yado.select(['yad_no','yad_type','total_room_cnt','wireless_lan_flg','onsen_flg','kd_stn_5min','kd_bch_5min','kd_slp_5min','kd_conv_walk_5min']),how='left',on='yad_no')

In [30]:
# 各seqで見た宿を特徴量にする
for seq in range(8):
    seq_yad_no = train_log.filter(pl.col('seq_no') == seq).select(['session_id','yad_no']).rename({'yad_no':f'seq_{seq}_yad_no'})
    train_candidate = train_candidate.join(seq_yad_no,how='left',on='session_id')

    seq_yad_no = test_log.filter(pl.col('seq_no') == seq).select(['session_id','yad_no']).rename({'yad_no':f'seq_{seq}_yad_no'})
    test_candidate = test_candidate.join(seq_yad_no,how='left',on='session_id')

In [31]:
def create_num_picture_df(yado_df):
    # yad_noとcategoryごとのデータ件数
    _df = yado_df.group_by(["yad_no", "category"]).count()

    # ピボットテーブルに変換
    num_picture_df = _df.pivot("count", "yad_no", "category", "sum").sort("yad_no")

    # 欠損を0で埋める
    num_picture_df = num_picture_df.fill_null(0)

    return num_picture_df

In [32]:
# 各カテゴリの画像枚数
num_picture_df = create_num_picture_df(yado_embedding)

# 画像枚数を結合
train_candidate = train_candidate.join(num_picture_df, how="left", on="yad_no")
test_candidate = test_candidate.join(num_picture_df, how="left", on="yad_no")

In [33]:
train_candidate

session_id,yad_no,target,fold,latest_yad_no,latest_next_booking_rank,max_seq_no,max_seq_no_diff,session_view_count,reservation_counts,popular_rank,wid_cd,popular_wid_cd_rank,ken_cd,popular_ken_cd_rank,lrg_cd,popular_lrg_cd_rank,sml_cd,popular_sml_cd_rank,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,seq_0_yad_no,seq_1_yad_no,seq_2_yad_no,seq_3_yad_no,seq_4_yad_no,seq_5_yad_no,seq_6_yad_no,seq_7_yad_no,room,food,exterior,others,facility
str,i64,i8,i32,i64,u32,i64,i64,u32,u32,i64,str,u32,str,u32,str,u32,str,u32,i64,f64,f64,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,u32,u32,u32,u32,u32
"""0024b67ec2147a…",11800,0,2,11800,0,2,2,2,16,4559,"""f0112abf369fb0…",103,"""bd054cc265d68a…",55,"""5b1499dd1a709e…",12,"""0189f42d884153…",4,0,113.0,1.0,0,1.0,,,1.0,11800,4634,11800,,,,,,3,3,2,3,3
"""002ba2bac67ccc…",1960,0,1,1960,0,2,2,2,35,2035,"""3300cf6f774b7c…",31,"""689cf8289e7ea0…",7,"""8b712435430a68…",1,"""2dfac18c3e1cec…",1,0,190.0,1.0,0,1.0,,,1.0,1960,13104,1960,,,,,,3,3,3,3,3
"""006cb399fa8d5e…",10287,1,4,3002,3,1,1,1,130,88,"""46e33861f921c3…",37,"""107c7305a74c8d…",16,"""d153c8fd78bfad…",3,"""93bb8a3bdcfb29…",3,0,226.0,1.0,0,1.0,,,1.0,10287,3002,,,,,,,2,3,3,3,3
"""0083b231f2f05d…",3724,0,3,4372,4,1,1,1,54,900,"""b07b75d367ebec…",48,"""0a66f6ab9c0507…",48,"""4713062d683b3b…",5,"""23a40c053a6e6b…",5,0,137.0,1.0,0,1.0,,,1.0,3724,4372,,,,,,,3,3,3,3,3
"""0092fe66fbea8e…",8725,0,4,8725,0,2,2,2,8,6990,"""46e33861f921c3…",142,"""c86352f5b57e80…",87,"""49c95c1add03ed…",14,"""7a1147e7618fb7…",14,0,70.0,1.0,0,,,,1.0,8725,13703,8725,,,,,,3,3,3,3,3
"""00a94a702bbe4f…",10095,0,3,2974,1,2,2,1,480,1,"""46e33861f921c3…",1,"""572d60f0f5212a…",1,"""8a623b960557e8…",1,"""f7b42d92528e7a…",1,0,2007.0,1.0,0,,,,1.0,10095,12707,2974,,,,,,3,3,3,3,3
"""011ca817cdac5b…",10415,1,2,1372,1,2,1,1,49,1079,"""46e33861f921c3…",102,"""572d60f0f5212a…",34,"""8a623b960557e8…",30,"""ab9480fd72a44d…",3,0,114.0,1.0,0,1.0,,,1.0,1372,10415,1372,,,,,,3,3,3,2,3
"""015855ebd9d6c5…",4367,0,2,39,1,2,2,1,73,417,"""f0112abf369fb0…",48,"""072c85e1653e10…",40,"""810a15fb99b13c…",2,"""bacd9adfafe9ef…",1,0,339.0,1.0,0,1.0,,,1.0,4367,211,39,,,,,,3,2,3,1,3
"""0209c523fd35ac…",5889,1,2,5510,4,1,1,1,13,5491,"""321b69d5eec98f…",53,"""39c3eb151762dd…",27,"""d9c4641b191036…",20,"""4a22094c55f841…",20,0,204.0,1.0,0,1.0,,,1.0,5889,5510,,,,,,,3,0,2,0,1
"""02642affffa79d…",9802,1,2,8223,1,2,1,1,41,1527,"""dc414a17890cfc…",60,"""6920865be128aa…",24,"""25ba24115c14a3…",10,"""bab5738ccde47c…",9,0,80.0,1.0,0,1.0,,,1.0,8223,9802,8223,,,,,,3,3,3,3,3


In [34]:
# 特徴量保存
train_candidate.write_parquet(CFG.path_exp / "train_candidate.parquet")
test_candidate.write_parquet(CFG.path_exp / "test_candidate.parquet")

In [35]:
test_candidate.shape

(3690374, 38)

In [36]:
train_candidate.get_column("target").sum() / len(label)

0.5769038926490658

## ReRankモデル

In [45]:
train = pl.read_parquet(CFG.path_exp / "train_candidate.parquet")
test = pl.read_parquet(CFG.path_exp / "test_candidate.parquet")

In [46]:
# session_id単位でtarge=1がなければ、session_idごと削除
use_session_ids = train.group_by('session_id').agg(pl.col('target').sum()).filter(pl.col('target') == 1)['session_id']

### Lightgbmで学習

In [47]:
# Lightgbmで学習
lgbm_params = {
        "objective": "lambdarank",
        "metric": "map",
        "eval_at": 10,
        'boosting_type' : 'gbdt',
        'seed' : CFG.seed,
        'learning_rate': 0.2, 
}

lgb_model_list = []
pred = np.zeros(len(test))
for fold in range(CFG.fold_num):
    _train = train_candidate.filter(
        (pl.col("session_id").is_in(use_session_ids)) & (pl.col("fold") != fold)
    ).drop(["fold"])
    _valid = train_candidate.filter(
        (pl.col("session_id").is_in(use_session_ids)) & (pl.col("fold") == fold)
    ).drop(["fold"])

    # pandasの方が扱いやすいので変換
    _train = _train.to_pandas()
    _valid = _valid.to_pandas()

    # train, validそれぞれsession_idでソート
    _train = _train.sort_values("session_id").reset_index(drop=True)
    _valid = _valid.sort_values("session_id").reset_index(drop=True)

    # train, validのsession_idの数を取得
    train_query = _train.groupby("session_id")["session_id"].count().to_list()
    valid_query = _valid.groupby("session_id")["session_id"].count().to_list()

    # train, validのtargetを取得
    Y_train = _train["target"]
    Y_valid = _valid["target"]

    # train, validのsession_id, targetを削除
    X_train = _train.drop(["session_id", "target"], axis=1)
    X_valid = _valid.drop(["session_id", "target"], axis=1)

    for feature in CFG.cat_features:
        X_train[feature] = X_train[feature].astype("category")
        X_valid[feature] = X_valid[feature].astype("category")

    train_dataset = lgb.Dataset(X_train, Y_train, group=train_query)
    valid_dataset = lgb.Dataset(X_valid, Y_valid, group=valid_query)
    lgb_model = lgb.train(
        params=lgbm_params,
        train_set=train_dataset,
        valid_sets=[train_dataset, valid_dataset],
        callbacks=[lgb.early_stopping(100), lgb.log_evaluation(1000)],
        **CFG.lgb_train_params,
    )

    lgb_model_list.append(lgb_model)

    del X_train, Y_train, X_valid, Y_valid, train_dataset, valid_dataset
    gc.collect()

[LightGBM] [Info] Total groups: 133051, total data: 2810673
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.149017 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19221
[LightGBM] [Info] Number of data points in the train set: 2810673, number of used features: 37
[LightGBM] [Info] Total groups: 33500, total data: 707092
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[30]	training's map@10: 0.744541	valid_1's map@10: 0.703156
[LightGBM] [Info] Total groups: 133303, total data: 2814790
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.113247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19235
[LightGBM] [Info] Number of data points in the t

### 推論

In [48]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k for a single actual value.

    Parameters:
    actual : int
        The actual value that is to be predicted
    predicted : list
        A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The average precision at k
    """
    if actual in predicted[:k]:
        return 1.0 / (predicted[:k].index(actual) + 1)
    return 0.0

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k for lists of actual values and predicted values.

    Parameters:
    actual : list
        A list of actual values that are to be predicted
    predicted : list
        A list of lists of predicted elements (order does matter in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The mean average precision at k
    """
    return sum(apk(a, p, k) for a, p in zip(actual, predicted)) / len(actual)

In [49]:
def create_top_10_yad_predict(_df):

    # セッションごとに予測確率の高い順に yad_no の配列を作成
    _agg = _df.sort_values("predict", ascending=False).groupby("session_id")["yad_no"].apply(list)

    out_df = pd.DataFrame(index=_agg.index, data=_agg.values.tolist()).iloc[:, :10]

    return out_df

In [50]:
oof = pd.DataFrame()
test = test.with_columns(pl.lit(0).alias('predict'))
X_test = test.drop(['session_id'])
X_test = X_test.to_pandas()
for feature in CFG.cat_features:
    X_test[feature] = X_test[feature].astype('category')

for fold in range(CFG.fold_num):
    X_valid = train.filter((pl.col('fold') == fold)).drop(['fold','target','session_id'])
    Y_valid = train.filter((pl.col('fold') == fold))['target'].to_numpy()

    # pandasの方が扱いやすいので変換
    X_valid = X_valid.to_pandas()
    for feature in CFG.cat_features:
        X_valid[feature] = X_valid[feature].astype('category')

    X_valid['predict'] = lgb_model_list[fold].predict(X_valid)
    X_valid['session_id'] = train.filter((pl.col('fold') == fold))['session_id'].to_numpy()
    X_test['predict'] += lgb_model_list[fold].predict(X_test.drop('predict',axis=1))/CFG.fold_num
    oof = pd.concat([oof,X_valid[['session_id','predict','yad_no']]])
X_test['session_id'] = test['session_id'].to_numpy()

In [51]:
oof = oof.sort_values(['session_id','predict'],ascending=False)
oof_ = create_top_10_yad_predict(oof)

In [52]:
label = pd.read_csv(CFG.path_input / "train_label.csv")

mapk(actual=label[label['session_id'].isin(oof_.reset_index()['session_id'])].sort_values('session_id',ascending=True)['yad_no'].to_list(),
     predicted=oof_.values.tolist(), k=10)

0.40549688904320075

In [53]:
oof_.to_csv(CFG.path_exp / "oof.csv")

In [54]:
# CV = 0.35497319410061673

In [55]:
sub = create_top_10_yad_predict(X_test)
sub.columns = [f'predict_{c}' for c in sub.columns]
sub = sub.reset_index(drop=True)

In [56]:
sub.to_csv(CFG.path_exp / "submission.csv", index=False)

In [57]:
sub

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,2986,4714,9830,11561,2680,5466,2305,4420,10233
1,2439,9323,11923,12862,613,11237,8108,6555,7014,7913
2,757,9190,410,9910,1774,10485,7710,6721,6730,3400
3,12341,13610,3359,6991,5657,7049,277,9319,4180,10746
4,607,763,11480,5650,1448,7246,12524,3187,6576,5106
...,...,...,...,...,...,...,...,...,...,...
174695,1997,11123,2278,7888,5744,7062,9543,10997,9743,11848
174696,6874,2232,1227,2164,9113,13702,4014,9723,3644,13220
174697,11037,2087,12240,7379,7308,13797,5810,13719,8143,12939
174698,13672,1687,4483,5515,2407,11496,2373,3002,12281,5513
