## Candidate

In [1]:
import os
from tqdm import tqdm
import gc
import polars as pl
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold,KFold
import lightgbm as lgb

In [2]:
class CFG:
    name = "exp018"

    path_input = Path("../input")
    path_output = Path("../output")
    seed = 127
    fold_num = 5

    cat_features = ['yad_no','latest_yad_no','wid_cd','ken_cd','lrg_cd','sml_cd']

    lgb_train_params = {
        "num_boost_round": 999999,
    }

CFG.path_exp = CFG.path_output / CFG.name
CFG.path_exp.mkdir(parents=True, exist_ok=True)

In [3]:
train_log = pl.read_csv(CFG.path_input / "train_log.csv")
label = pl.read_csv(CFG.path_input / "train_label.csv")
test_log = pl.read_csv(CFG.path_input / "test_log.csv")
yado = pl.read_csv(CFG.path_input / "yado.csv")
yado_embedding = pl.read_parquet(CFG.path_input / "image_embeddings.parquet")

In [4]:
# Group Kfold, labelにfold情報を付けておく。
kf = KFold(n_splits=CFG.fold_num,shuffle=True,random_state=CFG.seed)
fold_assignments = np.full(label.height, -1, dtype=int)
for i, (_, valid_index) in enumerate(kf.split(label)):
    fold_assignments[valid_index] = i
label = label.with_columns(pl.Series("fold", fold_assignments))

In [5]:
def create_past_view_yado_candidates(log):
    """
    アクセスした宿をcandidateとして作成。ただし、直近の宿は予約しないので除外する。
    """
    max_seq_no = log.group_by("session_id").agg(pl.max("seq_no").alias("max_seq_no"))
    log = log.join(max_seq_no, on="session_id")
    # 最大値に該当する行を除外する
    past_yado_candidates = log.filter(pl.col("seq_no") != pl.col("max_seq_no"))
    past_yado_candidates = past_yado_candidates.select(['session_id','yad_no']).unique()

    # 簡易的な特徴量も作成しておく。
    # 何個前に見たか 複数回見た時は、直近のみ残す。
    past_yado_feature = log.with_columns((pl.col('max_seq_no') - pl.col('seq_no')).alias('max_seq_no_diff')).filter(pl.col("seq_no") != pl.col("max_seq_no"))
    past_yado_feature = past_yado_feature.join(past_yado_feature.group_by(["session_id", "yad_no"]).agg(pl.col("max_seq_no_diff").max().alias("max_seq_no_diff")), on=["session_id", "yad_no", "max_seq_no_diff"])
    # 何回見たか
    session_view_count = log.group_by(['session_id','yad_no']).count().rename({'count':'session_view_count'})
    past_yado_feature = past_yado_feature.join(session_view_count,how='left',on=['session_id','yad_no']).drop('seq_no')
    
    return past_yado_candidates,past_yado_feature

In [6]:
def generate_co_visit_matrix(df:pl.DataFrame) -> pl.DataFrame:
    # 共起ペアの作成
    df = df.join(df, on="session_id")
    # yad_noが同じものは除外する
    df = df.filter(pl.col("yad_no") != pl.col("yad_no_right"))
    # yad_noのペアごとに共起回数を計算
    df = df.group_by(["yad_no", "yad_no_right"]).count()
    # 整形
    df = df.rename(
        {
            "yad_no_right":"candidate_yad_no",
            "count":"co_visit_count",
        }
    )[["yad_no", "candidate_yad_no", "co_visit_count"]]

    return df

def create_topN_covisit_candidates(log_df: pl.DataFrame, top = 10):
    """
    共起行列から候補を作成する
    """
    # 共起行列の作成
    co_visit_matrix = generate_co_visit_matrix(log_df)

    # 最後に見た宿と紐づけてcandidateを作成するために、renameしておく
    co_visit_matrix = co_visit_matrix.rename({"yad_no":"latest_yad_no", "candidate_yad_no":"yad_no"})
    # 上からtop個の候補を取得
    topn_co_visit_matrix = co_visit_matrix.sort(['latest_yad_no','co_visit_count'],descending=[False,True]).group_by('latest_yad_no').head(top)

    # log_dfで最後に見た宿を取得
    log_latest = log_df.group_by("session_id").tail(1)
    log_latest = log_latest.rename({"yad_no": "latest_yad_no"})

    # 結合
    top_yado_co_candidate = log_latest.join(topn_co_visit_matrix, on="latest_yad_no")
    
    return top_yado_co_candidate.select(["session_id", "yad_no"])

In [7]:
def create_topN_popular_yado_candidates(label,train_test='train',top=10):
    """
    予約された人気宿をcandidateとして作成。train/validでリークしないように注意。
    """
    # labelデータを使うので、学習データはtrain/validで分割して作成。
    top10_yado_candidate = pl.DataFrame()
    popular_yado_feature = pl.DataFrame()
    if train_test == 'train':
        for fold in range(CFG.fold_num):
            train_label = label.filter(pl.col('fold') != fold)
            popular_yado_sort = train_label['yad_no'].value_counts().sort(by='counts',descending=True)

            # candidateの作成
            top10_yado_candidate_fold = popular_yado_sort.head(top).with_columns(pl.lit(fold).alias('fold')).select(['yad_no','fold'])
            top10_yado_candidate = pl.concat([top10_yado_candidate,top10_yado_candidate_fold])

            # 簡易的な特徴量も作成しておく。
            popular_yado_feature_fold = popular_yado_sort.with_columns(pl.lit(fold).alias('fold'))
            popular_yado_feature_fold = popular_yado_feature_fold.with_columns(pl.arange(1,len(popular_yado_sort)+1).alias('popular_rank'))
            popular_yado_feature = pl.concat([popular_yado_feature,popular_yado_feature_fold])
    else: # testデータはtrainデータ全体で作成する。
        # candidateの作成
        popular_yado_sort = label['yad_no'].value_counts().sort(by='counts',descending=True)
        top10_yado_candidate = popular_yado_sort.head(top).select(['yad_no'])

        # 簡易的な特徴量も作成しておく。
        popular_yado_feature = popular_yado_sort.with_columns(pl.arange(1,len(popular_yado_sort)+1).alias('popular_rank'))

    popular_yado_feature = popular_yado_feature.rename({'counts':'reservation_counts'})
    
    return top10_yado_candidate,popular_yado_feature

In [8]:
def create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='wid_cd',top=10):
    """
    エリア単位で予約された人気宿をcandidateとして作成。train/validでリークしないように注意。
    """
    label_yado = label.join(yado,how='left',on='yad_no')
    # labelデータを使うので、学習データはtrain/validで分割して作成。
    top10_yado_area_candidate = pl.DataFrame()
    popular_yado_area_feature = pl.DataFrame()
    if train_test == 'train':
        for fold in range(CFG.fold_num):
            train_label = label_yado.filter(pl.col('fold') != fold)
            popular_yado_sort = train_label.group_by([area,'yad_no']).count().sort(by=[area,'count'],descending=[False,True])

            # candidateの作成
            top10_yado_area_candidate_fold = popular_yado_sort.group_by(area).head(top).with_columns(pl.lit(fold).alias('fold')).select([area,'yad_no','fold'])
            top10_yado_area_candidate = pl.concat([top10_yado_area_candidate,top10_yado_area_candidate_fold])

            # 簡易的な特徴量も作成しておく。
            popular_yado_area_feature_fold = popular_yado_sort.with_columns(pl.lit(fold).alias('fold'))
            popular_yado_area_feature_fold = (popular_yado_area_feature_fold
                                            .group_by(area)
                                            .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over(area).alias(f'popular_{area}_rank'))))
            popular_yado_area_feature = pl.concat([popular_yado_area_feature,popular_yado_area_feature_fold])
            
    else: # testデータはtrainデータ全体で作成する。
        # candidateの作成
        popular_yado_sort = label_yado.group_by([area,'yad_no']).count().sort(by=[area,'count'],descending=[False,True])
        top10_yado_area_candidate = popular_yado_sort.group_by(area).head(top).select([area,'yad_no'])
        
        # 簡易的な特徴量も作成しておく。
        popular_yado_area_feature = (popular_yado_sort
                                    .group_by(area)
                                    .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over(area).alias(f'popular_{area}_rank'))))

    popular_yado_area_feature = popular_yado_area_feature.drop('count')
    
    return top10_yado_area_candidate,popular_yado_area_feature

In [9]:
def create_latest_next_booking_tonN_candidate(log,label,train_test='train',top=10):
    """
    直近見た宿で、次にどこを予約しやすいか。
    """
    log_latest = train_log.group_by('session_id').tail(1)
    log_latest = log_latest.rename({'yad_no':'latest_yad_no'})
    log_latest = log_latest.join(label,how='left',on='session_id')

    # labelデータを使うので、学習データはtrain/validで分割して作成。
    latest_next_booking_tonN_candidate = pl.DataFrame()
    latest_next_booking_tonN_feature = pl.DataFrame()
    if train_test == 'train':
        for fold in range(CFG.fold_num):
            train_log_latest = log_latest.filter(pl.col('fold') != fold)
            train_log_latest = train_log_latest.group_by(['latest_yad_no','yad_no']).count().sort(by=['latest_yad_no','count'],descending=[False,True])
    
            # candidateの作成
            latest_next_booking_tonN_candidate_fold = train_log_latest.group_by('latest_yad_no').head(top).with_columns(pl.lit(fold).alias('fold')).select(['yad_no','latest_yad_no','fold'])
            latest_next_booking_tonN_candidate = pl.concat([latest_next_booking_tonN_candidate,latest_next_booking_tonN_candidate_fold])
    
            # 簡易的な特徴量も作成しておく。
            latest_next_booking_tonN_feature_fold = train_log_latest.with_columns(pl.lit(fold).alias('fold'))
            latest_next_booking_tonN_feature_fold = (latest_next_booking_tonN_feature_fold
                                                    .group_by('latest_yad_no')
                                                    .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over('latest_yad_no').alias(f'latest_next_booking_rank'))))
            latest_next_booking_tonN_feature = pl.concat([latest_next_booking_tonN_feature,latest_next_booking_tonN_feature_fold])
    else:
        log_latest = log_latest.group_by(['latest_yad_no','yad_no']).count().sort(by=['latest_yad_no','count'],descending=[False,True])

        # candidateの作成
        latest_next_booking_tonN_candidate = log_latest.group_by('latest_yad_no').head(top).select(['yad_no','latest_yad_no'])

        # 簡易的な特徴量も作成しておく。
        latest_next_booking_tonN_feature = (log_latest
                                                    .group_by('latest_yad_no')
                                                    .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over('latest_yad_no').alias(f'latest_next_booking_rank'))))
    latest_next_booking_tonN_feature = latest_next_booking_tonN_feature.drop('count')
    return latest_next_booking_tonN_candidate,latest_next_booking_tonN_feature

In [10]:
train_past_view_yado_candidates,train_past_view_yado_feature = create_past_view_yado_candidates(train_log)
test_past_view_yado_candidates,test_past_view_yado_feature = create_past_view_yado_candidates(test_log)

In [11]:
train_top20_popular_yado_candidates,train_top20_popular_yado_feature = create_topN_popular_yado_candidates(label,train_test='train',top=10)
test_top20_popular_yado_candidates,test_top20_popular_yado_feature = create_topN_popular_yado_candidates(label,train_test='test',top=10)

In [12]:
train_top10_wid_popular_yado_candidates,train_top10_wid_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='wid_cd',top=10)
test_top10_wid_popular_yado_candidates,test_top10_wid_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='wid_cd',top=10)

train_top10_ken_popular_yado_candidates,train_top10_ken_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='ken_cd',top=10)
test_top10_ken_popular_yado_candidates,test_top10_ken_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='ken_cd',top=10)

train_top10_lrg_popular_yado_candidates,train_top10_lrg_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='lrg_cd',top=10)
test_top10_lrg_popular_yado_candidates,test_top10_lrg_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='lrg_cd',top=10)

train_top10_sml_popular_yado_candidates,train_top10_sml_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='sml_cd',top=10)
test_top10_sml_popular_yado_candidates,test_top10_sml_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='sml_cd',top=10)

In [13]:
train_latest_next_booking_top20_candidate,train_latest_next_booking_top20_feature = create_latest_next_booking_tonN_candidate(train_log,label,train_test='train',top=10)
test_latest_next_booking_top20_candidate,test_latest_next_booking_top20_feature = create_latest_next_booking_tonN_candidate(train_log,label,train_test='test',top=10)

In [14]:
train_top10_covisit_candidates = create_topN_covisit_candidates(train_log,top=10)
test_top10_covisit_candidates = create_topN_covisit_candidates(test_log,top=10)

In [16]:
# parquet形式で保存
train_past_view_yado_candidates.write_parquet(CFG.path_exp / "train_past_view_yado_candidates.parquet")
test_past_view_yado_candidates.write_parquet(CFG.path_exp / "test_past_view_yado_candidates.parquet")

train_past_view_yado_feature.write_parquet(CFG.path_exp / "train_past_view_yado_feature.parquet")
test_past_view_yado_feature.write_parquet(CFG.path_exp / "test_past_view_yado_feature.parquet")

train_top20_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top20_popular_yado_candidates.parquet")
test_top20_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top20_popular_yado_candidates.parquet")

train_top20_popular_yado_feature.write_parquet(CFG.path_exp / "train_top20_popular_yado_feature.parquet")
test_top20_popular_yado_feature.write_parquet(CFG.path_exp / "test_top20_popular_yado_feature.parquet")

train_top10_wid_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_wid_popular_yado_candidates.parquet")
test_top10_wid_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_wid_popular_yado_candidates.parquet")

train_top10_wid_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_wid_popular_yado_feature.parquet")
test_top10_wid_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_wid_popular_yado_feature.parquet")

train_top10_ken_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_ken_popular_yado_candidates.parquet")
test_top10_ken_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_ken_popular_yado_candidates.parquet")

train_top10_ken_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_ken_popular_yado_feature.parquet")
test_top10_ken_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_ken_popular_yado_feature.parquet")

train_top10_lrg_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_lrg_popular_yado_candidates.parquet")
test_top10_lrg_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_lrg_popular_yado_candidates.parquet")

train_top10_lrg_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_lrg_popular_yado_feature.parquet")
test_top10_lrg_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_lrg_popular_yado_feature.parquet")

train_top10_sml_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_sml_popular_yado_candidates.parquet")
test_top10_sml_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_sml_popular_yado_candidates.parquet")

train_top10_sml_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_sml_popular_yado_feature.parquet")
test_top10_sml_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_sml_popular_yado_feature.parquet")

train_latest_next_booking_top20_candidate.write_parquet(CFG.path_exp / "train_latest_next_booking_top20_candidates.parquet")
test_latest_next_booking_top20_candidate.write_parquet(CFG.path_exp / "test_latest_next_booking_top20_candidates.parquet")

train_latest_next_booking_top20_feature.write_parquet(CFG.path_exp / "train_latest_next_booking_top20_feature.parquet")
test_latest_next_booking_top20_feature.write_parquet(CFG.path_exp / "test_latest_next_booking_top20_feature.parquet")

train_top10_covisit_candidates.write_parquet(CFG.path_exp / "train_top10_covisit_candidates.parquet")
test_top10_covisit_candidates.write_parquet(CFG.path_exp / "test_top10_covisit_candidates.parquet")

## Feature

### candidate結合

In [17]:
# area単位のは多すぎるので、今回は除外。
candidate_name_list = ['past_view_yado',
                    #   'top20_popular_yado',
                      'top10_wid_popular_yado',
                    #   'top10_ken_popular_yado',
                    #   'top10_lrg_popular_yado',
                    #   'top10_sml_popular_yado',
                       "top10_covisit",
                       'latest_next_booking_top20']

In [18]:
def get_session_id_list(log):
    return log.group_by('session_id').head(1).select(['session_id'])

In [19]:
train_session_id = get_session_id_list(train_log)
train_session_id = train_session_id.join(label.select(['fold','session_id']),how='left',on='session_id')

test_session_id = get_session_id_list(test_log)

In [20]:
# 各candidateを結合
candidate_list = {}
candidate_list['train'] = []
candidate_list['test'] = []

for train_test in ['train','test']:
    for candidate_name in tqdm(candidate_name_list):
        candidate = pl.read_parquet(CFG.path_exp / f"{train_test}_{candidate_name}_candidates.parquet")
        if 'session_id' in candidate.columns:
            candidate_list[train_test].append(candidate.select(['session_id','yad_no']))
        elif 'latest_yad_no' in candidate.columns:
            if train_test == 'train':
                latest_yad_no = train_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})
                latest_yad_no = latest_yad_no.join(label.select(['session_id','fold']),how='left',on='session_id')
                latest_yad_no = latest_yad_no.with_columns(pl.col('fold').cast(pl.Int32))
                candidate = latest_yad_no.join(candidate,how='inner',on=['latest_yad_no','fold'])
            else:
                latest_yad_no = test_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(candidate,how='inner',on=['latest_yad_no'])
            candidate_list[train_test].append(candidate.select(['session_id','yad_no']))
                # エリア系のやつ
        elif "wid" in candidate_name:
            print("wid in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                display(latest_yad_no)
                display(candidate)
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["wid_cd", "fold"]
                )
                display(candidate)
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["wid_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "ken" in candidate_name:
            print("ken in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["ken_cd", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["ken_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "lrg" in candidate_name:
            print("lrg in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["lrg_cd", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["lrg_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "sml" in candidate_name:
            print("sml in candidate_name")
            if train_test == "train":
                # 最後に見た宿を取得
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                # foldを付与
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["sml_cd", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["sml_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )       
        else:
            if train_test == 'train':
                if 'fold' in candidate.columns:
                    candidate_all = pl.DataFrame()
                    for fold in range(CFG.fold_num):
                        candidate_fold = train_session_id.filter(pl.col('fold') == fold).join(candidate.filter(pl.col('fold') == fold).select(['yad_no']),how='cross')
                        candidate_all = pl.concat([candidate_all,candidate_fold])
            else:
                candidate_all = test_session_id.join(candidate.select(['yad_no']),how='cross')
            candidate_list[train_test].append(candidate_all.select(['session_id','yad_no']))

  0%|          | 0/4 [00:00<?, ?it/s]

wid in candidate_name


session_id,latest_yad_no,fold,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
str,i64,i32,i64,f64,f64,i64,f64,f64,f64,f64,str,str,str,str
"""d87745166ba696…",13463,1,0,217.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…"
"""7368cee28c3d41…",4055,0,1,,1.0,0,,,,,"""c312e07b7a5d45…","""3acaaea4dab889…","""84e1a778ed49e7…","""de605258b7d5a1…"
"""6de5e94845f820…",13463,3,0,217.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…"
"""c7835f4ec554b1…",11700,0,0,151.0,1.0,1,,,1.0,1.0,"""b07b75d367ebec…","""0a66f6ab9c0507…","""b7bc640833cc20…","""637c7ce61d3838…"
"""e254e633feb043…",11930,0,0,88.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""52ca3d2824fc3c…","""1cdef7807dc6bc…"
"""6cd5f1fa95f545…",1219,3,0,,,1,,,,,"""dc414a17890cfc…","""31a0f630d36db5…","""f11d0a982fcea0…","""199073cb3739d7…"
"""ac5b4d7a29ca6a…",2370,3,0,52.0,1.0,0,,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…"
"""e29fced6a5d3e6…",5492,3,0,123.0,1.0,0,1.0,,,1.0,"""e9316013ee1b03…","""66c4d01ad8e301…","""41e20110b38f12…","""0aa9c9e83b1666…"
"""f151726226cc37…",4859,4,0,14.0,1.0,1,1.0,,,,"""8a1c0d3243bba1…","""ce83563814cff3…","""7c9ac287ff3567…","""5b3bd3d71adc0b…"
"""bef9f1baf4e780…",13608,4,0,144.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""ce3aaf25e7e38a…","""d5033a3a12e84a…","""748d9b1f0f1b7d…"


wid_cd,yad_no,fold
str,i64,i32
"""dc414a17890cfc…",2445,0
"""dc414a17890cfc…",11407,0
"""dc414a17890cfc…",13106,0
"""dc414a17890cfc…",4744,0
"""dc414a17890cfc…",12946,0
"""dc414a17890cfc…",12962,0
"""dc414a17890cfc…",10868,0
"""dc414a17890cfc…",5566,0
"""dc414a17890cfc…",11499,0
"""dc414a17890cfc…",3521,0


session_id,latest_yad_no,fold,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,yad_no
str,i64,i32,i64,f64,f64,i64,f64,f64,f64,f64,str,str,str,str,i64
"""d87745166ba696…",13463,1,0,217.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…",13017
"""d87745166ba696…",13463,1,0,217.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…",719
"""d87745166ba696…",13463,1,0,217.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…",1818
"""d87745166ba696…",13463,1,0,217.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…",8567
"""d87745166ba696…",13463,1,0,217.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…",9971
"""d87745166ba696…",13463,1,0,217.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…",2201
"""d87745166ba696…",13463,1,0,217.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…",693
"""d87745166ba696…",13463,1,0,217.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…",3988
"""d87745166ba696…",13463,1,0,217.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…",1799
"""d87745166ba696…",13463,1,0,217.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…",6418


100%|██████████| 4/4 [00:00<00:00, 10.60it/s]
100%|██████████| 4/4 [00:00<00:00, 24.47it/s]

wid in candidate_name





In [21]:
train_candidate = pl.concat(candidate_list['train']).unique()
test_candidate = pl.concat(candidate_list['test']).unique()

In [22]:
del candidate_list
gc.collect()

0

### 特徴量作成

In [23]:
train_candidate = train_candidate.join(label.rename({'yad_no':'target'}),how='left',on='session_id')
train_candidate = train_candidate.with_columns(pl.col('fold').cast(pl.Int32))
train_candidate = train_candidate.with_columns((pl.col('yad_no') == pl.col('target')).alias('target').cast(pl.Int8))

In [24]:
train_latest_yad_no = train_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})
test_latest_yad_no = test_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})

In [25]:
train_candidate = train_candidate.join(train_latest_yad_no,how='left',on='session_id')
test_candidate = test_candidate.join(test_latest_yad_no,how='left',on='session_id')

In [26]:
feature_name_list = ['latest_next_booking_top20',
                     'past_view_yado',
                      'top20_popular_yado',
                      'top10_wid_popular_yado',
                      'top10_ken_popular_yado',
                      'top10_lrg_popular_yado',
                      'top10_sml_popular_yado']

In [27]:
for train_test in ['train','test']:
    for feature_name in tqdm(feature_name_list):
        feature = pl.read_parquet(CFG.path_exp / f"{train_test}_{feature_name}_feature.parquet")
        if train_test == 'train':
            # for fold in range(CFG.fold_num):
            if 'session_id' in feature.columns:
                train_candidate = train_candidate.join(feature,how='left',on=['session_id','yad_no'])
            elif 'latest_yad_no' in feature.columns:
                train_candidate = train_candidate.join(feature,how='left',on=['fold','latest_yad_no','yad_no'])
            else:
                train_candidate = train_candidate.join(feature,how='left',on=['fold','yad_no'])
        else:
            if 'session_id' in feature.columns:
                test_candidate = test_candidate.join(feature,how='left',on=['session_id','yad_no'])
            elif 'latest_yad_no' in feature.columns:
                test_candidate = test_candidate.join(feature,how='left',on=['latest_yad_no','yad_no'])
            else:
                test_candidate = test_candidate.join(feature,how='left',on=['yad_no'])

100%|██████████| 7/7 [00:00<00:00,  7.60it/s]
100%|██████████| 7/7 [00:00<00:00, 18.04it/s]


In [28]:
train_candidate = train_candidate.fill_null(0)
test_candidate = test_candidate.fill_null(0)

In [29]:
train_candidate = train_candidate.join(yado.select(['yad_no','yad_type','total_room_cnt','wireless_lan_flg','onsen_flg','kd_stn_5min','kd_bch_5min','kd_slp_5min','kd_conv_walk_5min']),how='left',on='yad_no')
test_candidate = test_candidate.join(yado.select(['yad_no','yad_type','total_room_cnt','wireless_lan_flg','onsen_flg','kd_stn_5min','kd_bch_5min','kd_slp_5min','kd_conv_walk_5min']),how='left',on='yad_no')

In [30]:
# 各seqで見た宿を特徴量にする
for seq in range(8):
    seq_yad_no = train_log.filter(pl.col('seq_no') == seq).select(['session_id','yad_no']).rename({'yad_no':f'seq_{seq}_yad_no'})
    train_candidate = train_candidate.join(seq_yad_no,how='left',on='session_id')

    seq_yad_no = test_log.filter(pl.col('seq_no') == seq).select(['session_id','yad_no']).rename({'yad_no':f'seq_{seq}_yad_no'})
    test_candidate = test_candidate.join(seq_yad_no,how='left',on='session_id')

In [31]:
def create_num_picture_df(yado_df):
    # yad_noとcategoryごとのデータ件数
    _df = yado_df.group_by(["yad_no", "category"]).count()

    # ピボットテーブルに変換
    num_picture_df = _df.pivot("count", "yad_no", "category", "sum").sort("yad_no")

    # 欠損を0で埋める
    num_picture_df = num_picture_df.fill_null(0)

    return num_picture_df

In [32]:
# 各カテゴリの画像枚数
num_picture_df = create_num_picture_df(yado_embedding)

# 画像枚数を結合
train_candidate = train_candidate.join(num_picture_df, how="left", on="yad_no")
test_candidate = test_candidate.join(num_picture_df, how="left", on="yad_no")

In [33]:
train_candidate

session_id,yad_no,target,fold,latest_yad_no,latest_next_booking_rank,max_seq_no,max_seq_no_diff,session_view_count,reservation_counts,popular_rank,wid_cd,popular_wid_cd_rank,ken_cd,popular_ken_cd_rank,lrg_cd,popular_lrg_cd_rank,sml_cd,popular_sml_cd_rank,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,seq_0_yad_no,seq_1_yad_no,seq_2_yad_no,seq_3_yad_no,seq_4_yad_no,seq_5_yad_no,seq_6_yad_no,seq_7_yad_no,exterior,food,facility,others,room
str,i64,i8,i32,i64,u32,i64,i64,u32,u32,i64,str,u32,str,u32,str,u32,str,u32,i64,f64,f64,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,u32,u32,u32,u32,u32
"""0003948318658b…",569,1,2,7246,0,1,1,1,89,245,"""e9316013ee1b03…",30,"""66c4d01ad8e301…",28,"""7763c74e2efa67…",28,"""084c46af580a48…",18,0,103.0,1.0,0,,,,1.0,569,7246,,,,,,,3,3,3,3,2
"""010341130f151b…",4277,0,0,4277,0,2,2,2,11,6020,"""e9316013ee1b03…",100,"""c71d257a01a809…",21,"""19caa95d568040…",12,"""b021781d1d2b93…",11,0,21.0,1.0,1,,,,,4277,7083,4277,,,,,,3,3,3,3,3
"""01345adc38c50e…",2769,1,0,13159,0,1,1,1,41,1549,"""46e33861f921c3…",107,"""c86352f5b57e80…",51,"""7974c7e841b95b…",9,"""e0dd9b1cf3be1d…",2,0,97.0,1.0,0,,,,,2769,13159,,,,,,,3,3,3,2,3
"""01d8236032ed45…",7896,1,2,12034,0,1,1,1,6,7612,"""dc414a17890cfc…",95,"""6920865be128aa…",59,"""056a9c29dccbdf…",9,"""76765dcbfb9ab1…",9,0,39.0,1.0,1,,,,,7896,12034,,,,,,,3,2,3,3,1
"""01dbef024fc1ae…",6958,1,1,4905,5,1,1,1,30,2480,"""dc414a17890cfc…",72,"""6920865be128aa…",33,"""029040f3b54732…",11,"""93c4520208aef9…",8,0,85.0,1.0,0,,,,,6958,4905,,,,,,,2,3,3,3,2
"""025094b783402f…",3421,1,3,7795,3,1,1,1,33,2209,"""321b69d5eec98f…",33,"""0745a2107686fc…",26,"""bd9ca9b95bfc52…",4,"""b731123b3a8076…",4,0,108.0,,1,,,,,3421,7795,,,,,,,3,1,2,1,3
"""025e0fe1b421b0…",9292,1,1,6471,4,1,1,1,93,226,"""d86102dd9c232b…",7,"""b4d2fb4e51ea7b…",6,"""29b680542fd396…",6,"""b4b4c63ff87803…",6,0,256.0,1.0,0,1.0,,,1.0,9292,6471,,,,,,,3,3,3,3,3
"""0267a259c295b6…",2598,0,1,6776,3,2,1,1,75,394,"""c312e07b7a5d45…",17,"""6692a692f80687…",17,"""8cf750072f8520…",17,"""ddd5616ecb2d2c…",16,0,224.0,1.0,0,1.0,,,1.0,1390,2598,6776,,,,,,3,2,3,3,3
"""026e4f7fbbbf9f…",4744,0,3,1755,1,1,1,1,159,48,"""dc414a17890cfc…",6,"""d78f53d0856617…",6,"""e5cfcc0a43c820…",6,"""e2f51242791849…",3,0,110.0,1.0,0,1.0,,,1.0,4744,1755,,,,,,,3,3,3,1,2
"""02ca34eb17b05f…",3214,0,3,12797,1,1,1,1,16,4522,"""8a1c0d3243bba1…",51,"""ce83563814cff3…",41,"""7c9ac287ff3567…",32,"""5b3bd3d71adc0b…",2,0,110.0,,1,,,,,3214,12797,,,,,,,3,3,3,2,2


In [34]:
# 特徴量保存
train_candidate.write_parquet(CFG.path_exp / "train_candidate.parquet")
test_candidate.write_parquet(CFG.path_exp / "test_candidate.parquet")

In [35]:
test_candidate.shape

(3690856, 38)

In [36]:
train_candidate.get_column("target").sum() / len(label)

0.5772156371017465

## ReRankモデル

In [37]:
train = pl.read_parquet(CFG.path_exp / "train_candidate.parquet")
test = pl.read_parquet(CFG.path_exp / "test_candidate.parquet")

In [38]:
# session_id単位でtarge=1がなければ、session_idごと削除
use_session_ids = train.group_by('session_id').agg(pl.col('target').sum()).filter(pl.col('target') == 1)['session_id']

### Lightgbmで学習

In [40]:
# Lightgbmで学習
lgbm_params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type' : 'gbdt',
        'seed' : CFG.seed,
        'learning_rate': 0.1, 
}

lgb_model_list = []
pred = np.zeros(len(test))
for fold in range(CFG.fold_num):
    X_train = train.filter((pl.col('session_id').is_in(use_session_ids)) & (pl.col('fold') != fold)).drop(['fold','target','session_id'])
    Y_train = train.filter((pl.col('session_id').is_in(use_session_ids)) & (pl.col('fold') != fold))['target'].to_numpy()
    X_valid = train.filter((pl.col('session_id').is_in(use_session_ids)) & (pl.col('fold') == fold)).drop(['fold','target','session_id'])
    Y_valid = train.filter((pl.col('session_id').is_in(use_session_ids)) & (pl.col('fold') == fold))['target'].to_numpy()


    # pandasの方が扱いやすいので変換
    X_train = X_train.to_pandas()
    X_valid = X_valid.to_pandas()
    for feature in CFG.cat_features:
        X_train[feature] = X_train[feature].astype('category')
        X_valid[feature] = X_valid[feature].astype('category')

    lgb_train = lgb.Dataset(X_train, Y_train)
    lgb_valid = lgb.Dataset(X_valid, Y_valid)
    lgb_model = lgb.train(lgbm_params,
                      lgb_train,
                      valid_sets=[lgb_train, lgb_valid], 
                      valid_names=['train', 'valid'],
                      callbacks=[lgb.early_stopping(200), lgb.log_evaluation(1000)],
                      **CFG.lgb_train_params
                    )

    lgb_model_list.append(lgb_model)

    del X_train,Y_train,X_valid,Y_valid,lgb_train,lgb_valid
    gc.collect()

[LightGBM] [Info] Number of positive: 133156, number of negative: 2681764
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.121517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19240
[LightGBM] [Info] Number of data points in the train set: 2814920, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047304 -> initscore=-3.002709
[LightGBM] [Info] Start training from score -3.002709
Training until validation scores don't improve for 200 rounds
[1000]	train's auc: 0.976349	valid's auc: 0.937737
Early stopping, best iteration is:
[830]	train's auc: 0.974468	valid's auc: 0.937755
[LightGBM] [Info] Number of positive: 133353, number of negative: 2682902
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.144876 seconds.
You can set `force_row_wise=true` to remove the overhead.
An

### 推論

In [41]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k for a single actual value.

    Parameters:
    actual : int
        The actual value that is to be predicted
    predicted : list
        A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The average precision at k
    """
    if actual in predicted[:k]:
        return 1.0 / (predicted[:k].index(actual) + 1)
    return 0.0

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k for lists of actual values and predicted values.

    Parameters:
    actual : list
        A list of actual values that are to be predicted
    predicted : list
        A list of lists of predicted elements (order does matter in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The mean average precision at k
    """
    return sum(apk(a, p, k) for a, p in zip(actual, predicted)) / len(actual)

In [42]:
def create_top_10_yad_predict(_df):

    # セッションごとに予測確率の高い順に yad_no の配列を作成
    _agg = _df.sort_values("predict", ascending=False).groupby("session_id")["yad_no"].apply(list)

    out_df = pd.DataFrame(index=_agg.index, data=_agg.values.tolist()).iloc[:, :10]

    return out_df

In [43]:
oof = pd.DataFrame()
test = test.with_columns(pl.lit(0).alias('predict'))
X_test = test.drop(['session_id'])
X_test = X_test.to_pandas()
for feature in CFG.cat_features:
    X_test[feature] = X_test[feature].astype('category')

for fold in range(CFG.fold_num):
    X_valid = train.filter((pl.col('fold') == fold)).drop(['fold','target','session_id'])
    Y_valid = train.filter((pl.col('fold') == fold))['target'].to_numpy()

    # pandasの方が扱いやすいので変換
    X_valid = X_valid.to_pandas()
    for feature in CFG.cat_features:
        X_valid[feature] = X_valid[feature].astype('category')

    X_valid['predict'] = lgb_model_list[fold].predict(X_valid)
    X_valid['session_id'] = train.filter((pl.col('fold') == fold))['session_id'].to_numpy()
    X_test['predict'] += lgb_model_list[fold].predict(X_test.drop('predict',axis=1))/CFG.fold_num
    oof = pd.concat([oof,X_valid[['session_id','predict','yad_no']]])
X_test['session_id'] = test['session_id'].to_numpy()

In [44]:
oof = oof.sort_values(['session_id','predict'],ascending=False)
oof_ = create_top_10_yad_predict(oof)

In [45]:
label = pd.read_csv(CFG.path_input / "train_label.csv")

mapk(actual=label[label['session_id'].isin(oof_.reset_index()['session_id'])].sort_values('session_id',ascending=True)['yad_no'].to_list(),
     predicted=oof_.values.tolist(), k=10)

0.4010051600578836

In [46]:
oof_.to_csv(CFG.path_exp / "oof.csv")

In [47]:
# CV = 0.35497319410061673

In [48]:
sub = create_top_10_yad_predict(X_test)
sub.columns = [f'predict_{c}' for c in sub.columns]
sub = sub.reset_index(drop=True)

In [49]:
sub.to_csv(CFG.path_exp / "submission.csv", index=False)

In [50]:
sub

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,4420,9830,4714,5466,11561,2680,9534,2911,13192
1,2439,9323,11923,613,10095,12350,6129,11237,12862,8108
2,757,9190,410,9910,10485,6721,6730,1774,7710,3400
3,12341,3359,13521,10746,6991,5657,9319,2047,1542,5080
4,607,763,11480,5650,1448,6576,6161,3476,2862,12029
...,...,...,...,...,...,...,...,...,...,...
174695,1997,7888,11123,2278,5744,7062,10997,9743,9543,11848
174696,2164,5331,4014,2232,13702,3644,9723,13220,3802,899
174697,5810,11037,2087,7379,8143,844,7308,11796,13719,12939
174698,13672,2407,6034,5515,1687,2373,5513,3002,4976,2305
