## Candidate

In [1]:
import os
from tqdm import tqdm
import gc
import polars as pl
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold,KFold
import lightgbm as lgb

In [2]:
class CFG:
    name = "exp014"

    path_input = Path("../input")
    path_output = Path("../output")
    seed = 127
    fold_num = 5

    cat_features = ['yad_no','latest_yad_no','wid_cd','ken_cd','lrg_cd','sml_cd']

    lgb_train_params = {
        "num_boost_round": 999999,
    }

CFG.path_exp = CFG.path_output / CFG.name
CFG.path_exp.mkdir(parents=True, exist_ok=True)

In [3]:
train_log = pl.read_csv(CFG.path_input / "train_log.csv")
label = pl.read_csv(CFG.path_input / "train_label.csv")
test_log = pl.read_csv(CFG.path_input / "test_log.csv")
yado = pl.read_csv(CFG.path_input / "yado.csv")
yado_embedding = pl.read_parquet(CFG.path_input / "image_embeddings.parquet")

In [4]:
# Group Kfold, labelにfold情報を付けておく。
kf = KFold(n_splits=CFG.fold_num,shuffle=True,random_state=CFG.seed)
fold_assignments = np.full(label.height, -1, dtype=int)
for i, (_, valid_index) in enumerate(kf.split(label)):
    fold_assignments[valid_index] = i
label = label.with_columns(pl.Series("fold", fold_assignments))

In [5]:
def create_past_view_yado_candidates(log):
    """
    アクセスした宿をcandidateとして作成。ただし、直近の宿は予約しないので除外する。
    """
    max_seq_no = log.group_by("session_id").agg(pl.max("seq_no").alias("max_seq_no"))
    log = log.join(max_seq_no, on="session_id")
    # 最大値に該当する行を除外する
    past_yado_candidates = log.filter(pl.col("seq_no") != pl.col("max_seq_no"))
    past_yado_candidates = past_yado_candidates.select(['session_id','yad_no']).unique()

    # 簡易的な特徴量も作成しておく。
    # 何個前に見たか 複数回見た時は、直近のみ残す。
    past_yado_feature = log.with_columns((pl.col('max_seq_no') - pl.col('seq_no')).alias('max_seq_no_diff')).filter(pl.col("seq_no") != pl.col("max_seq_no"))
    past_yado_feature = past_yado_feature.join(past_yado_feature.group_by(["session_id", "yad_no"]).agg(pl.col("max_seq_no_diff").max().alias("max_seq_no_diff")), on=["session_id", "yad_no", "max_seq_no_diff"])
    # 何回見たか
    session_view_count = log.group_by(['session_id','yad_no']).count().rename({'count':'session_view_count'})
    past_yado_feature = past_yado_feature.join(session_view_count,how='left',on=['session_id','yad_no']).drop('seq_no')
    
    return past_yado_candidates,past_yado_feature

In [6]:
def create_topN_popular_yado_candidates(label,train_test='train',top=10):
    """
    予約された人気宿をcandidateとして作成。train/validでリークしないように注意。
    """
    # labelデータを使うので、学習データはtrain/validで分割して作成。
    top10_yado_candidate = pl.DataFrame()
    popular_yado_feature = pl.DataFrame()
    if train_test == 'train':
        for fold in range(CFG.fold_num):
            train_label = label.filter(pl.col('fold') != fold)
            popular_yado_sort = train_label['yad_no'].value_counts().sort(by='counts',descending=True)

            # candidateの作成
            top10_yado_candidate_fold = popular_yado_sort.head(top).with_columns(pl.lit(fold).alias('fold')).select(['yad_no','fold'])
            top10_yado_candidate = pl.concat([top10_yado_candidate,top10_yado_candidate_fold])

            # 簡易的な特徴量も作成しておく。
            popular_yado_feature_fold = popular_yado_sort.with_columns(pl.lit(fold).alias('fold'))
            popular_yado_feature_fold = popular_yado_feature_fold.with_columns(pl.arange(1,len(popular_yado_sort)+1).alias('popular_rank'))
            popular_yado_feature = pl.concat([popular_yado_feature,popular_yado_feature_fold])
    else: # testデータはtrainデータ全体で作成する。
        # candidateの作成
        popular_yado_sort = label['yad_no'].value_counts().sort(by='counts',descending=True)
        top10_yado_candidate = popular_yado_sort.head(top).select(['yad_no'])

        # 簡易的な特徴量も作成しておく。
        popular_yado_feature = popular_yado_sort.with_columns(pl.arange(1,len(popular_yado_sort)+1).alias('popular_rank'))

    popular_yado_feature = popular_yado_feature.rename({'counts':'reservation_counts'})
    
    return top10_yado_candidate,popular_yado_feature

In [7]:
def create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='wid_cd',top=10):
    """
    エリア単位で予約された人気宿をcandidateとして作成。train/validでリークしないように注意。
    """
    label_yado = label.join(yado,how='left',on='yad_no')
    # labelデータを使うので、学習データはtrain/validで分割して作成。
    top10_yado_area_candidate = pl.DataFrame()
    popular_yado_area_feature = pl.DataFrame()
    if train_test == 'train':
        for fold in range(CFG.fold_num):
            train_label = label_yado.filter(pl.col('fold') != fold)
            popular_yado_sort = train_label.group_by([area,'yad_no']).count().sort(by=[area,'count'],descending=[False,True])

            # candidateの作成
            top10_yado_area_candidate_fold = popular_yado_sort.group_by(area).head(top).with_columns(pl.lit(fold).alias('fold')).select([area,'yad_no','fold'])
            top10_yado_area_candidate = pl.concat([top10_yado_area_candidate,top10_yado_area_candidate_fold])

            # 簡易的な特徴量も作成しておく。
            popular_yado_area_feature_fold = popular_yado_sort.with_columns(pl.lit(fold).alias('fold'))
            popular_yado_area_feature_fold = (popular_yado_area_feature_fold
                                            .group_by(area)
                                            .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over(area).alias(f'popular_{area}_rank'))))
            popular_yado_area_feature = pl.concat([popular_yado_area_feature,popular_yado_area_feature_fold])
            
    else: # testデータはtrainデータ全体で作成する。
        # candidateの作成
        popular_yado_sort = label_yado.group_by([area,'yad_no']).count().sort(by=[area,'count'],descending=[False,True])
        top10_yado_area_candidate = popular_yado_sort.group_by(area).head(top).select([area,'yad_no'])
        
        # 簡易的な特徴量も作成しておく。
        popular_yado_area_feature = (popular_yado_sort
                                    .group_by(area)
                                    .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over(area).alias(f'popular_{area}_rank'))))

    popular_yado_area_feature = popular_yado_area_feature.drop('count')
    
    return top10_yado_area_candidate,popular_yado_area_feature

In [8]:
def create_latest_next_booking_tonN_candidate(log,label,train_test='train',top=10):
    """
    直近見た宿で、次にどこを予約しやすいか。
    """
    log_latest = train_log.group_by('session_id').tail(1)
    log_latest = log_latest.rename({'yad_no':'latest_yad_no'})
    log_latest = log_latest.join(label,how='left',on='session_id')

    # labelデータを使うので、学習データはtrain/validで分割して作成。
    latest_next_booking_tonN_candidate = pl.DataFrame()
    latest_next_booking_tonN_feature = pl.DataFrame()
    if train_test == 'train':
        for fold in range(CFG.fold_num):
            train_log_latest = log_latest.filter(pl.col('fold') != fold)
            train_log_latest = train_log_latest.group_by(['latest_yad_no','yad_no']).count().sort(by=['latest_yad_no','count'],descending=[False,True])
    
            # candidateの作成
            latest_next_booking_tonN_candidate_fold = train_log_latest.group_by('latest_yad_no').head(top).with_columns(pl.lit(fold).alias('fold')).select(['yad_no','latest_yad_no','fold'])
            latest_next_booking_tonN_candidate = pl.concat([latest_next_booking_tonN_candidate,latest_next_booking_tonN_candidate_fold])
    
            # 簡易的な特徴量も作成しておく。
            latest_next_booking_tonN_feature_fold = train_log_latest.with_columns(pl.lit(fold).alias('fold'))
            latest_next_booking_tonN_feature_fold = (latest_next_booking_tonN_feature_fold
                                                    .group_by('latest_yad_no')
                                                    .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over('latest_yad_no').alias(f'latest_next_booking_rank'))))
            latest_next_booking_tonN_feature = pl.concat([latest_next_booking_tonN_feature,latest_next_booking_tonN_feature_fold])
    else:
        log_latest = log_latest.group_by(['latest_yad_no','yad_no']).count().sort(by=['latest_yad_no','count'],descending=[False,True])

        # candidateの作成
        latest_next_booking_tonN_candidate = log_latest.group_by('latest_yad_no').head(top).select(['yad_no','latest_yad_no'])

        # 簡易的な特徴量も作成しておく。
        latest_next_booking_tonN_feature = (log_latest
                                                    .group_by('latest_yad_no')
                                                    .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over('latest_yad_no').alias(f'latest_next_booking_rank'))))
    latest_next_booking_tonN_feature = latest_next_booking_tonN_feature.drop('count')
    return latest_next_booking_tonN_candidate,latest_next_booking_tonN_feature

In [9]:
train_past_view_yado_candidates,train_past_view_yado_feature = create_past_view_yado_candidates(train_log)
test_past_view_yado_candidates,test_past_view_yado_feature = create_past_view_yado_candidates(test_log)

In [10]:
train_top20_popular_yado_candidates,train_top20_popular_yado_feature = create_topN_popular_yado_candidates(label,train_test='train',top=20)
test_top20_popular_yado_candidates,test_top20_popular_yado_feature = create_topN_popular_yado_candidates(label,train_test='test',top=20)

In [11]:
train_top10_wid_popular_yado_candidates,train_top10_wid_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='wid_cd',top=10)
test_top10_wid_popular_yado_candidates,test_top10_wid_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='wid_cd',top=10)

train_top10_ken_popular_yado_candidates,train_top10_ken_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='ken_cd',top=10)
test_top10_ken_popular_yado_candidates,test_top10_ken_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='ken_cd',top=10)

train_top10_lrg_popular_yado_candidates,train_top10_lrg_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='lrg_cd',top=10)
test_top10_lrg_popular_yado_candidates,test_top10_lrg_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='lrg_cd',top=10)

train_top10_sml_popular_yado_candidates,train_top10_sml_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='sml_cd',top=10)
test_top10_sml_popular_yado_candidates,test_top10_sml_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='sml_cd',top=10)

In [12]:
train_latest_next_booking_top20_candidate,train_latest_next_booking_top20_feature = create_latest_next_booking_tonN_candidate(train_log,label,train_test='train',top=20)
test_latest_next_booking_top20_candidate,test_latest_next_booking_top20_feature = create_latest_next_booking_tonN_candidate(train_log,label,train_test='test',top=20)

In [13]:
# parquet形式で保存
train_past_view_yado_candidates.write_parquet(CFG.path_exp / "train_past_view_yado_candidates.parquet")
test_past_view_yado_candidates.write_parquet(CFG.path_exp / "test_past_view_yado_candidates.parquet")

train_past_view_yado_feature.write_parquet(CFG.path_exp / "train_past_view_yado_feature.parquet")
test_past_view_yado_feature.write_parquet(CFG.path_exp / "test_past_view_yado_feature.parquet")

train_top20_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top20_popular_yado_candidates.parquet")
test_top20_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top20_popular_yado_candidates.parquet")

train_top20_popular_yado_feature.write_parquet(CFG.path_exp / "train_top20_popular_yado_feature.parquet")
test_top20_popular_yado_feature.write_parquet(CFG.path_exp / "test_top20_popular_yado_feature.parquet")

train_top10_wid_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_wid_popular_yado_candidates.parquet")
test_top10_wid_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_wid_popular_yado_candidates.parquet")

train_top10_wid_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_wid_popular_yado_feature.parquet")
test_top10_wid_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_wid_popular_yado_feature.parquet")

train_top10_ken_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_ken_popular_yado_candidates.parquet")
test_top10_ken_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_ken_popular_yado_candidates.parquet")

train_top10_ken_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_ken_popular_yado_feature.parquet")
test_top10_ken_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_ken_popular_yado_feature.parquet")

train_top10_lrg_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_lrg_popular_yado_candidates.parquet")
test_top10_lrg_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_lrg_popular_yado_candidates.parquet")

train_top10_lrg_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_lrg_popular_yado_feature.parquet")
test_top10_lrg_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_lrg_popular_yado_feature.parquet")

train_top10_sml_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_sml_popular_yado_candidates.parquet")
test_top10_sml_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_sml_popular_yado_candidates.parquet")

train_top10_sml_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_sml_popular_yado_feature.parquet")
test_top10_sml_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_sml_popular_yado_feature.parquet")

train_latest_next_booking_top20_candidate.write_parquet(CFG.path_exp / "train_latest_next_booking_top20_candidates.parquet")
test_latest_next_booking_top20_candidate.write_parquet(CFG.path_exp / "test_latest_next_booking_top20_candidates.parquet")

train_latest_next_booking_top20_feature.write_parquet(CFG.path_exp / "train_latest_next_booking_top20_feature.parquet")
test_latest_next_booking_top20_feature.write_parquet(CFG.path_exp / "test_latest_next_booking_top20_feature.parquet")


## Feature

### candidate結合

In [14]:
# area単位のは多すぎるので、今回は除外。
candidate_name_list = ['past_view_yado',
                      # 'top20_popular_yado',
                      'top10_wid_popular_yado',
                      'top10_ken_popular_yado',
                      'top10_lrg_popular_yado',
                      'top10_sml_popular_yado',
                       'latest_next_booking_top20']

In [15]:
def get_session_id_list(log):
    return log.group_by('session_id').head(1).select(['session_id'])

In [16]:
train_session_id = get_session_id_list(train_log)
train_session_id = train_session_id.join(label.select(['fold','session_id']),how='left',on='session_id')

test_session_id = get_session_id_list(test_log)

In [17]:
# 各candidateを結合
candidate_list = {}
candidate_list['train'] = []
candidate_list['test'] = []

for train_test in ['train','test']:
    for candidate_name in tqdm(candidate_name_list):
        candidate = pl.read_parquet(CFG.path_exp / f"{train_test}_{candidate_name}_candidates.parquet")
        if 'session_id' in candidate.columns:
            candidate_list[train_test].append(candidate.select(['session_id','yad_no']))
        elif 'latest_yad_no' in candidate.columns:
            if train_test == 'train':
                latest_yad_no = train_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})
                latest_yad_no = latest_yad_no.join(label.select(['session_id','fold']),how='left',on='session_id')
                latest_yad_no = latest_yad_no.with_columns(pl.col('fold').cast(pl.Int32))
                candidate = latest_yad_no.join(candidate,how='inner',on=['latest_yad_no','fold'])
            else:
                latest_yad_no = test_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(candidate,how='inner',on=['latest_yad_no'])
            candidate_list[train_test].append(candidate.select(['session_id','yad_no']))
                # エリア系のやつ
        elif "wid" in candidate_name:
            print("wid in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                display(latest_yad_no)
                display(candidate)
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["wid_cd", "fold"]
                )
                display(candidate)
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["wid_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "ken" in candidate_name:
            print("ken in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["ken_cd", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["ken_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "lrg" in candidate_name:
            print("lrg in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["lrg_cd", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["lrg_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "sml" in candidate_name:
            print("sml in candidate_name")
            if train_test == "train":
                # 最後に見た宿を取得
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                # foldを付与
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["sml_cd", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["sml_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )       
        else:
            if train_test == 'train':
                if 'fold' in candidate.columns:
                    candidate_all = pl.DataFrame()
                    for fold in range(CFG.fold_num):
                        candidate_fold = train_session_id.filter(pl.col('fold') == fold).join(candidate.filter(pl.col('fold') == fold).select(['yad_no']),how='cross')
                        candidate_all = pl.concat([candidate_all,candidate_fold])
            else:
                candidate_all = test_session_id.join(candidate.select(['yad_no']),how='cross')
            candidate_list[train_test].append(candidate_all.select(['session_id','yad_no']))

  0%|          | 0/6 [00:00<?, ?it/s]

wid in candidate_name


session_id,latest_yad_no,fold,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
str,i64,i32,i64,f64,f64,i64,f64,f64,f64,f64,str,str,str,str
"""d406e959dfab74…",10985,4,0,26.0,,1,,,,,"""dc414a17890cfc…","""6920865be128aa…","""ed31bbb4d1a6dd…","""2aefae5b4f7ee1…"
"""50e08b00e74e19…",12350,1,0,696.0,1.0,0,,,,,"""46e33861f921c3…","""572d60f0f5212a…","""8a623b960557e8…","""1d9f09b9e2bd43…"
"""65831520bc8ee9…",12012,0,0,40.0,1.0,0,,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""52d0a7d917cc19…","""5423b90b9624bb…"
"""ef110707b95521…",7810,3,0,234.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""ce3aaf25e7e38a…","""b94b1624f29ace…","""8cb854e17cd42e…"
"""8f2ad79bf186ca…",10461,1,0,196.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""eb147deb12cb67…","""247ac4e90c8df7…"
"""10306a8f4f1eb8…",11788,4,0,143.0,1.0,0,1.0,,,1.0,"""b07b75d367ebec…","""0a66f6ab9c0507…","""0dbb24594ba15f…","""7d4cba769884ac…"
"""8e4586405f8904…",7071,1,0,104.0,1.0,0,1.0,,,1.0,"""b07b75d367ebec…","""0a66f6ab9c0507…","""9ab5718fd88c6e…","""7aff71bb47acb7…"
"""6eb346648b83c4…",544,1,0,170.0,1.0,0,1.0,,,1.0,"""dc414a17890cfc…","""d78f53d0856617…","""e5cfcc0a43c820…","""e2f51242791849…"
"""dc3cbef952eec4…",705,1,0,59.0,1.0,0,,,,1.0,"""f0112abf369fb0…","""ce3aaf25e7e38a…","""b94b1624f29ace…","""9dff180c5e5089…"
"""a0952b14536782…",334,1,0,194.0,,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""810a15fb99b13c…","""bacd9adfafe9ef…"


wid_cd,yad_no,fold
str,i64,i32
"""d86102dd9c232b…",6407,0
"""d86102dd9c232b…",5135,0
"""d86102dd9c232b…",3789,0
"""d86102dd9c232b…",13740,0
"""d86102dd9c232b…",11777,0
"""d86102dd9c232b…",6471,0
"""d86102dd9c232b…",1510,0
"""d86102dd9c232b…",9292,0
"""d86102dd9c232b…",8734,0
"""d86102dd9c232b…",5937,0


session_id,latest_yad_no,fold,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,yad_no
str,i64,i32,i64,f64,f64,i64,f64,f64,f64,f64,str,str,str,str,i64
"""d406e959dfab74…",10985,4,0,26.0,,1,,,,,"""dc414a17890cfc…","""6920865be128aa…","""ed31bbb4d1a6dd…","""2aefae5b4f7ee1…",2445
"""d406e959dfab74…",10985,4,0,26.0,,1,,,,,"""dc414a17890cfc…","""6920865be128aa…","""ed31bbb4d1a6dd…","""2aefae5b4f7ee1…",11407
"""d406e959dfab74…",10985,4,0,26.0,,1,,,,,"""dc414a17890cfc…","""6920865be128aa…","""ed31bbb4d1a6dd…","""2aefae5b4f7ee1…",13106
"""d406e959dfab74…",10985,4,0,26.0,,1,,,,,"""dc414a17890cfc…","""6920865be128aa…","""ed31bbb4d1a6dd…","""2aefae5b4f7ee1…",12946
"""d406e959dfab74…",10985,4,0,26.0,,1,,,,,"""dc414a17890cfc…","""6920865be128aa…","""ed31bbb4d1a6dd…","""2aefae5b4f7ee1…",4744
"""d406e959dfab74…",10985,4,0,26.0,,1,,,,,"""dc414a17890cfc…","""6920865be128aa…","""ed31bbb4d1a6dd…","""2aefae5b4f7ee1…",12962
"""d406e959dfab74…",10985,4,0,26.0,,1,,,,,"""dc414a17890cfc…","""6920865be128aa…","""ed31bbb4d1a6dd…","""2aefae5b4f7ee1…",10868
"""d406e959dfab74…",10985,4,0,26.0,,1,,,,,"""dc414a17890cfc…","""6920865be128aa…","""ed31bbb4d1a6dd…","""2aefae5b4f7ee1…",5566
"""d406e959dfab74…",10985,4,0,26.0,,1,,,,,"""dc414a17890cfc…","""6920865be128aa…","""ed31bbb4d1a6dd…","""2aefae5b4f7ee1…",10515
"""d406e959dfab74…",10985,4,0,26.0,,1,,,,,"""dc414a17890cfc…","""6920865be128aa…","""ed31bbb4d1a6dd…","""2aefae5b4f7ee1…",11499


 50%|█████     | 3/6 [00:00<00:00,  7.02it/s]

ken in candidate_name
lrg in candidate_name


 67%|██████▋   | 4/6 [00:00<00:00,  6.32it/s]

sml in candidate_name


100%|██████████| 6/6 [00:00<00:00,  6.38it/s]
  0%|          | 0/6 [00:00<?, ?it/s]

wid in candidate_name
ken in candidate_name


 50%|█████     | 3/6 [00:00<00:00, 14.02it/s]

lrg in candidate_name
sml in candidate_name


100%|██████████| 6/6 [00:00<00:00, 11.80it/s]


In [18]:
train_candidate = pl.concat(candidate_list['train']).unique()
test_candidate = pl.concat(candidate_list['test']).unique()

In [19]:
del candidate_list
gc.collect()

0

### 特徴量作成

In [20]:
train_candidate = train_candidate.join(label.rename({'yad_no':'target'}),how='left',on='session_id')
train_candidate = train_candidate.with_columns(pl.col('fold').cast(pl.Int32))
train_candidate = train_candidate.with_columns((pl.col('yad_no') == pl.col('target')).alias('target').cast(pl.Int8))

In [21]:
train_latest_yad_no = train_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})
test_latest_yad_no = test_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})

In [22]:
train_candidate = train_candidate.join(train_latest_yad_no,how='left',on='session_id')
test_candidate = test_candidate.join(test_latest_yad_no,how='left',on='session_id')

In [23]:
feature_name_list = ['latest_next_booking_top20',
                     'past_view_yado',
                      'top20_popular_yado',
                      'top10_wid_popular_yado',
                      'top10_ken_popular_yado',
                      'top10_lrg_popular_yado',
                      'top10_sml_popular_yado']

In [24]:
for train_test in ['train','test']:
    for feature_name in tqdm(feature_name_list):
        feature = pl.read_parquet(CFG.path_exp / f"{train_test}_{feature_name}_feature.parquet")
        if train_test == 'train':
            # for fold in range(CFG.fold_num):
            if 'session_id' in feature.columns:
                train_candidate = train_candidate.join(feature,how='left',on=['session_id','yad_no'])
            elif 'latest_yad_no' in feature.columns:
                train_candidate = train_candidate.join(feature,how='left',on=['fold','latest_yad_no','yad_no'])
            else:
                train_candidate = train_candidate.join(feature,how='left',on=['fold','yad_no'])
        else:
            if 'session_id' in feature.columns:
                test_candidate = test_candidate.join(feature,how='left',on=['session_id','yad_no'])
            elif 'latest_yad_no' in feature.columns:
                test_candidate = test_candidate.join(feature,how='left',on=['latest_yad_no','yad_no'])
            else:
                test_candidate = test_candidate.join(feature,how='left',on=['yad_no'])

100%|██████████| 7/7 [00:01<00:00,  5.06it/s]
100%|██████████| 7/7 [00:00<00:00, 11.31it/s]


In [25]:
train_candidate = train_candidate.fill_null(0)
test_candidate = test_candidate.fill_null(0)

In [26]:
train_candidate = train_candidate.join(yado.select(['yad_no','yad_type','total_room_cnt','wireless_lan_flg','onsen_flg','kd_stn_5min','kd_bch_5min','kd_slp_5min','kd_conv_walk_5min']),how='left',on='yad_no')
test_candidate = test_candidate.join(yado.select(['yad_no','yad_type','total_room_cnt','wireless_lan_flg','onsen_flg','kd_stn_5min','kd_bch_5min','kd_slp_5min','kd_conv_walk_5min']),how='left',on='yad_no')

In [27]:
# 各seqで見た宿を特徴量にする
for seq in range(8):
    seq_yad_no = train_log.filter(pl.col('seq_no') == seq).select(['session_id','yad_no']).rename({'yad_no':f'seq_{seq}_yad_no'})
    train_candidate = train_candidate.join(seq_yad_no,how='left',on='session_id')

    seq_yad_no = test_log.filter(pl.col('seq_no') == seq).select(['session_id','yad_no']).rename({'yad_no':f'seq_{seq}_yad_no'})
    test_candidate = test_candidate.join(seq_yad_no,how='left',on='session_id')

In [28]:
def create_num_picture_df(yado_df):
    # yad_noとcategoryごとのデータ件数
    _df = yado_df.group_by(["yad_no", "category"]).count()

    # ピボットテーブルに変換
    num_picture_df = _df.pivot("count", "yad_no", "category", "sum").sort("yad_no")

    # 欠損を0で埋める
    num_picture_df = num_picture_df.fill_null(0)

    return num_picture_df

In [29]:
# 各カテゴリの画像枚数
num_picture_df = create_num_picture_df(yado_embedding)

# 画像枚数を結合
train_candidate = train_candidate.join(num_picture_df, how="left", on="yad_no")
test_candidate = test_candidate.join(num_picture_df, how="left", on="yad_no")

In [30]:
train_candidate

session_id,yad_no,target,fold,latest_yad_no,latest_next_booking_rank,max_seq_no,max_seq_no_diff,session_view_count,reservation_counts,popular_rank,wid_cd,popular_wid_cd_rank,ken_cd,popular_ken_cd_rank,lrg_cd,popular_lrg_cd_rank,sml_cd,popular_sml_cd_rank,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,seq_0_yad_no,seq_1_yad_no,seq_2_yad_no,seq_3_yad_no,seq_4_yad_no,seq_5_yad_no,seq_6_yad_no,seq_7_yad_no,room,facility,food,exterior,others
str,i64,i8,i32,i64,u32,i64,i64,u32,u32,i64,str,u32,str,u32,str,u32,str,u32,i64,f64,f64,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,u32,u32,u32,u32,u32
"""005d9c6b7d2b81…",13757,1,3,6378,6,1,1,1,44,1368,"""46e33861f921c3…",117,"""107c7305a74c8d…",83,"""e2034d4f2fbe08…",40,"""086904b20a91b5…",33,0,130.0,1.0,0,1.0,,,1.0,13757,6378,,,,,,,3,3,0,3,0
"""00b7614e6f6dbb…",5169,1,1,990,2,1,1,1,28,2749,"""f0112abf369fb0…",113,"""fec19ba0016c01…",11,"""9cb0bdd21ca9db…",3,"""4e1d384ad60758…",3,0,113.0,1.0,0,1.0,,,1.0,5169,990,,,,,,,3,3,3,3,3
"""00bc5b5004cec6…",532,0,4,13468,3,1,1,1,202,21,"""46e33861f921c3…",13,"""572d60f0f5212a…",8,"""8a623b960557e8…",8,"""1d9f09b9e2bd43…",7,0,427.0,1.0,0,1.0,,,,532,13468,,,,,,,2,2,2,3,2
"""00c364a9d0297c…",7500,1,2,8387,4,1,1,1,33,2150,"""321b69d5eec98f…",37,"""39c3eb151762dd…",11,"""d9c4641b191036…",11,"""4a22094c55f841…",11,0,165.0,1.0,0,1.0,,,1.0,7500,8387,,,,,,,3,3,3,3,0
"""01105b9be97cda…",5944,1,0,6199,0,1,1,1,65,574,"""46e33861f921c3…",84,"""c86352f5b57e80…",35,"""9d6a46da05976c…",30,"""568887ea1e1d8c…",23,0,2311.0,1.0,0,1.0,,,1.0,5944,6199,,,,,,,2,3,1,3,3
"""0122e6a77b1d1d…",6874,1,0,899,3,1,1,1,55,859,"""3300cf6f774b7c…",20,"""013592a15b9a68…",7,"""989ce3ae2fc5f1…",2,"""ed85e7b17b271d…",2,0,212.0,1.0,0,1.0,,,1.0,6874,899,,,,,,,3,3,3,2,3
"""013f4c724e4859…",3684,0,0,3684,0,2,2,2,27,2898,"""8a1c0d3243bba1…",47,"""ce83563814cff3…",39,"""bd351c18d73441…",7,"""6b2f1db7e5052f…",7,0,110.0,1.0,0,1.0,,,1.0,3684,4724,3684,,,,,,3,2,3,3,3
"""0162c0cd66b027…",6867,1,0,301,2,1,1,1,26,3007,"""43875109d1dab9…",36,"""3b09a7ce9934c0…",19,"""c0e203c17b3a41…",19,"""991bd5d1842a49…",6,0,43.0,1.0,1,1.0,,,1.0,6867,301,,,,,,,3,3,3,3,2
"""017dd1525b8161…",5905,0,4,1571,2,1,1,1,16,4728,"""f0112abf369fb0…",128,"""ade0e32ad9713b…",28,"""c7e01c55c5c28c…",1,"""0ea610f6ce9d4b…",1,0,82.0,1.0,0,,,,1.0,5905,1571,,,,,,,1,2,1,3,3
"""018799b8512f88…",9121,1,4,2198,2,1,1,1,13,5353,"""321b69d5eec98f…",62,"""0745a2107686fc…",49,"""bd9ca9b95bfc52…",17,"""b731123b3a8076…",15,0,116.0,1.0,1,,,,1.0,9121,2198,,,,,,,3,2,3,3,3


In [31]:
# 特徴量保存
train_candidate.write_parquet(CFG.path_exp / "train_candidate.parquet")
test_candidate.write_parquet(CFG.path_exp / "test_candidate.parquet")

In [32]:
train_candidate.shape

(9623201, 40)

In [33]:
train_candidate.get_column("target").sum() / len(label)

0.6588753645678183

## ReRankモデル

In [34]:
train = pl.read_parquet(CFG.path_exp / "train_candidate.parquet")
test = pl.read_parquet(CFG.path_exp / "test_candidate.parquet")

In [35]:
# session_id単位でtarge=1がなければ、session_idごと削除
use_session_ids = train.group_by('session_id').agg(pl.col('target').sum()).filter(pl.col('target') == 1)['session_id']

### Lightgbmで学習

In [36]:
# Lightgbmで学習
lgbm_params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type' : 'gbdt',
        'seed' : CFG.seed,
        'learning_rate': 0.2, 
}

lgb_model_list = []
pred = np.zeros(len(test))
for fold in range(CFG.fold_num):
    X_train = train.filter((pl.col('session_id').is_in(use_session_ids)) & (pl.col('fold') != fold)).drop(['fold','target','session_id'])
    Y_train = train.filter((pl.col('session_id').is_in(use_session_ids)) & (pl.col('fold') != fold))['target'].to_numpy()
    X_valid = train.filter((pl.col('session_id').is_in(use_session_ids)) & (pl.col('fold') == fold)).drop(['fold','target','session_id'])
    Y_valid = train.filter((pl.col('session_id').is_in(use_session_ids)) & (pl.col('fold') == fold))['target'].to_numpy()


    # pandasの方が扱いやすいので変換
    X_train = X_train.to_pandas()
    X_valid = X_valid.to_pandas()
    for feature in CFG.cat_features:
        X_train[feature] = X_train[feature].astype('category')
        X_valid[feature] = X_valid[feature].astype('category')

    lgb_train = lgb.Dataset(X_train, Y_train)
    lgb_valid = lgb.Dataset(X_valid, Y_valid)
    lgb_model = lgb.train(lgbm_params,
                      lgb_train,
                      valid_sets=[lgb_train, lgb_valid], 
                      valid_names=['train', 'valid'],
                      callbacks=[lgb.early_stopping(200), lgb.log_evaluation(1000)],
                      **CFG.lgb_train_params
                    )

    lgb_model_list.append(lgb_model)

    del X_train,Y_train,X_valid,Y_valid,lgb_train,lgb_valid
    gc.collect()

[LightGBM] [Info] Number of positive: 152000, number of negative: 4926141
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.242204 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21140
[LightGBM] [Info] Number of data points in the train set: 5078141, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.029932 -> initscore=-3.478431
[LightGBM] [Info] Start training from score -3.478431
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[493]	train's auc: 0.970618	valid's auc: 0.915065
[LightGBM] [Info] Number of positive: 152261, number of negative: 4934961
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.161178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_w

### 推論

In [37]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k for a single actual value.

    Parameters:
    actual : int
        The actual value that is to be predicted
    predicted : list
        A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The average precision at k
    """
    if actual in predicted[:k]:
        return 1.0 / (predicted[:k].index(actual) + 1)
    return 0.0

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k for lists of actual values and predicted values.

    Parameters:
    actual : list
        A list of actual values that are to be predicted
    predicted : list
        A list of lists of predicted elements (order does matter in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The mean average precision at k
    """
    return sum(apk(a, p, k) for a, p in zip(actual, predicted)) / len(actual)

In [38]:
def create_top_10_yad_predict(_df):

    # セッションごとに予測確率の高い順に yad_no の配列を作成
    _agg = _df.sort_values("predict", ascending=False).groupby("session_id")["yad_no"].apply(list)

    out_df = pd.DataFrame(index=_agg.index, data=_agg.values.tolist()).iloc[:, :10]

    return out_df

In [39]:
oof = pd.DataFrame()
test = test.with_columns(pl.lit(0).alias('predict'))
X_test = test.drop(['session_id'])
X_test = X_test.to_pandas()
for feature in CFG.cat_features:
    X_test[feature] = X_test[feature].astype('category')

for fold in range(CFG.fold_num):
    X_valid = train.filter((pl.col('fold') == fold)).drop(['fold','target','session_id'])
    Y_valid = train.filter((pl.col('fold') == fold))['target'].to_numpy()

    # pandasの方が扱いやすいので変換
    X_valid = X_valid.to_pandas()
    for feature in CFG.cat_features:
        X_valid[feature] = X_valid[feature].astype('category')

    X_valid['predict'] = lgb_model_list[fold].predict(X_valid)
    X_valid['session_id'] = train.filter((pl.col('fold') == fold))['session_id'].to_numpy()
    X_test['predict'] += lgb_model_list[fold].predict(X_test.drop('predict',axis=1))/CFG.fold_num
    oof = pd.concat([oof,X_valid[['session_id','predict','yad_no']]])
X_test['session_id'] = test['session_id'].to_numpy()

In [40]:
oof = oof.sort_values(['session_id','predict'],ascending=False)
oof_ = create_top_10_yad_predict(oof)

In [41]:
label = pd.read_csv(CFG.path_input / "train_label.csv")

mapk(actual=label[label['session_id'].isin(oof_.reset_index()['session_id'])].sort_values('session_id',ascending=True)['yad_no'].to_list(),
     predicted=oof_.values.tolist(), k=10)

0.3842769403562316

In [42]:
oof_.to_csv(CFG.path_exp / "oof.csv")

In [43]:
# CV = 0.35497319410061673

In [44]:
sub = create_top_10_yad_predict(X_test)
sub.columns = [f'predict_{c}' for c in sub.columns]
sub = sub.reset_index(drop=True)

In [45]:
sub.to_csv(CFG.path_exp / "submission.csv", index=False)