## Candidate

In [1]:
import os
from tqdm import tqdm
import gc
import polars as pl
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import GroupKFold,KFold
import lightgbm as lgb

In [2]:
class CFG:
    name = "exp015"

    path_input = Path("../input")
    path_output = Path("../output")
    seed = 127
    fold_num = 5

    cat_features = ['yad_no','latest_yad_no','wid_cd','ken_cd','lrg_cd','sml_cd']

    lgb_train_params = {
        "num_boost_round": 999999,
    }

CFG.path_exp = CFG.path_output / CFG.name
CFG.path_exp.mkdir(parents=True, exist_ok=True)

In [3]:
train_log = pl.read_csv(CFG.path_input / "train_log.csv")
label = pl.read_csv(CFG.path_input / "train_label.csv")
test_log = pl.read_csv(CFG.path_input / "test_log.csv")
yado = pl.read_csv(CFG.path_input / "yado.csv")
yado_embedding = pl.read_parquet(CFG.path_input / "image_embeddings.parquet")

In [4]:
# Group Kfold, labelにfold情報を付けておく。
kf = KFold(n_splits=CFG.fold_num,shuffle=True,random_state=CFG.seed)
fold_assignments = np.full(label.height, -1, dtype=int)
for i, (_, valid_index) in enumerate(kf.split(label)):
    fold_assignments[valid_index] = i
label = label.with_columns(pl.Series("fold", fold_assignments))

In [5]:
def create_past_view_yado_candidates(log):
    """
    アクセスした宿をcandidateとして作成。ただし、直近の宿は予約しないので除外する。
    """
    max_seq_no = log.group_by("session_id").agg(pl.max("seq_no").alias("max_seq_no"))
    log = log.join(max_seq_no, on="session_id")
    # 最大値に該当する行を除外する
    past_yado_candidates = log.filter(pl.col("seq_no") != pl.col("max_seq_no"))
    past_yado_candidates = past_yado_candidates.select(['session_id','yad_no']).unique()

    # 簡易的な特徴量も作成しておく。
    # 何個前に見たか 複数回見た時は、直近のみ残す。
    past_yado_feature = log.with_columns((pl.col('max_seq_no') - pl.col('seq_no')).alias('max_seq_no_diff')).filter(pl.col("seq_no") != pl.col("max_seq_no"))
    past_yado_feature = past_yado_feature.join(past_yado_feature.group_by(["session_id", "yad_no"]).agg(pl.col("max_seq_no_diff").max().alias("max_seq_no_diff")), on=["session_id", "yad_no", "max_seq_no_diff"])
    # 何回見たか
    session_view_count = log.group_by(['session_id','yad_no']).count().rename({'count':'session_view_count'})
    past_yado_feature = past_yado_feature.join(session_view_count,how='left',on=['session_id','yad_no']).drop('seq_no')
    
    return past_yado_candidates,past_yado_feature

In [6]:
def generate_co_visit_matrix(df:pl.DataFrame) -> pl.DataFrame:
    # 共起ペアの作成
    df = df.join(df, on="session_id")
    # yad_noが同じものは除外する
    df = df.filter(pl.col("yad_no") != pl.col("yad_no_right"))
    # yad_noのペアごとに共起回数を計算
    df = df.group_by(["yad_no", "yad_no_right"]).count()
    # 整形
    df = df.rename(
        {
            "yad_no_right":"candidate_yad_no",
            "count":"co_visit_count",
        }
    )[["yad_no", "candidate_yad_no", "co_visit_count"]]

    return df

def create_topN_covisit_candidates(log_df: pl.DataFrame, top = 10):
    """
    共起行列から候補を作成する
    """
    # 共起行列の作成
    co_visit_matrix = generate_co_visit_matrix(log_df)

    # 最後に見た宿と紐づけてcandidateを作成するために、renameしておく
    co_visit_matrix = co_visit_matrix.rename({"yad_no":"latest_yad_no", "candidate_yad_no":"yad_no"})
    # 上からtop個の候補を取得
    topn_co_visit_matrix = co_visit_matrix.sort(['latest_yad_no','co_visit_count'],descending=[False,True]).group_by('latest_yad_no').head(top)

    # log_dfで最後に見た宿を取得
    log_latest = log_df.group_by("session_id").tail(1)
    log_latest = log_latest.rename({"yad_no": "latest_yad_no"})

    # 結合
    top_yado_co_candidate = log_latest.join(topn_co_visit_matrix, on="latest_yad_no")
    
    return top_yado_co_candidate.select(["session_id", "yad_no"])

In [7]:
def create_topN_popular_yado_candidates(label,train_test='train',top=10):
    """
    予約された人気宿をcandidateとして作成。train/validでリークしないように注意。
    """
    # labelデータを使うので、学習データはtrain/validで分割して作成。
    top10_yado_candidate = pl.DataFrame()
    popular_yado_feature = pl.DataFrame()
    if train_test == 'train':
        for fold in range(CFG.fold_num):
            train_label = label.filter(pl.col('fold') != fold)
            popular_yado_sort = train_label['yad_no'].value_counts().sort(by='counts',descending=True)

            # candidateの作成
            top10_yado_candidate_fold = popular_yado_sort.head(top).with_columns(pl.lit(fold).alias('fold')).select(['yad_no','fold'])
            top10_yado_candidate = pl.concat([top10_yado_candidate,top10_yado_candidate_fold])

            # 簡易的な特徴量も作成しておく。
            popular_yado_feature_fold = popular_yado_sort.with_columns(pl.lit(fold).alias('fold'))
            popular_yado_feature_fold = popular_yado_feature_fold.with_columns(pl.arange(1,len(popular_yado_sort)+1).alias('popular_rank'))
            popular_yado_feature = pl.concat([popular_yado_feature,popular_yado_feature_fold])
    else: # testデータはtrainデータ全体で作成する。
        # candidateの作成
        popular_yado_sort = label['yad_no'].value_counts().sort(by='counts',descending=True)
        top10_yado_candidate = popular_yado_sort.head(top).select(['yad_no'])

        # 簡易的な特徴量も作成しておく。
        popular_yado_feature = popular_yado_sort.with_columns(pl.arange(1,len(popular_yado_sort)+1).alias('popular_rank'))

    popular_yado_feature = popular_yado_feature.rename({'counts':'reservation_counts'})
    
    return top10_yado_candidate,popular_yado_feature

In [8]:
def create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='wid_cd',top=10):
    """
    エリア単位で予約された人気宿をcandidateとして作成。train/validでリークしないように注意。
    """
    label_yado = label.join(yado,how='left',on='yad_no')
    # labelデータを使うので、学習データはtrain/validで分割して作成。
    top10_yado_area_candidate = pl.DataFrame()
    popular_yado_area_feature = pl.DataFrame()
    if train_test == 'train':
        for fold in range(CFG.fold_num):
            train_label = label_yado.filter(pl.col('fold') != fold)
            popular_yado_sort = train_label.group_by([area,'yad_no']).count().sort(by=[area,'count'],descending=[False,True])

            # candidateの作成
            top10_yado_area_candidate_fold = popular_yado_sort.group_by(area).head(top).with_columns(pl.lit(fold).alias('fold')).select([area,'yad_no','fold'])
            top10_yado_area_candidate = pl.concat([top10_yado_area_candidate,top10_yado_area_candidate_fold])

            # 簡易的な特徴量も作成しておく。
            popular_yado_area_feature_fold = popular_yado_sort.with_columns(pl.lit(fold).alias('fold'))
            popular_yado_area_feature_fold = (popular_yado_area_feature_fold
                                            .group_by(area)
                                            .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over(area).alias(f'popular_{area}_rank'))))
            popular_yado_area_feature = pl.concat([popular_yado_area_feature,popular_yado_area_feature_fold])
            
    else: # testデータはtrainデータ全体で作成する。
        # candidateの作成
        popular_yado_sort = label_yado.group_by([area,'yad_no']).count().sort(by=[area,'count'],descending=[False,True])
        top10_yado_area_candidate = popular_yado_sort.group_by(area).head(top).select([area,'yad_no'])
        
        # 簡易的な特徴量も作成しておく。
        popular_yado_area_feature = (popular_yado_sort
                                    .group_by(area)
                                    .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over(area).alias(f'popular_{area}_rank'))))

    popular_yado_area_feature = popular_yado_area_feature.drop('count')
    
    return top10_yado_area_candidate,popular_yado_area_feature

In [9]:
def create_latest_next_booking_tonN_candidate(log,label,train_test='train',top=10):
    """
    直近見た宿で、次にどこを予約しやすいか。
    """
    log_latest = train_log.group_by('session_id').tail(1)
    log_latest = log_latest.rename({'yad_no':'latest_yad_no'})
    log_latest = log_latest.join(label,how='left',on='session_id')

    # labelデータを使うので、学習データはtrain/validで分割して作成。
    latest_next_booking_tonN_candidate = pl.DataFrame()
    latest_next_booking_tonN_feature = pl.DataFrame()
    if train_test == 'train':
        for fold in range(CFG.fold_num):
            train_log_latest = log_latest.filter(pl.col('fold') != fold)
            train_log_latest = train_log_latest.group_by(['latest_yad_no','yad_no']).count().sort(by=['latest_yad_no','count'],descending=[False,True])
    
            # candidateの作成
            latest_next_booking_tonN_candidate_fold = train_log_latest.group_by('latest_yad_no').head(top).with_columns(pl.lit(fold).alias('fold')).select(['yad_no','latest_yad_no','fold'])
            latest_next_booking_tonN_candidate = pl.concat([latest_next_booking_tonN_candidate,latest_next_booking_tonN_candidate_fold])
    
            # 簡易的な特徴量も作成しておく。
            latest_next_booking_tonN_feature_fold = train_log_latest.with_columns(pl.lit(fold).alias('fold'))
            latest_next_booking_tonN_feature_fold = (latest_next_booking_tonN_feature_fold
                                                    .group_by('latest_yad_no')
                                                    .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over('latest_yad_no').alias(f'latest_next_booking_rank'))))
            latest_next_booking_tonN_feature = pl.concat([latest_next_booking_tonN_feature,latest_next_booking_tonN_feature_fold])
    else:
        log_latest = log_latest.group_by(['latest_yad_no','yad_no']).count().sort(by=['latest_yad_no','count'],descending=[False,True])

        # candidateの作成
        latest_next_booking_tonN_candidate = log_latest.group_by('latest_yad_no').head(top).select(['yad_no','latest_yad_no'])

        # 簡易的な特徴量も作成しておく。
        latest_next_booking_tonN_feature = (log_latest
                                                    .group_by('latest_yad_no')
                                                    .map_groups(lambda group: group.with_columns(pl.col('count').rank(method='dense',descending=True).over('latest_yad_no').alias(f'latest_next_booking_rank'))))
    latest_next_booking_tonN_feature = latest_next_booking_tonN_feature.drop('count')
    return latest_next_booking_tonN_candidate,latest_next_booking_tonN_feature

In [10]:
train_past_view_yado_candidates,train_past_view_yado_feature = create_past_view_yado_candidates(train_log)
test_past_view_yado_candidates,test_past_view_yado_feature = create_past_view_yado_candidates(test_log)

In [11]:
train_top20_popular_yado_candidates,train_top20_popular_yado_feature = create_topN_popular_yado_candidates(label,train_test='train',top=10)
test_top20_popular_yado_candidates,test_top20_popular_yado_feature = create_topN_popular_yado_candidates(label,train_test='test',top=10)

In [12]:
train_top10_wid_popular_yado_candidates,train_top10_wid_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='wid_cd',top=10)
test_top10_wid_popular_yado_candidates,test_top10_wid_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='wid_cd',top=10)

train_top10_ken_popular_yado_candidates,train_top10_ken_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='ken_cd',top=10)
test_top10_ken_popular_yado_candidates,test_top10_ken_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='ken_cd',top=10)

train_top10_lrg_popular_yado_candidates,train_top10_lrg_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='lrg_cd',top=10)
test_top10_lrg_popular_yado_candidates,test_top10_lrg_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='lrg_cd',top=10)

train_top10_sml_popular_yado_candidates,train_top10_sml_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='train',area='sml_cd',top=10)
test_top10_sml_popular_yado_candidates,test_top10_sml_popular_yado_feature = create_topN_area_popular_yado_candidates(label,yado,train_test='test',area='sml_cd',top=10)

In [17]:
train_latest_next_booking_top20_candidate,train_latest_next_booking_top20_feature = create_latest_next_booking_tonN_candidate(train_log,label,train_test='train',top=10)
test_latest_next_booking_top20_candidate,test_latest_next_booking_top20_feature = create_latest_next_booking_tonN_candidate(train_log,label,train_test='test',top=10)

In [18]:
train_top10_covisit_candidates = create_topN_covisit_candidates(train_log,top=10)
test_top10_covisit_candidates = create_topN_covisit_candidates(test_log,top=10)

In [19]:
# parquet形式で保存
train_past_view_yado_candidates.write_parquet(CFG.path_exp / "train_past_view_yado_candidates.parquet")
test_past_view_yado_candidates.write_parquet(CFG.path_exp / "test_past_view_yado_candidates.parquet")

train_past_view_yado_feature.write_parquet(CFG.path_exp / "train_past_view_yado_feature.parquet")
test_past_view_yado_feature.write_parquet(CFG.path_exp / "test_past_view_yado_feature.parquet")

train_top20_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top20_popular_yado_candidates.parquet")
test_top20_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top20_popular_yado_candidates.parquet")

train_top20_popular_yado_feature.write_parquet(CFG.path_exp / "train_top20_popular_yado_feature.parquet")
test_top20_popular_yado_feature.write_parquet(CFG.path_exp / "test_top20_popular_yado_feature.parquet")

train_top10_wid_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_wid_popular_yado_candidates.parquet")
test_top10_wid_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_wid_popular_yado_candidates.parquet")

train_top10_wid_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_wid_popular_yado_feature.parquet")
test_top10_wid_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_wid_popular_yado_feature.parquet")

train_top10_ken_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_ken_popular_yado_candidates.parquet")
test_top10_ken_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_ken_popular_yado_candidates.parquet")

train_top10_ken_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_ken_popular_yado_feature.parquet")
test_top10_ken_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_ken_popular_yado_feature.parquet")

train_top10_lrg_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_lrg_popular_yado_candidates.parquet")
test_top10_lrg_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_lrg_popular_yado_candidates.parquet")

train_top10_lrg_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_lrg_popular_yado_feature.parquet")
test_top10_lrg_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_lrg_popular_yado_feature.parquet")

train_top10_sml_popular_yado_candidates.write_parquet(CFG.path_exp / "train_top10_sml_popular_yado_candidates.parquet")
test_top10_sml_popular_yado_candidates.write_parquet(CFG.path_exp / "test_top10_sml_popular_yado_candidates.parquet")

train_top10_sml_popular_yado_feature.write_parquet(CFG.path_exp / "train_top10_sml_popular_yado_feature.parquet")
test_top10_sml_popular_yado_feature.write_parquet(CFG.path_exp / "test_top10_sml_popular_yado_feature.parquet")

train_latest_next_booking_top20_candidate.write_parquet(CFG.path_exp / "train_latest_next_booking_top20_candidates.parquet")
test_latest_next_booking_top20_candidate.write_parquet(CFG.path_exp / "test_latest_next_booking_top20_candidates.parquet")

train_latest_next_booking_top20_feature.write_parquet(CFG.path_exp / "train_latest_next_booking_top20_feature.parquet")
test_latest_next_booking_top20_feature.write_parquet(CFG.path_exp / "test_latest_next_booking_top20_feature.parquet")

train_top10_covisit_candidates.write_parquet(CFG.path_exp / "train_top10_covisit_candidates.parquet")
test_top10_covisit_candidates.write_parquet(CFG.path_exp / "test_top10_covisit_candidates.parquet")

## Feature

### candidate結合

In [20]:
# area単位のは多すぎるので、今回は除外。
candidate_name_list = ['past_view_yado',
                    #   'top20_popular_yado',
                      'top10_wid_popular_yado',
                    #   'top10_ken_popular_yado',
                    #   'top10_lrg_popular_yado',
                    #   'top10_sml_popular_yado',
                       "top10_covisit",
                       'latest_next_booking_top20']

In [21]:
def get_session_id_list(log):
    return log.group_by('session_id').head(1).select(['session_id'])

In [22]:
train_session_id = get_session_id_list(train_log)
train_session_id = train_session_id.join(label.select(['fold','session_id']),how='left',on='session_id')

test_session_id = get_session_id_list(test_log)

In [23]:
# 各candidateを結合
candidate_list = {}
candidate_list['train'] = []
candidate_list['test'] = []

for train_test in ['train','test']:
    for candidate_name in tqdm(candidate_name_list):
        candidate = pl.read_parquet(CFG.path_exp / f"{train_test}_{candidate_name}_candidates.parquet")
        if 'session_id' in candidate.columns:
            candidate_list[train_test].append(candidate.select(['session_id','yad_no']))
        elif 'latest_yad_no' in candidate.columns:
            if train_test == 'train':
                latest_yad_no = train_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})
                latest_yad_no = latest_yad_no.join(label.select(['session_id','fold']),how='left',on='session_id')
                latest_yad_no = latest_yad_no.with_columns(pl.col('fold').cast(pl.Int32))
                candidate = latest_yad_no.join(candidate,how='inner',on=['latest_yad_no','fold'])
            else:
                latest_yad_no = test_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(candidate,how='inner',on=['latest_yad_no'])
            candidate_list[train_test].append(candidate.select(['session_id','yad_no']))
                # エリア系のやつ
        elif "wid" in candidate_name:
            print("wid in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                display(latest_yad_no)
                display(candidate)
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["wid_cd", "fold"]
                )
                display(candidate)
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["wid_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "ken" in candidate_name:
            print("ken in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["ken_cd", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["ken_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "lrg" in candidate_name:
            print("lrg in candidate_name")
            if train_test == "train":
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["lrg_cd", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["lrg_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )
        # エリア系のやつ
        elif "sml" in candidate_name:
            print("sml in candidate_name")
            if train_test == "train":
                # 最後に見た宿を取得
                latest_yad_no = (
                    train_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                # foldを付与
                latest_yad_no = latest_yad_no.join(
                    label.select(["session_id", "fold"]),
                    how="left",
                    on="session_id",
                )
                latest_yad_no = latest_yad_no.with_columns(
                    pl.col("fold").cast(pl.Int32)
                )
                # yado_dfとマージ
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["sml_cd", "fold"]
                )
            else:
                latest_yad_no = (
                    test_log.group_by("session_id")
                    .tail(1)
                    .select(["session_id", "yad_no"])
                )
                latest_yad_no = latest_yad_no.join(
                    yado, how="left", on="yad_no"
                )
                latest_yad_no = latest_yad_no.rename({'yad_no':'latest_yad_no'})
                candidate = latest_yad_no.join(
                    candidate, how="inner", on=["sml_cd"]
                )
            candidate_list[train_test].append(
                candidate.select(["session_id", "yad_no"])
            )       
        else:
            if train_test == 'train':
                if 'fold' in candidate.columns:
                    candidate_all = pl.DataFrame()
                    for fold in range(CFG.fold_num):
                        candidate_fold = train_session_id.filter(pl.col('fold') == fold).join(candidate.filter(pl.col('fold') == fold).select(['yad_no']),how='cross')
                        candidate_all = pl.concat([candidate_all,candidate_fold])
            else:
                candidate_all = test_session_id.join(candidate.select(['yad_no']),how='cross')
            candidate_list[train_test].append(candidate_all.select(['session_id','yad_no']))

  0%|          | 0/4 [00:00<?, ?it/s]

wid in candidate_name


session_id,latest_yad_no,fold,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
str,i64,i32,i64,f64,f64,i64,f64,f64,f64,f64,str,str,str,str
"""2a3a8c06094d01…",7017,4,0,76.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""c9d5e891463e53…","""7cf2b4f31fb207…"
"""0645904de58245…",5728,0,0,88.0,1.0,1,1.0,,,,"""e9316013ee1b03…","""84efa50e52f9b4…","""6991a9d2e7fe40…","""51450d88ac85e2…"
"""0abc4bd66918af…",2061,1,0,202.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""bd054cc265d68a…","""728d0185a651e4…","""101281084c088b…"
"""3afdee12abfecd…",12347,4,0,150.0,1.0,0,1.0,,,1.0,"""d86102dd9c232b…","""7d66a82def7849…","""49cb51139dcdc8…","""4709679ac0e2af…"
"""68ab0eef145632…",4646,3,0,961.0,1.0,0,1.0,,,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""f76e14a3f2ebd4…"
"""bf0d84981dc5eb…",12265,3,0,292.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""c9d5e891463e53…","""9ccc341413e935…"
"""7fb2aed703115f…",6622,0,0,63.0,1.0,0,,,,1.0,"""8a1c0d3243bba1…","""ea4e9c80303b41…","""fdf7c422972a58…","""0d3eda185bb0c1…"
"""8294ae45402db8…",13468,4,0,1016.0,1.0,0,1.0,,,,"""46e33861f921c3…","""572d60f0f5212a…","""8a623b960557e8…","""1d9f09b9e2bd43…"
"""8d6472692115c8…",2177,0,0,224.0,,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""d91d5d5ccafec0…","""ef8e39a2c0aa16…"
"""d5a2333df93ff3…",4536,0,0,32.0,1.0,0,,,,1.0,"""c312e07b7a5d45…","""6692a692f80687…","""8cf750072f8520…","""ddd5616ecb2d2c…"


wid_cd,yad_no,fold
str,i64,i32
"""f0112abf369fb0…",13017,0
"""f0112abf369fb0…",719,0
"""f0112abf369fb0…",1818,0
"""f0112abf369fb0…",8567,0
"""f0112abf369fb0…",2201,0
"""f0112abf369fb0…",9971,0
"""f0112abf369fb0…",693,0
"""f0112abf369fb0…",1799,0
"""f0112abf369fb0…",3988,0
"""f0112abf369fb0…",6418,0


session_id,latest_yad_no,fold,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,yad_no
str,i64,i32,i64,f64,f64,i64,f64,f64,f64,f64,str,str,str,str,i64
"""2a3a8c06094d01…",7017,4,0,76.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""c9d5e891463e53…","""7cf2b4f31fb207…",10095
"""2a3a8c06094d01…",7017,4,0,76.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""c9d5e891463e53…","""7cf2b4f31fb207…",3338
"""2a3a8c06094d01…",7017,4,0,76.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""c9d5e891463e53…","""7cf2b4f31fb207…",12350
"""2a3a8c06094d01…",7017,4,0,76.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""c9d5e891463e53…","""7cf2b4f31fb207…",8553
"""2a3a8c06094d01…",7017,4,0,76.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""c9d5e891463e53…","""7cf2b4f31fb207…",385
"""2a3a8c06094d01…",7017,4,0,76.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""c9d5e891463e53…","""7cf2b4f31fb207…",3848
"""2a3a8c06094d01…",7017,4,0,76.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""c9d5e891463e53…","""7cf2b4f31fb207…",12785
"""2a3a8c06094d01…",7017,4,0,76.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""c9d5e891463e53…","""7cf2b4f31fb207…",13468
"""2a3a8c06094d01…",7017,4,0,76.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""c9d5e891463e53…","""7cf2b4f31fb207…",13549
"""2a3a8c06094d01…",7017,4,0,76.0,1.0,0,1.0,,,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""c9d5e891463e53…","""7cf2b4f31fb207…",13292


100%|██████████| 4/4 [00:00<00:00, 10.21it/s]
100%|██████████| 4/4 [00:00<00:00, 24.79it/s]

wid in candidate_name





In [24]:
train_candidate = pl.concat(candidate_list['train']).unique()
test_candidate = pl.concat(candidate_list['test']).unique()

In [25]:
del candidate_list
gc.collect()

0

### 特徴量作成

In [26]:
train_candidate = train_candidate.join(label.rename({'yad_no':'target'}),how='left',on='session_id')
train_candidate = train_candidate.with_columns(pl.col('fold').cast(pl.Int32))
train_candidate = train_candidate.with_columns((pl.col('yad_no') == pl.col('target')).alias('target').cast(pl.Int8))

In [27]:
train_latest_yad_no = train_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})
test_latest_yad_no = test_log.group_by('session_id').tail(1).select(['session_id','yad_no']).rename({'yad_no':'latest_yad_no'})

In [28]:
train_candidate = train_candidate.join(train_latest_yad_no,how='left',on='session_id')
test_candidate = test_candidate.join(test_latest_yad_no,how='left',on='session_id')

In [29]:
feature_name_list = ['latest_next_booking_top20',
                     'past_view_yado',
                      'top20_popular_yado',
                      'top10_wid_popular_yado',
                      'top10_ken_popular_yado',
                      'top10_lrg_popular_yado',
                      'top10_sml_popular_yado']

In [30]:
for train_test in ['train','test']:
    for feature_name in tqdm(feature_name_list):
        feature = pl.read_parquet(CFG.path_exp / f"{train_test}_{feature_name}_feature.parquet")
        if train_test == 'train':
            # for fold in range(CFG.fold_num):
            if 'session_id' in feature.columns:
                train_candidate = train_candidate.join(feature,how='left',on=['session_id','yad_no'])
            elif 'latest_yad_no' in feature.columns:
                train_candidate = train_candidate.join(feature,how='left',on=['fold','latest_yad_no','yad_no'])
            else:
                train_candidate = train_candidate.join(feature,how='left',on=['fold','yad_no'])
        else:
            if 'session_id' in feature.columns:
                test_candidate = test_candidate.join(feature,how='left',on=['session_id','yad_no'])
            elif 'latest_yad_no' in feature.columns:
                test_candidate = test_candidate.join(feature,how='left',on=['latest_yad_no','yad_no'])
            else:
                test_candidate = test_candidate.join(feature,how='left',on=['yad_no'])

  0%|          | 0/7 [00:00<?, ?it/s]

100%|██████████| 7/7 [00:01<00:00,  6.77it/s]
100%|██████████| 7/7 [00:00<00:00, 16.05it/s]


In [31]:
train_candidate = train_candidate.fill_null(0)
test_candidate = test_candidate.fill_null(0)

In [32]:
train_candidate = train_candidate.join(yado.select(['yad_no','yad_type','total_room_cnt','wireless_lan_flg','onsen_flg','kd_stn_5min','kd_bch_5min','kd_slp_5min','kd_conv_walk_5min']),how='left',on='yad_no')
test_candidate = test_candidate.join(yado.select(['yad_no','yad_type','total_room_cnt','wireless_lan_flg','onsen_flg','kd_stn_5min','kd_bch_5min','kd_slp_5min','kd_conv_walk_5min']),how='left',on='yad_no')

In [33]:
# 各seqで見た宿を特徴量にする
for seq in range(8):
    seq_yad_no = train_log.filter(pl.col('seq_no') == seq).select(['session_id','yad_no']).rename({'yad_no':f'seq_{seq}_yad_no'})
    train_candidate = train_candidate.join(seq_yad_no,how='left',on='session_id')

    seq_yad_no = test_log.filter(pl.col('seq_no') == seq).select(['session_id','yad_no']).rename({'yad_no':f'seq_{seq}_yad_no'})
    test_candidate = test_candidate.join(seq_yad_no,how='left',on='session_id')

In [34]:
def create_num_picture_df(yado_df):
    # yad_noとcategoryごとのデータ件数
    _df = yado_df.group_by(["yad_no", "category"]).count()

    # ピボットテーブルに変換
    num_picture_df = _df.pivot("count", "yad_no", "category", "sum").sort("yad_no")

    # 欠損を0で埋める
    num_picture_df = num_picture_df.fill_null(0)

    return num_picture_df

In [35]:
# 各カテゴリの画像枚数
num_picture_df = create_num_picture_df(yado_embedding)

# 画像枚数を結合
train_candidate = train_candidate.join(num_picture_df, how="left", on="yad_no")
test_candidate = test_candidate.join(num_picture_df, how="left", on="yad_no")

In [36]:
train_candidate

session_id,yad_no,target,fold,latest_yad_no,latest_next_booking_rank,max_seq_no,max_seq_no_diff,session_view_count,reservation_counts,popular_rank,wid_cd,popular_wid_cd_rank,ken_cd,popular_ken_cd_rank,lrg_cd,popular_lrg_cd_rank,sml_cd,popular_sml_cd_rank,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,seq_0_yad_no,seq_1_yad_no,seq_2_yad_no,seq_3_yad_no,seq_4_yad_no,seq_5_yad_no,seq_6_yad_no,seq_7_yad_no,others,food,facility,room,exterior
str,i64,i8,i32,i64,u32,i64,i64,u32,u32,i64,str,u32,str,u32,str,u32,str,u32,i64,f64,f64,i64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,u32,u32,u32,u32,u32
"""0048fe22798057…",11699,1,4,8134,4,1,1,1,59,724,"""46e33861f921c3…",91,"""572d60f0f5212a…",30,"""8748b5a626046f…",2,"""8a3f01bdf9eb39…",2,0,307.0,1.0,0,,,,,11699,8134,,,,,,,1,3,3,3,3
"""0114b8faf47010…",8524,1,1,2464,2,1,1,1,27,2895,"""c312e07b7a5d45…",60,"""a738e672656974…",21,"""ee5518ed106a6e…",6,"""2a8ec22a10c934…",6,0,182.0,1.0,1,,,,,8524,2464,,,,,,,3,3,3,3,3
"""015932589a1088…",9330,1,2,8460,1,1,1,1,19,3958,"""dc414a17890cfc…",82,"""6920865be128aa…",46,"""1033c846fb31b0…",9,"""69916a1fb9c5b4…",8,0,,1.0,0,,,,1.0,9330,8460,,,,,,,2,3,3,3,2
"""01ae976591f738…",3077,1,0,8445,2,2,1,1,150,53,"""46e33861f921c3…",26,"""c86352f5b57e80…",7,"""2e35ef11a402c4…",3,"""1e011cd8090011…",3,0,198.0,1.0,1,,,,1.0,589,3077,8445,,,,,,3,3,2,3,3
"""021a59a086caba…",8632,1,1,3739,2,1,1,1,47,1166,"""46e33861f921c3…",103,"""107c7305a74c8d…",73,"""c9d5e891463e53…",10,"""9ccc341413e935…",6,0,106.0,1.0,0,1.0,,,1.0,8632,3739,,,,,,,3,1,1,3,3
"""022e0578fff4b4…",12252,1,3,1500,2,1,1,1,13,5240,"""46e33861f921c3…",132,"""107c7305a74c8d…",106,"""d153c8fd78bfad…",49,"""93bb8a3bdcfb29…",43,0,24.0,1.0,0,1.0,,,1.0,12252,1500,,,,,,,2,0,3,3,3
"""0256dae23cd3a5…",4020,1,2,7806,2,1,1,1,76,371,"""f0112abf369fb0…",45,"""072c85e1653e10…",37,"""810a15fb99b13c…",1,"""5640dd2cb3b5fa…",1,0,214.0,1.0,0,1.0,,,1.0,4020,7806,,,,,,,3,2,3,3,3
"""026e83e1e75100…",10732,0,2,10732,0,2,2,2,83,299,"""f0112abf369fb0…",39,"""072c85e1653e10…",32,"""52d0a7d917cc19…",5,"""5423b90b9624bb…",5,0,368.0,1.0,0,1.0,,,1.0,10732,2531,10732,,,,,,3,3,3,3,3
"""027e377dad7dbb…",5542,1,3,9880,2,1,1,1,84,289,"""3300cf6f774b7c…",6,"""7bc5fba93082f0…",5,"""2e74bf64d33bf4…",2,"""30afdbf62c5f98…",2,0,192.0,1.0,1,,,,,5542,9880,,,,,,,3,3,3,3,0
"""028fcaef9e146a…",6091,1,2,3228,4,1,1,1,43,1436,"""46e33861f921c3…",108,"""107c7305a74c8d…",75,"""3a6cd37aa9e38f…",29,"""4683b842facadc…",11,0,603.0,1.0,1,1.0,,,1.0,6091,3228,,,,,,,2,3,3,3,3


In [37]:
# 特徴量保存
train_candidate.write_parquet(CFG.path_exp / "train_candidate.parquet")
test_candidate.write_parquet(CFG.path_exp / "test_candidate.parquet")

In [38]:
test_candidate.shape

(3691102, 38)

In [39]:
train_candidate.get_column("target").sum() / len(label)

0.5769281394398299

## ReRankモデル

In [40]:
train = pl.read_parquet(CFG.path_exp / "train_candidate.parquet")
test = pl.read_parquet(CFG.path_exp / "test_candidate.parquet")

In [41]:
# session_id単位でtarge=1がなければ、session_idごと削除
use_session_ids = train.group_by('session_id').agg(pl.col('target').sum()).filter(pl.col('target') == 1)['session_id']

### Lightgbmで学習

In [42]:
# Lightgbmで学習
lgbm_params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type' : 'gbdt',
        'seed' : CFG.seed,
        'learning_rate': 0.2, 
}

lgb_model_list = []
pred = np.zeros(len(test))
for fold in range(CFG.fold_num):
    X_train = train.filter((pl.col('session_id').is_in(use_session_ids)) & (pl.col('fold') != fold)).drop(['fold','target','session_id'])
    Y_train = train.filter((pl.col('session_id').is_in(use_session_ids)) & (pl.col('fold') != fold))['target'].to_numpy()
    X_valid = train.filter((pl.col('session_id').is_in(use_session_ids)) & (pl.col('fold') == fold)).drop(['fold','target','session_id'])
    Y_valid = train.filter((pl.col('session_id').is_in(use_session_ids)) & (pl.col('fold') == fold))['target'].to_numpy()


    # pandasの方が扱いやすいので変換
    X_train = X_train.to_pandas()
    X_valid = X_valid.to_pandas()
    for feature in CFG.cat_features:
        X_train[feature] = X_train[feature].astype('category')
        X_valid[feature] = X_valid[feature].astype('category')

    lgb_train = lgb.Dataset(X_train, Y_train)
    lgb_valid = lgb.Dataset(X_valid, Y_valid)
    lgb_model = lgb.train(lgbm_params,
                      lgb_train,
                      valid_sets=[lgb_train, lgb_valid], 
                      valid_names=['train', 'valid'],
                      callbacks=[lgb.early_stopping(200), lgb.log_evaluation(1000)],
                      **CFG.lgb_train_params
                    )

    lgb_model_list.append(lgb_model)

    del X_train,Y_train,X_valid,Y_valid,lgb_train,lgb_valid
    gc.collect()

[LightGBM] [Info] Number of positive: 133128, number of negative: 2679539
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.186067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19209
[LightGBM] [Info] Number of data points in the train set: 2812667, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047332 -> initscore=-3.002089
[LightGBM] [Info] Start training from score -3.002089
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[567]	train's auc: 0.977318	valid's auc: 0.936724
[LightGBM] [Info] Number of positive: 133297, number of negative: 2681319
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.096931 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_w

### 推論

In [43]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k for a single actual value.

    Parameters:
    actual : int
        The actual value that is to be predicted
    predicted : list
        A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The average precision at k
    """
    if actual in predicted[:k]:
        return 1.0 / (predicted[:k].index(actual) + 1)
    return 0.0

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k for lists of actual values and predicted values.

    Parameters:
    actual : list
        A list of actual values that are to be predicted
    predicted : list
        A list of lists of predicted elements (order does matter in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns:
    float
        The mean average precision at k
    """
    return sum(apk(a, p, k) for a, p in zip(actual, predicted)) / len(actual)

In [44]:
def create_top_10_yad_predict(_df):

    # セッションごとに予測確率の高い順に yad_no の配列を作成
    _agg = _df.sort_values("predict", ascending=False).groupby("session_id")["yad_no"].apply(list)

    out_df = pd.DataFrame(index=_agg.index, data=_agg.values.tolist()).iloc[:, :10]

    return out_df

In [45]:
oof = pd.DataFrame()
test = test.with_columns(pl.lit(0).alias('predict'))
X_test = test.drop(['session_id'])
X_test = X_test.to_pandas()
for feature in CFG.cat_features:
    X_test[feature] = X_test[feature].astype('category')

for fold in range(CFG.fold_num):
    X_valid = train.filter((pl.col('fold') == fold)).drop(['fold','target','session_id'])
    Y_valid = train.filter((pl.col('fold') == fold))['target'].to_numpy()

    # pandasの方が扱いやすいので変換
    X_valid = X_valid.to_pandas()
    for feature in CFG.cat_features:
        X_valid[feature] = X_valid[feature].astype('category')

    X_valid['predict'] = lgb_model_list[fold].predict(X_valid)
    X_valid['session_id'] = train.filter((pl.col('fold') == fold))['session_id'].to_numpy()
    X_test['predict'] += lgb_model_list[fold].predict(X_test.drop('predict',axis=1))/CFG.fold_num
    oof = pd.concat([oof,X_valid[['session_id','predict','yad_no']]])
X_test['session_id'] = test['session_id'].to_numpy()

In [46]:
oof = oof.sort_values(['session_id','predict'],ascending=False)
oof_ = create_top_10_yad_predict(oof)

In [47]:
label = pd.read_csv(CFG.path_input / "train_label.csv")

mapk(actual=label[label['session_id'].isin(oof_.reset_index()['session_id'])].sort_values('session_id',ascending=True)['yad_no'].to_list(),
     predicted=oof_.values.tolist(), k=10)

0.40073673131480436

In [48]:
oof_.to_csv(CFG.path_exp / "oof.csv")

In [49]:
# CV = 0.35497319410061673

In [50]:
sub = create_top_10_yad_predict(X_test)
sub.columns = [f'predict_{c}' for c in sub.columns]
sub = sub.reset_index(drop=True)

In [51]:
sub.to_csv(CFG.path_exp / "submission.csv", index=False)

In [52]:
sub

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,4420,9830,4714,2911,9534,13192,5466,2680,11561
1,5627,2439,9323,11923,12350,10095,12862,6555,7913,6129
2,757,9190,9910,410,6721,10485,7710,1774,6730,3400
3,12341,1542,10746,13521,3359,5080,6991,9319,5657,4180
4,607,763,11480,5650,6576,1448,3476,6161,3844,12029
...,...,...,...,...,...,...,...,...,...,...
174695,1997,7888,11123,2278,5744,7062,9543,10997,11848,9743
174696,6874,2164,4014,1227,13702,3644,5331,2232,13220,3802
174697,11037,12240,2087,13797,7379,7308,844,11796,8143,12939
174698,13672,4483,1687,6034,2510,3002,2373,5513,2305,4976
