# ルール
- 共起行列を作成し、最後の宿に対してそ共起が上位10件の宿を推薦する

In [11]:
import gc
import os
import pickle
import random
from collections import defaultdict
from pathlib import Path
from heapq import heappush, heappop
from time import time

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.metrics import (accuracy_score, average_precision_score,
                             cohen_kappa_score, explained_variance_score,
                             f1_score, log_loss, mean_absolute_error,
                             mean_squared_error, mean_squared_log_error,
                             median_absolute_error, precision_score, r2_score,
                             recall_score, roc_auc_score)
from sklearn.model_selection import GroupKFold, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight
from tqdm import tqdm

plt.style.use("ggplot")
tqdm.pandas()

In [12]:
class CFG:
    name = "rule002"
    seed = 42

    path_input = Path("../input")
    path_output = Path("../output")

In [13]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True


def setup(CFG):
    # expフォルダを作成
    CFG.path_exp = CFG.path_output / CFG.name
    CFG.path_exp.mkdir(parents=True, exist_ok=True)

    # seedの設定
    set_seed(CFG.seed)

    return CFG

In [14]:
CFG = setup(CFG)

In [15]:
# データの読み込み
train_log_df = pl.read_csv(CFG.path_input / "train_log.csv")
train_label_df = pl.read_csv(CFG.path_input / "train_label.csv")

test_log_df = pl.read_csv(CFG.path_input / "test_log.csv")
test_session_df = pl.read_csv(CFG.path_input / "test_session.csv")

yado_df = pl.read_csv(CFG.path_input / "yado.csv")

In [16]:
def create_topN_area_popular_yado_candidates(label, yado, area='wid_cd',top=10):
    label_yado = label.join(yado,how='left',on='yad_no')
    top_yado_area_candidate = pl.DataFrame()

    popular_yado_sort = label_yado.group_by([area,'yad_no']).count().sort(by=[area,'count'],descending=[False,True])
    top_yado_area_candidate = popular_yado_sort.group_by(area).head(top).select([area,'yad_no', 'count'])
    
    return top_yado_area_candidate

In [17]:
top_yado_area_candidate = create_topN_area_popular_yado_candidates(train_label_df,yado_df,area='lrg_cd',top=10)

In [18]:
# test_logの最後に見た宿を取得
latest_yad_no = test_log_df.group_by('session_id').tail(1).select(['session_id','yad_no'])

In [19]:
latest_yad_no = latest_yad_no.join(yado_df,how='left',on='yad_no')

In [20]:
candidate_df = latest_yad_no.join(top_yado_area_candidate,how='left',on='lrg_cd')

In [21]:
candidate_df = candidate_df.sort(by=['session_id','count'],descending=[False,True]).to_pandas()

In [22]:
candidate_df

Unnamed: 0,session_id,yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd,yad_no_right,count
0,00001149e9c73985425197104712478c,1959,0,173.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,9830,222
1,00001149e9c73985425197104712478c,1959,0,173.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,6766,143
2,00001149e9c73985425197104712478c,1959,0,173.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,2986,85
3,00001149e9c73985425197104712478c,1959,0,173.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,4422,83
4,00001149e9c73985425197104712478c,1959,0,173.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,52ca3d2824fc3cc90bd4274423badeed,87d9490219b3778f73c41b8176cf30d0,2680,81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1743799,ffffe984aafd6127ce8e43e3ca40c79d,8250,0,326.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,11801,79
1743800,ffffe984aafd6127ce8e43e3ca40c79d,8250,0,326.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,1891,77
1743801,ffffe984aafd6127ce8e43e3ca40c79d,8250,0,326.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,3781,72
1743802,ffffe984aafd6127ce8e43e3ca40c79d,8250,0,326.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,c9d5e891463e5389c42d16f987ed30bd,7cf2b4f31fb20747f89e58981e6d9fc1,11994,70


In [23]:
# session_idごとにyad_no_rightをリスト化。
candidate_df = candidate_df.groupby('session_id')['yad_no_right'].apply(list).reset_index()

In [24]:
candidate_df

Unnamed: 0,session_id,yad_no_right
0,00001149e9c73985425197104712478c,"[9830, 6766, 2986, 4422, 2680, 9955, 10965, 54..."
1,0000e02747d749a52b7736dfa751e258,"[12862, 6555, 4066, 11984, 143, 5267, 7014, 11..."
2,0000f17ae2628237d78d3a38b009d3be,"[9190, 1774, 9910, 410, 757, 2267, 4999, 2142,..."
3,000174a6f7a569b84c5575760d2e9664,"[277, 5657, 12341, 2795, 6991, 3359, 13610, 20..."
4,00017e2a527901c9c41b1acef525d016,"[9020, 5713, 12524, 6576, 7246, 13590, 5106, 3..."
...,...,...
174695,fffee3199ef94b92283239cd5e3534fa,"[7888, 1885, 2278, 11123, 5744, 7780, 7062, 83..."
174696,ffff62c6bb49bc9c0fbcf08494a4869c,"[1227, 6874, 5331, 4014, 2232, 9782, 4541, 380..."
174697,ffff9a7dcc892875c7a8b821fa436228,"[12425, 6199, 12240, 11037, 2087, 12089, 12986..."
174698,ffffb1d30300fe17f661941fd085b04b,"[6378, 10287, 11496, 3100, 2305, 2373, 2692, 8..."


In [39]:
def make_prediction(candidate_df):
    test_session_number = len(candidate_df)
    Predicted_List = [ [0]*10 for _ in range(len(candidate_df)) ]

    for idx, row in candidate_df.iterrows():
        for rank, yad_no in enumerate(row["yad_no_right"]):
            Predicted_List[idx][rank] = yad_no

    df_submit = pd.DataFrame(Predicted_List, columns=["predict_0", "predict_1", "predict_2", "predict_3", "predict_4", "predict_5", "predict_6", "predict_7", "predict_8", "predict_9"])

    return df_submit

In [40]:
df_submit = make_prediction(candidate_df)

In [41]:
df_submit

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,9830,6766,2986,4422,2680,9955,10965,5466,1959,11561
1,12862,6555,4066,11984,143,5267,7014,11237,6129,1266
2,9190,1774,9910,410,757,2267,4999,2142,11001,6721
3,277,5657,12341,2795,6991,3359,13610,2047,10535,7049
4,9020,5713,12524,6576,7246,13590,5106,3187,9623,11494
...,...,...,...,...,...,...,...,...,...,...
174695,7888,1885,2278,11123,5744,7780,7062,831,1997,6719
174696,1227,6874,5331,4014,2232,9782,4541,3802,12432,13702
174697,12425,6199,12240,11037,2087,12089,12986,10155,7379,12132
174698,6378,10287,11496,3100,2305,2373,2692,8501,12688,1530


In [42]:
df_submit.to_csv(CFG.path_exp / "submission.csv", index=False)