In [21]:
import os
from glob import glob

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from contextlib import contextmanager
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
import random
import shutil

%matplotlib inline


# ref: Kaggleコード遺産 https://qiita.com/kaggle_grandmaster-arai-san/items/d59b2fb7142ec7e270a5 
class Timer:
    def __init__(self, logger=None, format_str="{:.3f}[s]", prefix=None, suffix=None, sep=" "):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)


def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)

# 再現性確保!
seed_everything(510)

In [22]:
# 学習用のログデータと正解ラベル
train_log_df = pd.read_csv("./input/train_log.csv")
train_label_df = pd.read_csv("./input/train_label.csv")

# 宿のデータ
yado_df = pd.read_csv("./input/yado.csv")

# テスト期間のログデータ
test_log_df = pd.read_csv("./input/test_log.csv")

sample_submission_df = pd.read_csv("./input/sample_submission.csv")

In [23]:
# train_logに情報をmerge
train_label_df["label"] = 1
train_log_df = train_log_df.merge(train_label_df, how="left", on=["session_id", "yad_no"])
train_log_df["label"] = train_log_df["label"].fillna(0)
train_log_df =train_log_df.merge(yado_df, how="left", on="yad_no")

train_log_df.head()

Unnamed: 0,session_id,seq_no,yad_no,label,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
0,000007603d533d30453cc45d0f3d119f,0,2395,0.0,0,113.0,1.0,0,,,,,dc414a17890cfc17d011d5038b88ca93,d78f53d0856617bc782f02c3280dfef2,4fd631b15116098340cdb099c86a5a40,4044dac1931ddaa5a967e09506d76343
1,0000ca043ed437a1472c9d1d154eb49b,0,13535,0.0,0,40.0,1.0,0,1.0,,,1.0,b07b75d367ebece55a23ceecc939fff4,0a66f6ab9c0507059da6f22a0e1f1690,9ab5718fd88c6e5f9fec37a51827d428,7aff71bb47acb796d425c5ed5e6dfb3f
2,0000d4835cf113316fe447e2f80ba1c8,0,123,0.0,0,17.0,1.0,0,,,,,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,dac434451fe9bd50068191f41fe792e3,b7c56c5d2855b39366b4ebe9a4eded93
3,0000fcda1ae1b2f431e55a7075d1f500,0,8475,0.0,0,65.0,1.0,0,1.0,,,1.0,46e33861f921c3e38b81998fbf283f01,107c7305a74c8dcc4f143de208bf7ec2,3a6cd37aa9e38fd96d9dafc2615643d0,f2fcbd8e62872147efde0acef474e1f2
4,000104bdffaaad1a1e0a9ebacf585f33,0,96,1.0,0,228.0,1.0,0,,,,1.0,e9316013ee1b03f4525fe361c46ce9c5,84efa50e52f9b471c95bfc3b21b854ad,a1370d90ed3b80ee41311bbbab46aec9,d72674f02c5340d90f245e3177727650


In [24]:
# yadoのtrainにおける予約回数をカウントし、宿情報をmerge
yado_score = train_log_df.query("label==1")["yad_no"].value_counts().reset_index()
yado_score = yado_score.merge(yado_df, how="left", on='yad_no')

yado_score.head()

Unnamed: 0,yad_no,count,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
0,3338,426,0,703.0,1.0,0,1.0,,,,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,1d9f09b9e2bd43cebc9885a46388739a
1,12350,358,0,696.0,1.0,0,,,,,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,1d9f09b9e2bd43cebc9885a46388739a
2,10095,302,0,2007.0,1.0,0,,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,f7b42d92528e7a88617c4b26e033d3e5
3,719,250,0,600.0,1.0,0,1.0,,,1.0,f0112abf369fb03cdc5f5309300913da,072c85e1653e10c9c7dd065ad007125a,ed62e66a5031c23c78bd03ccf9f3ef70,d3d1cf557f10fadb1fbc0b429bf14578
4,8553,247,0,550.0,1.0,0,,,,1.0,46e33861f921c3e38b81998fbf283f01,572d60f0f5212aacda515ebf81fb0a3a,8a623b960557e87bd1f4edf71b6255be,1d9f09b9e2bd43cebc9885a46388739a


In [25]:
test_pivot = test_log_df.pivot_table(index='session_id', columns='seq_no')
test_pivot = test_pivot.fillna(-1).astype(int)
test_log_df = test_log_df.merge(yado_df, how="left", on="yad_no")

test_pivot

Unnamed: 0_level_0,yad_no,yad_no,yad_no,yad_no,yad_no,yad_no,yad_no,yad_no
seq_no,0,1,2,3,4,5,6,7
session_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
00001149e9c73985425197104712478c,3560,1959,-1,-1,-1,-1,-1,-1
0000e02747d749a52b7736dfa751e258,11984,-1,-1,-1,-1,-1,-1,-1
0000f17ae2628237d78d3a38b009d3be,757,8922,-1,-1,-1,-1,-1,-1
000174a6f7a569b84c5575760d2e9664,13610,12341,13610,-1,-1,-1,-1,-1
00017e2a527901c9c41b1acef525d016,4621,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...
fffee3199ef94b92283239cd5e3534fa,1997,8336,-1,-1,-1,-1,-1,-1
ffff62c6bb49bc9c0fbcf08494a4869c,12062,-1,-1,-1,-1,-1,-1,-1
ffff9a7dcc892875c7a8b821fa436228,8989,-1,-1,-1,-1,-1,-1,-1
ffffb1d30300fe17f661941fd085b04b,6030,-1,-1,-1,-1,-1,-1,-1


In [27]:
# session_idごとに、最も出現頻度の高いsml_cdを集計しておく
sml_cd_mode_df = test_log_df.groupby('session_id')['sml_cd'].agg(lambda x: x.mode().iloc[0]).reset_index()
sml_cd_dict = dict(zip(sml_cd_mode_df['session_id'], sml_cd_mode_df['sml_cd']))

In [28]:
from tqdm import tqdm
def extract_valid_yados(record):
    """test_logのセッションに含まれているyad_noを抽出"""
    return list(filter(lambda x: isinstance(x, int) and x >= 0, record))

In [29]:
preds = []

for record in tqdm(test_pivot.itertuples()):
    session_id = record[0]
    session_yados = extract_valid_yados(record)
    session_yado_set = set(session_yados)

    # 予測値のlistを定義
    # セッション内に存在する宿が予約される割合は高そうなので、とりあえずセッション内の宿はそのまま予測値に含める
    # ただし、セッション内の最後の宿は正解にならないことがデータの作成方法の記述で明言されているので除外(pop)する
    session_yados.pop()
    tmp_yados = session_yados.copy()

    # 当該session_idにおいて最も出現頻度の高いsml_cdに該当するyadoを候補として抽出
    target_sml_cd = sml_cd_dict.get(session_id)
    candidates = yado_score.query(f"sml_cd=='{target_sml_cd}'")["yad_no"].tolist()

    # 抽出した候補を予測値のlistに追加。candidatesはtrainにおける出現頻度の降順でsortされているので、人気の宿から順にappendされる。
    for cand in candidates:
        if cand not in session_yado_set:
            tmp_yados.append(cand)
        if len(tmp_yados) == 10:
            break

    # sml_cd内の宿の数が足りない場合がある(同一sml_cd内の宿数の最小値は5)ので、予測された宿の数が足りない場合は適当に宿を足す。
    extra_num = 10 - len(tmp_yados)
    for _ in range(extra_num):
        tmp_yados.append(3338) # trainで最も人気の宿

    pred = {
        f"predict_{idx}": cand
        for idx, cand in enumerate(tmp_yados)
    }
    preds.append(pred)

pred_df = pd.DataFrame(preds)

174700it [04:51, 599.69it/s]


In [30]:
pred_df.to_csv("./output/#2_submission.csv", index=False)