# ルール
- 最終宿の閲覧に対して、実際の予約された宿の中から上位10件を出す
- 0のところは共起行列でまずは埋める
- それでも埋まらないところは宿のエリアによって埋める

In [1]:
import gc
import os
import pickle
import random
from collections import defaultdict
from pathlib import Path
from heapq import heappush, heappop
from time import time

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.metrics import (accuracy_score, average_precision_score,
                             cohen_kappa_score, explained_variance_score,
                             f1_score, log_loss, mean_absolute_error,
                             mean_squared_error, mean_squared_log_error,
                             median_absolute_error, precision_score, r2_score,
                             recall_score, roc_auc_score)
from sklearn.model_selection import GroupKFold, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight
from tqdm import tqdm

plt.style.use("ggplot")
tqdm.pandas()

In [2]:
class CFG:
    name = "rule003"
    seed = 42

    path_input = Path("../input")
    path_output = Path("../output")

In [3]:
def set_seed(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True


def setup(CFG):
    # expフォルダを作成
    CFG.path_exp = CFG.path_output / CFG.name
    CFG.path_exp.mkdir(parents=True, exist_ok=True)

    # seedの設定
    set_seed(CFG.seed)

    return CFG

In [4]:
CFG = setup(CFG)

In [5]:
# データの読み込み
train_log_df = pl.read_csv(CFG.path_input / "train_log.csv")
train_label_df = pl.read_csv(CFG.path_input / "train_label.csv")

test_log_df = pl.read_csv(CFG.path_input / "test_log.csv")
test_session_df = pl.read_csv(CFG.path_input / "test_session.csv")

yado_df = pl.read_csv(CFG.path_input / "yado.csv")

# first rule

In [6]:
# train_logで実際に予約した宿をひけるようにしておく
map_reserved = {row['session_id']: row['yad_no'] for row in train_label_df.to_dicts()}

In [7]:
# 縦持ちのセッションログを、session_id : [閲覧したyad_noのリスト] のdictに変換
def make_session_list(session_log):
    _df = (
        session_log
        .group_by('session_id')
        .agg(pl.col('yad_no').alias('yad_list'))
    ).to_pandas()

    return {row['session_id']: row['yad_list'] for row in _df.to_dict(orient='records')}

map_session_yads_train = make_session_list(train_log_df)
map_session_yads_test = make_session_list(test_log_df)

# D[v][r]:= 「最後に宿vを閲覧して、宿rを予約した」セッションの件数
D = defaultdict(lambda:defaultdict(int))
for session_id, viewed_yad_no in map_session_yads_train.items():
  last_viewed = viewed_yad_no[-1]
  reserved = map_reserved[session_id]
  D[last_viewed][reserved] += 1

In [8]:
test_session_number = len(test_session_df)
Predicted_List = [ [0]*10 for _ in range(test_session_number) ]
for idx, session_id in enumerate(test_session_df["session_id"]):
  viewed_number = len(map_session_yads_test[session_id])
  last_viewed = map_session_yads_test[session_id][-1]
  rank = 0

  if viewed_number > 1:
    Predicted_List[idx][rank] = map_session_yads_test[session_id][-2]
    rank += 1

  sorted_yad_list = []
  for yad_no, viewed_cnt in D[last_viewed].items():
    heappush(sorted_yad_list, (-viewed_cnt, yad_no))

  while rank < 10 and sorted_yad_list:
    _, predicted_yad_no = heappop(sorted_yad_list)
    Predicted_List[idx][rank] = predicted_yad_no
    rank += 1

df_submit = pd.DataFrame(Predicted_List, columns=["predict_0", "predict_1", "predict_2", "predict_3", "predict_4", "predict_5", "predict_6", "predict_7", "predict_8", "predict_9"])

In [9]:
df_submit

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
0,3560,4714,2680,4420,11561,5466,9830,2040,2305,2811
1,143,4066,6555,7014,7913,6129,8108,11237,12862,613
2,757,757,7710,9190,10485,410,1774,3400,6721,6730
3,12341,3359,12341,5080,6991,10746,13521,277,1542,2047
4,9020,2862,3476,3844,3854,4070,5372,5411,6161,6565
...,...,...,...,...,...,...,...,...,...,...
174695,1997,1997,2278,5744,7062,7888,9543,9743,10997,11123
174696,12432,1227,3802,899,2164,3644,4014,13220,13702,448
174697,13241,844,2087,5810,7308,7379,8143,11796,12240,12939
174698,2692,3100,10287,2305,2373,2510,3002,3096,4976,5079


In [15]:
(0.4199*174700 + 0.4199*2500)/174700

0.42590887235260444

In [12]:
df_submit[df_submit["predict_1"] == 0]

Unnamed: 0,predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
6,11776,0,0,0,0,0,0,0,0,0
41,8441,0,0,0,0,0,0,0,0,0
108,0,0,0,0,0,0,0,0,0,0
125,7650,0,0,0,0,0,0,0,0,0
247,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
174513,2994,0,0,0,0,0,0,0,0,0
174534,0,0,0,0,0,0,0,0,0,0
174594,8685,0,0,0,0,0,0,0,0,0
174614,11448,0,0,0,0,0,0,0,0,0


# second rule

In [46]:
def generate_co_visit_matrix(df:pl.DataFrame) -> pl.DataFrame:
    # 共起ペアの作成
    df = df.join(df, on="session_id")

    # yad_noが同じものは除外する
    df = df.filter(pl.col("yad_no") != pl.col("yad_no_right"))

    # yad_noのペアごとに共起回数を計算
    df = df.group_by(["yad_no", "yad_no_right"]).count()

    # 整形
    df = df.rename(
        {
            "yad_no_right":"candidate_yad_no",
            "count":"co_visit_count",
        }
    )[["yad_no", "candidate_yad_no", "co_visit_count"]]

    return df

In [51]:
co_visit_matrix = generate_co_visit_matrix(train_log_df)

In [52]:
# yad_noとco_visit_countでソート
co_visit_matrix = co_visit_matrix.sort(["yad_no", "co_visit_count"], descending=[False, True])
co_visit_matrix

yad_no,candidate_yad_no,co_visit_count
i64,i64,u32
2,3860,2
2,12162,1
2,13783,1
2,3847,1
2,12232,1
3,10095,41
3,846,24
3,5800,10
3,7093,8
3,10211,6


In [53]:
# yad_noで各候補と共起回数を辞書にする
def make_co_visit_dict(df:pl.DataFrame) -> dict:
    co_visit_dict = defaultdict(lambda:defaultdict(int))

    for row in df.to_dicts():
        co_visit_dict[row["yad_no"]][row["candidate_yad_no"]] = row["co_visit_count"]

    return co_visit_dict

In [54]:
co_visit_dict = make_co_visit_dict(co_visit_matrix)

In [55]:
co_visit_dict

defaultdict(<function __main__.make_co_visit_dict.<locals>.<lambda>()>,
            {2: defaultdict(int,
                         {3860: 2, 12162: 1, 13783: 1, 3847: 1, 12232: 1}),
             3: defaultdict(int,
                         {10095: 41,
                          846: 24,
                          5800: 10,
                          7093: 8,
                          10211: 6,
                          13131: 5,
                          11295: 5,
                          11822: 4,
                          13202: 4,
                          11919: 4,
                          10439: 4,
                          10556: 3,
                          10415: 2,
                          2439: 2,
                          1229: 2,
                          1372: 2,
                          12707: 2,
                          6579: 2,
                          8609: 2,
                          11273: 2,
                          1091: 2,
                          7169: 2,
  