## 後処理でうまく汎化させる方法を見つける

- seq_len==1 の上位を変更する
- 確率行列(のpower) を用いて、lastから最も遷移し易いものを候補とする

### バリエーション

- 確率行列のpowerの回数を変えてみる
- 確率行列をlabelも含めて生成してみる（oofでのスコア計算はリークしてしまいうまくいかないが、提出してどうなるかを確かめる

In [2]:
%cd ..

/kaggle/working


In [23]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../experiments/ensemble_002"):
    cfg = compose(config_name="config.yaml", overrides=["exp=003"])
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 42
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  other_dirs:
  - output/exp/008_split/base
  first_dirs:
  - output/exp/008_split/v025_003_first
  transision_path: output/cand_unsupervised/prob_matrix_filter/two002/yad2yad_feature.parquet
  score_col: transition_prob



In [6]:
import logging
import os
import pickle
import sys
import time
from pathlib import Path

import hydra
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf
from tqdm.auto import tqdm

import utils
from utils.load import load_label_data, load_log_data, load_session_data
from utils.logger import get_logger
from utils.metrics import calculate_metrics

In [12]:
logger = get_logger(__name__, "./notebook/run.log")

In [9]:
other_oof_df = pl.read_parquet(Path(cfg.exp.other_dirs[0]) / "oof_pred.parquet")
other_test_df = pl.read_parquet(Path(cfg.exp.other_dirs[0]) / "test_pred.parquet")
first_oof_df = pl.read_parquet(Path(cfg.exp.first_dirs[0]) / "oof_pred.parquet")
first_test_df = pl.read_parquet(Path(cfg.exp.first_dirs[0]) / "test_pred.parquet")

In [14]:
def make_eval_df(cfg, other_oof_df: pl.DataFrame, first_oof_df: pl.DataFrame):
    other_oof_df = other_oof_df.filter(pl.col("session_count") != 1).drop(
        "session_count"
    )
    first_oof_df = first_oof_df.filter(pl.col("session_count") == 1).drop(
        "session_count"
    )
    pred_df = pl.concat([other_oof_df, first_oof_df]).sort(
        by=["session_id", "pred"], descending=True
    )
    pred_candidates_df = pred_df.group_by("session_id").agg(pl.col("candidates"))
    train_label_df = load_label_data(Path(cfg.dir.data_dir))
    candidaates_df = pred_candidates_df.join(
        train_label_df, on="session_id", how="left"
    )
    return candidaates_df

In [15]:
with utils.trace("eval"):
    oof_candidate_df = make_eval_df(cfg, other_oof_df, first_oof_df)
    logger.info(oof_candidate_df.head())
    metrics = calculate_metrics(
        oof_candidate_df, candidates_col="candidates", label_col="yad_no", k=[10]
    )
    logger.info(metrics)

shape: (5, 3)
┌──────────────────────────────────┬───────────────────────┬────────┐
│ session_id                       ┆ candidates            ┆ yad_no │
│ ---                              ┆ ---                   ┆ ---    │
│ str                              ┆ list[i32]             ┆ i64    │
╞══════════════════════════════════╪═══════════════════════╪════════╡
│ 186768e7232523202944561d05c561d5 ┆ [1405, 3954, … 2524]  ┆ 3954   │
│ 46f542caf1efc6c39cedf30d2119a816 ┆ [6893, 1587, … 9941]  ┆ 2118   │
│ badeb10fc0ba79786ba13913c7bbbea8 ┆ [6199, 11037, … 6178] ┆ 12986  │
│ 34de4dd4beb7e479b0c4ec4934b30723 ┆ [7215, 12122, … 8406] ┆ 3519   │
│ 85e893285ec16fb2bb64620c8596a44d ┆ [630, 6418, … 303]    ┆ 2187   │
└──────────────────────────────────┴───────────────────────┴────────┘
shape: (5, 3)
┌──────────────────────────────────┬───────────────────────┬────────┐
│ session_id                       ┆ candidates            ┆ yad_no │
│ ---                              ┆ ---                   ┆ -

k: 10
avg_num_candidates: 9.991298865942957
recall: 0.6098448898156551
precision: 0.06098448898156552


[{'k': 10, 'avg_num_candidates': 9.991298865942957, 'recall': 0.6098448898156551, 'precision': 0.06098448898156552, 'map@k': 0.40712217314583793}]
[{'k': 10, 'avg_num_candidates': 9.991298865942957, 'recall': 0.6098448898156551, 'precision': 0.06098448898156552, 'map@k': 0.40712217314583793}]


map@k: 0.40712217314583793



[5.0GB(+1.0GB):9.1sec] eval 



## 遷移確率で修正してみる


In [19]:
transition_df = pl.read_parquet(cfg.exp.transision_path).filter(
    pl.col("from_yad_no") != pl.col("to_yad_no")
)

In [22]:
first_oof_df.head()

session_id,candidates,pred,session_count
str,i32,f64,i32
"""fffffa7baf3700…",2439,0.223318,2
"""fffffa7baf3700…",10095,0.203527,2
"""fffffa7baf3700…",1372,0.144034,2
"""fffffa7baf3700…",9624,0.132992,2
"""fffffa7baf3700…",12154,0.132992,2


In [26]:
def concat_label_pred(cfg, first_df, transition_df, mode):
    # 最後のyad_noを作る＆そのセッションでの長さを計算&長さ１のものだけ残す
    log_df = load_log_data(Path(cfg.dir.data_dir), mode)
    last_log_df = (
        log_df.with_columns(
            (pl.col("seq_no").max().over("session_id") + 1).alias("session_count")
        )
        .filter(pl.col("session_count") == 1)
        .rename({"yad_no": "from_yad_no"})
    )
    # session と結合
    session_df = load_session_data(Path(cfg.dir.data_dir), mode)
    session_last_df = session_df.join(
        last_log_df.select(["session_id", "from_yad_no", "session_count"]),
        on="session_id",
    )

    # transitionと結合
    first_df_from_label = (
        session_last_df.join(
            transition_df.rename({cfg.exp.score_col: "pred"}), on="from_yad_no"
        )
        .with_columns(
            pl.col("to_yad_no").alias("candidates").cast(pl.Int32),
            pl.col("session_count").cast(pl.Int32),
            pl.col("pred") * 1000,
        )
        .drop(["from_yad_no", "to_yad_no"])
        .select(["session_id", "candidates", "pred", "session_count"])
    )

    # first と結合
    result = (
        pl.concat([first_df, first_df_from_label])
        .group_by(["session_id", "candidates"])
        .agg(pl.col("pred").sum(), pl.col("session_count").max())
        .sort(by=["session_id", "pred"], descending=True)
    )
    return result

In [27]:
with utils.trace("post process for eval"):
    oof_candidate_df = make_eval_df(
        cfg, other_oof_df, concat_label_pred(cfg, first_oof_df, transition_df, "train")
    )
    logger.info(oof_candidate_df.head())
    metrics = calculate_metrics(
        oof_candidate_df, candidates_col="candidates", label_col="yad_no", k=[10]
    )
    logger.info(metrics)

shape: (5, 3)
┌──────────────────────────────────┬────────────────────────┬────────┐
│ session_id                       ┆ candidates             ┆ yad_no │
│ ---                              ┆ ---                    ┆ ---    │
│ str                              ┆ list[i32]              ┆ i64    │
╞══════════════════════════════════╪════════════════════════╪════════╡
│ 32f17966c7468726a20cee23a7ed1a29 ┆ [11407, 1050, … 10983] ┆ 12736  │
│ 2de19706348d0401830ba078cdc43ee9 ┆ [364, 8809, … 801]     ┆ 364    │
│ 88d3cb4f009273bcfabdb5905c9a9e84 ┆ [11470, 7256, … 6629]  ┆ 5340   │
│ 9dbf3e1f0763479397540d3e103356cc ┆ [4391, 2995, … 535]    ┆ 11996  │
│ c26bffdd0ccd6fd25bea43ea9974abcd ┆ [5552, 9432, … 5828]   ┆ 5552   │
└──────────────────────────────────┴────────────────────────┴────────┘
shape: (5, 3)
┌──────────────────────────────────┬────────────────────────┬────────┐
│ session_id                       ┆ candidates             ┆ yad_no │
│ ---                              ┆ ---         

k: 10
avg_num_candidates: 9.991298865942957
recall: 0.6116114417141788
precision: 0.06116114417141788


[{'k': 10, 'avg_num_candidates': 9.991298865942957, 'recall': 0.6116114417141788, 'precision': 0.06116114417141788, 'map@k': 0.40763716728427263}]
[{'k': 10, 'avg_num_candidates': 9.991298865942957, 'recall': 0.6116114417141788, 'precision': 0.06116114417141788, 'map@k': 0.40763716728427263}]


map@k: 0.40763716728427263



[5.7GB(+0.3GB):13.4sec] post process for eval 


In [None]:
seq_len_df = train_log_df.group_by("session_id").agg(
    (pl.col("seq_no").max() + 1).alias("seq_len")
)
oof_candidate_df = oof_candidate_df.join(seq_len_df, on="session_id")
for i in range(1, 10):
    print(i)
    metrics_list = calculate_metrics(
        oof_candidate_df.filter(pl.col("seq_len") == i),
        candidates_col="candidates",
        label_col="yad_no",
        k=10,
    )
    for metrics in metrics_list:
        metrics = {f"{k}/each_seq_len": v for k, v in metrics.items()}
        metrics["seq_len"] = i
        print(metrics)