# 遷移確率を用いた後処理で全データを修正する

In [1]:
%cd ..

/kaggle/working


In [62]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../experiments/ensemble_003"):
    cfg = compose(config_name="config.yaml", overrides=["exp=002"])
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 42
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  other_dirs:
  - output/exp/008_split/base
  first_dirs:
  - output/exp/008_split/v025_003_first
  transision_path: output/cand_unsupervised/prob_matrix_filter/two002/yad2yad_feature.parquet
  score_col: transition_prob



In [3]:
import logging
import os
import pickle
import sys
import time
from pathlib import Path

import hydra
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf
from tqdm.auto import tqdm

import utils
from utils.load import load_label_data, load_log_data, load_session_data
from utils.logger import get_logger
from utils.metrics import calculate_metrics

In [4]:
logger = get_logger(__name__, "./notebook/run.log")

In [63]:
other_oof_df = pl.read_parquet(Path(cfg.exp.other_dirs[0]) / "oof_pred.parquet")
other_test_df = pl.read_parquet(Path(cfg.exp.other_dirs[0]) / "test_pred.parquet")
first_oof_df = pl.read_parquet(Path(cfg.exp.first_dirs[0]) / "oof_pred.parquet")
first_test_df = pl.read_parquet(Path(cfg.exp.first_dirs[0]) / "test_pred.parquet")

In [64]:
def make_eval_df(cfg, other_oof_df: pl.DataFrame, first_oof_df: pl.DataFrame):
    other_oof_df = other_oof_df.filter(pl.col("session_count") != 1).drop(
        "session_count"
    )
    first_oof_df = first_oof_df.filter(pl.col("session_count") == 1).drop(
        "session_count"
    )
    pred_df = pl.concat([other_oof_df, first_oof_df]).sort(
        by=["session_id", "pred"], descending=True
    )
    pred_candidates_df = pred_df.group_by("session_id").agg(pl.col("candidates"))
    train_label_df = load_label_data(Path(cfg.dir.data_dir))
    candidaates_df = pred_candidates_df.join(
        train_label_df, on="session_id", how="left"
    )
    return candidaates_df

In [65]:
with utils.trace("eval"):
    oof_candidate_df = make_eval_df(cfg, other_oof_df, first_oof_df)
    logger.info(oof_candidate_df.head())
    metrics = calculate_metrics(
        oof_candidate_df,
        candidates_col="candidates",
        label_col="yad_no",
        k=[10],
        is_print=False,
    )
    logger.info(metrics)
    # seq_lenごとのmetrics
    train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
    seq_len_df = train_log_df.group_by("session_id").agg(
        (pl.col("seq_no").max() + 1).alias("seq_len")
    )
    oof_candidate_df = oof_candidate_df.join(seq_len_df, on="session_id")
    for i in range(1, 10):
        logger.info(i)
        metrics_list = calculate_metrics(
            oof_candidate_df.filter(pl.col("seq_len") == i),
            candidates_col="candidates",
            label_col="yad_no",
            k=10,
            is_print=False,
        )
        for metrics in metrics_list:
            metrics = {f"{k}/each_seq_len": v for k, v in metrics.items()}
            metrics["seq_len"] = i
            logger.info(metrics)

shape: (5, 3)
┌──────────────────────────────────┬─────────────────────────┬────────┐
│ session_id                       ┆ candidates              ┆ yad_no │
│ ---                              ┆ ---                     ┆ ---    │
│ str                              ┆ list[i32]               ┆ i64    │
╞══════════════════════════════════╪═════════════════════════╪════════╡
│ 076bb066678fcf6a651bc84bafafd243 ┆ [844, 6905, … 7057]     ┆ 2318   │
│ 7b21600deb0c894b46a06a064fb6946a ┆ [583, 3290, … 3851]     ┆ 7201   │
│ fc505334e7b5ae0368c4952f8e069e04 ┆ [3764, 4770, … 5937]    ┆ 3764   │
│ 0435a419e0b4803651c641dd3a86078e ┆ [7722, 7458, … 12358]   ┆ 7722   │
│ eba58c3b9f7bb4f4482ef16bf35b67e8 ┆ [12645, 10827, … 10371] ┆ 13795  │
└──────────────────────────────────┴─────────────────────────┴────────┘
[{'k': 10, 'avg_num_candidates': 9.991298865942957, 'recall': 0.6098448898156551, 'precision': 0.06098448898156552, 'map@k': 0.407122173145838}]
1
{'k/each_seq_len': 10, 'avg_num_candidates/each

## 遷移確率で修正する
- logに遷移確率をjoin
- session, to_yad_no でgroupしてprobのsumを取る
- logのlastを削除する
- probで降順にソートしてcandidateを作る

In [69]:
def concat_label_pred(cfg, other_df, first_df, mode):
    # logに遷移確率をjoin
    log_df = load_log_data(Path(cfg.dir.data_dir), mode).with_columns(
        pl.col("seq_no").count().over("session_id").alias("session_count")
    )
    transition_df = pl.read_parquet(cfg.exp.transision_path)
    log_df = log_df.join(transition_df, left_on="yad_no", right_on="from_yad_no")

    # session, to_yad_no でgroupしてprobのsumを取る
    prob_df = (
        log_df.group_by(["session_id", "to_yad_no"])
        .agg(
            (pl.col(cfg.exp.score_col).sum().alias("pred") + 1) * 100,
            pl.col("session_count").max(),
        )
        .rename({"to_yad_no": "candidates"})
    )

    # log のlastは答えにならないので削除する
    last_df = (
        load_log_data(Path(cfg.dir.data_dir), mode)
        .group_by("session_id")
        .agg(pl.col("yad_no").last().alias("candidates"))
        .with_columns(pl.lit(True).alias("last"))
        .sort(by="session_id")
    )
    removed_prob_df = (
        prob_df.join(last_df, on=["session_id", "candidates"], how="left")
        .filter(pl.col("last").is_null())
        .drop("last")
    ).with_columns(pl.col("candidates").cast(pl.Int32))

    # first と結合
    first_result = (
        pl.concat(
            [
                first_df,
                removed_prob_df.with_columns(pl.col("session_count").cast(pl.Int32)),
            ]
        )
        .filter(pl.col("session_count") == 1)
        .group_by(["session_id", "candidates"])
        .agg(pl.col("pred").sum(), pl.col("session_count").max())
        .sort(by=["session_id", "pred"], descending=True)
    )

    other_result = (
        pl.concat(
            [
                other_df,
                removed_prob_df.with_columns(pl.col("session_count").cast(pl.UInt32)),
            ]
        )
        .filter(pl.col("session_count") != 1)
        .group_by(["session_id", "candidates"])
        .agg(pl.col("pred").sum(), pl.col("session_count").max())
        .sort(by=["session_id", "pred"], descending=True)
    )
    return other_result, first_result

In [70]:
mode = "train"

other_df, first_df = concat_label_pred(cfg, other_oof_df, first_oof_df, mode)
oof_candidate_df = make_eval_df(cfg, other_df, first_df)
logger.info(oof_candidate_df.head())
metrics = calculate_metrics(
    oof_candidate_df, candidates_col="candidates", label_col="yad_no", k=[10]
)
logger.info(metrics)
# seq_lenごとのmetrics
train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
seq_len_df = train_log_df.group_by("session_id").agg(
    (pl.col("seq_no").max() + 1).alias("seq_len")
)
oof_candidate_df = oof_candidate_df.join(seq_len_df, on="session_id")
for i in range(1, 10):
    logger.info(i)
    metrics_list = calculate_metrics(
        oof_candidate_df.filter(pl.col("seq_len") == i),
        candidates_col="candidates",
        label_col="yad_no",
        k=10,
        is_print=False,
    )
    for metrics in metrics_list:
        metrics = {f"{k}/each_seq_len": v for k, v in metrics.items()}
        metrics["seq_len"] = i
        logger.info(metrics)

shape: (5, 3)
┌──────────────────────────────────┬─────────────────────────┬────────┐
│ session_id                       ┆ candidates              ┆ yad_no │
│ ---                              ┆ ---                     ┆ ---    │
│ str                              ┆ list[i32]               ┆ i64    │
╞══════════════════════════════════╪═════════════════════════╪════════╡
│ 1f45276df1cd99bb7753fc11d1fbaf89 ┆ [12148, 4792, … 12907]  ┆ 7874   │
│ af84512a0f6aad4c9ccd99395998b61b ┆ [13220, 12432, … 13542] ┆ 899    │
│ f0cff0fa13313f35d5b9420ac8588d35 ┆ [3187, 6703, … 635]     ┆ 1368   │
│ 3738887cbe7e34e0445f88510806d4e6 ┆ [7301, 6087, … 2086]    ┆ 7301   │
│ fcee292499832e5bcbf47a0321f83c48 ┆ [8798, 7512, … 6986]    ┆ 1784   │
└──────────────────────────────────┴─────────────────────────┴────────┘
[{'k': 10, 'avg_num_candidates': 9.992940720060409, 'recall': 0.6115456289963906, 'precision': 0.061154562899639074, 'map@k': 0.400604628915788}]


{'k': 10, 'avg_num_candidates': 9.992940720060409, 'recall': 0.6115456289963906, 'precision': 0.061154562899639074, 'map@k': 0.400604628915788}


1
{'k/each_seq_len': 10, 'avg_num_candidates/each_seq_len': 9.991655249047932, 'recall/each_seq_len': 0.4015082045030369, 'precision/each_seq_len': 0.040150820450303694, 'map@k/each_seq_len': 0.14036466427234875, 'seq_len': 1}
2
{'k/each_seq_len': 10, 'avg_num_candidates/each_seq_len': 9.994758010942954, 'recall/each_seq_len': 0.985747587356419, 'precision/each_seq_len': 0.09857475873564192, 'map@k/each_seq_len': 0.8606724094265312, 'seq_len': 2}
3
{'k/each_seq_len': 10, 'avg_num_candidates/each_seq_len': 9.997394136807818, 'recall/each_seq_len': 0.9992182410423452, 'precision/each_seq_len': 0.09992182410423454, 'map@k/each_seq_len': 0.8746793082053669, 'seq_len': 3}
4
{'k/each_seq_len': 10, 'avg_num_candidates/each_seq_len': 9.995776397515527, 'recall/each_seq_len': 0.9995031055900621, 'precision/each_seq_len': 0.09995031055900623, 'map@k/each_seq_len': 0.9517308488612837, 'seq_len': 4}
5
{'k/each_seq_len': 10, 'avg_num_candidates/each_seq_len': 10.0, 'recall/each_seq_len': 1.0, 'prec

In [55]:
# logに遷移確率をjoin
log_df = load_log_data(Path(cfg.dir.data_dir), mode).with_columns(
    pl.col("seq_no").count().over("session_id").alias("session_count")
)
transition_df = pl.read_parquet(cfg.exp.transision_path)
log_df = log_df.join(transition_df, left_on="yad_no", right_on="from_yad_no")
log_df.head()

session_id,seq_no,yad_no,session_count,to_yad_no,transition_prob
str,i64,i64,u32,i64,f64
"""000007603d533d…",0,2395,1,2395,0.5
"""000007603d533d…",0,2395,1,11882,0.263158
"""000007603d533d…",0,2395,1,2808,0.184211
"""000007603d533d…",0,2395,1,5289,0.026316
"""000007603d533d…",0,2395,1,4101,0.026316


In [57]:
# session, to_yad_no でgroupしてprobのsumを取る
prob_df = (
    log_df.group_by(["session_id", "to_yad_no"])
    .agg(
        (pl.col(cfg.exp.score_col).sum().alias("pred") + 1) * 100,
        pl.col("session_count").max(),
    )
    .rename({"to_yad_no": "candidates"})
).sort(by=["session_id", "pred"], descending=True)
print(prob_df.shape)
prob_df.head()

(6975028, 4)


session_id,candidates,pred,session_count
str,i64,f64,u32
"""fffffa7baf3700…",11822,162.857143,2
"""fffffa7baf3700…",2439,157.894737,2
"""fffffa7baf3700…",2981,120.877193,2
"""fffffa7baf3700…",10095,111.854637,2
"""fffffa7baf3700…",3,107.243108,2


In [58]:
# log のlastは答えにならないので削除する
last_df = (
    load_log_data(Path(cfg.dir.data_dir), mode)
    .group_by("session_id")
    .agg(pl.col("yad_no").last().alias("candidates"))
    .with_columns(pl.lit(True).alias("last"))
    .sort(by="session_id")
)
removed_prob_df = (
    prob_df.join(last_df, on=["session_id", "candidates"], how="left")
    .filter(pl.col("last").is_null())
    .drop("last")
).with_columns(pl.col("candidates").cast(pl.Int32))
print(removed_prob_df.shape)
removed_prob_df.head()

(6686330, 4)


session_id,candidates,pred,session_count
str,i32,f64,u32
"""fffffa7baf3700…",2439,157.894737,2
"""fffffa7baf3700…",2981,120.877193,2
"""fffffa7baf3700…",10095,111.854637,2
"""fffffa7baf3700…",3,107.243108,2
"""fffffa7baf3700…",1372,106.691729,2


In [23]:
# probで降順にソートしてcandidateを作る

# first と結合
first_result = (
    pl.concat([first_df, first_df_from_label])
    .group_by(["session_id", "candidates"])
    .agg(pl.col("pred").sum(), pl.col("session_count").max())
    .sort(by=["session_id", "pred"], descending=True)
)

In [25]:
pred_candidates_df = result.group_by("session_id").agg(pl.col("candidates"))
train_label_df = load_label_data(Path(cfg.dir.data_dir))
candidaates_df = pred_candidates_df.join(train_label_df, on="session_id", how="left")

In [26]:
candidaates_df

session_id,candidates,yad_no
str,list[i64],i64
"""e1415bc59a3b95…","[5490, 1908, 7749]",7749
"""a0c099ff4f1523…","[12802, 5366, … 9652]",11823
"""856a240716615a…","[2499, 6960, … 13320]",6960
"""41d8ef4fba7acc…","[9261, 1019, … 11878]",9261
"""422e390c35af20…","[10095, 2334, … 1028]",5710
"""772c5717ca7276…","[12582, 2888, … 6895]",1791
"""73a9d23563436a…","[11844, 3537, … 8753]",4373
"""e588f9a585f413…","[3267, 6538, … 13303]",6538
"""a931571506f59d…","[7915, 8690, … 1534]",3587
"""115a98ca1d0a9e…","[5222, 12444, … 12019]",5222
