In [1]:
%cd ..

/kaggle/working


In [4]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../experiments/ensemble_001"):
    cfg = compose(config_name="config.yaml", overrides=["exp=002"])
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 42
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  other_dirs:
  - output/exp/008_split/base
  first_dirs:
  - output/exp/008_split/first004



In [5]:
import logging
import os
import pickle
import sys
import time
from pathlib import Path

import hydra
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf
from tqdm.auto import tqdm

import utils
import wandb
from utils.load import load_label_data, load_log_data, load_session_data, load_yad_data
from utils.logger import get_logger
from utils.metrics import calculate_metrics

In [6]:
other_oof_df = pl.read_parquet(Path(cfg.exp.other_dirs[0]) / "oof_pred.parquet")
other_test_df = pl.read_parquet(Path(cfg.exp.other_dirs[0]) / "test_pred.parquet")
other_oof_df.head()

session_id,candidates,pred,session_count
str,i32,f64,u32
"""fffffa7baf3700…",2439,2.098421,2
"""fffffa7baf3700…",2981,0.334312,2
"""fffffa7baf3700…",10095,-0.838374,2
"""fffffa7baf3700…",1372,-1.271482,2
"""fffffa7baf3700…",3,-1.31913,2


In [7]:
first_oof_df = pl.read_parquet(Path(cfg.exp.first_dirs[0]) / "oof_pred.parquet")
first_test_df = pl.read_parquet(Path(cfg.exp.first_dirs[0]) / "test_pred.parquet")

In [8]:
def make_eval_df(other_oof_df: pl.DataFrame, first_oof_df: pl.DataFrame):
    other_oof_df = other_oof_df.filter(pl.col("session_count") != 1).drop(
        "session_count"
    )
    first_oof_df = first_oof_df.filter(pl.col("session_count") == 1).drop(
        "session_count"
    )
    pred_df = pl.concat([other_oof_df, first_oof_df]).sort(
        by=["session_id", "pred"], descending=True
    )
    pred_candidates_df = pred_df.group_by("session_id").agg(pl.col("candidates"))
    train_label_df = load_label_data(Path(cfg.dir.data_dir))
    candidaates_df = pred_candidates_df.join(
        train_label_df, on="session_id", how="left"
    )
    return candidaates_df

In [9]:
oof_candidate_df = make_eval_df(other_oof_df, first_oof_df)
print(oof_candidate_df.head())

metrics = calculate_metrics(
    oof_candidate_df, candidates_col="candidates", label_col="yad_no", k=[10]
)
print(metrics)

shape: (5, 3)
┌──────────────────────────────────┬────────────────────────┬────────┐
│ session_id                       ┆ candidates             ┆ yad_no │
│ ---                              ┆ ---                    ┆ ---    │
│ str                              ┆ list[i32]              ┆ i64    │
╞══════════════════════════════════╪════════════════════════╪════════╡
│ 86a316f8ab5fbceac4806b52c0ad2631 ┆ [11895, 11765, … 1679] ┆ 11895  │
│ f900914271cf87451eb8c5a72b7f68b6 ┆ [7093, 3338, … 1028]   ┆ 10095  │
│ 762d83ac51fa8f63bee8ca8c4d9b79df ┆ [7417, 2783, … 5887]   ┆ 7417   │
│ 5be01ca530f1bc1076b8a930bcec8543 ┆ [9785, 9208, … 4787]   ┆ 9785   │
│ eb3833243b02dc59ce8284bdd789dc45 ┆ [9508, 12946, … 3318]  ┆ 3565   │
└──────────────────────────────────┴────────────────────────┴────────┘
k: 10
avg_num_candidates: 9.990380951721177
recall: 0.5991555189159606
precision: 0.05991555189159607
map@k: 0.404229478775371

[{'k': 10, 'avg_num_candidates': 9.990380951721177, 'recall': 0.5991555189159

In [10]:
def make_submission(other_test_df: pl.DataFrame, first_test_df: pl.DataFrame):
    other_test_df = other_test_df.filter(pl.col("session_count") != 1).drop(
        "session_count"
    )
    first_test_df = first_test_df.filter(pl.col("session_count") == 1).drop(
        "session_count"
    )
    pred_df = pl.concat([other_test_df, first_test_df]).sort(
        by=["session_id", "pred"], descending=True
    )
    session_df = load_session_data(Path(cfg.dir.data_dir), "test")
    pred_candidates_df = pred_df.group_by("session_id").agg(pl.col("candidates"))
    submission_df = (
        session_df.join(
            pred_candidates_df.with_columns(
                [
                    pl.col("candidates").list.get(i).alias(f"predict_{i}")
                    for i in range(10)
                ]
            ).drop("candidates"),
            on="session_id",
            how="left",
        )
        .fill_null(-1)
        .drop("session_id")
    )
    return submission_df

In [11]:
test_candidate_df = make_submission(other_test_df, first_test_df)
test_candidate_df.head()

predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
3560,11561,4545,9534,4714,4420,5466,2680,6563,6488
143,4066,6555,7014,613,8108,11923,6129,11237,12862
757,7710,9190,9910,1774,410,10485,13570,6721,3400
12341,3359,6991,1542,13521,10861,5080,4180,5657,9319
2862,9020,10826,3854,763,3476,6161,12029,9611,5372


## ルールベースでの最後の一個まえ重視

In [31]:
mode = "train"

def post_process_last_before_pred(cfg, first_df, mode):
    log_df = load_log_data(Path(cfg.dir.data_dir), mode)
    print(log_df.shape)
    log_df = (
        log_df
        .with_columns(
            # max seq_no を作成
            pl.col("seq_no").max().over("session_id").alias("max_seq_no"),
            pl.col("yad_no").alias("candidates"),
            pl.lit(100.0).alias("pred"),
            pl.lit(2).alias("session_count"),
        )
        .with_columns(pl.col("candidates").cast(pl.Int32), pl.col("pred").cast(pl.Float64))
        .filter(pl.col("seq_no") == pl.col("max_seq_no") - 1)
    ).select(["session_id", "candidates", "pred", "session_count"])

    # first と結合
    result = (
        pl.concat([first_oof_df, log_df])
        .group_by(["session_id", "candidates"])
        .agg(pl.col("pred").sum(), pl.col("session_count").max())
        .sort(by=["session_id", "pred"], descending=True)
    )
    return result

(419270, 3)


In [32]:
log_df.shape

(103312, 4)

In [34]:
oof_candidate_df = make_eval_df(other_oof_df, result)
print(oof_candidate_df.head())

metrics = calculate_metrics(
    oof_candidate_df, candidates_col="candidates", label_col="yad_no", k=[10]
)
print(metrics)

shape: (5, 3)
┌──────────────────────────────────┬────────────────────────┬────────┐
│ session_id                       ┆ candidates             ┆ yad_no │
│ ---                              ┆ ---                    ┆ ---    │
│ str                              ┆ list[i32]              ┆ i64    │
╞══════════════════════════════════╪════════════════════════╪════════╡
│ 1f8880d26a224466297da54e8a4feec2 ┆ [11760, 8264, … 5043]  ┆ 10952  │
│ 125b342622d89a1de95447a6af4b8b62 ┆ [12132, 12395, … 7057] ┆ 10145  │
│ 0c81bedd3dfde15fe7729cea64af1660 ┆ [6967, 914, … 7024]    ┆ 914    │
│ ff6eba17fe40c83e48f56b275c0324f0 ┆ [2997, 4003, … 10137]  ┆ 8908   │
│ 7b71929504fd4eca6b97cf6aac085d34 ┆ [10915, 9849, … 4072]  ┆ 10915  │
└──────────────────────────────────┴────────────────────────┴────────┘
k: 10
avg_num_candidates: 9.990380951721177
recall: 0.5991485912614566
precision: 0.05991485912614564
map@k: 0.4041116756599718

[{'k': 10, 'avg_num_candidates': 9.990380951721177, 'recall': 0.599148591261

## ルールベースでの session_count==1 の変更

In [22]:
first_oof_df.head()

session_id,candidates,pred,session_count
str,i32,f64,i32
"""fffffa7baf3700…",2439,0.245733,2
"""fffffa7baf3700…",1372,0.169726,2
"""fffffa7baf3700…",10095,0.169726,2
"""fffffa7baf3700…",12154,0.141801,2
"""fffffa7baf3700…",3,0.127825,2


In [44]:
def make_first_candidates():
    sml_df = pl.read_parquet(
        "/kaggle/working/output/cand_unsupervised/ranking_location/sml_cd/yad_feature.parquet"
    ).rename({"counts_ranking_location/sml_cd": "pred"})
    yad_df = load_yad_data(Path(cfg.dir.data_dir))
    sml_df = sml_df.join(yad_df.select(["yad_no", "sml_cd"]), on="yad_no").rename(
        {"yad_no": "candidates"}
    )
    # session_count==1だけにする
    train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
    first_log_df = train_log_df.with_columns(
        (pl.col("seq_no").max().over("session_id") + 1).alias("session_count")
    ).filter(pl.col("session_count") == 1)

    # yad_df を結合→ sml_df を結合 → rankごとにソートしてcandidate作成
    first_log_df = (
        (
            first_log_df.join(yad_df, on="yad_no")
            .join(sml_df, on="sml_cd")
            .filter(pl.col("yad_no") != pl.col("candidates"))  # 自身は取り除く
            .select(["session_id", "candidates", "pred"])
        )
        .sort(by=["session_id", "pred"], descending=True)
        .with_columns(pl.lit(1).alias("session_count"))
    )

    return first_log_df.with_columns(
        pl.col("candidates").cast(pl.Int32), pl.col("pred").cast(pl.Float64)
    )

In [45]:
first_sml_df = make_first_candidates()
first_sml_df.head()

session_id,candidates,pred,session_count
str,i32,f64,i32
"""ffff7fb4617164…",2087,495.0,1
"""ffff7fb4617164…",11850,451.0,1
"""ffff7fb4617164…",12240,377.0,1
"""ffff7fb4617164…",6630,258.0,1
"""ffff7fb4617164…",4398,255.0,1


In [46]:
oof_candidate_df = make_eval_df(other_oof_df, first_sml_df)
print(oof_candidate_df.head())

metrics = calculate_metrics(
    oof_candidate_df, candidates_col="candidates", label_col="yad_no", k=[10]
)
print(metrics)

shape: (5, 3)
┌──────────────────────────────────┬────────────────────────┬────────┐
│ session_id                       ┆ candidates             ┆ yad_no │
│ ---                              ┆ ---                    ┆ ---    │
│ str                              ┆ list[i32]              ┆ i64    │
╞══════════════════════════════════╪════════════════════════╪════════╡
│ 981200e80de3337ed9ad0d5f6d99e81c ┆ [9104, 8445, … 12989]  ┆ 858    │
│ 1ec473ec8c14b950c2180dec42cd1a35 ┆ [5445, 13106, … 6289]  ┆ 6690   │
│ 9bba6b013cf1e55f4eac1a8ea1810011 ┆ [787, 8479, … 13733]   ┆ 787    │
│ 6bc277552119730db1af0d8a09f53321 ┆ [9974, 8879, … 11463]  ┆ 7505   │
│ 1922a96268e4629636b4152c3eef8763 ┆ [10236, 6218, … 11702] ┆ 4006   │
└──────────────────────────────────┴────────────────────────┴────────┘
k: 10
avg_num_candidates: 9.936109706336726
recall: 0.555137202197452
precision: 0.055513720219745204
map@k: 0.38502145566075696

[{'k': 10, 'avg_num_candidates': 9.936109706336726, 'recall': 0.55513720219

In [121]:
mode = "train"


def concat_label_pred(first_df, mode):
    # 最後のyad_noだけを残す & labelを付与
    train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
    train_label_df = load_label_data(Path(cfg.dir.data_dir))
    train_last_log_label_df = (
        train_log_df.join(train_label_df, on="session_id", suffix="_label")
        .with_columns(
            (pl.col("seq_no").max().over("session_id") + 1).alias("session_count")
        )
        .filter(pl.col("seq_no") == pl.col("session_count") - 1)
    )
    # 実績ラベルからyad_noごとに良さそうな対象を探す
    label_pred_df = (
        train_last_log_label_df.group_by(["yad_no", "yad_no_label"])
        .agg(pl.col("yad_no").count().alias("pred"))
        .with_columns(pl.col("pred") * 100.0, pl.lit(1).alias("session_count"))
        .sort(by=["yad_no", "pred", "session_count"], descending=True)
    )

    # 予測値作成
    log_df = load_log_data(Path(cfg.dir.data_dir), mode)
    last_log_df = log_df.with_columns(
        (pl.col("seq_no").max().over("session_id") + 1).alias("session_count")
    ).filter(pl.col("seq_no") == pl.col("session_count") - 1)
    session_df = load_session_data(Path(cfg.dir.data_dir), mode)
    session_last_df = (
        session_df.join(
            last_log_df.select(["session_id", "yad_no", "session_count"]),
            on="session_id",
        )
        .filter(pl.col("session_count") == 1)
        .drop("session_count")
    )
    first_df_from_label = (
        session_last_df.join(label_pred_df, on="yad_no")
        .with_columns(
            pl.col("yad_no_label").alias("candidates").cast(pl.Int32),
            pl.col("session_count").cast(pl.Int32),
        )
        .drop(["yad_no", "yad_no_label"])
        .select(["session_id", "candidates", "pred", "session_count"])
    )
    # first と結合
    result = (
        pl.concat([first_df, first_df_from_label])
        .group_by(["session_id", "candidates"])
        .agg(pl.col("pred").sum(), pl.col("session_count").max())
        .sort(by=["session_id", "pred"], descending=True)
    )
    return result


df = concat_label_pred(first_oof_df, "train")

In [119]:
df.sort(by="pred")

session_id,candidates,pred,session_count
str,i32,f64,i32
"""ffca0dda16272d…",12797,-0.509899,2
"""ff588a978241c7…",12676,-0.509899,2
"""ff588a978241c7…",3698,-0.509899,2
"""ff588a978241c7…",8628,-0.509899,2
"""ff3d5a0a6c8cfa…",3698,-0.509899,2
"""ff3d5a0a6c8cfa…",12676,-0.509899,2
"""ff3d5a0a6c8cfa…",8628,-0.509899,2
"""fef5f6fe59aacb…",1987,-0.509899,2
"""fef5f6fe59aacb…",7703,-0.509899,2
"""fef5f6fe59aacb…",13450,-0.509899,2


In [120]:
oof_candidate_df = make_eval_df(other_oof_df, concat_label_pred(first_oof_df, "train"))
print(oof_candidate_df.head())

metrics = calculate_metrics(
    oof_candidate_df, candidates_col="candidates", label_col="yad_no", k=[10]
)
print(metrics)

shape: (5, 4)
┌──────────────────────────────────┬────────────┬──────────┬───────────────┐
│ session_id                       ┆ candidates ┆ pred     ┆ session_count │
│ ---                              ┆ ---        ┆ ---      ┆ ---           │
│ str                              ┆ i32        ┆ f64      ┆ i32           │
╞══════════════════════════════════╪════════════╪══════════╪═══════════════╡
│ fffffa7baf370083ebcdd98f26a7e31a ┆ 2439       ┆ 0.245733 ┆ 2             │
│ fffffa7baf370083ebcdd98f26a7e31a ┆ 1372       ┆ 0.169726 ┆ 2             │
│ fffffa7baf370083ebcdd98f26a7e31a ┆ 10095      ┆ 0.169726 ┆ 2             │
│ fffffa7baf370083ebcdd98f26a7e31a ┆ 12154      ┆ 0.141801 ┆ 2             │
│ fffffa7baf370083ebcdd98f26a7e31a ┆ 3          ┆ 0.127825 ┆ 2             │
└──────────────────────────────────┴────────────┴──────────┴───────────────┘
shape: (5, 4)
┌──────────────────────────────────┬────────────┬───────┬───────────────┐
│ session_id                       ┆ candidates ┆ p

In [122]:
test_candidate_df = make_submission(
    other_test_df, concat_label_pred(first_test_df, "test")
)
test_candidate_df.head()

predict_0,predict_1,predict_2,predict_3,predict_4,predict_5,predict_6,predict_7,predict_8,predict_9
i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
3560,11561,4545,9534,4714,4420,5466,2680,6563,6488
143,4066,6555,7014,7913,8108,12862,6129,11237,12350
757,7710,9190,9910,1774,410,10485,13570,6721,3400
12341,3359,6991,1542,13521,10861,5080,4180,5657,9319
9020,2862,13235,4070,6565,5411,9623,5372,10826,9611
