In [1]:
%cd ..

/kaggle/working


In [2]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../experiments/006_score_bug_fix"):
    cfg = compose(config_name="config.yaml", overrides=["exp=v12_001"])
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 42
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  datasets_dir: /kaggle/working/output/datasets/012/base
  one_epoch: false
  lgbm:
    cat_cols: []
    unuse_cols:
    - yad_no
    - session_id
    - original
    - label
    - fold
    - candidates
    - yad_type
    - wid_cd
    - ken_cd
    - lrg_cd
    - sml_cd
    label_col: label
    verbose_eval: 100
    downsampling_rate: 1.0
    ndcg_eval_at:
    - 5
    - 10
    params:
      metric:
      - map
      - ndcg
      - auc
      objective: lambdarank
      ndcg_eval_at: []
      num_iterations: 4000
      early_stopping_round: 50
      lambda_l1: 0.1
      lambda_l2: 0.1
      num_leaves: 32
      feature_fraction: 0.8
      bagging_fraction: 0.8


In [3]:
import os
import sys
from pathlib import Path

import hydra
import numpy as np
import polars as pl
import torch
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm

import utils
import wandb
from utils.load import (
    load_image_embeddings,
    load_label_data,
    load_log_data,
    load_session_data,
    load_yad_data,
)
from utils.metrics import calculate_metrics

In [4]:
train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
train_label_df = load_label_data(Path(cfg.dir.data_dir))
test_log_df = load_log_data(Path(cfg.dir.data_dir), "test")
train_log_df.head()

session_id,seq_no,yad_no
str,i64,i64
"""000007603d533d…",0,2395
"""0000ca043ed437…",0,13535
"""0000d4835cf113…",0,123
"""0000fcda1ae1b2…",0,8475
"""000104bdffaaad…",0,96


In [5]:
train_log_label_df = train_log_df.join(
    train_label_df, on="session_id", suffix="_label"
).with_columns((pl.col("seq_no").max().over("session_id") + 1).alias("seq_len"))

train_log_label_df.head()

session_id,seq_no,yad_no,yad_no_label,seq_len
str,i64,i64,i64,i64
"""000007603d533d…",0,2395,4101,1
"""0000ca043ed437…",0,13535,8253,1
"""0000d4835cf113…",0,123,4863,1
"""0000fcda1ae1b2…",0,8475,1652,1
"""000104bdffaaad…",0,96,96,2


In [6]:
train_log_label_df.group_by(["session_id"]).agg(pl.col("seq_len").max())[
    "seq_len"
].value_counts().sort(by="seq_len")

seq_len,counts
i64,u32
1,185386
2,82793
3,15350
4,4025
5,833
6,223
7,65
8,18
9,4
10,1


## seq_len==1をルールベースにしてみる

In [15]:
train_log_label_df.filter(pl.col("seq_no") == pl.col("seq_len") - 1).group_by(
    ["yad_no", "yad_no_label"]
).agg(pl.col("yad_no").count().alias("counts")).sort(
    by=["yad_no", "counts"], descending=True
).group_by(
    "yad_no"
).agg(
    pl.col("yad_no_label")
)

yad_no,yad_no_label
i64,list[i64]
13806,"[11113, 3326, … 6925]"
13805,"[5068, 5271]"
13804,"[13382, 9181, … 10727]"
13803,"[12752, 12962, … 13106]"
13801,"[496, 5130, … 52]"
13800,"[5491, 12245, … 10758]"
13799,"[2231, 1037, … 9765]"
13798,"[12056, 7722, … 4299]"
13797,"[10159, 3955, … 7820]"
13796,"[9112, 13775, … 13097]"


## seq_lenごとのスコア算出

In [103]:
train_df = pl.read_parquet(Path(cfg.exp.datasets_dir) / "train.parquet")

In [104]:
train_df.shape

(11360661, 110)

In [105]:
oof = np.load("output/exp/006_score_bug_fix/v12_001/oof.npy")
oof.shape

(11360661,)

In [106]:
oof

array([-0.49877921, -0.68552034, -0.32798068, ...,  4.26175052,
        3.84972775,  2.87522553])

In [107]:
def make_eval_df(pred_df: pl.DataFrame):
    pred_candidates_df = pred_df.group_by("session_id").agg(pl.col("candidates"))
    train_label_df = load_label_data(Path(cfg.dir.data_dir))
    candidaates_df = pred_candidates_df.join(
        train_label_df, on="session_id", how="left"
    )
    return candidaates_df


oof_pred_df = (
    train_df.with_columns(pl.Series(name="pred", values=oof))
    .sort(by=["session_id", "pred"], descending=True)
    .filter(pl.col("original") == 1)
    .select(
        ["session_id", "candidates"],
    )
)
candidates_df = make_eval_df(oof_pred_df)

In [108]:
candidates_df.sort(by="session_id")

session_id,candidates,yad_no
str,list[i32],i64
"""000007603d533d…","[11882, 2808, … 11407]",4101
"""0000ca043ed437…","[8253, 4488, … 2843]",8253
"""0000d4835cf113…","[9039, 13642, … 13468]",4863
"""0000fcda1ae1b2…","[626, 2272, … 3338]",1652
"""000104bdffaaad…","[96, 902, … 1490]",96
"""00011afe25c343…","[12305, 9981, … 9308]",4823
"""000125c737df18…","[13240, 4574, … 5341]",10378
"""0001763050a10b…","[11958, 10868, … 3318]",10362
"""000178c4d4d567…","[13220, 2232, … 5542]",1227
"""0001e6a407a85d…","[9430, 3752, … 10095]",175


In [109]:
metrics = calculate_metrics(
    candidates_df, candidates_col="candidates", label_col="yad_no", k=[10]
)
metrics

k: 10
avg_num_candidates: 10.0
recall: 0.589799721508289
precision: 0.0589799721508289
map@k: 0.39922412743717356



[{'k': 10,
  'avg_num_candidates': 10.0,
  'recall': 0.589799721508289,
  'precision': 0.0589799721508289,
  'map@k': 0.39922412743717356}]

In [110]:
for seq_len in range(1, 10):
    print(seq_len)
    tmp = candidates_df.join(
        train_log_label_df.group_by(["session_id"]).agg(pl.col("seq_len").max()),
        on="session_id",
    ).filter(pl.col("seq_len") == seq_len)

    metrics = calculate_metrics(
        tmp, candidates_col="candidates", label_col="yad_no", k=[10]
    )
    print(metrics)

1
k: 10
avg_num_candidates: 10.0
recall: 0.36747650847421054
precision: 0.03674765084742105
map@k: 0.12855392754953673

[{'k': 10, 'avg_num_candidates': 10.0, 'recall': 0.36747650847421054, 'precision': 0.03674765084742105, 'map@k': 0.12855392754953673}]
2
k: 10
avg_num_candidates: 10.0
recall: 0.9861461717778073
precision: 0.09861461717778075
map@k: 0.8755191902390337

[{'k': 10, 'avg_num_candidates': 10.0, 'recall': 0.9861461717778073, 'precision': 0.09861461717778075, 'map@k': 0.8755191902390337}]
3
k: 10
avg_num_candidates: 10.0
recall: 0.9990228013029316
precision: 0.09990228013029318
map@k: 0.909852696344553

[{'k': 10, 'avg_num_candidates': 10.0, 'recall': 0.9990228013029316, 'precision': 0.09990228013029318, 'map@k': 0.909852696344553}]
4
k: 10
avg_num_candidates: 10.0
recall: 0.999751552795031
precision: 0.09997515527950311
map@k: 0.9559236912156167

[{'k': 10, 'avg_num_candidates': 10.0, 'recall': 0.999751552795031, 'precision': 0.09997515527950311, 'map@k': 0.955923691215616

In [16]:
one_log = train_log_label_df.filter(pl.col("seq_len") == 1)

In [22]:
# 同じペアは出現しない
one_log.group_by(["yad_no", "yad_no_label"]).agg(
    pl.col("session_id").count().alias("counts")
).describe()

describe,yad_no,yad_no_label,counts
str,f64,f64,f64
"""count""",185386.0,185386.0,185386.0
"""null_count""",0.0,0.0,0.0
"""mean""",6902.682689,6894.784407,1.0
"""std""",3999.732525,4008.776842,0.0
"""min""",2.0,1.0,1.0
"""25%""",3412.0,3412.0,1.0
"""50%""",6895.0,6861.0,1.0
"""75%""",10353.0,10350.0,1.0
"""max""",13806.0,13806.0,1.0


In [24]:
one_log.group_by("yad_no").agg(pl.col("yad_no_label")).with_columns(
    pl.col("yad_no_label").list.len()
)

yad_no,yad_no_label
i64,u32
7083,17
5115,17
11930,41
6562,17
5246,3
6556,43
12722,20
12454,6
1706,7
11412,6
