In [1]:
%cd ..

/kaggle/working


In [2]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(
    version_base=None, config_path="../cand_unsupervised/ranking_location_all"
):
    cfg = compose(config_name="config.yaml", overrides=["exp=sml_cd"])
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 7
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  num_candidate: 100
  k:
  - 1
  - 5
  - 10
  - 50
  - 100
  location_col: sml_cd



In [3]:
import os
import sys
from pathlib import Path

import hydra
import polars as pl
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf

import utils
import wandb
from utils.load import load_label_data, load_log_data, load_session_data, load_yad_data
from utils.metrics import calculate_metrics

In [4]:
exp_name = "temp"

In [5]:
train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
test_log_df = load_log_data(Path(cfg.dir.data_dir), "test")
all_log_df = pl.concat([train_log_df, test_log_df])
yad_df = load_yad_data(Path(cfg.dir.data_dir))

In [6]:
count_df = (
    all_log_df.get_column("yad_no").value_counts().sort(by="counts", descending=True)
)
yad_counts_df = yad_df.join(count_df, on="yad_no").with_columns(
    pl.col("counts").rank(descending=True).over(cfg.exp.location_col).alias("rank")
)

# ランキングを保存
save_df = yad_counts_df.with_columns(
    pl.col("counts").alias(f"counts_{exp_name}"),
    pl.col("rank").alias(f"rank_{exp_name}"),
).select(["yad_no", f"counts_{exp_name}", f"rank_{exp_name}"])
print(save_df.head())
print(save_df.shape)

shape: (5, 3)
┌────────┬─────────────┬───────────┐
│ yad_no ┆ counts_temp ┆ rank_temp │
│ ---    ┆ ---         ┆ ---       │
│ i64    ┆ u32         ┆ f64       │
╞════════╪═════════════╪═══════════╡
│ 1      ┆ 30          ┆ 44.0      │
│ 2      ┆ 29          ┆ 29.0      │
│ 3      ┆ 210         ┆ 1.0       │
│ 4      ┆ 67          ┆ 7.0       │
│ 5      ┆ 30          ┆ 6.0       │
└────────┴─────────────┴───────────┘
(13562, 3)


In [7]:
with utils.timer("load session data"):
    train_session_df = load_session_data(Path(cfg.dir.data_dir), "train")
    test_session_df = load_session_data(Path(cfg.dir.data_dir), "test")

[load session data] done in 0.3 s


In [10]:
location_candidates_df = (
    (
        yad_counts_df.sort(by="counts", descending=True)
        .group_by(cfg.exp.location_col)
        .agg(
            [
                pl.col("yad_no").alias("candidates"),
            ]
        )
        .with_columns(pl.col("candidates").list.head(cfg.exp.num_candidate))
    )
    .select([cfg.exp.location_col, "candidates"])
    .sort(by=cfg.exp.location_col)
)
print("location_candidates_df")
print(location_candidates_df.head(5))

location_candidates_df
shape: (5, 2)
┌──────────────────────────────────┬────────────────────────┐
│ sml_cd                           ┆ candidates             │
│ ---                              ┆ ---                    │
│ str                              ┆ list[i64]              │
╞══════════════════════════════════╪════════════════════════╡
│ 00e15b2eac75d33b9bc37c8a44d6b70a ┆ [10163, 3714, … 6781]  │
│ 0163d7c60f870add58d57c181736ec63 ┆ [9563, 6891, … 5447]   │
│ 01783e93ed4e8d33fcfd93fdb2ea748f ┆ [4954, 669, … 8773]    │
│ 0189f42d8841537a8f76269662d4fe78 ┆ [3057, 11800, … 13720] │
│ 01eeec17159ea4c98a60455cb2e237d8 ┆ [4638, 6377, … 6973]   │
└──────────────────────────────────┴────────────────────────┘


In [None]:
train_log_df.join(yad_df.select(["yad_no", cfg.exp.location_col]), on="yad_no").join(
    location_candidates_df, on=cfg.exp.location_col
).group_by("session_id").agg(pl.col("candidates").flatten())