In [2]:
%cd ..

/kaggle/working


In [15]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../cand_supervised/target_encoding"):
    cfg = compose(config_name="config.yaml", overrides=["exp=sml_cd"])
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 7
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  num_candidate: 100
  k:
  - 1
  - 5
  - 10
  - 50
  - 100
  location_col: sml_cd



In [12]:
import os
import sys
from pathlib import Path

import hydra
import numpy as np
import polars as pl
import torch
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm

import utils
import wandb
from utils.load import (
    load_image_embeddings,
    load_label_data,
    load_log_data,
    load_session_data,
    load_yad_data,
)
from utils.metrics import calculate_metrics

2種類
- lastからの遷移
- 全体からlastへの遷移

## 確認

In [13]:
log_df = load_log_data(Path(cfg.dir.data_dir), "train")
yad_df = load_yad_data(Path(cfg.dir.data_dir))

In [29]:
log_df.group_by("session_id").agg(pl.col("seq_no").max())["seq_no"].value_counts()

seq_no,counts
i64,u32
6,65
1,82793
7,18
4,833
9,1
2,15350
3,4025
0,185386
5,223
8,4


In [16]:
count_df = log_df.get_column("yad_no").value_counts().sort(by="counts", descending=True)
yad_counts_df = yad_df.join(count_df, on="yad_no").with_columns(
    pl.col("counts").rank(descending=True).over(cfg.exp.location_col).alias("rank")
)

In [21]:
yad_counts_df.select(["yad_no", cfg.exp.location_col, "counts", "rank"]).sort(
    by=[cfg.exp.location_col, "rank"]
)

yad_no,sml_cd,counts,rank
i64,str,u32,f64
10163,"""00e15b2eac75d3…",84,1.0
3714,"""00e15b2eac75d3…",75,2.0
1055,"""00e15b2eac75d3…",49,3.0
1664,"""00e15b2eac75d3…",44,4.0
12490,"""00e15b2eac75d3…",38,5.0
8098,"""00e15b2eac75d3…",34,6.0
4958,"""00e15b2eac75d3…",32,7.0
9266,"""00e15b2eac75d3…",29,8.0
708,"""00e15b2eac75d3…",28,9.0
4605,"""00e15b2eac75d3…",26,10.0


In [24]:
label_df = load_label_data(Path(cfg.dir.data_dir), "train")
count_label_df = (
    label_df.get_column("yad_no").value_counts().sort(by="counts", descending=True)
)
yad_label_counts_df = yad_df.join(count_label_df, on="yad_no").with_columns(
    pl.col("counts").rank(descending=True).over(cfg.exp.location_col).alias("rank")
)
yad_label_counts_df.select(["yad_no", cfg.exp.location_col, "counts", "rank"]).sort(
    by=[cfg.exp.location_col, "rank"]
)

yad_no,sml_cd,counts,rank
i64,str,u32,f64
10163,"""00e15b2eac75d3…",49,1.0
3714,"""00e15b2eac75d3…",44,2.0
12490,"""00e15b2eac75d3…",41,3.0
1055,"""00e15b2eac75d3…",35,4.0
1664,"""00e15b2eac75d3…",33,5.0
9266,"""00e15b2eac75d3…",25,6.0
4605,"""00e15b2eac75d3…",23,7.0
1276,"""00e15b2eac75d3…",18,8.5
10689,"""00e15b2eac75d3…",18,8.5
8568,"""00e15b2eac75d3…",17,10.0
