In [3]:
%cd ..

/kaggle/working


In [60]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../cand_unsupervised/transition_prob"):
    cfg = compose(config_name="config.yaml")
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 7
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  range_transition: 1
  num_candidate: 100
  k:
  - 1
  - 5
  - 10
  - 50
  - 100



In [5]:
from pathlib import Path

import polars as pl

import utils
from utils.load import load_label_data, load_log_data, load_yad_data
from utils.metrics import calculate_metrics

In [6]:
with utils.timer("load data"):
    train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
    test_log_df = load_log_data(Path(cfg.dir.data_dir), "test")
    all_log_df = pl.concat([train_log_df, test_log_df])

[load data] done in 0.1 s


In [35]:
range_transition = 1
transition_dfs = []

for rti in range(range_transition):
    df = (
        all_log_df.with_columns(
            pl.col("yad_no").alias("from_yad_no"),
            pl.col("yad_no").shift(-(rti + 1)).over("session_id").alias("to_yad_no"),
        )
        .filter(~pl.col("to_yad_no").is_null())
        .filter(pl.col("from_yad_no") != pl.col("to_yad_no"))  # 同じものへは遷移しない
        .select(["from_yad_no", "to_yad_no"])
    )
    transition_dfs.append(df)

In [36]:
transition_df = pl.concat(transition_dfs)

In [37]:
transition_df = (
    transition_df.group_by(["from_yad_no", "to_yad_no"])
    .agg(pl.col("from_yad_no").count().alias("from_to_count"))
    .with_columns(
        pl.col("from_yad_no").count().over(["from_yad_no"]).alias("from_count"),
    )
    .with_columns(
        (pl.col("from_to_count") / pl.col("from_count")).alias("transition_prob")
    )
    .sort(by=["from_yad_no", "to_yad_no"])
    .select(["from_yad_no", "to_yad_no", "transition_prob"])
)

In [38]:
transition_df.head()

from_yad_no,to_yad_no,transition_prob
i64,i64,f64
1,1254,0.25
1,1503,0.25
1,4133,0.25
1,10352,0.25
2,3847,0.25


In [59]:
# session_id ごとに最後の yad_no を取得する
last_log_df = (
    train_log_df.group_by("session_id")
    .agg(pl.all().sort_by("seq_no").last())
    .sort(by="session_id")
)

# 遷移確率を結合し、確率の降順に候補として生成する
candidate_df = (
    last_log_df.join(
        transition_df, left_on="yad_no", right_on="from_yad_no", how="left"
    )
    .sort(by=["session_id", "transition_prob"], descending=True)
    .group_by("session_id")
    .agg(pl.col("to_yad_no").alias("candidates"))
)

candidate_df.head()

session_id,candidates
str,list[i64]
"""65f5b4aae382b2…","[11699, 13308, … 12737]"
"""f90f65bceb463e…","[11635, 4208, … 7587]"
"""8a1357c83aa866…","[8793, 531, … 12880]"
"""713979d622e817…","[10215, 138, … 12956]"
"""a9dff8ef1837cf…",[3173]
