In [1]:
%cd ..

/kaggle/working


In [2]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(
    version_base=None, config_path="../cand_unsupervised/transition_prob_all"
):
    cfg = compose(config_name="config.yaml")
    print(OmegaConf.to_yaml(cfg))

debug: false
seed: 7
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  range_transition: 10
  num_candidate: 100
  k:
  - 1
  - 5
  - 10
  - 50
  - 100



In [3]:
import os
import sys
from pathlib import Path

import hydra
import polars as pl
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf

import utils
import wandb
from utils.load import load_label_data, load_log_data, load_session_data
from utils.metrics import calculate_metrics

In [4]:
exp_name = "test"

In [5]:
with utils.timer("load data"):
    train_log_df = load_log_data(Path(cfg.dir.data_dir), "train")
    test_log_df = load_log_data(Path(cfg.dir.data_dir), "test")
    all_log_df = pl.concat([train_log_df, test_log_df])

[load data] done in 0.1 s


In [6]:
with utils.timer("create ranking"):
    transition_dfs = []

    # 遷移を作成
    for rti in range(cfg.exp.range_transition):
        df = (
            all_log_df.with_columns(
                pl.col("yad_no").alias("from_yad_no"),
                pl.col("yad_no")
                .shift(-(rti + 1))
                .over("session_id")
                .alias("to_yad_no"),
            )
            .filter(~pl.col("to_yad_no").is_null())
            .filter(pl.col("from_yad_no") != pl.col("to_yad_no"))  # 同じものへは遷移しない
            .select(["from_yad_no", "to_yad_no"])
        )
        transition_dfs.append(df)
    transition_df = pl.concat(transition_dfs)
    # 集約して確率計算
    transition_df = (
        transition_df.group_by(["from_yad_no", "to_yad_no"])
        .agg(pl.col("from_yad_no").count().alias("from_to_count"))
        .with_columns(
            pl.col("from_yad_no").count().over(["from_yad_no"]).alias("from_count"),
        )
        .with_columns(
            (pl.col("from_to_count") / pl.col("from_count")).alias("transition_prob")
        )
        .sort(by=["from_yad_no", "to_yad_no"])
        .select(["from_yad_no", "to_yad_no", "transition_prob"])
    )

[create ranking] done in 6.4 s


In [7]:
# session_id ごとにランキングの上位10個を予測値とする submission を作成
with utils.timer("load session data"):
    train_session_df = load_session_data(Path(cfg.dir.data_dir), "train")
    test_session_df = load_session_data(Path(cfg.dir.data_dir), "test")

[load session data] done in 0.0 s


In [8]:
def make_candidates(log_df, session_df, transition_df):
    log_df = (
        log_df.sort(by="session_id").with_columns(pl.col("yad_no").alias("from_yad_no"))
    ).select(["session_id", "from_yad_no"])
    candidate_df = (
        log_df.join(transition_df, on="from_yad_no")
        .group_by(["session_id", "to_yad_no"])
        .agg(
            pl.sum("transition_prob").alias("transition_prob"),
        )
        .sort(by=["session_id", "transition_prob"], descending=True)
        .group_by(["session_id"])
        .agg(
            pl.col("to_yad_no").alias("candidates"),
        )
    )
    candidate_df = session_df.join(
        candidate_df, on="session_id", how="left"
    ).with_columns(
        # candidates が null の場合は空のリストを入れておく
        pl.when(pl.col("candidates").is_null())
        .then(pl.Series("empty", [[]]))
        .otherwise(pl.col("candidates"))
        .alias("candidates")
    )

    return candidate_df

session_id,from_yad_no
str,i64
"""000007603d533d…",2395
"""0000ca043ed437…",13535
"""0000d4835cf113…",123
"""0000fcda1ae1b2…",8475
"""000104bdffaaad…",96


In [10]:
train_candidate_df = (
    train_log_df.join(transition_df, on="from_yad_no")
    .group_by(["session_id", "to_yad_no"])
    .agg(
        pl.sum("transition_prob").alias("transition_prob"),
    )
    .sort(by=["session_id", "transition_prob"], descending=True)
    .group_by(["session_id"])
    .agg(
        pl.col("to_yad_no").alias("candidates"),
    )
)

train_candidate_df.head()

session_id,candidates
str,list[i64]
"""3857f94ed0608e…","[4879, 3226, … 3848]"
"""94419110053a9f…","[8971, 2750, … 4649]"
"""ca16665cd7de73…","[867, 11723, … 7690]"
"""e60056e38a77dc…","[3153, 5444, … 299]"
"""402bb0e80aee81…","[8178, 8109, 4364]"


In [None]:
train_candidate_df = train_session_df.join(
    train_candidate_df, on="session_id", how="left"
).with_columns(
    # candidates が null の場合は空のリストを入れておく
    pl.when(pl.col("candidates").is_null())
    .then(pl.Series("empty", [[]]))
    .otherwise(pl.col("candidates"))
    .alias("candidates")
)