In [1]:
%cd ..

/kaggle/working


In [2]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../generate_datasets/001_baseline"):
    cfg = compose(config_name="config.yaml", overrides=["debug=True"])
    print(OmegaConf.to_yaml(cfg))

debug: true
seed: 7
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  fold_path: /kaggle/working/output/datasets/make_cv/base/train_fold.parquet
  candidate_info_list:
  - name: transition_prob/base
    max_num_candidates: 100
    dir: /kaggle/working/output/cand_unsupervised/transition_prob/base
    features:
    - name: yad2yad_feature
  - name: ranking_location/sml_cd
    max_num_candidates: 50
    dir: /kaggle/working/output/cand_unsupervised/ranking_location/sml_cd
    features:
    - name: yad_feature
  - name: ranking_location/lrg_cd
    max_num_candidates: 50
    dir: /kaggle/working/output/cand_unsupervised/ranking_location/lrg_cd
    features:
    - name: yad_feature



In [9]:
import os
import sys
from pathlib import Path

import hydra
import polars as pl
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf

import utils
from utils.load import load_label_data, load_log_data, load_session_data, load_yad_data
from utils.metrics import calculate_metrics

In [4]:
mode = "train"

In [5]:
def load_and_union_candidates(cfg, mode: str):
    # logデータのsession中のyad_noを候補に加える
    log_df = load_log_data(Path(cfg.dir.data_dir), mode)
    df = log_df.group_by("session_id").agg(pl.col("yad_no").alias("candidates"))
    dfs = [df]
    for candidate_info in cfg.exp.candidate_info_list:
        df = pl.read_parquet(Path(candidate_info["dir"]) / f"{mode}_candidate.parquet")
        df = df.with_columns(
            pl.col("candidates")
            .list.head(candidate_info["max_num_candidates"])
            .alias("candidates")
        ).filter(pl.col("candidates").list.len() > 0)

        dfs.append(df)
    df = pl.concat(dfs)
    df = (
        df.group_by("session_id")
        .agg(pl.col("candidates").flatten())
        .with_columns(pl.col("candidates").list.unique())
    ).select(["session_id", "candidates"])

    if cfg.debug:
        df = df.with_columns(pl.col("candidates").list.head(5))

    # リストを展開
    candidate_df = df.explode("candidates")

    # セッション最後のyad_noを除外
    last_df = (
        load_log_data(Path(cfg.dir.data_dir), mode)
        .group_by("session_id")
        .agg(pl.col("yad_no").last().alias("candidates"))
        .with_columns(pl.lit(True).alias("last"))
        .sort(by="session_id")
    )
    candidate_df = (
        candidate_df.join(last_df, on=["session_id", "candidates"], how="left")
        .filter(pl.col("last").is_null())
        .drop("last")
    )
    return candidate_df

In [6]:
def convert_to_32bit(df):
    df = df.with_columns(
        [pl.col(col).cast(pl.Int32) for col in df.columns if df[col].dtype == pl.Int64]
    ).with_columns(
        [
            pl.col(col).cast(pl.Float32)
            for col in df.columns
            if df[col].dtype == pl.Float64
        ]
    )
    return df

In [7]:
candidate_df = load_and_union_candidates(cfg, mode)

In [8]:
candidate_df = convert_to_32bit(candidate_df)
print(candidate_df.shape)
candidate_df.head()

(1415117, 2)


session_id,candidates
str,i32
"""3c4f8bbb00915a…",540
"""3c4f8bbb00915a…",554
"""3c4f8bbb00915a…",781
"""3c4f8bbb00915a…",825
"""3c4f8bbb00915a…",1092


In [58]:
label_df = load_label_data(Path(cfg.dir.data_dir)).with_columns(
    pl.lit(False).alias("original"), pl.lit(True).alias("label")
)
label_df.head()

session_id,yad_no,original,label
str,i64,bool,bool
"""000007603d533d…",4101,False,True
"""0000ca043ed437…",8253,False,True
"""0000d4835cf113…",4863,False,True
"""0000fcda1ae1b2…",1652,False,True
"""000104bdffaaad…",96,False,True


In [63]:
def concat_label_fold(cfg, mode: str, candidate_df):
    """
    train に対して original, label, fold を付与する
    validationのスコア計算時にはoriginalを外して計算を行う
    """
    if mode == "train":
        candidate_df = (
            pl.concat(
                [
                    candidate_df.with_columns(
                        pl.lit(True).alias("original"), pl.lit(False).alias("label")
                    ),
                    convert_to_32bit(load_label_data(Path(cfg.dir.data_dir)))
                    .with_columns(
                        pl.col("yad_no").alias("candidates"),
                        pl.lit(False).alias("original"),
                        pl.lit(True).alias("label"),
                    )
                    .drop("yad_no"),
                ]
            )
            .group_by(["session_id", "candidates"])
            .agg(pl.sum("original"), pl.sum("label"))
        )
        fold_df = pl.read_parquet(cfg.exp.fold_path)
        candidate_df = candidate_df.join(fold_df, on="session_id")
    return candidate_df

In [64]:
concat_label_fold(cfg, mode, candidate_df).head()

session_id,candidates,original,label,fold
str,i32,u32,u32,i64
"""bbdbf88dd0d1d7…",781,1,0,1
"""bbdbf88dd0d1d7…",1092,1,0,1
"""03256142f584cb…",2040,1,0,1
"""02f96519fc2974…",397,1,0,0
"""a00f5a46ab3faa…",708,1,0,2


In [22]:
yad_df.fill_null(0).head()

yad_no,yad_type,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,kd_conv_walk_5min,wid_cd,ken_cd,lrg_cd,sml_cd
i64,i64,f64,f64,i64,f64,f64,f64,f64,str,str,str,str
1,0,129.0,1.0,0,1.0,0.0,0.0,1.0,"""f0112abf369fb0…","""072c85e1653e10…","""449c52ef581d5f…","""677a32689cd1ad…"
2,0,23.0,1.0,0,0.0,0.0,0.0,0.0,"""d86102dd9c232b…","""b4d2fb4e51ea7b…","""5c9a8f48e9df02…","""4ee16ee838dd27…"
3,0,167.0,1.0,1,1.0,0.0,0.0,1.0,"""46e33861f921c3…","""572d60f0f5212a…","""8a623b960557e8…","""ab9480fd72a44d…"
4,0,144.0,1.0,0,1.0,0.0,0.0,1.0,"""46e33861f921c3…","""107c7305a74c8d…","""52c9ea83f2cfe9…","""1cc3e1838bb0fd…"
5,0,41.0,1.0,1,0.0,0.0,0.0,0.0,"""43875109d1dab9…","""75617bb07a2785…","""9ea5a911019b66…","""be1b876af18afc…"


In [39]:
def concat_session_feature(cfg, mode: str, candidate_df: pl.DataFrame):
    """
    # TODO: categorical_colの情報もあとで追加する
    session_id, seq_no, yad_no に yado.csv を結合して集約し、セッションに関する特徴量を作成する
    """
    log_df = load_log_data(Path(cfg.dir.data_dir), mode)
    yad_df = load_yad_data(Path(cfg.dir.data_dir))
    log_yad_df = log_df.join(yad_df.fill_null(0), on="yad_no")
    log_yad_df = log_yad_df.group_by(by="session_id").agg(
        [pl.sum(col).name.suffix("_session_sum") for col in numerical_col]
        + [pl.min(col).name.suffix("_session_min") for col in numerical_col]
        + [pl.max(col).name.suffix("_session_max") for col in numerical_col]
        + [pl.std(col).name.suffix("_session_std") for col in numerical_col]
    )

    candidate_df = candidate_df.join(log_yad_df, on="session_id")

    return candidate_df

In [41]:
candidate_session_df = concat_session_feature(cfg, mode, candidate_df)
candidate_session_df.head()

session_id,candidates,total_room_cnt_session_sum,wireless_lan_flg_session_sum,onsen_flg_session_sum,kd_stn_5min_session_sum,kd_bch_5min_session_sum,kd_slp_5min_session_sum,total_room_cnt_session_min,wireless_lan_flg_session_min,onsen_flg_session_min,kd_stn_5min_session_min,kd_bch_5min_session_min,kd_slp_5min_session_min,total_room_cnt_session_max,wireless_lan_flg_session_max,onsen_flg_session_max,kd_stn_5min_session_max,kd_bch_5min_session_max,kd_slp_5min_session_max,total_room_cnt_session_std,wireless_lan_flg_session_std,onsen_flg_session_std,kd_stn_5min_session_std,kd_bch_5min_session_std,kd_slp_5min_session_std
str,i32,f64,f64,i64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""3c4f8bbb00915a…",540,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,,,,,,
"""3c4f8bbb00915a…",554,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,,,,,,
"""3c4f8bbb00915a…",781,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,,,,,,
"""3c4f8bbb00915a…",825,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,,,,,,
"""3c4f8bbb00915a…",1092,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,,,,,,


In [49]:
def concat_candidate_feature(cfg, mode: str, candidate_df: pl.DataFrame):
    """
    # TODO: categorical_colの情報もあとで追加する
    candidateの特徴量を抽出する
    """
    yad_df = load_yad_data(Path(cfg.dir.data_dir))
    yad_df = convert_to_32bit(yad_df)
    candidate_yad_df = candidate_df.join(
        yad_df.select(["yad_no"] + numerical_col),
        left_on="candidates",
        right_on="yad_no",
    )
    return candidate_yad_df

In [53]:
candidate_session_cand_df = concat_candidate_feature(cfg, mode, candidate_session_df)
candidate_session_cand_df.head()

session_id,candidates,total_room_cnt_session_sum,wireless_lan_flg_session_sum,onsen_flg_session_sum,kd_stn_5min_session_sum,kd_bch_5min_session_sum,kd_slp_5min_session_sum,total_room_cnt_session_min,wireless_lan_flg_session_min,onsen_flg_session_min,kd_stn_5min_session_min,kd_bch_5min_session_min,kd_slp_5min_session_min,total_room_cnt_session_max,wireless_lan_flg_session_max,onsen_flg_session_max,kd_stn_5min_session_max,kd_bch_5min_session_max,kd_slp_5min_session_max,total_room_cnt_session_std,wireless_lan_flg_session_std,onsen_flg_session_std,kd_stn_5min_session_std,kd_bch_5min_session_std,kd_slp_5min_session_std,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min
str,i32,f64,f64,i64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f32,f32,i32,f32,f32,f32
"""3c4f8bbb00915a…",540,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,,,,,,,103.0,1.0,0,1.0,,
"""3c4f8bbb00915a…",554,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,,,,,,,134.0,1.0,0,,,
"""3c4f8bbb00915a…",781,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,,,,,,,228.0,1.0,0,1.0,,
"""3c4f8bbb00915a…",825,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,,,,,,,203.0,1.0,0,1.0,,
"""3c4f8bbb00915a…",1092,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,493.0,1.0,0,1.0,0.0,0.0,,,,,,,296.0,1.0,0,1.0,,


In [None]:
def concat_session_candidate_feature(cfg, mode: str, candidate_df: pl.DataFrame):
    """
    session中の特徴とcandidateの関係性を特徴量として抽出する
    例：session中におけるcandidateの出現回数(割合)、candidateと同一地域のものを見た回数(割合)
    """
    pass

session_id,fold
str,i64
"""000007603d533d…",2
"""0000ca043ed437…",2
"""0000d4835cf113…",0
"""0000fcda1ae1b2…",4
"""000104bdffaaad…",3
"""00011afe25c343…",1
"""000125c737df18…",0
"""0001763050a10b…",1
"""000178c4d4d567…",1
"""0001e6a407a85d…",1
