In [2]:
%cd ..

/kaggle/working


In [20]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../generate_datasets/002_add_features"):
    cfg = compose(config_name="config.yaml", overrides=["debug=True"])
    print(OmegaConf.to_yaml(cfg))

debug: true
seed: 7
dir:
  data_dir: /kaggle/working/input/atmaCup16_Dataset
  output_dir: /kaggle/working/output
  exp_dir: /kaggle/working/output/exp
  cand_unsupervised_dir: /kaggle/working/output/cand_unsupervised
  cand_supervised_dir: /kaggle/working/output/cand_supervised
  datasets_dir: /kaggle/working/output/datasets
exp:
  fold_path: /kaggle/working/output/datasets/make_cv/base/train_fold.parquet
  candidate_info_list:
  - name: transition_prob/base
    max_num_candidates: 100
    dir: /kaggle/working/output/cand_unsupervised/transition_prob/base
  - name: ranking_location/sml_cd
    max_num_candidates: 50
    dir: /kaggle/working/output/cand_unsupervised/ranking_location/sml_cd
  - name: ranking_location/lrg_cd
    max_num_candidates: 50
    dir: /kaggle/working/output/cand_unsupervised/ranking_location/lrg_cd
  transition_prob_path: /kaggle/working/output/cand_unsupervised/transition_prob/base/yad2yad_feature.parquet



In [38]:
import os
import sys
from pathlib import Path

import hydra
import polars as pl
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf

from utils.data import convert_to_32bit
from utils.load import load_label_data, load_log_data, load_yad_data

numerical_cols = [
    "total_room_cnt",
    "wireless_lan_flg",
    "onsen_flg",
    "kd_stn_5min",
    "kd_bch_5min",
    "kd_slp_5min",
]

categorical_cols = [
    "yad_type",
    "wid_cd",
    "ken_cd",
    "lrg_cd",
    "sml_cd",
]

from sklearn.preprocessing import OrdinalEncoder

yad_df = load_yad_data(Path(cfg.dir.data_dir))
ordinal_encoder = OrdinalEncoder(
    handle_unknown="use_encoded_value",
    unknown_value=-1,
)
ordinal_encoder.fit(yad_df[categorical_cols].to_numpy())

In [13]:
def load_and_union_candidates(cfg, mode: str):
    # logデータのsession中のyad_noを候補に加える
    log_df = load_log_data(Path(cfg.dir.data_dir), mode)
    df = log_df.group_by("session_id").agg(pl.col("yad_no").alias("candidates"))
    dfs = [df]
    for candidate_info in cfg.exp.candidate_info_list:
        df = pl.read_parquet(Path(candidate_info["dir"]) / f"{mode}_candidate.parquet")
        df = df.with_columns(
            pl.col("candidates")
            .list.head(candidate_info["max_num_candidates"])
            .alias("candidates")
        ).filter(pl.col("candidates").list.len() > 0)

        dfs.append(df)
    df = pl.concat(dfs)
    df = (
        df.group_by("session_id")
        .agg(pl.col("candidates").flatten())
        .with_columns(pl.col("candidates").list.unique())
    ).select(["session_id", "candidates"])

    # リストを展開
    candidate_df = df.explode("candidates")

    # セッション最後のyad_noを除外
    last_df = (
        load_log_data(Path(cfg.dir.data_dir), mode)
        .group_by("session_id")
        .agg(pl.col("yad_no").last().alias("candidates"))
        .with_columns(pl.lit(True).alias("last"))
        .sort(by="session_id")
    )
    candidate_df = (
        candidate_df.join(last_df, on=["session_id", "candidates"], how="left")
        .filter(pl.col("last").is_null())
        .drop("last")
    )
    return candidate_df


def concat_label_fold(cfg, mode: str, candidate_df):
    """
    train に対して original, label, fold を付与する
    validationのスコア計算時にはoriginalを外して計算を行う
    """
    if mode == "train":
        candidate_df = (
            pl.concat(
                [
                    candidate_df.with_columns(
                        pl.lit(True).alias("original"), pl.lit(False).alias("label")
                    ),
                    load_label_data(Path(cfg.dir.data_dir))
                    .with_columns(
                        pl.col("yad_no").alias("candidates"),
                        pl.lit(False).alias("original"),
                        pl.lit(True).alias("label"),
                    )
                    .drop("yad_no"),
                ]
            )
            .group_by(["session_id", "candidates"])
            .agg(pl.sum("original"), pl.sum("label"))
        )
        fold_df = pl.read_parquet(cfg.exp.fold_path)
        candidate_df = candidate_df.join(fold_df, on="session_id")
    return candidate_df


def concat_session_feature(cfg, mode: str, candidate_df: pl.DataFrame):
    """
    # TODO: categorical_colsの情報もあとで追加する
    session_id, seq_no, yad_no に yado.csv を結合して集約し、セッションに関する特徴量を作成する
    """
    log_df = load_log_data(Path(cfg.dir.data_dir), mode)
    yad_df = load_yad_data(Path(cfg.dir.data_dir))
    log_yad_df = log_df.join(yad_df.fill_null(0), on="yad_no")
    log_yad_df = log_yad_df.group_by(by="session_id").agg(
        [pl.sum(col).name.suffix("_session_sum") for col in numerical_cols]
        + [pl.min(col).name.suffix("_session_min") for col in numerical_cols]
        + [pl.max(col).name.suffix("_session_max") for col in numerical_cols]
        + [pl.std(col).name.suffix("_session_std") for col in numerical_cols]
    )

    candidate_df = candidate_df.join(log_yad_df, on="session_id")

    return candidate_df


def concat_candidate_feature(cfg, mode: str, candidate_df: pl.DataFrame):
    """
    # TODO: categorical_colsの情報もあとで追加する
    candidateの特徴量を抽出する
    """
    yad_df = load_yad_data(Path(cfg.dir.data_dir))
    candidate_yad_df = candidate_df.join(
        yad_df.select(["yad_no"] + numerical_cols + categorical_cols),
        left_on="candidates",
        right_on="yad_no",
    )

    return candidate_yad_df

In [14]:
mode = "train"
candidate_df = load_and_union_candidates(cfg, mode)
candidate2_df = concat_label_fold(cfg, mode, candidate_df)
candidate3_df = concat_candidate_feature(cfg, mode, candidate2_df)
candidate3_df.head()

session_id,candidates,original,label,fold,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,yad_type,wid_cd,ken_cd,lrg_cd,sml_cd
str,i64,u32,u32,i64,f64,f64,i64,f64,f64,f64,i64,str,str,str,str
"""700ab0108bc03f…",1229,1,0,3,418.0,1.0,0,1.0,,,0,"""46e33861f921c3…","""572d60f0f5212a…","""8a623b960557e8…","""f7b42d92528e7a…"
"""700ab0108bc03f…",2439,1,0,3,81.0,1.0,0,1.0,,,0,"""46e33861f921c3…","""572d60f0f5212a…","""8a623b960557e8…","""ab9480fd72a44d…"
"""700ab0108bc03f…",2974,1,0,3,130.0,1.0,0,1.0,,,0,"""46e33861f921c3…","""572d60f0f5212a…","""8a623b960557e8…","""f7b42d92528e7a…"
"""700ab0108bc03f…",3137,1,0,3,350.0,,0,,,,0,"""46e33861f921c3…","""572d60f0f5212a…","""8a623b960557e8…","""1d9f09b9e2bd43…"
"""700ab0108bc03f…",7169,1,0,3,222.0,1.0,0,1.0,,,0,"""46e33861f921c3…","""572d60f0f5212a…","""8a623b960557e8…","""f7b42d92528e7a…"


In [29]:
def concat_session_candidate_feature(cfg, mode: str, candidate_df: pl.DataFrame):
    """
    session中の特徴とcandidateの関係性を特徴量として抽出する
    例: session中におけるcandidateの出現回数(割合)、candidateと同一地域のものを見た回数(割合)
    """
    original_cols = candidate_df.columns

    # 同じ categorical の出現回数
    ## (series_id, categorical) でグループ化して、session_id ごとに出現回数を集計する
    log_df = load_log_data(Path(cfg.dir.data_dir), mode)
    yad_df = load_yad_data(Path(cfg.dir.data_dir))
    log_yad_df = log_df.join(yad_df.fill_null(0), on="yad_no")
    for col in categorical_cols:
        tmp = (
            log_yad_df.group_by(by=["session_id", col])
            .agg(pl.count("session_id").alias(f"same_{col}_count"))
            .with_columns(
                pl.col(f"same_{col}_count").sum().over("session_id").alias("seq_sum")
            )
            .with_columns(
                (pl.col(f"same_{col}_count") / pl.col("seq_sum")).alias(
                    f"same_{col}_rate"
                )
            )
        )
        candidate_df = candidate_df.join(
            tmp.select(["session_id", col, f"same_{col}_count", f"same_{col}_rate"]),
            on=["session_id", col],
            how="left",
        )

    # transition probを追加
    yad2yad_prob = pl.read_parquet(cfg.exp.transition_prob_path)
    log_df = load_log_data(Path(cfg.dir.data_dir), mode)
    last_log_df = (
        log_df.group_by("session_id")
        .agg(pl.all().sort_by("seq_no").last())
        .sort(by="session_id")
        .with_columns(pl.col("yad_no").alias("from_yad_no"))
    ).select(["session_id", "from_yad_no"])
    last_log_prob_df = last_log_df.join(yad2yad_prob, on="from_yad_no")
    candidate_df = candidate_df.join(
        last_log_prob_df,
        left_on=["session_id", "candidates"],
        right_on=["session_id", "to_yad_no"],
    ).drop("from_yad_no")

    # 増えたカラムを出力
    new_cols = [col for col in candidate_df.columns if col not in original_cols]
    print(f"new_cols: {new_cols}")

    return candidate_df


candidate4_df = concat_session_candidate_feature(cfg, mode, candidate3_df)

new_cols: ['same_yad_type_count', 'same_yad_type_rate', 'same_wid_cd_count', 'same_wid_cd_rate', 'same_ken_cd_count', 'same_ken_cd_rate', 'same_lrg_cd_count', 'same_lrg_cd_rate', 'same_sml_cd_count', 'same_sml_cd_rate', 'transition_prob_transition_prob/base']


In [42]:
transformed = ordinal_encoder.transform(candidate4_df[categorical_cols].to_numpy())

candidate4_df.with_columns(
    [
        pl.Series(name=col, values=transformed[:, i])
        for i, col in enumerate(categorical_cols)
    ]
)

session_id,candidates,original,label,fold,total_room_cnt,wireless_lan_flg,onsen_flg,kd_stn_5min,kd_bch_5min,kd_slp_5min,yad_type,wid_cd,ken_cd,lrg_cd,sml_cd,same_yad_type_count,same_yad_type_rate,same_wid_cd_count,same_wid_cd_rate,same_ken_cd_count,same_ken_cd_rate,same_lrg_cd_count,same_lrg_cd_rate,same_sml_cd_count,same_sml_cd_rate,transition_prob_transition_prob/base
str,i64,u32,u32,i64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,u32,f64,u32,f64,u32,f64,u32,f64,u32,f64,f64
"""700ab0108bc03f…",2974,1,0,3,130.0,1.0,0,1.0,,,0.0,3.0,18.0,153.0,519.0,3,1.0,3,1.0,3,1.0,3,1.0,,,0.038462
"""700ab0108bc03f…",7561,1,0,3,216.0,1.0,0,,,,0.0,3.0,18.0,153.0,69.0,3,1.0,3,1.0,3,1.0,3,1.0,3,1.0,0.038462
"""700ab0108bc03f…",13549,1,0,3,706.0,1.0,0,1.0,,,0.0,3.0,5.0,97.0,66.0,3,1.0,3,1.0,,,,,,,0.038462
"""2180c48c8cd98a…",1268,1,0,2,159.0,1.0,0,,,,0.0,10.0,20.0,137.0,350.0,1,1.0,1,1.0,1,1.0,1,1.0,1,1.0,0.05
"""2180c48c8cd98a…",1868,1,0,2,170.0,1.0,0,,,,0.0,10.0,20.0,137.0,350.0,1,1.0,1,1.0,1,1.0,1,1.0,1,1.0,0.05
"""2180c48c8cd98a…",2179,1,0,2,252.0,1.0,0,1.0,,,0.0,10.0,20.0,137.0,350.0,1,1.0,1,1.0,1,1.0,1,1.0,1,1.0,0.45
"""2180c48c8cd98a…",5372,1,0,2,154.0,1.0,0,1.0,,,0.0,10.0,20.0,137.0,350.0,1,1.0,1,1.0,1,1.0,1,1.0,1,1.0,0.05
"""1b02208d5a56eb…",5937,1,0,2,110.0,1.0,0,1.0,,,0.0,8.0,38.0,45.0,389.0,2,1.0,2,1.0,2,1.0,2,1.0,2,1.0,0.058824
"""1b02208d5a56eb…",8157,1,0,2,156.0,1.0,0,1.0,,,0.0,8.0,38.0,45.0,389.0,2,1.0,2,1.0,2,1.0,2,1.0,2,1.0,0.029412
"""1b02208d5a56eb…",13229,1,0,2,105.0,1.0,0,,,,0.0,8.0,38.0,45.0,389.0,2,1.0,2,1.0,2,1.0,2,1.0,2,1.0,0.029412
