# １日ごとに後処理を行い順番を並び替える

In [1]:
import sys, os
sys.path.append(os.pardir)

In [2]:
from pathlib import Path
import numpy as np
import polars as pl
import os
from hydra import initialize, compose

with initialize(config_path="../run/conf", version_base=None):
    cfg = compose("cv_train", overrides=["exp_name=exp013"])

In [3]:
# series の読み込み
train_df = pl.read_parquet(Path(cfg.dir.data_dir) / "train_series.parquet")
train_df = train_df.with_columns(
            pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z")
        )
event_df = pl.read_csv(Path(cfg.dir.data_dir) / "train_events.csv").drop_nulls()
event_df = event_df.with_columns(
    pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z")
)

In [4]:
exp_dir = Path(os.path.join(cfg.dir.cv_model_dir, cfg.exp_name, "cv"))
oof_event_df_list = []
for fold in range(cfg.num_fold):
    oof_event_df_list.append(pl.read_csv(exp_dir / f"val_pred_df_fold{fold}.csv"))
oof_event_df = pl.concat(oof_event_df_list)

In [5]:
oof_event_df = oof_event_df.with_columns(pl.col("step").cast(pl.UInt32))
oof_event_df.head()

row_id,series_id,step,event,score
i64,str,u32,str,f64
0,"""0402a003dae9""",3054,"""wakeup""",0.029968
1,"""0402a003dae9""",5321,"""onset""",0.028671
2,"""0402a003dae9""",5470,"""onset""",0.907227
3,"""0402a003dae9""",5741,"""onset""",0.024094
4,"""0402a003dae9""",5934,"""onset""",0.493652


In [6]:
from src.utils.metrics import event_detection_ap
score = event_detection_ap(
    event_df.to_pandas(),
    oof_event_df.to_pandas(),
)
score

0.7392876263644894

In [20]:
def post_process_event_score_normalize_by_day(
    event_df: pl.DataFrame,
    series_df: pl.DataFrame,
    day_start_hour_dict: dict[str, int] = {"onset": 12, "wakeup": 20},
) -> pl.DataFrame:
    """event score を日毎に正規化する。height以上のスコアを持つイベント農地日毎のscoreの合計を1にする

    Args:
        event_df (pl.DataFrame): event score を持つ dataframe
        day_start_hour (dict[str, int], optional): 日付の切り替え時間
        height (float, optional): event score の高さの閾値

    Returns:
        pl.DataFrame: 正規化された event score を持つ dataframe
    """

    # event_df の step カラムの型を u32 に
    event_df = event_df.with_columns(pl.col("step").cast(pl.UInt32))

    # event_df, series_df の series_id, step  を key に、event_df に series_id の timestamp カラムを結合
    event_df = event_df.join(series_df.select(["series_id", "step", "timestamp"]), on=["series_id", "step"])

    result_event_df_list = []

    for event, day_start_hour in day_start_hour_dict.items():
        # event が一致する行を抽出
        one_event_df = event_df.filter(pl.col("event") == event)
        # 日付ごとに day_start_hour だけ時間を引いて、日付カラムを追加
        one_event_df = one_event_df.with_columns(
            pl.col("timestamp").dt.offset_by(f"-{day_start_hour}h").alias("shifted_timestamp")
        ).with_columns(pl.col("shifted_timestamp").dt.date().alias("date"))

        #  score の合計をスコアの正規化に用いる
        score_sum_df = (
            one_event_df.group_by(["series_id", "date"])
            .agg(pl.sum("score").alias("score_sum"))
            .select(["series_id", "date", "score_sum"])
        )
        # 日付ごとの score の合計を event_df に結合
        one_event_df = one_event_df.join(score_sum_df, on=["series_id", "date"])
        # score_sum が欠損している場合は 1 にする
        one_event_df = one_event_df.with_columns(
            pl.when(pl.col("score_sum").is_null()).then(1.0).otherwise(pl.col("score_sum")).alias("score_sum")
        )
        # score_sum が 1 未満の場合は 1 にする（大きくはしない）
        one_event_df = one_event_df.with_columns(
            pl.when(pl.col("score_sum") < 1).then(1.0).otherwise(pl.col("score_sum")).alias("score_sum")
        )
        
        
        """
        # 0.5: 0.738580529726147, 0.7: 0.7386160510811913, 0.9→0.95: 0.7392161845499938 
        one_event_df = one_event_df.with_columns(
            pl.when((pl.col("score_sum") > 4.0).and_(pl.col("score") > 0.7)).then(pl.col("score")).otherwise(pl.col("score")*0.95).alias("score")
        ) 
        """
        """
        # 0.8: 0.7181488534323635, 0.9: 0.7325261406573275
        one_event_df = one_event_df.with_columns(
            pl.when(pl.col("score_sum") > 3.0).then(pl.col("score")).otherwise(pl.col("score")*0.9).alias("score")
        ) 
        """
        """
        """
        # 日付ごとの score の合計でスコアを割ると割りすぎなので、power(score_sum) で減衰させる
        # 0.7395099476450617
        one_event_df = one_event_df.with_columns(
            pl.col("score").pow(pl.col("score_sum").alias("score")*2)
        ) 
        """
        # 日付ごとの score の合計でスコアを割る
        one_event_df = one_event_df.with_columns(
            pl.col("score") / pl.col("score_sum").pow(1.0).alias("score")
        )  # 0.5: 0.7361688492242824, 1.0: 0.7177877757639883,  2.0: 0.6715313626079408
        """
        
        # 日付カラムを削除
        one_event_df = one_event_df.drop(["shifted_timestamp", "date", "score_sum"])

        # event 結合
        result_event_df_list.append(one_event_df)

    # event を結合
    result_event_df = pl.concat(result_event_df_list)

    return result_event_df


In [21]:

sub_df = post_process_event_score_normalize_by_day(oof_event_df, train_df)
score = event_detection_ap(
    event_df.to_pandas(),
    sub_df.to_pandas(),
)
score

0.7395099476450617

In [None]:
oof_event_df_time.head()