In [1]:
import sys, os
sys.path.append(os.pardir)

In [2]:
from pathlib import Path
import numpy as np
import polars as pl
import os
from hydra import initialize, compose
import plotly.express as px

with initialize(config_path="../run/conf", version_base=None):
    cfg = compose("cv_train")

In [70]:
from src.utils.metrics import event_detection_ap
from src.utils.periodicity import get_periodicity_dict
from src.utils.post_process import make_submission
from src.utils.common import trace

periodicity_dict = get_periodicity_dict(cfg)

In [4]:
event_df = pl.read_csv(Path(cfg.dir.data_dir) / "train_events.csv").drop_nulls()
event_df = event_df.with_columns(
    pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z")
)

### 1段目モデルの予測値

In [21]:
sakami_path2col_dict = {
   "148_gru_scale_factor.parquet": "148_gru_scale_factor",
    "156_gru_transformer_residual.parquet" : "156_gru_transformer_residual",
}
kami_path2col_dict = {
    "../output/cv_inference/exp065_split_drop/single/train_pred.parquet": "exp065_split_drop",
}
weight_cols = {
    "148_gru_scale_factor": 0.42164168,
    "156_gru_transformer_residual": 0.48143932,
    "exp065_split_drop": 0.096919,
}
events = ["onset", "wakeup"]


In [15]:
train_df = pl.read_parquet(Path(cfg.dir.data_dir) / "train_series.parquet")
train_df = train_df.with_columns(
            pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z")
)
df_list = [train_df]
for path, name in sakami_path2col_dict.items():
    pred_df = pl.read_parquet(path)
    pred_df = pred_df.with_columns(
        [pl.col("prediction_"+event).cast(pl.Float32).alias(name+"_"+event) for event in events]
    ).select([name+"_"+event for event in events])
    df_list.append(pred_df)

for path, name in kami_path2col_dict.items():
    pred_df = pl.read_parquet(path)
    pred_df = pred_df.with_columns(
        [pl.col("pred_"+event).cast(pl.Float32).alias(name+"_"+event) for event in events]
    ).select([name+"_"+event for event in events])
    df_list.append(pred_df)

pred_all_df = pl.concat(df_list, how="horizontal")
pred_1st_df = pred_all_df.with_columns(
    [pl.sum_horizontal([
        pl.col(f"{name}_{event}")* weight for (name, weight) in weight_cols.items()]).alias(f"prediction_{event}")
    for event in events]
).select(["series_id", "step", "timestamp"] + [f"prediction_{event}" for event in events])

pred_1st_df.head()

series_id,step,timestamp,prediction_onset,prediction_wakeup
str,u32,"datetime[μs, UTC]",f32,f32
"""038441c925bb""",0,2018-08-14 19:30:00 UTC,0.001358,0.002242
"""038441c925bb""",1,2018-08-14 19:30:05 UTC,0.001213,0.001838
"""038441c925bb""",2,2018-08-14 19:30:10 UTC,0.000926,0.001025
"""038441c925bb""",3,2018-08-14 19:30:15 UTC,0.000912,0.000642
"""038441c925bb""",4,2018-08-14 19:30:20 UTC,0.001171,0.000688


### 2段目モデルの予測値

In [62]:
pred_2nd_df = (
    pl.read_parquet("./pred_onset.parquet")
    .rename({"label_pred": "stacking_prediction_onset"})
    .drop("label")
    .join(
        pl.read_parquet("./pred_wakeup.parquet")
        .rename({"label_pred": "stacking_prediction_wakeup"})
        .drop("label"),
        on=["series_id", "step"],
        how="left",
    )
)
pred_2nd_df = pred_2nd_df.with_columns(
    ((pl.col("step") - pl.col("step").shift(1)) != 12)
    .cast(pl.UInt64)
    .cumsum()
    .over("series_id")
    .fill_null(0)
    .alias("chunk_id")
).with_columns(pl.col('step').cast(pl.UInt32))
pred_2nd_df = pred_2nd_df.join(train_df, on=['series_id', 'step'], how='left')

pred_2nd_df.head()

series_id,step,stacking_prediction_onset,stacking_prediction_wakeup,chunk_id,timestamp,anglez,enmo
str,u32,f64,f64,u64,"datetime[μs, UTC]",f32,f32
"""038441c925bb""",0,8e-06,1e-05,0,2018-08-14 19:30:00 UTC,2.6367,0.0217
"""038441c925bb""",12,8e-06,8e-06,0,2018-08-14 19:31:00 UTC,2.4129,0.0218
"""038441c925bb""",24,7e-06,3.1e-05,0,2018-08-14 19:32:00 UTC,30.002501,0.0082
"""038441c925bb""",36,9e-06,1.9e-05,0,2018-08-14 19:33:00 UTC,-79.968803,0.0136
"""038441c925bb""",48,6e-06,7e-06,0,2018-08-14 19:34:00 UTC,-80.014297,0.0141


## 2段目event detection + 1段目のfind peaks 

In [53]:
from src.utils.detect_peak import post_process_from_2nd

In [58]:
from scipy.ndimage import maximum_filter1d
from tqdm.auto import tqdm
from scipy.signal import find_peaks

def make_additional_event_after_2nd(
    preds_df: pl.DataFrame,
    sub_2nd_df: pl.DataFrame,
    periodicity_dict: dict[str, np.ndarray],
    height: float = 0.001,
    distance: int = 100,
    zero_range: int = 200,
    event2col: dict[str, str] = {"onset": "prediction_onset", "wakeup": "prediction_wakeup"},
) -> pl.DataFrame:
    events = ["onset", "wakeup"]
    event_dfs = []
    series2steps = {event:{} for event in events}
    for (series_id,event), event_series_df in sub_2nd_df.group_by(["series_id", 'event']):
        for event in events:
            series2steps[event]['series_id'] = event_series_df.get_column('step').to_numpy()

    for series_id, series_df in tqdm(
        preds_df.group_by("series_id"), desc="find peaks", leave=False, total=len(preds_df["series_id"].unique())
    ):
        for event in events:
            event_preds = series_df[event2col[event]].to_numpy().copy()
            event_preds *= 1 - periodicity_dict[series_id][: len(event_preds)]

            # 2nd のイベントから zero_range 分の確率を0にする
            event_steps = series2steps[event]['series_id']
            peak_flag = np.zeros(max(len(event_preds), event_steps.max()+1))
            peak_flag[event_steps] = 1.0
            event_preds *= 1-maximum_filter1d(peak_flag, size=zero_range)[:len(event_preds)]

            steps = find_peaks(event_preds, height=height, distance=distance)[0]
            event_dfs.append(
                series_df.filter(pl.col("step").is_in(steps))
                .with_columns(pl.lit(event).alias("event"))
                .rename({event2col[event]: "score"})
                .select(["series_id", "step", "timestamp", "event", "score"])
            )
    submission_df = (
        pl.concat(event_dfs).sort(["series_id", "step"]).with_columns(pl.arange(0, pl.count()).alias("row_id"))
    )
    return submission_df.select(["row_id", "series_id", "step", "event", "score"])


In [86]:
def make_submission_additional(
    pred_1st_df, pred_2nd_df, height_2nd=0.001, additional_event_weight = 0.01, zero_range=200
):
    sub_2nd_df = post_process_from_2nd(
        pred_2nd_df,
        later_date_max_sub_rate=None,
        height = height_2nd,
    )

    additional_event = make_additional_event_after_2nd(
        pred_1st_df, sub_2nd_df, periodicity_dict, zero_range=zero_range
    )

    sub_2nd_df = sub_2nd_df.with_columns(
        pl.col('row_id').cast(pl.UInt32),
        pl.col('step').cast(pl.UInt32),
        pl.col('score').cast(pl.Float32),
    )
    additional_event = additional_event.with_columns(
        pl.col('row_id').cast(pl.UInt32),
        pl.col('step').cast(pl.UInt32),
        pl.col('score').cast(pl.Float32)*additional_event_weight,
    )

    submission_df = pl.concat([sub_2nd_df, additional_event])

    return submission_df, sub_2nd_df


In [85]:
for height in [0.0001, 0.001, 0.005, 0.01]:
    submission_df, sub_2nd_df = make_submission_additional(pred_1st_df, pred_2nd_df, height_2nd=height)

    score_first = event_detection_ap(
        event_df.to_pandas(),
        sub_2nd_df.to_pandas(),
    )    
    score = event_detection_ap(
        event_df.to_pandas(),
        submission_df.to_pandas()
    )
    print(f"2nd height {height}: {score_first:.6}({len(sub_2nd_df)}) → {score:.6}({len(submission_df)})")


detect onset peaks:   0%|          | 0/277 [00:00<?, ?it/s]

detect wakeup peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

2nd height 0.0001: 0.833825(272239) → 0.833938(656688)


detect onset peaks:   0%|          | 0/277 [00:00<?, ?it/s]

detect wakeup peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

2nd height 0.001: 0.833816(250027) → 0.833938(625715)


detect onset peaks:   0%|          | 0/277 [00:00<?, ?it/s]

detect wakeup peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

2nd height 0.005: 0.833704(168174) → 0.833916(562137)


detect onset peaks:   0%|          | 0/277 [00:00<?, ?it/s]

detect wakeup peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

2nd height 0.01: 0.833484(121523) → 0.83383(497211)


In [87]:
for height in [0.0001, 0.001, 0.005, 0.01]:
    submission_df, sub_2nd_df = make_submission_additional(
        pred_1st_df, pred_2nd_df, height_2nd=height, zero_range=100
    )

    score_first = event_detection_ap(
        event_df.to_pandas(),
        sub_2nd_df.to_pandas(),
    )    
    score = event_detection_ap(
        event_df.to_pandas(),
        submission_df.to_pandas()
    )
    print(f"2nd height {height}: {score_first:.6}({len(sub_2nd_df)}) → {score:.6}({len(submission_df)})")


detect onset peaks:   0%|          | 0/277 [00:00<?, ?it/s]

detect wakeup peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

2nd height 0.0001: 0.833825(272239) → 0.833941(675560)


detect onset peaks:   0%|          | 0/277 [00:00<?, ?it/s]

detect wakeup peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

2nd height 0.001: 0.833816(250027) → 0.833934(641685)


detect onset peaks:   0%|          | 0/277 [00:00<?, ?it/s]

detect wakeup peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

2nd height 0.005: 0.833704(168174) → 0.833922(577854)


detect onset peaks:   0%|          | 0/277 [00:00<?, ?it/s]

detect wakeup peaks:   0%|          | 0/277 [00:00<?, ?it/s]

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

2nd height 0.01: 0.833484(121523) → 0.83384(519084)
