## 後半にアノテーションミスがあるケースを考慮した後処理

In [1]:
import sys, os
sys.path.append(os.pardir)

In [13]:
from pathlib import Path
import numpy as np
import polars as pl
import os
from hydra import initialize, compose
from tqdm.auto import tqdm
with initialize(config_path="../run/conf", version_base=None):
    cfg = compose("cv_train", overrides=["exp_name=exp054_zero_periodicity"])

In [4]:
from src.utils.metrics import event_detection_ap
from src.utils.periodicity import get_periodicity_dict
from src.utils.common import trace
periodicity_dict = get_periodicity_dict(cfg)



In [5]:
train_df = pl.read_parquet(Path(cfg.dir.data_dir) / "train_series.parquet")
train_df = train_df.with_columns(
            pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z")
        )

In [6]:
event_df = pl.read_csv(Path(cfg.dir.data_dir) / "train_events.csv").drop_nulls()
event_df = event_df.with_columns(
    pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z")
)

In [7]:
## pred1
pred1_df = pl.read_parquet("valid_preds.parquet")
pred1_df.describe()

describe,row_id,prediction_onset,prediction_wakeup
str,f64,f64,f64
"""count""",127946340.0,127946340.0,127946340.0
"""null_count""",0.0,0.0,0.0
"""mean""",63973000.0,0.001201,0.001185
"""std""",36935000.0,0.017525,0.017998
"""min""",0.0,7.0448e-22,1.982e-18
"""25%""",31986585.0,4.2278e-11,1.0695e-08
"""50%""",63973170.0,1.0348e-07,2.5653e-07
"""75%""",95959755.0,9e-06,4e-06
"""max""",127946339.0,0.740916,0.765315


In [8]:
# 結合
pred_all_df = pl.concat([train_df, pred1_df], how="horizontal")
pred_all_df.head()

series_id,step,timestamp,anglez,enmo,row_id,prediction_onset,prediction_wakeup
str,u32,"datetime[μs, UTC]",f32,f32,i64,f32,f32
"""038441c925bb""",0,2018-08-14 19:30:00 UTC,2.6367,0.0217,0,3e-06,0.001149
"""038441c925bb""",1,2018-08-14 19:30:05 UTC,2.6368,0.0215,1,4e-06,0.001166
"""038441c925bb""",2,2018-08-14 19:30:10 UTC,2.637,0.0216,2,6.6307e-07,0.000871
"""038441c925bb""",3,2018-08-14 19:30:15 UTC,2.6368,0.0213,3,2e-06,0.001289
"""038441c925bb""",4,2018-08-14 19:30:20 UTC,2.6368,0.0215,4,1.9404e-07,0.001127


In [25]:
from scipy.signal import find_peaks

def make_submission(
    preds_df: pl.DataFrame,
    periodicity_dict: dict[str, np.ndarray],
    height: float = 0.001,
    distance: int = 100,
    daily_score_offset: float = 1.0,
    prefix:str ="prediction"
) -> pl.DataFrame:

    event_dfs = []

    for series_id, series_df in tqdm(preds_df.group_by("series_id"), desc="find peaks", leave=False, total=len(preds_df["series_id"].unique())):
        for event in ["onset", "wakeup"]:
            event_preds = series_df[f"{prefix}_{event}"].to_numpy().copy()
            event_preds *= 1 - periodicity_dict[series_id][: len(event_preds)]
            steps = find_peaks(event_preds, height=height, distance=distance)[0]
            event_dfs.append(
                series_df.filter(pl.col("step").is_in(steps))
                .with_columns(pl.lit(event).alias("event"))
                .rename({f"{prefix}_{event}": "score"})
                .select(["series_id", "step", "timestamp", "event", "score"])
            )
    submission_df = (
        pl.concat(event_dfs)
        .with_columns(pl.col("timestamp").dt.offset_by("2h").dt.date().alias("date"))
        .with_columns(
            pl.col("score") / (pl.col("score").sum().over(["series_id", "event", "date"]) + daily_score_offset)
        )
        .sort(["series_id", "step"])
        .with_columns(pl.arange(0, pl.count()).alias("row_id"))
        .select(["row_id", "series_id", "step", "event", "score"])
    )
    return submission_df

In [16]:
with trace('make submission'):
    sub_df1 = make_submission(
        pred_all_df,
        periodicity_dict= periodicity_dict,
        height = 0.001,
        distance = 107,
        daily_score_offset = 1.0,
    ) 
score = event_detection_ap(
    event_df.to_pandas(),
    sub_df1.to_pandas(),
)
score

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8031793680267056

In [33]:
for offset in range(-5, 6, 1):
    print(offset)
    with trace('make submission'):
        sub_df1 = make_submission(
            pred_all_df,
            periodicity_dict= periodicity_dict,
            height = 0.001,
            distance = 107,
            daily_score_offset = 1.0,
            offset_by=f"{offset}h"
        ) 
    score = event_detection_ap(
        event_df.to_pandas(),
        sub_df1.to_pandas(),
    )
    print(score)

-5


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):12.1sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.801138574549938
-4


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.6sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.802156173353346
-3


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.5sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.798230255024615
-2


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.8sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.7990172158116352
-1


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.7sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8005801991963586
0


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.7sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8024222952712678
1


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.4GB):11.7sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8030249691675242
2


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.8sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8031793680267056
3


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):12.9sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8031269658553184
4


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.9sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8031373386457663
5


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.5sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8031147920261006


In [30]:
for offset in range(0, 24):
    print(offset)
    with trace('make submission'):
        sub_df1 = make_submission(
            pred_all_df,
            periodicity_dict= periodicity_dict,
            height = 0.001,
            distance = 107,
            daily_score_offset = 1.0,
            offset_by=f"{offset}h"
        ) 
    score = event_detection_ap(
        event_df.to_pandas(),
        sub_df1.to_pandas(),
    )
    print(score)
    print()

0


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.4GB):11.8sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8024222952712678
4


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.9sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8031373386457663
8


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.4GB(+0.5GB):11.4sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8029099078629006
12


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.9GB(+0.5GB):11.5sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.7989729262264974
16


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):12.8sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8020826682398392
20


find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.4GB(+0.5GB):12.4sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.802156173353346


## 12の倍数のときに一個だけずらす

In [73]:
from scipy.signal import find_peaks

def make_submissionV3(
    preds_df: pl.DataFrame,
    periodicity_dict: dict[str, np.ndarray],
    height: float = 0.001,
    distance: int = 100,
    daily_score_offset: float = 1.0,
    prefix:str ="prediction"
) -> pl.DataFrame:

    event_dfs = []

    for series_id, series_df in tqdm(preds_df.group_by("series_id"), desc="find peaks", leave=False, total=len(preds_df["series_id"].unique())):
        for event in ["onset", "wakeup"]:
            event_preds = series_df[f"{prefix}_{event}"].to_numpy().copy()
            event_preds *= 1 - periodicity_dict[series_id][: len(event_preds)]
            steps = find_peaks(event_preds, height=height, distance=distance)[0]
            event_dfs.append(
                series_df.filter(pl.col("step").is_in(steps))
                .with_columns(pl.lit(event).alias("event"))
                .rename({f"{prefix}_{event}": "score"})
                .select(["series_id", "step", "timestamp", "event", "score"])
            )
    submission_df = (
        pl.concat(event_dfs)
        .with_columns(pl.col("timestamp").dt.offset_by("2h").dt.date().alias("date"))
        .with_columns(
            pl.col("score") / (pl.col("score").sum().over(["series_id", "event", "date"]) + daily_score_offset)
        )
        .with_columns(pl.when(pl.col("step")%12==0).then(pl.col("step")+1).otherwise(pl.col("step")).alias('step'))
        .sort(["series_id", "step"])
        .with_columns(pl.arange(0, pl.count()).alias("row_id"))
        .select(["row_id", "series_id", "step", "event", "score"])
    )
    return submission_df

In [74]:
with trace('make submission'):
    sub_df1 = make_submissionV3(
        pred_all_df,
        periodicity_dict= periodicity_dict,
        height = 0.001,
        distance = 107,
        daily_score_offset = 1.0,
    ) 
score = event_detection_ap(
    event_df.to_pandas(),
    sub_df1.to_pandas(),
)
score

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[10.0GB(+0.5GB):11.6sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8031793680267056

後の日付のスコアを下げる

In [59]:
from scipy.signal import find_peaks

def make_submissionv2(
    preds_df: pl.DataFrame,
    periodicity_dict: dict[str, np.ndarray],
    height: float = 0.001,
    distance: int = 100,
    daily_score_offset: float = 1.0,
    prefix:str ="prediction",
    date_rate = 1
) -> pl.DataFrame:

    event_dfs = []

    for series_id, series_df in tqdm(preds_df.group_by("series_id"), desc="find peaks", leave=False, total=len(preds_df["series_id"].unique())):
        for event in ["onset", "wakeup"]:
            event_preds = series_df[f"{prefix}_{event}"].to_numpy().copy()
            event_preds *= 1 - periodicity_dict[series_id][: len(event_preds)]
            steps = find_peaks(event_preds, height=height, distance=distance)[0]
            event_dfs.append(
                series_df.filter(pl.col("step").is_in(steps))
                .with_columns(pl.lit(event).alias("event"))
                .rename({f"{prefix}_{event}": "score"})
                .select(["series_id", "step", "timestamp", "event", "score"])
            )
    
    # date 1日ごとにスコアを減衰させる

    submission_df = (
        pl.concat(event_dfs)
        .with_columns(pl.col("timestamp").dt.offset_by("2h").dt.date().alias("date"))
        .with_columns(pl.col("date").min().over("series_id").alias("min_date"),
                     pl.col("date").max().over("series_id").alias("max_date")
                     )
        .with_columns(
            pl.col("score") / (pl.col("score").sum().over(["series_id", "event", "date"]) + daily_score_offset)
        )
        .with_columns(pl.col("score") *
                      (1 - (
                          (1 - pl.lit(date_rate)) * (((pl.col('date')-pl.col("min_date")).dt.days() / ((pl.col('max_date')-pl.col("min_date")).dt.days() + 1.0)))
                      )
                     )
                     )
        .sort(["series_id", "step"])
        .with_columns(pl.arange(0, pl.count()).alias("row_id"))
        .select(["row_id", "series_id", "step", "event", "score"])
    )
    return submission_df

In [60]:
with trace('make submission'):
    sub_df1 = make_submissionv2(
        pred_all_df,
        periodicity_dict= periodicity_dict,
        height = 0.001,
        distance = 107,
        daily_score_offset = 1.0,
        date_rate = 1
    ) 
score = event_detection_ap(
    event_df.to_pandas(),
    sub_df1.to_pandas(),
)
score

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.7sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8031793680267056

In [62]:
for date_rate in [1.0, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7]:
    
    with trace('make submission'):
        sub_df1 = make_submissionv2(
            pred_all_df,
            periodicity_dict= periodicity_dict,
            height = 0.001,
            distance = 107,
            daily_score_offset = 1.0,
            date_rate = date_rate
        ) 
    score = event_detection_ap(
        event_df.to_pandas(),
        sub_df1.to_pandas(),
    )
    print(date_rate,score)
    print()

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.8sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

1.0 0.8031793680267056



find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):12.2sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.95 0.8043326089030383



find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.9sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.9 0.8046021252760622



find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.5sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.85 0.8040990530324634



find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.4GB):11.7sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.8 0.8031464015898544



find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[10.0GB(+0.5GB):11.8sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.75 0.8018956228955505



find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.7sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.7 0.8003138751932118



In [66]:
from scipy.signal import find_peaks

def make_submissionv2(
    preds_df: pl.DataFrame,
    periodicity_dict: dict[str, np.ndarray],
    height: float = 0.001,
    distance: int = 100,
    daily_score_offset: float = 1.0,
    prefix:str ="prediction",
    one_date_diff_rate = 0
) -> pl.DataFrame:

    event_dfs = []

    for series_id, series_df in tqdm(preds_df.group_by("series_id"), desc="find peaks", leave=False, total=len(preds_df["series_id"].unique())):
        for event in ["onset", "wakeup"]:
            event_preds = series_df[f"{prefix}_{event}"].to_numpy().copy()
            event_preds *= 1 - periodicity_dict[series_id][: len(event_preds)]
            steps = find_peaks(event_preds, height=height, distance=distance)[0]
            event_dfs.append(
                series_df.filter(pl.col("step").is_in(steps))
                .with_columns(pl.lit(event).alias("event"))
                .rename({f"{prefix}_{event}": "score"})
                .select(["series_id", "step", "timestamp", "event", "score"])
            )
    
    # date 1日ごとにスコアを減衰させる

    submission_df = (
        pl.concat(event_dfs)
        .with_columns(pl.col("timestamp").dt.offset_by("2h").dt.date().alias("date"))
        .with_columns(pl.col("date").min().over("series_id").alias("min_date"),
                     pl.col("date").max().over("series_id").alias("max_date")
                     )
        .with_columns(
            pl.col("score") / (pl.col("score").sum().over(["series_id", "event", "date"]) + daily_score_offset)
        )
        .with_columns(pl.col("score") * 
                      (1 - (
                          (one_date_diff_rate) * ((pl.col('date')-pl.col("min_date")).dt.days())
                      )
                     )
                     )
        .sort(["series_id", "step"])
        .with_columns(pl.arange(0, pl.count()).alias("row_id"))
        .select(["row_id", "series_id", "step", "event", "score"])
    )
    return submission_df

In [67]:
for one_date_diff_rate in [0, 0.001, 0.002, 0.005, 0.01]:
    
    with trace('make submission'):
        sub_df1 = make_submissionv2(
            pred_all_df,
            periodicity_dict= periodicity_dict,
            height = 0.001,
            distance = 107,
            daily_score_offset = 1.0,
            one_date_diff_rate = one_date_diff_rate
        ) 
    score = event_detection_ap(
        event_df.to_pandas(),
        sub_df1.to_pandas(),
    )
    print(date_rate,score)
    print()

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.4GB):12.1sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0 0.8031793680267056



find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):12.2sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0 0.8039205304200319



find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.7sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0 0.8043584890072324



find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.7sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0 0.803949635086096



find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[9.5GB(+0.5GB):11.6sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0 0.8005937334779489

