In [1]:
import sys, os
sys.path.append(os.pardir)

In [2]:
from pathlib import Path
import numpy as np
import polars as pl
import os
from hydra import initialize, compose

with initialize(config_path="../run/conf", version_base=None):
    cfg = compose("cv_train", overrides=["exp_name=exp078_lstm"])

In [3]:
from src.utils.metrics import event_detection_ap
from src.utils.periodicity import get_periodicity_dict
from src.utils.common import trace
periodicity_dict = get_periodicity_dict(cfg)



In [4]:
train_df = pl.read_parquet(Path(cfg.dir.data_dir) / "train_series.parquet")
train_df = train_df.with_columns(
            pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z")
        )

In [5]:
event_df = pl.read_csv(Path(cfg.dir.data_dir) / "train_events.csv").drop_nulls()
event_df = event_df.with_columns(
    pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z")
)

In [6]:
## pred1
pred1_df = pl.read_parquet("valid_preds.parquet")
pred1_df.describe()

describe,row_id,prediction_onset,prediction_wakeup
str,f64,f64,f64
"""count""",127946340.0,127946340.0,127946340.0
"""null_count""",0.0,0.0,0.0
"""mean""",63973000.0,0.001201,0.001185
"""std""",36935000.0,0.017525,0.017998
"""min""",0.0,7.0448e-22,1.982e-18
"""25%""",31986585.0,4.2278e-11,1.0695e-08
"""50%""",63973170.0,1.0348e-07,2.5653e-07
"""75%""",95959755.0,9e-06,4e-06
"""max""",127946339.0,0.740916,0.765315


In [9]:
## pred2
pred2_df = pl.read_parquet("../output/cv_inference/exp078_lstm/single/train_pred.parquet")
pred2_df = pred2_df.with_columns(
    [pl.col(col).cast(pl.Float32) for col in ["pred_sleep", "pred_onset", "pred_wakeup"]]
)
pred2_df.describe()

describe,pred_sleep,pred_onset,pred_wakeup
str,f64,f64,f64
"""count""",127946340.0,127946340.0,127946340.0
"""null_count""",0.0,0.0,0.0
"""mean""",0.239709,0.005477,0.005221
"""std""",0.396118,0.040984,0.040704
"""min""",1.3e-05,5.9605e-08,1.7881e-07
"""25%""",0.001423,0.000173,0.000195
"""50%""",0.007534,0.000601,0.000581
"""75%""",0.329834,0.00176,0.001569
"""max""",0.99707,0.945801,0.940918


In [10]:
# 結合
pred_all_df = pl.concat([train_df, pred1_df, pred2_df], how="horizontal")
pred_all_df.head()

series_id,step,timestamp,anglez,enmo,row_id,prediction_onset,prediction_wakeup,pred_sleep,pred_onset,pred_wakeup
str,u32,"datetime[μs, UTC]",f32,f32,i64,f32,f32,f32,f32,f32
"""038441c925bb""",0,2018-08-14 19:30:00 UTC,2.6367,0.0217,0,3e-06,0.001149,0.0336,0.004398,0.011375
"""038441c925bb""",1,2018-08-14 19:30:05 UTC,2.6368,0.0215,1,4e-06,0.001166,0.028183,0.003567,0.009117
"""038441c925bb""",2,2018-08-14 19:30:10 UTC,2.637,0.0216,2,6.6307e-07,0.000871,0.017334,0.001902,0.004608
"""038441c925bb""",3,2018-08-14 19:30:15 UTC,2.6368,0.0213,3,2e-06,0.001289,0.012047,0.001182,0.002462
"""038441c925bb""",4,2018-08-14 19:30:20 UTC,2.6368,0.0215,4,1.9404e-07,0.001127,0.012299,0.001407,0.002687


In [23]:
from scipy.signal import find_peaks
from tqdm.auto import tqdm

def make_submission(
    preds_df: pl.DataFrame,
    periodicity_dict: dict[str, np.ndarray],
    height: float = 0.001,
    distance: int = 100,
    day_norm: bool = False,
    daily_score_offset: float = 1.0,
    pred_prefix:str ="prediction",
    late_date_rate: float|None= None,
) -> pl.DataFrame:

    event_dfs = []

    for series_id, series_df in tqdm(preds_df.group_by("series_id"), desc="find peaks", leave=False, total=len(preds_df["series_id"].unique())):
        for event in ["onset", "wakeup"]:
            event_preds = series_df[f"{pred_prefix}_{event}"].to_numpy().copy()
            event_preds *= 1 - periodicity_dict[series_id][: len(event_preds)]
            steps = find_peaks(event_preds, height=height, distance=distance)[0]
            event_dfs.append(
                series_df.filter(pl.col("step").is_in(steps))
                .with_columns(pl.lit(event).alias("event"))
                .rename({f"{pred_prefix}_{event}": "score"})
                .select(["series_id", "step", "timestamp", "event", "score"])
            )
        
    submission_df = (
        pl.concat(event_dfs)
        .sort(["series_id", "step"])
        .with_columns(pl.arange(0, pl.count()).alias("row_id"))
    )

    if day_norm:
        submission_df = submission_df.with_columns(
            pl.col("timestamp").dt.offset_by("2h").dt.date().alias("date")
        ).with_columns(
            pl.col("score") / (pl.col("score").sum().over(["series_id", "event", "date"]) + daily_score_offset)
        )

    if late_date_rate is not None:
        submission_df = submission_df.with_columns(
            pl.col("timestamp").dt.offset_by("2h").dt.date().alias("date")
        ).with_columns(
            pl.col("date").min().over("series_id").alias("min_date"),
            pl.col("date").max().over("series_id").alias("max_date"),
        ).with_columns(
            pl.col("score")
            * (
                1
                - (
                    (1 - pl.lit(late_date_rate))
                    * (
                        (pl.col("date") - pl.col("min_date")).dt.days()
                        / ((pl.col("max_date") - pl.col("min_date")).dt.days() + 1.0)
                    )
                )
            )
        )

    return submission_df.select(["row_id", "series_id", "step", "event", "score"])

In [25]:
with trace('make submission'):
    sub_df2 = make_submission(
        pred_all_df,
        periodicity_dict= periodicity_dict,
        height = 0.001,
        distance = 101,
        day_norm=False,
        daily_score_offset = 1.0,
        prefix="pred",
    ) 
score = event_detection_ap(
    event_df.to_pandas(),
    sub_df2.to_pandas(),
)
score

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[11.4GB(+0.4GB):12.4sec] make submission 


Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.788422374022332

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.788422374022332

In [16]:
from scipy.signal import find_peaks
from tqdm.auto import tqdm

def make_submission(
    preds_df: pl.DataFrame,
    periodicity_dict: dict[str, np.ndarray],
    height: float = 0.001,
    distance: int = 100,
    daily_score_offset: float = 1.0,
    prefix:str ="prediction"
) -> pl.DataFrame:

    event_dfs = []

    for series_id, series_df in tqdm(preds_df.group_by("series_id"), desc="find peaks", leave=False, total=len(preds_df["series_id"].unique())):
        for event in ["onset", "wakeup"]:
            event_preds = series_df[f"{prefix}_{event}"].to_numpy().copy()
            event_preds *= 1 - periodicity_dict[series_id][: len(event_preds)]
            steps = find_peaks(event_preds, height=height, distance=distance)[0]
            event_dfs.append(
                series_df.filter(pl.col("step").is_in(steps))
                .with_columns(pl.lit(event).alias("event"))
                .rename({f"{prefix}_{event}": "score"})
                .select(["series_id", "step", "timestamp", "event", "score"])
            )
    submission_df = (
        pl.concat(event_dfs)
        .sort(["series_id", "step"])
        .with_columns(pl.arange(0, pl.count()).alias("row_id"))
        .select(["row_id", "series_id", "step", "event", "score"])
    )
    return submission_df

In [17]:
with trace('make submission'):
    sub_df2 = make_submission(
        pred_all_df,
        periodicity_dict= periodicity_dict,
        height = 0.001,
        distance = 101,
        daily_score_offset = 1.0,
        prefix="pred"
    ) 

find peaks:   0%|          | 0/277 [00:00<?, ?it/s]

[11.4GB(+0.5GB):11.5sec] make submission 


In [18]:
score = event_detection_ap(
    event_df.to_pandas(),
    sub_df2.to_pandas(),
)
score

Matching detections to ground truth events:   0%|          | 0/538 [00:00<?, ?it/s]

0.788422374022332

In [15]:
import numpy as np
import torch

In [30]:
batch_size, height, n_timesteps = 2, 3, 16
x = torch.randn(batch_size, height, n_timesteps)

In [31]:
print(x.shape)
x

torch.Size([2, 3, 16])


tensor([[[-0.3043, -0.4002,  2.3879,  0.3315, -0.5977,  0.1647,  0.2765,
          -0.2068, -1.2563, -0.2078, -0.7170,  0.0813, -2.0867,  1.5346,
          -0.0893,  0.5615],
         [ 0.4196, -0.4105, -0.4590,  0.6873, -0.5096, -0.1428,  0.3947,
           0.5123, -0.7020,  2.0761, -1.4539,  0.0517, -0.3946, -0.9039,
           0.4739,  0.2793],
         [-0.7913, -1.5467,  0.9308,  0.5389, -0.8949, -0.0334,  1.7299,
           1.9157,  0.9344,  0.3280,  0.2866, -2.9284, -1.0829,  1.1879,
           0.7656,  0.4795]],

        [[-0.2544, -1.5696, -0.7613, -1.5473, -0.3800,  0.2525, -1.7151,
          -1.1560,  0.4908, -0.3029,  1.8000,  0.0819,  0.7768,  0.9647,
           0.7437,  0.9847],
         [ 0.5525, -0.1429,  0.4380, -0.1435, -1.2380, -1.3140,  0.9394,
           2.2703, -0.1071,  0.2021,  0.4899, -0.3443, -0.8482,  0.6317,
          -1.5534, -0.2310],
         [-0.4718, -1.8672,  0.2409,  0.6980,  0.0163, -1.4306, -0.4520,
          -0.3687, -0.7925,  0.6190, -0.3567,  1.0

In [39]:
split_times = 3 # 2,4,8分割
n_timesteps = 16

x_repeat = x.unsqueeze(dim=1).repeat((1,split_times+1,1,1))

for si in range(split_times):
    n_group = 2**(si+1) # n_group分割
    len_group = n_timesteps//n_group
    for gi in range(0, n_group, 2): # ２ペアずつ入れ替え処理
        start = len_group*gi
        mid = start+len_group
        end = start+len_group*2
        temp = x_repeat[:, si+1, :, start:mid].clone()
        x_repeat[:, si+1, :, start:mid] = x_repeat[:, si+1, :, mid:end]
        x_repeat[:, si+1, :, mid:end] = temp
    
    # 現在地との差分
    x_repeat[:, si+1, :, :] -= x_repeat[:, 0, :, :]

In [41]:
x_repeat.shape

torch.Size([2, 4, 3, 16])

torch.Size([2, 3, 3, 16])

In [35]:
x.repeat((1,1,2)).shape

torch.Size([2, 3, 32])