# シリーズごとの時間の偏りなどがないか調べる


In [1]:
import sys, os
sys.path.append(os.pardir)

In [2]:
from pathlib import Path
import numpy as np
import polars as pl
import os
from hydra import initialize, compose

with initialize(config_path="../run/conf", version_base=None):
    cfg = compose("cv_score", overrides=["exp_name=exp013"])

In [29]:
# series の読み込み
train_df = pl.read_parquet(Path(cfg.dir.data_dir) / "train_series.parquet")
train_df = train_df.with_columns(
            pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z")
        )
event_df = pl.read_csv(Path(cfg.dir.data_dir) / "train_events.csv")
event_df = event_df.with_columns(
    pl.col("timestamp").str.to_datetime("%Y-%m-%dT%H:%M:%S%z")
)

In [22]:
# 開始の時間
start_df = train_df.filter(pl.col("step")==0).with_columns(
            pl.col('timestamp').dt.hour().cast(pl.UInt8).alias('hour'),
            pl.col('timestamp').dt.minute().cast(pl.UInt8).alias('minute')
)

start_df

series_id,step,timestamp,anglez,enmo,hour,minute
str,u32,"datetime[μs, UTC]",f32,f32,u8,u8
"""038441c925bb""",0,2018-08-14 19:30:00 UTC,2.6367,0.0217,19,30
"""03d92c9f6f8a""",0,2018-05-31 16:00:00 UTC,38.892899,0.0803,16,0
"""0402a003dae9""",0,2018-12-18 17:45:00 UTC,-77.314903,0.0868,17,45
"""04f547b8017d""",0,2018-11-28 17:00:00 UTC,0.6793,0.0,17,0
"""05e1944c3818""",0,2018-11-16 23:00:00 UTC,-86.747398,0.0153,23,0
"""062cae666e2a""",0,2019-02-01 21:45:00 UTC,2.6827,0.003,21,45
"""062dbd4c95e6""",0,2018-08-22 16:15:00 UTC,10.9539,0.0839,16,15
"""08db4255286f""",0,2018-11-05 14:00:00 UTC,-30.845301,0.0447,14,0
"""0a96f4993bd7""",0,2018-05-03 14:30:00 UTC,3.2724,0.0386,14,30
"""0cd1e3d0ed95""",0,2017-12-08 20:00:00 UTC,-19.9536,0.0217,20,0


In [26]:
# 終了の時間
end_df = train_df.with_columns(pl.col('step').diff(-1).alias('diff')).filter(pl.col('diff').is_null().or_(pl.col('diff')>0)).with_columns(
            (pl.col('timestamp')+ pl.duration(seconds=5)),
            (pl.col('step')+1).alias('step'),
            ).with_columns(
            pl.col('timestamp').dt.hour().cast(pl.UInt8).alias('hour'),
            pl.col('timestamp').dt.minute().cast(pl.UInt8).alias('minute')
)
end_df

series_id,step,timestamp,anglez,enmo,diff,hour,minute
str,u32,"datetime[μs, UTC]",f32,f32,i64,u8,u8
"""038441c925bb""",389880,2018-09-06 09:00:00 UTC,-28.6567,0.0125,389879,9,0
"""03d92c9f6f8a""",724140,2018-07-12 13:45:00 UTC,-9.2718,0.032,724139,13,45
"""0402a003dae9""",397260,2019-01-10 17:30:00 UTC,14.2046,0.0896,397259,17,30
"""04f547b8017d""",637560,2019-01-04 14:30:00 UTC,21.4674,0.0795,637559,14,30
"""05e1944c3818""",400860,2018-12-10 03:45:00 UTC,-26.2253,0.0172,400859,3,45
"""062cae666e2a""",442440,2019-02-27 12:15:00 UTC,-14.9396,0.0366,442439,12,15
"""062dbd4c95e6""",778680,2018-10-06 17:45:00 UTC,-80.813599,0.0,778679,17,45
"""08db4255286f""",440280,2018-12-01 01:30:00 UTC,-29.593,0.0011,440279,1,30
"""0a96f4993bd7""",256860,2018-05-18 11:15:00 UTC,-10.2738,0.0265,256859,11,15
"""0cd1e3d0ed95""",370260,2017-12-30 06:15:00 UTC,17.777901,0.0216,370259,6,15


In [27]:
start_end_df = start_df.join(end_df, on="series_id", suffix="_end")
start_end_df

series_id,step,timestamp,anglez,enmo,hour,minute,step_end,timestamp_end,anglez_end,enmo_end,diff,hour_end,minute_end
str,u32,"datetime[μs, UTC]",f32,f32,u8,u8,u32,"datetime[μs, UTC]",f32,f32,i64,u8,u8
"""038441c925bb""",0,2018-08-14 19:30:00 UTC,2.6367,0.0217,19,30,389880,2018-09-06 09:00:00 UTC,-28.6567,0.0125,389879,9,0
"""03d92c9f6f8a""",0,2018-05-31 16:00:00 UTC,38.892899,0.0803,16,0,724140,2018-07-12 13:45:00 UTC,-9.2718,0.032,724139,13,45
"""0402a003dae9""",0,2018-12-18 17:45:00 UTC,-77.314903,0.0868,17,45,397260,2019-01-10 17:30:00 UTC,14.2046,0.0896,397259,17,30
"""04f547b8017d""",0,2018-11-28 17:00:00 UTC,0.6793,0.0,17,0,637560,2019-01-04 14:30:00 UTC,21.4674,0.0795,637559,14,30
"""05e1944c3818""",0,2018-11-16 23:00:00 UTC,-86.747398,0.0153,23,0,400860,2018-12-10 03:45:00 UTC,-26.2253,0.0172,400859,3,45
"""062cae666e2a""",0,2019-02-01 21:45:00 UTC,2.6827,0.003,21,45,442440,2019-02-27 12:15:00 UTC,-14.9396,0.0366,442439,12,15
"""062dbd4c95e6""",0,2018-08-22 16:15:00 UTC,10.9539,0.0839,16,15,778680,2018-10-06 17:45:00 UTC,-80.813599,0.0,778679,17,45
"""08db4255286f""",0,2018-11-05 14:00:00 UTC,-30.845301,0.0447,14,0,440280,2018-12-01 01:30:00 UTC,-29.593,0.0011,440279,1,30
"""0a96f4993bd7""",0,2018-05-03 14:30:00 UTC,3.2724,0.0386,14,30,256860,2018-05-18 11:15:00 UTC,-10.2738,0.0265,256859,11,15
"""0cd1e3d0ed95""",0,2017-12-08 20:00:00 UTC,-19.9536,0.0217,20,0,370260,2017-12-30 06:15:00 UTC,17.777901,0.0216,370259,6,15


# event

In [34]:
event_df_info = event_df.with_columns(
    pl.col('timestamp').dt.hour().cast(pl.UInt8).alias('hour'),
    pl.col('timestamp').dt.minute().cast(pl.UInt8).alias('minute')
).with_columns(
    (pl.col('minute')%15).alias('minute_15')
)
event_df_info

series_id,night,event,step,timestamp,hour,minute,minute_15
str,i64,str,i64,"datetime[μs, UTC]",u8,u8,u8
"""038441c925bb""",1,"""onset""",4992,2018-08-15 02:26:00 UTC,2,26,11
"""038441c925bb""",1,"""wakeup""",10932,2018-08-15 10:41:00 UTC,10,41,11
"""038441c925bb""",2,"""onset""",20244,2018-08-15 23:37:00 UTC,23,37,7
"""038441c925bb""",2,"""wakeup""",27492,2018-08-16 09:41:00 UTC,9,41,11
"""038441c925bb""",3,"""onset""",39996,2018-08-17 03:03:00 UTC,3,3,3
"""038441c925bb""",3,"""wakeup""",44400,2018-08-17 09:10:00 UTC,9,10,10
"""038441c925bb""",4,"""onset""",57240,2018-08-18 03:00:00 UTC,3,0,0
"""038441c925bb""",4,"""wakeup""",62856,2018-08-18 10:48:00 UTC,10,48,3
"""038441c925bb""",5,"""onset""",,,,,
"""038441c925bb""",5,"""wakeup""",,,,,


In [47]:
start_end_df.join(event_df_info, on='series_id').group_by('minute').agg([
    (pl.col('minute_15')==i).sum().alias(f'm{i}') for i in range(0, 15)
]+[(pl.col('timestamp').is_null()).sum().alias('null_num')]
)

minute,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14,null_num
u8,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
30,366,61,68,348,44,94,154,412,80,63,96,411,48,90,135,0
0,396,61,84,374,43,90,141,385,74,42,68,333,37,89,142,0
15,392,75,100,418,49,103,165,449,98,76,106,391,50,108,192,0
45,304,43,56,258,26,85,132,345,58,63,79,287,48,79,121,0


In [50]:
start_end_df.join(event_df_info, on='series_id').group_by('series_id').agg([
    (pl.col('minute_15')==i).sum().alias(f'm{i}') for i in range(0, 15)
]
)

series_id,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14
str,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
"""f0482490923c""",6,0,0,1,0,0,1,4,0,0,0,6,0,2,0
"""3c336d6ba566""",5,1,0,3,1,1,1,4,2,0,0,6,0,1,1
"""8b8b9e29171c""",2,1,1,6,0,2,0,1,0,0,0,3,0,1,1
"""612aa8ba44e2""",4,0,2,3,3,0,1,1,1,1,0,5,0,3,0
"""1762ab70ec76""",5,0,2,4,0,1,0,4,4,0,3,9,1,1,4
"""655f19eabf1e""",13,1,1,5,2,0,3,8,2,0,0,4,1,2,2
"""fa149c3c4bde""",6,1,0,5,0,1,1,4,1,1,1,9,0,1,1
"""efbfc4526d58""",0,0,1,2,0,0,1,1,0,0,0,3,0,0,1
"""7476c0bd18d2""",1,0,0,1,0,0,0,0,0,1,0,1,0,0,0
"""e4500e7e19e1""",2,0,0,2,0,1,0,1,0,0,0,0,0,0,0


In [51]:
start_end_df.join(event_df_info, on='series_id').group_by('hour').agg([
    (pl.col('minute_15')==i).sum().alias(f'm{i}') for i in range(0, 15)
]
)

hour,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14
u8,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
17,188,29,38,174,26,48,76,185,46,25,45,180,31,53,72
19,98,12,20,90,10,18,34,96,13,15,17,82,12,15,31
22,73,17,18,70,8,15,21,88,18,17,13,62,13,21,30
20,205,29,42,217,22,45,92,232,40,35,48,205,22,39,82
14,92,5,17,75,6,26,39,91,6,17,16,86,3,18,35
23,48,11,3,48,2,13,13,45,5,11,7,44,5,8,20
18,63,12,19,62,13,15,26,69,13,11,15,55,6,15,32
13,13,1,1,7,0,1,6,14,0,2,3,5,0,3,2
16,369,70,82,332,37,94,152,420,96,61,105,370,45,93,150
15,102,14,27,89,12,32,41,105,18,25,23,95,10,33,50


In [52]:
start_end_df.join(event_df_info, on='series_id').group_by('minute_end').agg([
    (pl.col('minute_15')==i).sum().alias(f'm{i}') for i in range(0, 15)
]
)

minute_end,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11,m12,m13,m14
u8,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,397,65,73,341,41,99,139,422,98,54,78,391,39,87,142
30,325,55,71,301,39,83,148,335,53,54,84,282,35,74,146
15,446,77,90,437,42,108,164,482,78,78,106,435,66,121,169
45,290,43,74,319,40,82,141,352,81,58,81,314,43,84,133
