In [1]:
import os
import gc
import time
import json
import joblib
import random
import math
from math import pi, sqrt, exp
from tqdm.auto import tqdm

import pyarrow as pa
from pyarrow.parquet import ParquetFile
from scipy.interpolate import interp1d
import pandas as pd
import numpy as np
import sklearn, sklearn.model_selection

In [2]:
class PATHS:
    MAIN_DIR = "/kaggle/input/child-mind-institute-detect-sleep-states/"
    # csv
    SUBMISSION = MAIN_DIR + "sample_submission.csv"
    TRAIN_EVENTS = MAIN_DIR + "train_events.csv"
    # parquet
    TRAIN_SERIES = MAIN_DIR + "train_series.parquet"
    TEST_SERIES = MAIN_DIR + "test_series.parquet"

In [3]:
out_dir = "train_csvs"
os.makedirs(out_dir, exist_ok=True)

### Load data

In [4]:
class DataReader:
    def __init__(self):
        # Mapping for data loading
        self.names_mapping = {
            "submission": {"path": PATHS.SUBMISSION, "is_parquet": False, "has_timestamp": False},
            "train_events": {"path": PATHS.TRAIN_EVENTS, "is_parquet": False, "has_timestamp": True},
            "train_series": {"path": PATHS.TRAIN_SERIES, "is_parquet": True, "has_timestamp": True},
            "test_series": {"path": PATHS.TEST_SERIES, "is_parquet": True, "has_timestamp": True},
        }
        self.valid_names = ["submission", "train_events", "train_series", "test_series"]
        
    def verify(self, filename: str):
        """ファイル名の存在チェック"""
        if filename not in self.valid_names:
            print("PLEASE ENTER A VALID DATASET NAME, VALID NAMES ARE: ", valid_names)
            
        return
    
    def load(self, filename: str):
        self.verify(filename)
        data_props = self.names_mapping[filename]
        
        if data_props["is_parquet"]:
            df = pd.read_parquet(data_props["path"])
        else:
            df = pd.read_csv(data_props["path"])
        
        if data_props["has_timestamp"]:
            # 
            df = df.dropna(subset=["timestamp"])
            
        gc.collect()
        
        return df

In [5]:
reader = DataReader()
series_df = reader.load(filename="train_series")
events_df = reader.load(filename="train_events")

In [6]:
print(series_df)
print(events_df)

              series_id    step                 timestamp     anglez    enmo
0          038441c925bb       0  2018-08-14T15:30:00-0400   2.636700  0.0217
1          038441c925bb       1  2018-08-14T15:30:05-0400   2.636800  0.0215
2          038441c925bb       2  2018-08-14T15:30:10-0400   2.637000  0.0216
3          038441c925bb       3  2018-08-14T15:30:15-0400   2.636800  0.0213
4          038441c925bb       4  2018-08-14T15:30:20-0400   2.636800  0.0215
...                 ...     ...                       ...        ...     ...
127946335  fe90110788d2  592375  2017-09-08T00:14:35-0400 -27.277500  0.0204
127946336  fe90110788d2  592376  2017-09-08T00:14:40-0400 -27.032499  0.0233
127946337  fe90110788d2  592377  2017-09-08T00:14:45-0400 -26.841200  0.0202
127946338  fe90110788d2  592378  2017-09-08T00:14:50-0400 -26.723900  0.0199
127946339  fe90110788d2  592379  2017-09-08T00:14:55-0400 -31.521601  0.0205

[127946340 rows x 5 columns]
          series_id  night   event      step  

### Preprocess

In [7]:
SIGMA = 720 # 12h * 60min

def gauss(n=SIGMA, sigma=SIGMA*0.15):
    """ガウス分布に基づいた関数の値を生成
    指定された範囲と標準偏差を用いて、ガウス分布の値を計算
    
    Args:
        n (int): ガウス分布の範囲 (default: SIGMA)
        sigma (fint): 標準偏差
    """
    r = range(-int(n/2), int(n/2)+1)
    
    return [
        1 / (sigma*sqrt(2*pi)) * exp(-float(x)**2 / (2*sigma**2))
        for x in r
    ]

In [8]:
def _generate_labels_gaussian(X, y):
    """
    Args:
        X:
        y (list): [(start_step, end_step), ...]
            ex. [(4992, 10932), (20244, 27492), (39996, 44400), (57240, 62856), ...]
    """
    # start_stepとend_stepの予測のため、2ついる
    labels_gaussian = np.zeros(shape=(len(X), 2))

    for start_step, end_step in y:
        gauss_values = gauss()

        start_range = max(0, start_step - SIGMA//2) # 0以上の値
        end_range = min(len(X), end_step + SIGMA//2) # 説明変数長以下の値

        # ガウス分布を開始点と終了点へ適用
        labels_gaussian[start_range: start_step + SIGMA//2, 0] = gauss_values[:start_step + SIGMA//2 - start_range]
        labels_gaussian[end_step: end_range, 1] = gauss_values[-(end_range - end_step):]

    labels_gaussian /= np.max(labels_gaussian + 1e-12)
    
    return labels_gaussian

In [9]:
labels = []
data = []
ids = series_df["series_id"].unique()

dfs = pd.DataFrame()
for series_id, viz_id in tqdm(enumerate(ids), total=len(ids)):
    viz_labels = []
    viz_events = events_df[events_df["series_id"] == viz_id]
    viz_series = series_df.loc[(series_df["series_id"] == viz_id)].copy().reset_index()
    
    viz_series["dt"] = pd.to_datetime(
        viz_series["timestamp"],
        format="%Y-%m-%dT%H:%M:%S%z",
    ).astype("datetime64[ns, UTC-04:00]")
    viz_series["hour"] = viz_series["dt"].dt.hour
    
    check = 0
    for i in range(len(viz_events)-1):
        if (
            viz_events.iloc[i]["event"] == "onset"
            and viz_events.iloc[i+1]["event"] == "wakeup"
            and viz_events.iloc[i]["night"] == viz_events.iloc[i+1]["night"]
        ):
            start, end = viz_events["timestamp"].iloc[i], viz_events["timestamp"].iloc[i+1]
            
            start_id = viz_series.loc[viz_series["timestamp"] == start].index.values[0]
            end_id = viz_series.loc[viz_series["timestamp"] == end].index.values[0]
            
            viz_labels.append((start_id, end_id))
            check += 1
            
    
    labels_gaussian = _generate_labels_gaussian(X=viz_series, y=viz_labels)
    
    viz_series["onset"] = labels_gaussian[:, 0]
    viz_series["wakeup"] = labels_gaussian[:, 1]
    
    df = viz_series[["step", "anglez", "enmo", "hour", "onset", "wakeup"]]
    
    # idごとにsave
    df.to_csv(f"{out_dir}/{viz_id}.csv", index=False)
    
    dfs = pd.concat([dfs, df], axis=0)
    
# 加速度の統計量をsave
dfs = dfs.reset_index()
np.save("enmo_mean.npy", dfs["enmo"].mean())
np.save("enmo_std.npy", dfs["enmo"].std())
np.save("anglez_mean.npy", dfs["anglez"].mean())
np.save("anglez_std.npy", dfs["anglez"].std())

  0%|          | 0/277 [00:00<?, ?it/s]

In [10]:
print(f"enmo mean: {dfs['enmo'].mean():.5f}, enmo std: {dfs['enmo'].std():.5f}")
print(f"anglez mean: {dfs['anglez'].mean():.5f}, anglez std: {dfs['anglez'].std():.5f}")

enmo mean: 0.04132, enmo std: 0.10183
anglez mean: -8.81045, anglez std: 35.52188
