In [1]:
from hydra import initialize, compose

with initialize(config_path="../run/conf", version_base=None):
    cfg = compose("train")

In [2]:
from pathlib import Path
import yaml

from glob import glob
import numpy as np
import pandas as pd
import polars as pl


In [3]:
print(len(cfg.split['train_series_ids'])+ len(cfg.split['valid_series_ids']))

series_ids = [str(path).split('/')[-1] for path in (Path(cfg.dir.processed_dir) / 'train').glob("*")]
len(series_ids)

277


277

In [4]:
# ラベルがある event データ

event_df = pl.read_csv(Path(cfg.dir.data_dir) / "train_events.csv").drop_nulls()

q_cut_num = 10 # no_event は含まないので全体では11クラス

event_count_df = event_df.group_by('series_id').count()
event_count_df = event_count_df.select(
    pl.col("series_id"),
    pl.col("count"),
    pl.col("count").qcut(q_cut_num, 
                         labels=[str(i) for i in range(1, q_cut_num+1)]).alias('class').cast(pl.Utf8)
)

event_count_df.head(10)

series_id,count,class
str,u32,str
"""d25e479ecbb7""",42,"""6"""
"""0a96f4993bd7""",30,"""4"""
"""40dce6018935""",42,"""6"""
"""f0482490923c""",20,"""2"""
"""d2d6b9af0553""",30,"""4"""
"""03d92c9f6f8a""",16,"""2"""
"""2b8d87addea9""",60,"""10"""
"""f2c2436cf7b7""",40,"""6"""
"""c908a0ad3e31""",32,"""4"""
"""2f7504d0f426""",36,"""5"""


In [81]:
no_event = [si for si in series_ids if si not in list(event_count_df.get_column('series_id'))]
no_event_df = pl.DataFrame({"series_id": no_event, 
                            "count":[0 for _ in range(len(no_event))],
                           "class": [str(0) for _ in range(len(no_event))]
                            }
                          )
no_event_df = no_event_df.select(
    pl.col("series_id"),
    pl.col("count").cast(pl.UInt32),
    pl.col("class").cast(pl.Utf8),
)
no_event_df

series_id,count,class
str,u32,str
"""2fc653ca75c7""",0,"""0"""
"""a3e59c2ce3f6""",0,"""0"""
"""c5d08fc3e040""",0,"""0"""
"""c7b1283bb7eb""",0,"""0"""
"""0f9e60a8e56d""",0,"""0"""
"""390b487231ce""",0,"""0"""
"""e11b9d69f856""",0,"""0"""
"""89c7daa72eee""",0,"""0"""


In [83]:
# 二つを結合
all_df = pl.concat([event_count_df,no_event_df])
all_df.head(5)

series_id,count,class
str,u32,str
"""bfa54bd26187""",42,"""6"""
"""d515236bdeec""",28,"""3"""
"""04f547b8017d""",42,"""6"""
"""51c49c540b4e""",32,"""4"""
"""8898e6db816d""",16,"""2"""


### 5Fold に分割

In [94]:
from sklearn.model_selection import StratifiedKFold
import yaml

X = all_df.drop("class")
y = all_df.get_column('class')

skf = StratifiedKFold(n_splits=5)

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    fold_dict = {
        "train_series_ids": list(all_df.get_column('series_id').take(train_index)),
        "valid_series_ids": list(all_df.get_column('series_id').take(test_index)),
    }

    with open(f"../run/conf/split/stratify_fold_{i}.yaml", "w") as wf:
        yaml.dump(fold_dict, wf)
