# CatBoost Starter for Brain Comp
EEGの差分の集計特徴量を用いたNotebook。

- ver1: CV, LB

### Version Notes
- Version 1 - EEGのT=20, 30, 40秒における、10秒時間窓 / 20秒時間窓のmean, min, max, std, max-minを特徴量とした

# Load Libraries

In [1]:
import os, gc
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
import pandas as pd
import numpy as np
import polars as pl
import matplotlib.pyplot as plt

VER = 1

# Load Test Data

In [4]:
df_test = pl.read_csv('../input/hms-harmful-brain-activity-classification/test.csv')
print('Train shape:', df_test.shape )
df_test.head()

Train shape: (1, 3)


spectrogram_id,eeg_id,patient_id
i64,i64,i64
853520,3911565283,6885


# Create Multiple Eeg Id Train Data
データの説明[here][1]には、テストデータには、同じ `eeg_id` からの複数の crop は含まれていないと記載されている。

# Feature Engineer


CatBoostモデルに入れる特徴量を作成する。


In [6]:
PATH = '../input/hms-harmful-brain-activity-classification/test_eegs/'
files = os.listdir(PATH)
print(f'There are {len(files)} eeg parquet files')

all_eegs = {}
temp = pl.DataFrame()
for i, f in enumerate(files):
    if i%100 == 0: print(i, ', ', end = '')

    eeg_id = int(f.split('.')[0])    
    if i >= 0:
        temp_eeg = pl.read_parquet(f'{PATH}{f}')
    
        # eegデータにおける、計測開始から各行までの秒数 seconds_eeg
        t = 1/200 # EEGのサンプリング周波数は200Hzなので、1行は1/200秒ごとの計測値を表す  
        seconds = [i*t for i in range(len(temp_eeg))]
        max_second = max(seconds)
        temp_eeg = temp_eeg.with_columns(pl.Series(seconds).alias("seconds_eeg")) 
        

        # 差分特徴量を作る
        temp_eeg = temp_eeg.with_columns(
            (pl.col("Fp1") - pl.col("T3")).alias("LL_Fp1-T3"),
            (pl.col("T3") - pl.col("O1")).alias("LL_T3-O1"),
            (pl.col("Fp1") - pl.col("C3")).alias("LP_Fp1-C3"),
            (pl.col("C3") - pl.col("O1")).alias("LP_C3-O1"),
            (pl.col("Fp2") - pl.col("C4")).alias("RP_Fp2-C4"),
            (pl.col("C4") - pl.col("O2")).alias("RP_C4-O2"),
            (pl.col("Fp2") - pl.col("T4")).alias("RR_Fp2-T4"),
            (pl.col("T4") - pl.col("O2")).alias("RR_T4-O2"),
        )

        # 差分特徴量の集計特徴量をつくる
        # window = 4000 # 20秒窓 (20秒 / (1レコード1/200秒) = 4000レコード)
        window_list = [2000, 4000]

        for window in window_list:

            str_window = str(int(window / (200))) + "s"
            temp_eeg = temp_eeg.with_columns(
                # LL: Fp1 - T3
                pl.col("LL_Fp1-T3").rolling_mean(window_size=window, center=True).alias("mean_" + str_window + "_LL_Fp1-T3"),
                pl.col("LL_Fp1-T3").rolling_max(window_size=window, center=True).alias("min_" + str_window + "_LL_Fp1-T3"),
                pl.col("LL_Fp1-T3").rolling_min(window_size=window, center=True).alias("max_" + str_window + "_LL_Fp1-T3"),
                pl.col("LL_Fp1-T3").rolling_std(window_size=window, center=True).alias("std_" + str_window + "_LL_Fp1-T3"),
                (pl.col("LL_Fp1-T3").rolling_max(window_size=window, center=True) - pl.col("LL_Fp1-T3").rolling_min(window_size=window, center=True)).alias("max-min_" + str_window + "_LL_Fp1-T3"),

                # LL: Fp1 - T3
                pl.col("LL_T3-O1").rolling_mean(window_size=window, center=True).alias("mean_" + str_window + "_LL_T3-O1"),
                pl.col("LL_T3-O1").rolling_max(window_size=window, center=True).alias("min_" + str_window + "_LL_T3-O1"),
                pl.col("LL_T3-O1").rolling_min(window_size=window, center=True).alias("max_" + str_window + "_LL_T3-O1"),
                pl.col("LL_T3-O1").rolling_std(window_size=window, center=True).alias("std_" + str_window + "_LL_T3-O1"),
                (pl.col("LL_T3-O1").rolling_max(window_size=window, center=True) - pl.col("LL_T3-O1").rolling_min(window_size=window, center=True)).alias("max-min__" + str_window + "_LL_T3-O1"),


                # LP: Fp1 - C3
                pl.col("LP_Fp1-C3").rolling_mean(window_size=window, center=True).alias("mean_" + str_window + "_LP_Fp1-C3"),
                pl.col("LP_Fp1-C3").rolling_max(window_size=window, center=True).alias("min_" + str_window + "_LP_Fp1-C3"),
                pl.col("LP_Fp1-C3").rolling_min(window_size=window, center=True).alias("max_" + str_window + "_LP_Fp1-C3"),
                pl.col("LP_Fp1-C3").rolling_std(window_size=window, center=True).alias("std_" + str_window + "_LP_Fp1-C3"),
                (pl.col("LP_Fp1-C3").rolling_max(window_size=window, center=True) - pl.col("LP_Fp1-C3").rolling_min(window_size=window, center=True)).alias("max-min_" + str_window + "_LP_Fp1-C3"),

                # LP: C3 - O1
                pl.col("LP_C3-O1").rolling_mean(window_size=window, center=True).alias("mean_" + str_window + "_LP_C3-O1"),
                pl.col("LP_C3-O1").rolling_max(window_size=window, center=True).alias("min_" + str_window + "_LP_C3-O1"),
                pl.col("LP_C3-O1").rolling_min(window_size=window, center=True).alias("max_" + str_window + "_LP_C3-O1"),
                pl.col("LP_C3-O1").rolling_std(window_size=window, center=True).alias("std_" + str_window + "_LP_C3-O1"),
                (pl.col("LP_C3-O1").rolling_max(window_size=window, center=True) - pl.col("LP_C3-O1").rolling_min(window_size=window, center=True)).alias("max-min_" + str_window + "_LP_C3-O1"),


                # RP: Fp1 - C4
                pl.col("RP_Fp2-C4").rolling_mean(window_size=window, center=True).alias("mean_" + str_window + "_RP_Fp2-C4"),
                pl.col("RP_Fp2-C4").rolling_max(window_size=window, center=True).alias("min_" + str_window + "_RP_Fp2-C4"),
                pl.col("RP_Fp2-C4").rolling_min(window_size=window, center=True).alias("max_" + str_window + "_RP_Fp2-C4"),
                pl.col("RP_Fp2-C4").rolling_std(window_size=window, center=True).alias("std_" + str_window + "_RP_Fp2-C4"),
                (pl.col("RP_Fp2-C4").rolling_max(window_size=window, center=True) - pl.col("RP_Fp2-C4").rolling_min(window_size=window, center=True)).alias("max-min_" + str_window + "_RP_Fp2-C4"),

                # RP: C4 - O2
                pl.col("RP_C4-O2").rolling_mean(window_size=window, center=True).alias("mean_" + str_window + "_RP_C4-O2"),
                pl.col("RP_C4-O2").rolling_max(window_size=window, center=True).alias("min_" + str_window + "_RP_C4-O2"),
                pl.col("RP_C4-O2").rolling_min(window_size=window, center=True).alias("max_" + str_window + "_RP_C4-O2"),
                pl.col("RP_C4-O2").rolling_std(window_size=window, center=True).alias("std_" + str_window + "_RP_C4-O2"),
                (pl.col("RP_C4-O2").rolling_max(window_size=window, center=True) - pl.col("RP_C4-O2").rolling_min(window_size=window, center=True)).alias("max-min_" + str_window + "_RP_C4-O2"),


                # RR: Fp2 - T4
                pl.col("RR_Fp2-T4").rolling_mean(window_size=window, center=True).alias("mean_" + str_window + "_RR_Fp2-T4"),
                pl.col("RR_Fp2-T4").rolling_max(window_size=window, center=True).alias("min_" + str_window + "_RR_Fp2-T4"),
                pl.col("RR_Fp2-T4").rolling_min(window_size=window, center=True).alias("max_" + str_window + "_RR_Fp2-T4"),
                pl.col("RR_Fp2-T4").rolling_std(window_size=window, center=True).alias("std_" + str_window + "_RR_Fp2-T4"),
                (pl.col("RR_Fp2-T4").rolling_max(window_size=window, center=True) - pl.col("RR_Fp2-T4").rolling_min(window_size=window, center=True)).alias("max-min_" + str_window + "_RR_Fp2-T4"),

                # RR: T4 - O2
                pl.col("RR_T4-O2").rolling_mean(window_size=window, center=True).alias("mean_" + str_window + "_RR_T4-O2"),
                pl.col("RR_T4-O2").rolling_max(window_size=window, center=True).alias("min_" + str_window + "_RR_T4-O2"),
                pl.col("RR_T4-O2").rolling_min(window_size=window, center=True).alias("max_" + str_window + "_RR_T4-O2"),
                pl.col("RR_T4-O2").rolling_std(window_size=window, center=True).alias("std_" + str_window + "_RR_T4-O2"),
                (pl.col("RR_T4-O2").rolling_max(window_size=window, center=True) - pl.col("RR_T4-O2").rolling_min(window_size=window, center=True)).alias("max-min_" + str_window + "_RR_T4-O2"),
            )

        # 集計特徴量とEKG以外drop
        temp_eeg = temp_eeg.drop(
            ['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4', 'P4', 'F8', 'T4', 'T6', 'O2',
             'LL_Fp1-T3', 'LL_T3-O1', 'LP_Fp1-C3', 'LP_C3-O1', 'RP_Fp2-C4', 'RP_C4-O2', 'RR_Fp2-T4', 'RR_T4-O2'])
        
        # eeg_id を追加
            # 型        
        temp_eeg = temp_eeg.with_columns(pl.lit(str(eeg_id)).alias("eeg_id"))

        # 全ての
        temp = pl.concat([temp, temp_eeg], how = 'vertical')

    else:
        continue

There are 1 eeg parquet files
0 , 

In [7]:
temp = temp.with_columns(pl.col("eeg_id").cast(pl.Int64).alias("eeg_id"))
temp.tail(5)

EKG,seconds_eeg,mean_10s_LL_Fp1-T3,min_10s_LL_Fp1-T3,max_10s_LL_Fp1-T3,std_10s_LL_Fp1-T3,max-min_10s_LL_Fp1-T3,mean_10s_LL_T3-O1,min_10s_LL_T3-O1,max_10s_LL_T3-O1,std_10s_LL_T3-O1,max-min__10s_LL_T3-O1,mean_10s_LP_Fp1-C3,min_10s_LP_Fp1-C3,max_10s_LP_Fp1-C3,std_10s_LP_Fp1-C3,max-min_10s_LP_Fp1-C3,mean_10s_LP_C3-O1,min_10s_LP_C3-O1,max_10s_LP_C3-O1,std_10s_LP_C3-O1,max-min_10s_LP_C3-O1,mean_10s_RP_Fp2-C4,min_10s_RP_Fp2-C4,max_10s_RP_Fp2-C4,std_10s_RP_Fp2-C4,max-min_10s_RP_Fp2-C4,mean_10s_RP_C4-O2,min_10s_RP_C4-O2,max_10s_RP_C4-O2,std_10s_RP_C4-O2,max-min_10s_RP_C4-O2,mean_10s_RR_Fp2-T4,min_10s_RR_Fp2-T4,max_10s_RR_Fp2-T4,std_10s_RR_Fp2-T4,max-min_10s_RR_Fp2-T4,…,max-min_20s_LL_Fp1-T3,mean_20s_LL_T3-O1,min_20s_LL_T3-O1,max_20s_LL_T3-O1,std_20s_LL_T3-O1,max-min__20s_LL_T3-O1,mean_20s_LP_Fp1-C3,min_20s_LP_Fp1-C3,max_20s_LP_Fp1-C3,std_20s_LP_Fp1-C3,max-min_20s_LP_Fp1-C3,mean_20s_LP_C3-O1,min_20s_LP_C3-O1,max_20s_LP_C3-O1,std_20s_LP_C3-O1,max-min_20s_LP_C3-O1,mean_20s_RP_Fp2-C4,min_20s_RP_Fp2-C4,max_20s_RP_Fp2-C4,std_20s_RP_Fp2-C4,max-min_20s_RP_Fp2-C4,mean_20s_RP_C4-O2,min_20s_RP_C4-O2,max_20s_RP_C4-O2,std_20s_RP_C4-O2,max-min_20s_RP_C4-O2,mean_20s_RR_Fp2-T4,min_20s_RR_Fp2-T4,max_20s_RR_Fp2-T4,std_20s_RR_Fp2-T4,max-min_20s_RR_Fp2-T4,mean_20s_RR_T4-O2,min_20s_RR_T4-O2,max_20s_RR_T4-O2,std_20s_RR_T4-O2,max-min_20s_RR_T4-O2,eeg_id
f32,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i64
-34.799999,49.975,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3911565283
-27.799999,49.98,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3911565283
21.98,49.985,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3911565283
-5.8,49.99,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3911565283
-54.950001,49.995,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3911565283


In [8]:
center_seconds_list = [20, 30, 40]
for sec in center_seconds_list:
    # ある時刻における集計特徴量を取得
    temp_sec = temp.filter(pl.col("seconds_eeg") == sec)
    temp_sec = temp_sec.drop(["EKG", "seconds_eeg"])
    
    print(temp_sec.head())

    # 列名を一括で変更(eeg_idはもとに戻す)
    col_suffix = "_at_" + str(sec) + "s"
    col_eeg_id = "eeg_id_at_" + str(sec) + "s"
    list_original_col = temp_sec.columns
    temp_sec = temp_sec.with_columns(pl.all().name.suffix(col_suffix)).drop(list_original_col) # suffixを付与して、元の列名の列を削除
    temp_sec = temp_sec.rename({col_eeg_id: "eeg_id"})

    print(df_test.shape)

    # join
    df_test = df_test.join(temp_sec, on = "eeg_id", how = "left")

    print(df_test.shape)

shape: (1, 81)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ mean_10s_ ┆ min_10s_L ┆ max_10s_L ┆ std_10s_L ┆ … ┆ max_20s_R ┆ std_20s_R ┆ max-min_2 ┆ eeg_id   │
│ LL_Fp1-T3 ┆ L_Fp1-T3  ┆ L_Fp1-T3  ┆ L_Fp1-T3  ┆   ┆ R_T4-O2   ┆ R_T4-O2   ┆ 0s_RR_T4- ┆ ---      │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ O2        ┆ i64      │
│ f32       ┆ f32       ┆ f32       ┆ f32       ┆   ┆ f32       ┆ f32       ┆ ---       ┆          │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆ f32       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ -4.881917 ┆ 92.529999 ┆ -92.77000 ┆ 27.796616 ┆ … ┆ -140.1300 ┆ 38.643867 ┆ 242.26001 ┆ 39115652 │
│           ┆           ┆ 4         ┆           ┆   ┆ 05        ┆           ┆           ┆ 83       │
└───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────

In [10]:
df_test.columns

['spectrogram_id',
 'eeg_id',
 'patient_id',
 'mean_10s_LL_Fp1-T3_at_20s',
 'min_10s_LL_Fp1-T3_at_20s',
 'max_10s_LL_Fp1-T3_at_20s',
 'std_10s_LL_Fp1-T3_at_20s',
 'max-min_10s_LL_Fp1-T3_at_20s',
 'mean_10s_LL_T3-O1_at_20s',
 'min_10s_LL_T3-O1_at_20s',
 'max_10s_LL_T3-O1_at_20s',
 'std_10s_LL_T3-O1_at_20s',
 'max-min__10s_LL_T3-O1_at_20s',
 'mean_10s_LP_Fp1-C3_at_20s',
 'min_10s_LP_Fp1-C3_at_20s',
 'max_10s_LP_Fp1-C3_at_20s',
 'std_10s_LP_Fp1-C3_at_20s',
 'max-min_10s_LP_Fp1-C3_at_20s',
 'mean_10s_LP_C3-O1_at_20s',
 'min_10s_LP_C3-O1_at_20s',
 'max_10s_LP_C3-O1_at_20s',
 'std_10s_LP_C3-O1_at_20s',
 'max-min_10s_LP_C3-O1_at_20s',
 'mean_10s_RP_Fp2-C4_at_20s',
 'min_10s_RP_Fp2-C4_at_20s',
 'max_10s_RP_Fp2-C4_at_20s',
 'std_10s_RP_Fp2-C4_at_20s',
 'max-min_10s_RP_Fp2-C4_at_20s',
 'mean_10s_RP_C4-O2_at_20s',
 'min_10s_RP_C4-O2_at_20s',
 'max_10s_RP_C4-O2_at_20s',
 'std_10s_RP_C4-O2_at_20s',
 'max-min_10s_RP_C4-O2_at_20s',
 'mean_10s_RR_Fp2-T4_at_20s',
 'min_10s_RR_Fp2-T4_at_20s',
 'max_10s_

In [11]:
# Testデータでは、eeg_id, patient_id, spectrogram_id, label_idは1つしかない
FEATURES = df_test.drop(["eeg_id", "spectrogram_id", "patient_id"]).columns
FEATURES

['mean_10s_LL_Fp1-T3_at_20s',
 'min_10s_LL_Fp1-T3_at_20s',
 'max_10s_LL_Fp1-T3_at_20s',
 'std_10s_LL_Fp1-T3_at_20s',
 'max-min_10s_LL_Fp1-T3_at_20s',
 'mean_10s_LL_T3-O1_at_20s',
 'min_10s_LL_T3-O1_at_20s',
 'max_10s_LL_T3-O1_at_20s',
 'std_10s_LL_T3-O1_at_20s',
 'max-min__10s_LL_T3-O1_at_20s',
 'mean_10s_LP_Fp1-C3_at_20s',
 'min_10s_LP_Fp1-C3_at_20s',
 'max_10s_LP_Fp1-C3_at_20s',
 'std_10s_LP_Fp1-C3_at_20s',
 'max-min_10s_LP_Fp1-C3_at_20s',
 'mean_10s_LP_C3-O1_at_20s',
 'min_10s_LP_C3-O1_at_20s',
 'max_10s_LP_C3-O1_at_20s',
 'std_10s_LP_C3-O1_at_20s',
 'max-min_10s_LP_C3-O1_at_20s',
 'mean_10s_RP_Fp2-C4_at_20s',
 'min_10s_RP_Fp2-C4_at_20s',
 'max_10s_RP_Fp2-C4_at_20s',
 'std_10s_RP_Fp2-C4_at_20s',
 'max-min_10s_RP_Fp2-C4_at_20s',
 'mean_10s_RP_C4-O2_at_20s',
 'min_10s_RP_C4-O2_at_20s',
 'max_10s_RP_C4-O2_at_20s',
 'std_10s_RP_C4-O2_at_20s',
 'max-min_10s_RP_C4-O2_at_20s',
 'mean_10s_RR_Fp2-T4_at_20s',
 'min_10s_RR_Fp2-T4_at_20s',
 'max_10s_RR_Fp2-T4_at_20s',
 'std_10s_RR_Fp2-T4_at_20s

In [12]:
# pandasに変換
df_test = df_test.to_pandas()

# Train CatBoost
CatBoost は、超高速トレーニングのために (パラメーター `task_type='GPU'` を追加すると) 両方の Kaggle T4 GPU を自動的に使用する。

In [14]:
import catboost as cat
from catboost import CatBoostClassifier, Pool
print('CatBoost version',cat.__version__)

CatBoost version 1.2.2


## Create Model Folder

In [17]:
%%time
from sklearn.model_selection import KFold, GroupKFold
import json

preds = []

for i in range(5):
    print(i,', ',end='')
    model = CatBoostClassifier(task_type='GPU')
    model.load_model(f'../input/catboost-eeg-feature-ver{VER}-train/models/CAT_v{VER}_f{i}.cat')
    
    test_pool = Pool(
        data = df_test[FEATURES]
    )
    
    pred = model.predict_proba(test_pool)
    preds.append(pred)
pred = np.mean(preds,axis=0)
print()
print('Test preds shape',pred.shape)


0 , 1 , 2 , 3 , 4 , 
Test preds shape (1, 6)
CPU times: total: 15.6 ms
Wall time: 68.4 ms


In [24]:
sub = pd.DataFrame({'eeg_id':df_test.eeg_id.values})
TARGETS = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
sub[TARGETS] = pred
sub.to_csv('submission.csv',index=False)
print('Submissionn shape',sub.shape)
sub.head()

Submissionn shape (1, 7)


Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,3911565283,0.120482,0.173887,0.112989,0.086354,0.103649,0.402639


In [25]:
# SANITY CHECK TO CONFIRM PREDICTIONS SUM TO ONE
sub.iloc[:,-6:].sum(axis=1)

0    1.0
dtype: float64