# KNN Starter for Brain Comp

Kaggleの脳波コンペ用のStarter Notebookである。SpectrogramとEEGを使用（UPDATE済み）。より多くのSpectrogram / EEG特徴量をエンジニアリングすることで、CV / LBスコアを改善できる。

# Load Libraries

In [26]:
import os, gc
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
import pandas as pd, numpy as np
import matplotlib.pyplot as plt

VER = 1

# Load Train Data

In [27]:
# df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
df = pd.read_csv('input/train.csv')
TARGETS = df.columns[-6:]
print('Train shape:', df.shape )
print('Targets', list(TARGETS))
df.head()

Train shape: (106800, 15)
Targets ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


# Create Non-Overlapping Eeg Id Train Data

In [28]:
# EEGデータをeeg_idでグループ化して、前処理と集計をおこなう

# 最初の'spectrogram_id'と最小の'spectrogram_label_offset_seconds'を集計
train = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_id':'first','spectrogram_label_offset_seconds':'min'})
train.columns = ['spec_id','min']

# 最大の'spectrogram_label_offset_seconds'を集計
tmp = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_label_offset_seconds':'max'})
train['max'] = tmp

# 'patient_id'を集計
tmp = df.groupby('eeg_id')[['patient_id']].agg('first')
train['patient_id'] = tmp

# TARGETSの各列の値をそれぞれ合計
tmp = df.groupby('eeg_id')[TARGETS].agg('sum')
for t in TARGETS:
    train[t] = tmp[t].values
    
# TARGETSの各列を正規化
y_data = train[TARGETS].values
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train[TARGETS] = y_data

# 'expert_consensus'を集計
tmp = df.groupby('eeg_id')[['expert_consensus']].agg('first')
train['target'] = tmp

# eeg_idをインデックスから列に戻す
train = train.reset_index()
print('Train non-overlapp eeg_id shape:', train.shape )
train.head()

Train non-overlapp eeg_id shape: (17089, 12)


Unnamed: 0,eeg_id,spec_id,min,max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,target
0,568657,789577333,0.0,16.0,20654,0.0,0.0,0.25,0.0,0.166667,0.583333,Other
1,582999,1552638400,0.0,38.0,20230,0.0,0.857143,0.0,0.071429,0.0,0.071429,LPD
2,642382,14960202,1008.0,1032.0,5955,0.0,0.0,0.0,0.0,0.0,1.0,Other
3,751790,618728447,908.0,908.0,38549,0.0,0.0,1.0,0.0,0.0,0.0,GPD
4,778705,52296320,0.0,0.0,40955,0.0,0.0,0.0,0.0,0.0,1.0,Other


# Feature Engineer
特徴量を生成する。
- 400個のスペクトログラムの中央の10分 / 20秒の時間平均 / 時間最小値
- EEGから生成した EEGスペクトログラム

In [29]:
# Falseのとき、CHRIS DEOTTE氏が作成したデータを利用
READ_SPEC_FILES = False
READ_EEG_SPEC_FILES = False

In [30]:
%%time
# READ ALL SPECTROGRAMS
# スペクトログラムの読み込み
# PATH = '/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/'
PATH = '../input/hms-harmful-brain-activity-classification/train_spectrograms/'
files = os.listdir(PATH)
print(f'There are {len(files)} spectrogram parquets')

if READ_SPEC_FILES:    
    spectrograms = {}
    for i,f in enumerate(files):
        if i%100==0: print(i,', ',end='')
        tmp = pd.read_parquet(f'{PATH}{f}')
        name = int(f.split('.')[0])
        spectrograms[name] = tmp.iloc[:,1:].values
else:
    # spectrograms = np.load('../input/brain-spectrograms/specs.npy',allow_pickle=True).item()
    spectrograms = np.load('../input/hms-harmful-brain-activity-classification/brain-spectrograms/specs.npy',allow_pickle=True).item()

There are 11138 spectrogram parquets
CPU times: total: 62.5 ms
Wall time: 6.16 s


In [31]:
%%time
# READ ALL EEG SPECTROGRAMS
# 生のEEGデータから作成されたEEG スペクトログラムの読み込み
if READ_EEG_SPEC_FILES:
    all_eegs = {}
    for i,e in enumerate(train.eeg_id.values):
        if i%100==0: print(i,', ',end='')
        x = np.load(f'../input/hms-harmful-brain-activity-classification/brain-eeg-spectrograms/EEG_Spectrograms/{e}.npy')
        all_eegs[e] = x
else:
    # all_eegs = np.load('/kaggle/input/brain-eeg-spectrograms/eeg_specs.npy',allow_pickle=True).item()
    all_eegs = np.load('../input/hms-harmful-brain-activity-classification/brain-eeg-spectrograms/eeg_specs.npy',allow_pickle=True).item()

CPU times: total: 984 ms
Wall time: 20.3 s


In [32]:
%time
# ENGINEER FEATURES
import warnings
warnings.filterwarnings('ignore')

# FEATURE NAMES
# 元の Spectrogram のファイルから、列名を取得
SPEC_COLS = pd.read_parquet(f'{PATH}1000086677.parquet').columns[1:]
FEATURES = [f'{c}_mean_10m' for c in SPEC_COLS]             # スペクトログラム: 10分の時間窓の時間平均
FEATURES += [f'{c}_min_10m' for c in SPEC_COLS]             # スペクトログラム: 10分の時間窓の時間最小値
FEATURES += [f'{c}_mean_20s' for c in SPEC_COLS]            # スペクトログラム: 20秒の時間窓の時間平均
FEATURES += [f'{c}_min_20s' for c in SPEC_COLS]             # スペクトログラム: 20秒の時間窓の時間最小値
FEATURES += [f'eeg_mean_f{x}_10s' for x in range(512)]      # 脳波: 10分の時間窓の時間平均
FEATURES += [f'eeg_min_f{x}_10s' for x in range(512)]       # 脳波: 10分の時間窓の時間最小値
FEATURES += [f'eeg_max_f{x}_10s' for x in range(512)]       # 脳波: 20秒の時間窓の時間平均
FEATURES += [f'eeg_std_f{x}_10s' for x in range(512)]       # 脳波: 20秒の時間窓の時間平均
print(f'We are creating {len(FEATURES)} features for {len(train)} rows... ',end='')

data = np.zeros((len(train),len(FEATURES)))
for k in range(len(train)):
    if k%100==0: print(k,', ',end='')
    row = train.iloc[k]
    r = int( (row['min'] + row['max'])//4 ) 

    ### スペクトログラム特徴量の計算
    # 10 MINUTE WINDOW FEATURES (MEANS and MINS)
    # 10分時間窓の特徴量計算
    x = np.nanmean(spectrograms[row.spec_id][r:r+300,:],axis=0)
    data[k,:400] = x
    x = np.nanmin(spectrograms[row.spec_id][r:r+300,:],axis=0)
    data[k,400:800] = x

    # 20 SECOND WINDOW FEATURES (MEANS and MINS)
    # 20秒時間窓の特徴量計算
    x = np.nanmean(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
    data[k,800:1200] = x
    x = np.nanmin(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
    data[k,1200:1600] = x

    ### EEG特徴量の計算
    # RESHAPE EEG SPECTROGRAMS 128x256x4 => 512x256
    eeg_spec = np.zeros((512,256),dtype='float32')
    xx = all_eegs[row.eeg_id]
    for j in range(4): eeg_spec[128*j:128*(j+1),] = xx[:,:,j]

    # 10 SECOND WINDOW FROM EEG SPECTROGRAMS 
    # 10秒時間窓の特徴量の計算（mean, min, max, std）
    x = np.nanmean(eeg_spec.T[100:-100,:],axis=0)
    data[k,1600:2112] = x
    x = np.nanmin(eeg_spec.T[100:-100,:],axis=0)
    data[k,2112:2624] = x
    x = np.nanmax(eeg_spec.T[100:-100,:],axis=0)
    data[k,2624:3136] = x
    x = np.nanstd(eeg_spec.T[100:-100,:],axis=0)
    data[k,3136:3648] = x

train[FEATURES] = data
print(); print('New train shape:',train.shape)

CPU times: total: 0 ns
Wall time: 0 ns


We are creating 3648 features for 17089 rows... 0 , 100 , 200 , 300 , 400 , 500 , 600 , 700 , 800 , 900 , 1000 , 1100 , 1200 , 1300 , 1400 , 1500 , 1600 , 1700 , 1800 , 1900 , 2000 , 2100 , 2200 , 2300 , 2400 , 2500 , 2600 , 2700 , 2800 , 2900 , 3000 , 3100 , 3200 , 3300 , 3400 , 3500 , 3600 , 3700 , 3800 , 3900 , 4000 , 4100 , 4200 , 4300 , 4400 , 4500 , 4600 , 4700 , 4800 , 4900 , 5000 , 5100 , 5200 , 5300 , 5400 , 5500 , 5600 , 5700 , 5800 , 5900 , 6000 , 6100 , 6200 , 6300 , 6400 , 6500 , 6600 , 6700 , 6800 , 6900 , 7000 , 7100 , 7200 , 7300 , 7400 , 7500 , 7600 , 7700 , 7800 , 7900 , 8000 , 8100 , 8200 , 8300 , 8400 , 8500 , 8600 , 8700 , 8800 , 8900 , 9000 , 9100 , 9200 , 9300 , 9400 , 9500 , 9600 , 9700 , 9800 , 9900 , 10000 , 10100 , 10200 , 10300 , 10400 , 10500 , 10600 , 10700 , 10800 , 10900 , 11000 , 11100 , 11200 , 11300 , 11400 , 11500 , 11600 , 11700 , 11800 , 11900 , 12000 , 12100 , 12200 , 12300 , 12400 , 12500 , 12600 , 12700 , 12800 , 12900 , 13000 , 13100 , 13200 , 

In [33]:
# FREE MEMORY
del all_eegs, spectrograms, data
gc.collect()

0

In [34]:
# KNNに入れるため、NaNを埋める
# ここでは、列ごとの平均にする
train = train.fillna(train.mean())

In [35]:
train.columns

Index(['eeg_id', 'spec_id', 'min', 'max', 'patient_id', 'seizure_vote',
       'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote',
       ...
       'eeg_std_f502_10s', 'eeg_std_f503_10s', 'eeg_std_f504_10s',
       'eeg_std_f505_10s', 'eeg_std_f506_10s', 'eeg_std_f507_10s',
       'eeg_std_f508_10s', 'eeg_std_f509_10s', 'eeg_std_f510_10s',
       'eeg_std_f511_10s'],
      dtype='object', length=3660)

# Train knn

In [36]:
# target列だけでなく、*_vote列も削除？
TARS = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}

X = train.drop(['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote',  'target'], axis = 1).values
y = train['target'].map(TARS).values

In [37]:
X.shape

(17089, 3653)

In [38]:
from joblib import dump, load

In [39]:
from sklearn.model_selection import KFold, GroupKFold
from sklearn.neighbors import KNeighborsClassifier
import json

all_oof = []
all_true = []
# Target変数のラベルの定義



gkf = GroupKFold(n_splits=5)
for i, (train_index, valid_index) in enumerate(gkf.split(train, train.target, train.patient_id)):   
    
    print('#'*25)
    print(f'### Fold {i+1}')
    print(f'### train size {len(train_index)}, valid size {len(valid_index)}')
    print('#'*25)
    
    # trainデータ / validデータを作成
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    model = KNeighborsClassifier(n_neighbors = 6, metric = 'minkowski', p = 2)

    # モデルの学習
    model.fit(X_train, y_train)

    # モデルの保存
    dump(model, f'models/KNN_v{VER}_f{i}.cat')

    # モデルのパラメータの保存
    params = model.get_params()
    with open(f'models/KNN_v{VER}_f{i}_params.txt', 'w') as f:
        json.dump(params, f)

    # 各 fold の予測確率
    oof = model.predict_proba(X_valid)
    all_oof.append(oof)
    all_true.append(train.loc[valid_index, TARGETS].values)
    
    del X_train, X_valid, y_train, y_valid, oof #model
    gc.collect()
    
    #break
    
all_oof = np.concatenate(all_oof)   # 全validationデータに対する予測確率
all_true = np.concatenate(all_true) # 全validationデータのターゲット変数の実測値


#########################
### Fold 1
### train size 13671, valid size 3418
#########################
#########################
### Fold 2
### train size 13671, valid size 3418
#########################
#########################
### Fold 3
### train size 13671, valid size 3418
#########################
#########################
### Fold 4
### train size 13671, valid size 3418
#########################
#########################
### Fold 5
### train size 13672, valid size 3417
#########################


# Feature Importance
feature importance 上位25件

In [40]:
# from sklearn.inspection import permutation_importance
# result = permutation_importance(model, X, y, n_repeats = 10, random_state = 0)
# print(result.importances_mean)

# CV Score for CatBoost
This is CV score for our CatBoost model.

In [41]:
import sys
# sys.path.append('/kaggle/input/kaggle-kl-div')
sys.path.append('../input/kaggle-kl-div')
from kaggle_kl_div import score

oof = pd.DataFrame(all_oof.copy())
oof['id'] = np.arange(len(oof))

true = pd.DataFrame(all_true.copy())
true['id'] = np.arange(len(true))

cv = score(solution=true, submission=oof, row_id_column_name='id')
print('CV Score KL-Div for KNN =',cv)

# CV Score KL-Divの保存
with open(f'models/CV_Score_v{VER}.txt', 'w') as f:
    f.write(str(cv) + "\n")

CV Score KL-Div for KNN = 10.38234918145436


# CV Score for Preds 1/6
This is CV score for Kaggle's sample submission.csv which uses equal predictions of 1/6 for all targets.

各目的変数の重みをすべて 1/6 にしたときのCVスコア

In [42]:
oof = pd.DataFrame(all_oof.copy())
for c in oof.columns:
    oof[c] = 1/6.
oof['id'] = np.arange(len(oof))

true = pd.DataFrame(all_true.copy())
true['id'] = np.arange(len(true))

cv = score(solution=true, submission=oof, row_id_column_name='id')
print('CV Score for "Use Equal Preds 1/6" =',cv)

# CV Score for "Use Equal Preds 1/6"の保存
with open(f'models/CV_Score_USE_EQUAL_PREDS_v{VER}.txt', 'w') as f:
    f.write(str(cv) + "\n")

CV Score for "Use Equal Preds 1/6" = 1.4563246139800803


# CV Score for EEG_Id Means
This is CV score for current highest scoring public notebook [here][1] which uses train means as predictions.

学習データのmeanを用いたCVスコア

[1]: https://www.kaggle.com/code/seshurajup/eda-train-csv

In [43]:
all_oof2 = []

gkf = GroupKFold(n_splits=5)
for i, (train_index, valid_index) in enumerate(gkf.split(train, train.target, train.patient_id)):  
    #print('#'*25)
    #print(f'### Fold {i+1}')
        
    y_train = train.iloc[train_index][TARGETS].values
    y_valid = train.iloc[valid_index][TARGETS].values
    
    #print(f'### train size {len(train_index)}, valid size {len(valid_index)}')
    #print('#'*25)
        
    oof = y_valid.copy()
    for j in range(6):
        oof[:,j] = y_train[:,j].mean()
    oof = oof / oof.sum(axis=1,keepdims=True)
    all_oof2.append(oof)
    
all_oof2 = np.concatenate(all_oof2)

In [44]:
oof = pd.DataFrame(all_oof2.copy())
oof['id'] = np.arange(len(oof))

true = pd.DataFrame(all_true.copy())
true['id'] = np.arange(len(true))

cv = score(solution=true, submission=oof, row_id_column_name='id')
print('CV Score for "Use Train Means" =',cv)

# CV Score for "Use Equal Preds 1/6"の保存
with open(f'models/CV_Score_EEG_ID_MEANS_v{VER}.txt', 'w') as f:
    f.write(str(cv) + "\n")

CV Score for "Use Train Means" = 1.2641160568651757
