Base notebook: https://www.kaggle.com/code/cdeotte/catboost-starter-lb-0-60

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import time

import os, gc
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

In [None]:
train_df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
TARGETS = train_df.columns[-6:]
print('Train shape:', train_df.shape )
print('Targets', list(TARGETS))
train_df.head()

In [None]:
# train_df['eeg_sub_id'].unique() # to 742

In [None]:
# Create Non-Overlapping Eeg Id Train Data
train = train_df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_id':'first','spectrogram_label_offset_seconds':'min'})
train.columns = ['spec_id','min_spec']

tmp = train_df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg(
    {'spectrogram_label_offset_seconds':'max'})
train['max_spec'] = tmp

tmp = train_df.groupby('eeg_id')[['patient_id']].agg('first')
train['patient_id'] = tmp

tmp = train_df.groupby('eeg_id')[TARGETS].agg('sum')
for t in TARGETS:
    train[t] = tmp[t].values

y_data = train[TARGETS].values
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train[TARGETS] = y_data

tmp = train_df.groupby('eeg_id')[['expert_consensus']].agg('first')
train['target'] = tmp

train = train.reset_index()

print('Train non-overlapp eeg_id shape:', train.shape )
train.head()

In [None]:
SPEC_PATH = '/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/'
EEG_PATH = '/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/'

In [None]:
%%time
# READ ALL EEGS
train_eegs = np.load('',allow_pickle=True).item()

In [None]:
%%time
# READ ALL SPECTROGRAMS from Chris Deotte dataset
spectrograms = np.load('/kaggle/input/brain-spectrograms/specs.npy',allow_pickle=True).item()

In [None]:
%%time
# READ ALL EEG SPECTROGRAMS from Chris Deotte dataset
all_eegs = np.load('/kaggle/input/brain-eeg-spectrograms/eeg_specs.npy',allow_pickle=True).item()

In [None]:
SPEC_COLS = pd.read_parquet(f'{SPEC_PATH}1000086677.parquet').columns[1:]
print(f'{len(SPEC_COLS)} columns')
SPEC_COLS

In [None]:
EEG_COLS = pd.read_parquet(f'{EEG_PATH}1000913311.parquet').columns
EEG_COLS

In [None]:
# SPEC FEATURES 10 MINUTE WINDOW 
FEATURES = [f'{c}_mean_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_std_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_min_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_25%_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_50%_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_75%_10m' for c in SPEC_COLS]
FEATURES += [f'{c}_max_10m' for c in SPEC_COLS]

# SPEC FEATURES 20 SECOND WINDOW 
FEATURES += [f'{c}_mean_20s' for c in SPEC_COLS]
FEATURES += [f'{c}_std_20s' for c in SPEC_COLS]
FEATURES += [f'{c}_min_20s' for c in SPEC_COLS]
FEATURES += [f'{c}_25%_20s' for c in SPEC_COLS]
FEATURES += [f'{c}_50%_20s' for c in SPEC_COLS]
FEATURES += [f'{c}_75%_20s' for c in SPEC_COLS]
FEATURES += [f'{c}_max_20s' for c in SPEC_COLS]

# EEG FEATURES 50 SECOND WINDOW
# FEATURES += [f'{c}_mean_50s' for c in EEG_COLS]
# FEATURES += [f'{c}_std_50s' for c in EEG_COLS]
# FEATURES += [f'{c}_min_50s' for c in EEG_COLS]
# FEATURES += [f'{c}_25%_50s' for c in EEG_COLS]
# FEATURES += [f'{c}_50%_50s' for c in EEG_COLS]
# FEATURES += [f'{c}_75%_50s' for c in EEG_COLS]
# FEATURES += [f'{c}_max_50s' for c in EEG_COLS]
# FEATURES += [f'{c}_skew_50s' for c in EEG_COLS]
# FEATURES += [f'{c}_kurtosis_50s' for c in EEG_COLS]

# EEG FEATURES 10 SECOND WINDOW
FEATURES += [f'eeg_mean_f{x}_10s' for x in range(512)]
FEATURES += [f'eeg_min_f{x}_10s' for x in range(512)]
FEATURES += [f'eeg_max_f{x}_10s' for x in range(512)]
FEATURES += [f'eeg_std_f{x}_10s' for x in range(512)]
print(f'We are creating {len(FEATURES)} features for {len(train)} rows... ')

In [None]:
data = np.zeros((len(train),len(FEATURES)))  
data.shape

In [None]:
# row = train.iloc[0]
# np.nanpercentile(spectrograms[row.spec_id], 25, axis=0)
# from scipy.stats import skew, kurtosis
# x = skew(train_eegs[row.spec_id][r+145:r+155,:],axis=0)

In [None]:
%time
import warnings
warnings.filterwarnings('ignore')

# ENGINEER FEATURES
for k in range(len(train)):
    if k%100==0: print(k,', ',end='')
    row = train.iloc[k]
    r = int( (row['min_spec'] + row['max_spec'])//4 ) 

    # 10 MINUTE WINDOW FEATURES
    x = np.nanmean(spectrograms[row.spec_id][r:r+300,:],axis=0)
    data[k,:400] = x
    x = np.nanstd(spectrograms[row.spec_id][r:r+300,:],axis=0)
    data[k,400:800] = x
    x = np.nanmin(spectrograms[row.spec_id][r:r+300,:],axis=0)
    data[k,800:1200] = x
    x = np.nanpercentile(spectrograms[row.spec_id][r:r+300,:], 25, axis=0)
    data[k,1200:1600] = x
    x = np.nanpercentile(spectrograms[row.spec_id][r:r+300,:], 50, axis=0)
    data[k,1600:2000] = x
    x = np.nanpercentile(spectrograms[row.spec_id][r:r+300,:], 75, axis=0)
    data[k,2000:2400] = x
    x = np.nanmax(spectrograms[row.spec_id][r:r+300,:],axis=0)
    data[k,2400:2800] = x

    # 20 SECOND WINDOW FEATURES
    x = np.nanmean(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
    data[k,2800:3200] = x
    x = np.nanstd(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
    data[k,3200:3600] = x
    x = np.nanmin(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
    data[k,3600:4000] = x
    x = np.nanpercentile(spectrograms[row.spec_id][r+145:r+155,:], 25, axis=0)
    data[k,4000:4400] = x
    x = np.nanpercentile(spectrograms[row.spec_id][r+145:r+155,:], 50, axis=0)
    data[k,4400:4800] = x
    x = np.nanpercentile(spectrograms[row.spec_id][r+145:r+155,:], 75, axis=0)
    data[k,4800:5200] = x
    x = np.nanmax(spectrograms[row.spec_id][r+145:r+155,:],axis=0)
    data[k,5200:5600] = x

    # RESHAPE EEG SPECTROGRAMS 128x256x4 => 512x256
    eeg_spec = np.zeros((512,256),dtype='float32')
    xx = all_eegs[row.eeg_id]
    for j in range(4): eeg_spec[128*j:128*(j+1),] = xx[:,:,j]

    # 10 SECOND WINDOW FROM EEG SPECTROGRAMS 
    x = np.nanmean(eeg_spec.T[100:-100,:],axis=0)
    data[k,5600:6112] = x
    x = np.nanmin(eeg_spec.T[100:-100,:],axis=0)
    data[k,6112:6624] = x
    x = np.nanmax(eeg_spec.T[100:-100,:],axis=0)
    data[k,6624:7136] = x
    x = np.nanstd(eeg_spec.T[100:-100,:],axis=0)
    data[k,7136:7648] = x

train[FEATURES] = data
print(); print('New train shape:',train.shape)

In [None]:
del all_eegs, spectrograms, data
gc.collect()

In [None]:
train[:1]

In [None]:
train.info()

# Train

In [None]:
import catboost as cat
from catboost import CatBoostClassifier, Pool

print('CatBoost version',cat.__version__)

In [None]:
from sklearn.model_selection import KFold, GroupKFold

all_oof = []
all_true = []
TARS = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}

gkf = GroupKFold(n_splits=5)
for i, (train_index, valid_index) in enumerate(gkf.split(train, train.target, train.patient_id)):   
    
    print('#'*25)
    print(f'### Fold {i+1}')
    print(f'### train size {len(train_index)}, valid size {len(valid_index)}')
    print('#'*25)
    
    model = CatBoostClassifier(task_type='GPU',
                               loss_function='MultiClass')
    
    train_pool = Pool(
        data = train.loc[train_index,FEATURES],
        label = train.loc[train_index,'target'].map(TARS),
    )
    
    valid_pool = Pool(
        data = train.loc[valid_index,FEATURES],
        label = train.loc[valid_index,'target'].map(TARS),
    )
    
    model.fit(train_pool,
             verbose=100,
             eval_set=valid_pool,
             )
    model.save_model(f'CAT_f{i}.cat')
    
    oof = model.predict_proba(valid_pool)
    all_oof.append(oof)
    all_true.append(train.loc[valid_index, TARGETS].values)
    
    del train_pool, valid_pool, oof #model
    gc.collect()
    
    #break

all_oof = np.concatenate(all_oof)
all_true = np.concatenate(all_true)

# CV

In [None]:
import sys
sys.path.append('/kaggle/input/kaggle-kl-div')
from kaggle_kl_div import score

oof = pd.DataFrame(all_oof.copy())
oof['id'] = np.arange(len(oof))

true = pd.DataFrame(all_true.copy())
true['id'] = np.arange(len(true))

cv = score(solution=true, submission=oof, row_id_column_name='id')
print('CV Score KL-Div for CatBoost =',cv)

In [None]:
TOP = 25

feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(10, 8))
plt.barh(np.arange(len(sorted_idx))[-TOP:], feature_importance[sorted_idx][-TOP:], align='center')
plt.yticks(np.arange(len(sorted_idx))[-TOP:], np.array(FEATURES)[sorted_idx][-TOP:])
plt.title(f'Feature Importance - Top {TOP}')
plt.show()

# Test

In [None]:
test = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/test.csv')
print('Test shape',test.shape)
test.head()

In [None]:
import pywt, librosa

USE_WAVELET = None 

NAMES = ['LL','LP','RP','RR']

FEATS = [['Fp1','F7','T3','T5','O1'],
         ['Fp1','F3','C3','P3','O1'],
         ['Fp2','F8','T4','T6','O2'],
         ['Fp2','F4','C4','P4','O2']]

# DENOISE FUNCTION
def maddest(d, axis=None):
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

def denoise(x, wavelet='haar', level=1):    
    coeff = pywt.wavedec(x, wavelet, mode="per")
    sigma = (1/0.6745) * maddest(coeff[-level])

    uthresh = sigma * np.sqrt(2*np.log(len(x)))
    coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])

    ret=pywt.waverec(coeff, wavelet, mode='per')
    
    return ret

def spectrogram_from_eeg(parquet_path, display=False):
    
    # LOAD MIDDLE 50 SECONDS OF EEG SERIES
    eeg = pd.read_parquet(parquet_path)
    middle = (len(eeg)-10_000)//2
    eeg = eeg.iloc[middle:middle+10_000]
    
    # VARIABLE TO HOLD SPECTROGRAM
    img = np.zeros((128,256,4),dtype='float32')
    
    if display: plt.figure(figsize=(10,7))
    signals = []
    for k in range(4):
        COLS = FEATS[k]
        
        for kk in range(4):
        
            # COMPUTE PAIR DIFFERENCES
            x = eeg[COLS[kk]].values - eeg[COLS[kk+1]].values

            # FILL NANS
            m = np.nanmean(x)
            if np.isnan(x).mean()<1: x = np.nan_to_num(x,nan=m)
            else: x[:] = 0

            # DENOISE
            if USE_WAVELET:
                x = denoise(x, wavelet=USE_WAVELET)
            signals.append(x)

            # RAW SPECTROGRAM
            mel_spec = librosa.feature.melspectrogram(y=x, sr=200, hop_length=len(x)//256, 
                  n_fft=1024, n_mels=128, fmin=0, fmax=20, win_length=128)

            # LOG TRANSFORM
            width = (mel_spec.shape[1]//32)*32
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).astype(np.float32)[:,:width]

            # STANDARDIZE TO -1 TO 1
            mel_spec_db = (mel_spec_db+40)/40 
            img[:,:,k] += mel_spec_db
                
        # AVERAGE THE 4 MONTAGE DIFFERENCES
        img[:,:,k] /= 4.0
        
        if display:
            plt.subplot(2,2,k+1)
            plt.imshow(img[:,:,k],aspect='auto',origin='lower')
            plt.title(f'EEG {eeg_id} - Spectrogram {NAMES[k]}')
            
    if display: 
        plt.show()
        plt.figure(figsize=(10,5))
        offset = 0
        for k in range(4):
            if k>0: offset -= signals[3-k].min()
            plt.plot(range(10_000),signals[k]+offset,label=NAMES[3-k])
            offset += signals[3-k].max()
        plt.legend()
        plt.title(f'EEG {eeg_id} Signals')
        plt.show()
        print(); print('#'*25); print()
        
    return img

In [None]:
PATH2 = '/kaggle/input/hms-harmful-brain-activity-classification/test_eegs/'
DISPLAY = 0
EEG_IDS2 = test.eeg_id.unique()
all_eegs2 = {}

print('Converting Test EEG to Spectrograms...'); print()
for i,eeg_id in enumerate(EEG_IDS2):
        
    # CREATE SPECTROGRAM FROM EEG PARQUET
    img = spectrogram_from_eeg(f'{PATH2}{eeg_id}.parquet', i<DISPLAY)
    all_eegs2[eeg_id] = img

In [None]:
# FEATURE ENGINEER TEST
PATH2 = '/kaggle/input/hms-harmful-brain-activity-classification/test_spectrograms/'
data = np.zeros((len(test),len(FEATURES)))

# ENGINEER FEATURES
for k in range(len(test)):
    row = test.iloc[k]
    r = int( row.spectrogram_id )
    spec = pd.read_parquet(f'{PATH2}{r}.parquet')

    # 10 MINUTE WINDOW FEATURES
    x = np.nanmean(spec.iloc[:,1:].values, axis=0)
    data[k,:400] = x
    x = np.nanstd(spec.iloc[:,1:].values, axis=0)
    data[k,400:800] = x
    x = np.nanmin(spec.iloc[:,1:].values, axis=0)
    data[k,800:1200] = x
    x = np.nanpercentile(spec.iloc[:,1:].values, 25, axis=0)
    data[k,1200:1600] = x
    x = np.nanpercentile(spec.iloc[:,1:].values, 50, axis=0)
    data[k,1600:2000] = x
    x = np.nanpercentile(spec.iloc[:,1:].values, 75, axis=0)
    data[k,2000:2400] = x
    x = np.nanmax(spec.iloc[:,1:].values, axis=0)
    data[k,2400:2800] = x

    # 20 SECOND WINDOW FEATURES
    x = np.nanmean(spec.iloc[145:155,1:].values, axis=0)
    data[k,2800:3200] = x
    x = np.nanstd(spec.iloc[145:155,1:].values, axis=0)
    data[k,3200:3600] = x
    x = np.nanmin(spec.iloc[145:155,1:].values, axis=0)
    data[k,3600:4000] = x
    x = np.nanpercentile(spec.iloc[145:155,1:].values, 25, axis=0)
    data[k,4000:4400] = x
    x = np.nanpercentile(spec.iloc[145:155,1:].values, 50, axis=0)
    data[k,4400:4800] = x
    x = np.nanpercentile(spec.iloc[145:155,1:].values, 75, axis=0)
    data[k,4800:5200] = x
    x = np.nanmax(spec.iloc[145:155,1:].values, axis=0)
    data[k,5200:5600] = x

    # RESHAPE EEG SPECTROGRAMS 128x256x4 => 512x256
    eeg_spec = np.zeros((512,256),dtype='float32')
    xx = all_eegs2[row.eeg_id]
    for j in range(4): eeg_spec[128*j:128*(j+1),] = xx[:,:,j]

    # 10 SECOND WINDOW FROM EEG SPECTROGRAMS 
    x = np.nanmean(eeg_spec.T[100:-100,:],axis=0)
    data[k,5600:6112] = x
    x = np.nanmin(eeg_spec.T[100:-100,:],axis=0)
    data[k,6112:6624] = x
    x = np.nanmax(eeg_spec.T[100:-100,:],axis=0)
    data[k,6624:7136] = x
    x = np.nanstd(eeg_spec.T[100:-100,:],axis=0)
    data[k,7136:7648] = x

test[FEATURES] = data
print(); print('New test shape:',test.shape)

In [None]:
test

# Sub

In [None]:
# INFER CATBOOST ON TEST
preds = []

for i in range(5):
    print(i,', ',end='')
    model = CatBoostClassifier(task_type='GPU')
    model.load_model(f'CAT_f{i}.cat')
    
    test_pool = Pool(
        data = test[FEATURES]
    )
    
    pred = model.predict_proba(test_pool)
    preds.append(pred)

In [None]:
pred = np.mean(preds,axis=0)
print(pred)
print('Test preds shape',pred.shape)

In [None]:
sub = pd.DataFrame({'eeg_id':test.eeg_id.values})
sub[TARGETS] = pred
sub

In [None]:
sub.to_csv('submission.csv',index=False)
print('Submissionn shape',sub.shape)
sub.head()

In [None]:
# SANITY CHECK TO CONFIRM PREDICTIONS SUM TO ONE
sub.iloc[:,-6:].sum(axis=1)