In [None]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

import pywt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import log_loss, confusion_matrix, f1_score

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

# USE MULTIPLE GPUS
gpus = tf.config.list_physical_devices('GPU')
if len(gpus)<=1: 
    strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")
    print(f'Using {len(gpus)} GPU')
else: 
    strategy = tf.distribute.MirroredStrategy()
    print(f'Using {len(gpus)} GPUs')

In [None]:
train_df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
test_df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/test.csv')
submission = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/sample_submission.csv')

spectrograms = np.load('/kaggle/input/brain-spectrograms/specs.npy',allow_pickle=True)

In [None]:
display(train_df.head())
display(test_df.head())

In [None]:
train_votes = train_df.columns[-6:]

train = train_df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds', 'patient_id' ,'expert_consensus']].agg(
    {'spectrogram_id': 'first',
     'spectrogram_label_offset_seconds': ['min', 'max'],
     'patient_id': 'first',
     'expert_consensus': 'first',
    })

train.columns = ['spec_id','spec_offset_min', 'spec_offset_max', 'patient_id', 'target']

tmp = train_df.groupby('eeg_id')[train_votes].agg('sum') # get sum per vote for each eeg
tmp[train_votes] = tmp[train_votes].div(tmp[train_votes].sum(axis=1), axis=0) # convert into probabilities

train[train_votes] = tmp
train = train[[col for col in train if col != 'target'] + ['target']] # put target variable at the end for more readability
train = train.reset_index()

display(train.head())

## Denoising function

Can be useful later to generate spectrogram from EEG

In [None]:
def maddest(d, axis=None):
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

def denoise(x, wavelet='haar', level=1):
    ret = {key:[] for key in x.columns}
    
    for pos in x.columns:
        coeff = pywt.wavedec(x[pos], wavelet, mode="per")
        sigma = (1/0.6745) * maddest(coeff[-level])

        uthresh = sigma * np.sqrt(2*np.log(len(x)))
        coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])

        ret[pos]=pywt.waverec(coeff, wavelet, mode='per')
    
    return pd.DataFrame(ret)

# eeg_denoised = denoise(eeg, wavelet="db8")

## Ploting function

In [None]:
# 2366870, 2259539799
def eeg_info(eeg_id=36718960):
    eeg_df = train_df[train_df["eeg_id"] == eeg_id].reset_index(drop=True)
    spec = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/{eeg_df["spectrogram_id"].values[0]}.parquet')    

    eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
    
    display(eeg_df)
    
    # PLOT THE SUB EEG TIME REPRESENTATION
    plt.figure(figsize=(20, min(10, len(eeg_df))))
    bars = plt.barh(y=eeg_df['eeg_sub_id'], width=50, left=eeg_df['eeg_label_offset_seconds'], color='plum')
    
    y_min, y_max = plt.ylim()
    
    for i,row in eeg_df.iterrows():
        start_mid_range = row['eeg_label_offset_seconds'] + 50/2 - 5
        end_mid_range = row['eeg_label_offset_seconds'] + 50/2 + 5
        
        ymin, ymax = (y_max - 0.4 - i)/(abs(y_min) + y_max), (y_max + 0.4 - i)/(abs(y_min) + y_max)
        plt.axvspan(xmin=start_mid_range, xmax=end_mid_range, ymin=ymin, ymax=ymax, alpha=0.2, color='blue', label='10sec vote zone' if i == 0 else None)
        plt.text(25 + eeg_df['eeg_label_offset_seconds'].iloc[i], i+0.05, f'{eeg_df["expert_consensus"].iloc[i]}', color='white', fontweight='bold', horizontalalignment='center')
    
    
    plt.gca().invert_yaxis()
    plt.ylabel('EEG sub id')
    plt.xlabel('Time (seconds)')
    
    max_offset = int(eeg_df["eeg_label_offset_seconds"].max())
    eeg_time = max_offset + 50
                     
    plt.title(f'EEG {eeg_id} during {eeg_time} seconds')
    plt.legend()
    plt.show()
    
    
    # PLOT THE EEG VOTES 
    #display(eeg.head(3))
    
    plt.figure(figsize=(20,5))
    plt.plot(eeg['Fp1'])
    
    for i,row in eeg_df.iterrows():
        plt.axvline(x=(row['eeg_label_offset_seconds']*200),color='green', label='Start of subsample' if i == 0 else None)    
        plt.axvline(x=((row['eeg_label_offset_seconds']+50)*200),color='red', label='End of subsample' if i == 0 else None)  
    
    plt.title(f'EEG - Fp1 having {eeg_time}s * 200 samples/s = {eeg_time*200} samples')
    plt.xlim(xmin=0)
    plt.grid()
    plt.legend()
    plt.show()
    
    
    # PLOT THE SPECTROGRAM
    #display(spec.head(3))
               
    max_spec_offset = int(eeg_df['spectrogram_label_offset_seconds'].max())
    
    plt.figure(figsize=(20,5))
    plt.plot(spec['time'], spec['LL_0.59'])
    
    for i,row in eeg_df.iterrows():
        plt.axvline(x=row['spectrogram_label_offset_seconds'], color='green', label='Start of subsample' if i == 0 else None)   
        plt.axvline(x=((row['spectrogram_label_offset_seconds'])+600),color='red', label='End of subsample' if i == 0 else None)
    
    spec["time"]
    plt.title(f'Spectrogram - LL_0.59 during {spec["time"].values[-1]} seconds')
    plt.xlim(xmin=0)
    plt.ylabel('Frequency (Hz)')
    plt.xlabel('Time (seconds)')
    plt.legend()
    plt.grid()
    plt.show()

eeg_info(2366870)

**Some facts:**<br>
    - Experts vote on the 10 middle seconds of each sub EEG<br>
    - We have overlapping sub EEG<br>
    - Votes seems to be majoritarly unique between each sub EEG<br>
    - Expert consensus can be different in overlaping sub EEG during some seconds<br>

## Spectrogram working progress

#### The following code iterate through each row in train and preprocess 10 minutes of the corresponding spectrogram

In [None]:
channel_map = {0: "LL", 1: "LP", 2: "RR", 3: "RP"}
target_map = {'Seizure': 0, 'LPD': 1, 'GPD': 2, 'LRDA': 3, 'GRDA': 4, 'Other': 5}
i_target_map = {y:x for x,y in target_map.items()} # inverted target map

IMG_WIDTH = 256
IMG_HEIGHT = 128

X_spec = np.zeros((len(train), IMG_HEIGHT, IMG_WIDTH, 4), dtype='float32')
y_spec = np.zeros((len(train)), dtype='int8')


def plot_spectrogram(index):
    for channel in range(4):
        img = X_spec[index,:,:,channel]
        plt.figure(figsize=(10, 2))
        plt.imshow(img, aspect='auto', cmap='viridis', origin='lower')
        plt.xlabel('Time [sec]')
        plt.ylabel('Frequency [Hz]')
        plt.title(f'Spectrogram {train[index].spec_id} - {channel_map[channel]} : {i_target_map[y_spec[index]]}')
        plt.show()
        
def generate_spectrograms():
    for row in train.itertuples(): # for each row in train
        i = row[0] # get index of the row in train
        
        spec_id = row.spec_id
        spec = spectrograms.item()[spec_id]
        
        r = np.random.randint(row.spec_offset_min, row.spec_offset_max+1)//2

        for k in range(4): # 4 different channel (LL, LP, RR, RP)
            img = spec[r:r+300, k*100:(k+1)*100].T # get 10 random minutes of the spectrogram for each channel

            # log scaling
            img = np.clip(img, np.exp(-4), np.exp(8)) # avoid 0 for log
            img = np.log(img)

            # z-score scaling
            ep = 1e-6
            mean = np.nanmean(img.flatten())
            std = np.nanstd(img.flatten())
            img = (img - mean) / (std + ep)
            
            # convert nan to value 0
            img = np.nan_to_num(img, nan=0.0)

            # crop to size 256
            img = img[:,22:-22] # 300-22-22=256

            X_spec[i, 14:-14,:, k] = img # 14:-14 to put the IMG_HEIGHT (100) into 128
            y_spec[i] = target_map[row.target]
    
generate_spectrograms()

**Finally, we have 17 089 spectrograms, each one corresponding to an eeg_id**

In [None]:
plot_spectrogram(0)

#### Now we will try to create the CNN model

In [None]:
def build_cnn_model(lr=1e-3): # impossible because to much RAM used
    model = Sequential()
    
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(128,256,4)))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2), strides=1))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D((2, 2), strides=1))
    
    model.add(GlobalAveragePooling2D()) # avoid overfitting and because we have a lot of parameters per image
    
    model.add(Dense(len(set(y_spec)), activation='softmax', dtype='float32')) # to predict probability for each target class
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=lr), metrics=['KLDivergence'])
    
    return model

model = build_cnn_model()
model.summary()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_spec, y_spec, test_size=0.2, random_state=42, shuffle=False)

history = model.fit(X_train, y_train, batch_size=256, epochs=10, validation_data=0.2)

## EEG working progress

Create dataset for eeg

In [None]:
eeg_feature_df = []
for row in train.itertuples():
    eeg_id = row.eeg_id
    eeg_df = train_df[train_df["eeg_id"] == eeg_id].reset_index(drop=True)
    eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')

    ranges = [(int(200 * offset + 4000), int(200 * offset + 6000)) for offset in eeg_df['eeg_label_offset_seconds']]
    filtered_eeg = pd.concat([eeg.iloc[s:e].mean().to_frame().T for s, e in ranges]).reset_index(drop=True)
    filtered_eeg['target'] = eeg_df['expert_consensus']
    filtered_eeg.insert(0, 'eeg_id', eeg_df['eeg_id'])
    filtered_eeg.insert(1, 'spec_id', eeg_df['spectrogram_id'])

    eeg_feature_df.append(filtered_eeg)
        
eeg_feature_df = pd.concat(eeg_feature_df).reset_index(drop=True)
display(eeg_feature_df)

## Processing

In [None]:
eeg_feature_df = eeg_feature_df.drop(['eeg_id', 'spec_id'], axis=1)

le = LabelEncoder()
eeg_feature_df['expert_consensus'] = le.fit_transform(eeg_feature_df['expert_consensus'])

eeg_feature_df = eeg_feature_df.drop_duplicates()

display(eeg_feature_df)

## Model

In [None]:
rs = RobustScaler()


X = eeg_feature_df.drop("expert_consensus", axis=1)
y = eeg_feature_df["expert_consensus"]

X_scaled = pd.DataFrame(rs.fit_transform(X), columns=X.columns)

X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

num_classes = len(set(y))

In [None]:
def create_nn_model(num_neurons=64, learning_rate=1.0e-02, act='swish', dropout=0.1):
    model = Sequential()

    
    model.add(Dense(num_neurons, activation=act, input_dim=X_train.shape[1]))
    
    model.add(Dense(num_neurons, activation=act))
    model.add(Dropout(dropout))
    model.add(BatchNormalization())

    model.add(Dense(num_neurons, activation=act))
    model.add(Dropout(dropout))
    model.add(BatchNormalization())

    model.add(Dense(int(num_neurons//2), activation=act))
    model.add(BatchNormalization())

    model.add(Dense(int(num_neurons//4), activation=act))
    model.add(BatchNormalization())

    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=learning_rate), metrics=['accuracy'])
    
    return model

model = create_nn_model()

history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_split=0.2)

In [None]:
test_egg_id = 3911565283
test_spec_id = 853520

eeg_test = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/test_eegs/{test_egg_id}.parquet')    
plt.figure(figsize=(20,5))
plt.plot(eeg_test['Fp1'])

spec_test = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/test_spectrograms/{test_spec_id}.parquet')    
plt.figure(figsize=(20,5))
plt.plot(spec_test['time'], spec_test['LL_0.59'])

In [None]:
eeg_test_denoised = denoise(eeg_test, wavelet="db8")
eeg_test_denoised = eeg_test_denoised.iloc[4000:6000].mean().to_frame().T

eeg_test_scaled = rs.transform(eeg_test_denoised)

result = model.predict(eeg_test_scaled)
print(result)

In [None]:
submission[train_votes] = result
submission.to_csv('submission.csv')