In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import matplotlib.pyplot as plt, gc
import joblib  # to pipeline files reading and transforming
from tqdm.notebook import tqdm  # progress bar
import keras_cv
import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         #print()
#         os.path.join(dirname, filename)

In [None]:
i = 0
for dirname, _, filenames in os.walk('/tmp/dataset/hms-hbac'):
    for filename in filenames:
        i += 1
        if i < 3:
            print(filename)
print(i)        

In [None]:
import tensorflow as tf

In [None]:
BASE_PATH = "/kaggle/input/hms-harmful-brain-activity-classification"

SPEC_DIR = "/kaggle/working"
os.makedirs(SPEC_DIR+'/train', exist_ok=True)
os.makedirs(SPEC_DIR+'/test', exist_ok=True)

In [None]:
num_classes = 6
image_size = [200, 600]
class_names = ['Seizure', 'LPD', 'GPD', 'LRDA','GRDA', 'Other']
label2name = dict(enumerate(class_names))
name2label = {v:k for k, v in label2name.items()}
batch_size = 64
LOSS = tf.keras.losses.KLDivergence()

In [None]:
# Train + Valid
df = pd.read_csv(f'{BASE_PATH}/train.csv')
df['eeg_path'] = f'{BASE_PATH}/train_eegs/'+df['eeg_id'].astype(str)+'.parquet'
df['spec_path'] = f'{BASE_PATH}/train_spectrograms/'+df['spectrogram_id'].astype(str)+'.parquet'
df['spec2_path'] = f'{SPEC_DIR}/train/'+df['spectrogram_id'].astype(str)+'.npy'
df['eeg_to_spec'] = f'{SPEC_DIR}/train/'+df['eeg_id'].astype(str)+'.npy'
df['class_name'] = df.expert_consensus.copy()
df['class_label'] = df.expert_consensus.map(name2label)
display(df.head(2))

# Test
test_df = pd.read_csv(f'{BASE_PATH}/test.csv')
test_df['eeg_path'] = f'{BASE_PATH}/test_eegs/'+test_df['eeg_id'].astype(str)+'.parquet'
test_df['spec_path'] = f'{BASE_PATH}/test_spectrograms/'+test_df['spectrogram_id'].astype(str)+'.parquet'
test_df['spec2_path'] = f'{SPEC_DIR}/test/'+test_df['spectrogram_id'].astype(str)+'.npy'
test_df['eeg_to_spec'] = f'{SPEC_DIR}/test/'+test_df['eeg_id'].astype(str)+'.npy'
display(test_df.head(2))

In [None]:
import librosa

NAMES = ['LL','LP','RP','RR']

FEATS = [['Fp1','F7','T3','T5','O1'],
         ['Fp1','F3','C3','P3','O1'],
         ['Fp2','F8','T4','T6','O2'],
         ['Fp2','F4','C4','P4','O2']]


def spectrogram_from_eeg(eeg_id, #display=False,
                        split = "train"):
    
    # LOAD MIDDLE 50 SECONDS OF EEG SERIES
    #eeg_path = f"{BASE_PATH}/{split}_eegs/{eeg_id}.parquet"
    PATH = f'{BASE_PATH}/{split}_eegs/'
    eeg = pd.read_parquet(f'{PATH}{eeg_id}.parquet')
    middle = (len(eeg)-10_000)//2
    eeg = eeg.iloc[middle:middle+10_000]
    
    # VARIABLE TO HOLD SPECTROGRAM
    img = np.zeros((150, 200, 4),dtype='float32')
    
    #if display: plt.figure(figsize=(10,7))
    signals = []
    for k in range(4):
        COLS = FEATS[k]
        
        for kk in range(4):
        
            # COMPUTE PAIR DIFFERENCES
            x = eeg[COLS[kk]].values - eeg[COLS[kk+1]].values

            # FILL NANS
            m = np.nanmean(x)
            if np.isnan(x).mean()<1: x = np.nan_to_num(x,nan=m)
            else: x[:] = 0

            # DENOISE
#             if USE_WAVELET:
#                 x = denoise(x, wavelet=USE_WAVELET)
            signals.append(x)

            # RAW SPECTROGRAM
            mel_spec = librosa.feature.melspectrogram(y=x, sr=200, hop_length=len(x)//200, 
                  n_fft=1024, n_mels=150, fmin=0, fmax=20, win_length=150)

            # LOG TRANSFORM
            width = (mel_spec.shape[1]//10)*10
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).astype(np.float32)[:,:width]

            # STANDARDIZE TO -1 TO 1
            mel_spec_db = (mel_spec_db+40)/40 
            img[:,:,k] += mel_spec_db
                
        # AVERAGE THE 4 MONTAGE DIFFERENCES
        img[:,:,k] /= 4.0
        
#         if display:
#             plt.subplot(2,2,k+1)
#             plt.imshow(img[:,:,k],aspect='auto',origin='lower')
#             plt.title(f'EEG {eeg_id} - Spectrogram {NAMES[k]}')
            
#     if display: 
#         plt.show()
#         plt.figure(figsize=(10,5))
#         offset = 0
#         for k in range(4):
#             if k>0: offset -= signals[3-k].min()
#             plt.plot(range(10_000),signals[k]+offset,label=NAMES[3-k])
#             offset += signals[3-k].max()
#         plt.legend()
#         plt.title(f'EEG {eeg_id} Signals')
#         plt.show()
#         print(); print('#'*25); print()
#         plt.close()    
    np.save(f'{SPEC_DIR}/{split}/{eeg_id}',img)    
    return img

In [None]:
%%time
DISPLAY = 4
EEG_IDS = df.eeg_id.unique()
EEG_IDS_TEST = test_df.eeg_id.unique()

# train eegs
_ = joblib.Parallel(n_jobs=-1, backend="loky")(
    joblib.delayed(spectrogram_from_eeg)(eeg_id, "train")
    for eeg_id in tqdm(EEG_IDS, total=len(EEG_IDS))
)

#test eegs
_ = joblib.Parallel(n_jobs=-1, backend="loky")(
    joblib.delayed(spectrogram_from_eeg)(eeg_id, "test")
    for eeg_id in tqdm(EEG_IDS_TEST, total=len(EEG_IDS_TEST))
)

In [None]:
# Define a function to process a single eeg_id
def process_spec(spec_id, split="train"):
    spec_path = f"{BASE_PATH}/{split}_spectrograms/{spec_id}.parquet"
    spec = pd.read_parquet(spec_path)
    spec = spec.fillna(0).values[:, 1:].T # fill NaN values with 0, transpose for (Time, Freq) -> (Freq, Time)
    spec = spec.astype("float32")
    np.save(f"{SPEC_DIR}/{split}/{spec_id}.npy", spec)

# Get unique spec_ids of train and valid data
spec_ids = df["spectrogram_id"].unique()

# Parallelize the processing using joblib for training data
_ = joblib.Parallel(n_jobs=-1, backend="loky")(
    joblib.delayed(process_spec)(spec_id, "train")
    for spec_id in tqdm(spec_ids, total=len(spec_ids))
)

# Get unique spec_ids of test data
test_spec_ids = test_df["spectrogram_id"].unique()

# Parallelize the processing using joblib for test data
_ = joblib.Parallel(n_jobs=-1, backend="loky")(
    joblib.delayed(process_spec)(spec_id, "test")
    for spec_id in tqdm(test_spec_ids, total=len(test_spec_ids))
)

In [None]:
def build_augmenter(dim=image_size):
    augmenters = [
        keras_cv.layers.MixUp(alpha=2.0),
        keras_cv.layers.RandomCutout(height_factor=(1.0, 1.0),
                                     width_factor=(0.06, 0.1)), # freq-masking
        keras_cv.layers.RandomCutout(height_factor=(0.06, 0.1),
                                     width_factor=(1.0, 1.0)), # time-masking
    ]
    
    def augment(img, label):
        data = {"images":img, "labels":label}
        for augmenter in augmenters:
            if tf.random.uniform([]) < 0.5:
                data = augmenter(data, training=True)
        return data["images"], data["labels"]
    
    return augment


def build_decoder(with_labels=True, target_size=image_size, dtype=32):
    def decode_signal(path, path_eeg, offset=None):
        # Read .npy files and process the signal
        file_bytes = tf.io.read_file(path)
        sig = tf.io.decode_raw(file_bytes, tf.float32)
        sig = sig[1024//dtype:]  # Remove header tag
        sig = tf.reshape(sig, [200, -1])
        
        #Extract labeled subsample from full spectrogram using "offset"
        if offset is not None: 
            #offset = offset // 2  # Only odd values are given
            sig = sig[:, offset:offset+600]
            
            # Pad spectrogram to ensure the same input shape of [400, 300]
            pad_size = tf.math.maximum(0, 600 - tf.shape(sig)[1])
            sig = tf.pad(sig, [[0, 0], [0, pad_size]])
            sig = tf.reshape(sig, [200, 600])
        
        # Log spectrogram 
        sig = tf.clip_by_value(sig, tf.math.exp(-4.0), tf.math.exp(8.0)) # avoid 0 in log
        sig = tf.math.log(sig)
        
        # Normalize spectrogram
        sig -= tf.math.reduce_mean(sig)
        sig /= tf.math.reduce_std(sig) + 1e-6
        
        # Adding our eeg_spectogram
        file_eeg = tf.io.read_file(path_eeg + '".npy')
        eeg = tf.io.decode_raw(file_eeg, tf.float32)
        eeg = eeg[1024//dtype:]
        eeg = tf.transpose(eeg)
        eeg = tf.reshape(eeg, [200, -1])
        
        sig = tf.concat([sig, eeg], axis = 0)
        
        # Mono channel to 3 channels to use "ImageNet" weights
        sig = tf.tile(sig[..., None], [1, 1, 3])
        return sig
    
    def decode_label(label):
        label = tf.one_hot(label, num_classes)
        label = tf.cast(label, tf.float32)
        label = tf.reshape(label, [num_classes])
        return label
    
    def decode_with_labels(path, path_eeg, offset=None, label=None):
        sig = decode_signal(path, path_eeg, offset)
        label = decode_label(label)
        return (sig, label)
    
    return decode_with_labels if with_labels else decode_signal


def build_dataset(paths, paths_eeg, offsets=None, labels=None, batch_size=32, cache=True,
                  decode_fn=None, augment_fn=None,
                  augment=False, repeat=True, shuffle=1024, 
                  cache_dir="", drop_remainder=False):
    if cache_dir != "" and cache is True:
        os.makedirs(cache_dir, exist_ok=True)
    
    if decode_fn is None:
        decode_fn = build_decoder(labels is not None)
    
    if augment_fn is None:
        augment_fn = build_augmenter()
    
    AUTO = tf.data.experimental.AUTOTUNE
    slices = (paths, paths_eeg, offsets) if labels is None else (paths, paths_eeg, offsets, labels)
    
    ds = tf.data.Dataset.from_tensor_slices(slices)
    ds = ds.map(decode_fn, num_parallel_calls=AUTO)
    ds = ds.cache(cache_dir) if cache else ds
    ds = ds.repeat() if repeat else ds
    if shuffle: 
        ds = ds.shuffle(shuffle, seed=42)
        opt = tf.data.Options()
        opt.experimental_deterministic = False
        ds = ds.with_options(opt)
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)
    ds = ds.map(augment_fn, num_parallel_calls=AUTO) if augment else ds
    ds = ds.prefetch(AUTO)
    return ds

In [None]:
from sklearn.model_selection import StratifiedGroupKFold

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

df["fold"] = -1
df.reset_index(drop=True, inplace=True)
for fold, (train_idx, valid_idx) in enumerate(
    sgkf.split(df, y=df["class_label"], groups=df["patient_id"])
):
    df.loc[valid_idx, "fold"] = fold
df.groupby(["fold", "class_name"])[["eeg_id"]].count().T

In [None]:
# Sample from full data
sample_df = df.groupby("spectrogram_id").head(1).reset_index(drop=True)
train_df = sample_df[sample_df.fold != 0]
valid_df = sample_df[sample_df.fold == 0]
print(f"# Num Train: {len(train_df)} | Num Valid: {len(valid_df)}")

# Train
train_paths = train_df.spec2_path.values
train_eeg_paths = train_df.eeg_to_spec
train_offsets = train_df.spectrogram_label_offset_seconds.values.astype(int)
train_labels = train_df.class_label.values
train_ds = build_dataset(train_paths,train_eeg_paths, train_offsets, train_labels, batch_size=batch_size,
                         repeat=True, shuffle=True, augment=True, cache=True)

# Valid
valid_paths = valid_df.spec2_path.values 
valid_eeg_paths = valid_df.eeg_to_spec
valid_offsets = valid_df.spectrogram_label_offset_seconds.values.astype(int)
valid_labels = valid_df.class_label.values
valid_ds = build_dataset(valid_paths, valid_eeg_paths, valid_offsets, valid_labels, batch_size=batch_size,
                         repeat=False, shuffle=False, augment=False, cache=True)

In [None]:
imgs, tars = next(iter(train_ds))

num_imgs = 1
plt.figure(figsize=(16, 20))
for i in range(num_imgs):
    #plt.subplot(1, 4, i + 1)
    img = imgs[i].numpy()[...,0]  # Adjust as per your image data format
    img -= img.min()
    img /= img.max() + 1e-4
    tar = label2name[np.argmax(tars[i].numpy())]
    plt.imshow(img)
    plt.title(f"Target: {tar}")
    plt.axis('off')
    
#plt.tight_layout()
plt.show()

In [None]:
import math

def get_lr_callback(batch_size=8, mode='cos', epochs=8, plot=False):
    lr_start, lr_max, lr_min = 5e-5, 6e-6 * batch_size, 1e-5
    lr_ramp_ep, lr_sus_ep, lr_decay = 3, 0, 0.75

    def lrfn(epoch):  # Learning rate update function
        if epoch < lr_ramp_ep: lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
        elif epoch < lr_ramp_ep + lr_sus_ep: lr = lr_max
        elif mode == 'exp': lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min
        elif mode == 'step': lr = lr_max * lr_decay**((epoch - lr_ramp_ep - lr_sus_ep) // 2)
        elif mode == 'cos':
            decay_total_epochs, decay_epoch_index = epochs - lr_ramp_ep - lr_sus_ep + 3, epoch - lr_ramp_ep - lr_sus_ep
            phase = math.pi * decay_epoch_index / decay_total_epochs
            lr = (lr_max - lr_min) * 0.5 * (1 + math.cos(phase)) + lr_min
        return lr

    if plot:  # Plot lr curve if plot is True
        plt.figure(figsize=(10, 5))
        plt.plot(np.arange(epochs), [lrfn(epoch) for epoch in np.arange(epochs)], marker='o')
        plt.xlabel('epoch'); plt.ylabel('lr')
        plt.title('LR Scheduler')
        plt.show()

    return tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=False)  # Create lr callback

lr_cb = get_lr_callback(64, mode='cos', plot=True)

In [None]:
from tensorflow.keras.applications.efficientnet_v2 import EfficientNetV2L

In [None]:
#base_model = EfficientNetV2M(input_shape = (480, 480, 3), include_top = True, weights = 'imagenet')

In [None]:
base_model = EfficientNetV2L(input_shape = (400, 600, 3), include_top = False, weights = 'imagenet')

In [None]:
for layer in base_model.layers:
    layer.trainable = False

In [None]:
x = base_model.output
x = tf.keras.layers.Flatten()(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)

# Add a final sigmoid layer with 1 node for classification output
predictions = tf.keras.layers.Dense(6, activation="softmax")(x)
model_final = tf.keras.models.Model(inputs = base_model.input, outputs = predictions)

In [None]:
model_final.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
              loss=LOSS, metrics = [LOSS])

In [None]:
ckpt_cb = tf.keras.callbacks.ModelCheckpoint("best_model.keras",
                                         monitor='val_loss',
                                         save_best_only=True,
                                         save_weights_only=False,
                                         mode='min')

In [None]:
history = model_final.fit(
    train_ds, 
    epochs=5,
    callbacks=[ckpt_cb, lr_cb], 
    steps_per_epoch=len(train_df)//64,
    validation_data=valid_ds, 
    verbose=1
)

# Predictions

In [None]:
test_paths = test_df.spec2_path.values
test_eeg_paths = test_df.eeg_to_spec
test_ds = build_dataset(test_paths, test_eeg_paths, batch_size=min(CFG.batch_size, len(test_df)),
                         repeat=False, shuffle=False, cache=False, augment=False)

In [None]:
preds = model_final.predict(test_ds)

In [None]:
pred_df = test_df[["eeg_id"]].copy()
target_cols = [x.lower()+'_vote' for x in class_names]
pred_df[target_cols] = preds.tolist()

sub_df = pd.read_csv(f'{BASE_PATH}/sample_submission.csv')
sub_df = sub_df[["eeg_id"]].copy()
sub_df = sub_df.merge(pred_df, on="eeg_id", how="left")
sub_df.to_csv("submission.csv", index=False)
sub_df.head()