# Import modules

In [None]:
import os
import shutil
import glob
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

# Set global parameter

In [None]:
BASE_DIR = Path("/kaggle/input/hms-harmful-brain-activity-classification")

EEG_SAMPLING_TIME = 50  #second
EEG_SAMPLING_RATE = 200 #Hz
EEG_DURATION = EEG_SAMPLING_RATE * EEG_SAMPLING_TIME

In [None]:
train_df = pd.read_csv(BASE_DIR/"train.csv")
print(f"{len(train_df)=}")
print(f"train df nan:{train_df.isna().sum().sum()}")
train_df.head(100)

In [None]:
train_df.expert_consensus.unique()

In [None]:
def get_train_eeg(q:dict) -> pd.DataFrame:
    parquet_df = pd.read_parquet(BASE_DIR/f"train_eegs/{q['eeg_id']}.parquet")
    eeg_start_index = int(EEG_SAMPLING_RATE * q["eeg_label_offset_seconds"])
    return parquet_df.iloc[eeg_start_index:eeg_start_index+EEG_DURATION]

def plot_eeg(df, moving_avg=1):
    fig, axs = plt.subplots(20, 1, figsize=(30, 15), sharex=True)
    for i, ax in enumerate(axs):
        ax.plot(df.iloc[:,i], color="black")
        for vline in df[df.iloc[:,i].isna()].index:
            line_min = df.iloc[:,i].min()
            line_max = df.iloc[:,i].max()
            ax.vlines(vline, line_min, line_max, color='red')
        ax.set_ylabel(df.columns[i], rotation=0)
        ax.set_yticklabels([])
        ax.set_yticks([])
        ax.set_xticks([])
        ax.spines[["top", "bottom", "left", "right"]].set_visible(False)

In [None]:
def save_all_eeg_img(p_eeg):
    eeg = pd.read_parquet(p_eeg)
    plot_eeg(eeg)
    plt.savefig(f'all_eeg_figs/{p_eeg.split("/")[-1].split(".")[0]}.png')
    plt.close()

In [None]:
import pywt

def maddest(d, axis=None):
    return np.mean(np.absolute(d - np.mean(d, axis)), axis)

def denoise(x, wavelet='db8', level=1): # dmeyがてんかん患者のEEG信号からのノイズ除去に最適であり、db8が健康な被験者のEEG信号からのノイズ除去に最適
    ret = {key:[] for key in x.columns}
    for pos in x.columns:
        coeff = pywt.wavedec(x[pos], wavelet, mode="per")
        sigma = (1/0.6745) * maddest(coeff[-level])
        uthresh = sigma * np.sqrt(2*np.log(len(x)))
        coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])
        ret[pos]=pywt.waverec(coeff, wavelet, mode='per')
    return pd.DataFrame(ret)

isna# Sample Vizulization

In [None]:
q = train_df.iloc[10]
q = dict(q)
q["expert_consensus"]

## raw data

In [None]:
eeg = get_train_eeg(q)
plot_eeg(eeg)

## db8 is best for removing noise from EEG signals of healthy patients

In [None]:
db8_processed_eeg = denoise(eeg, wavelet="db8")
plot_eeg(db8_processed_eeg)

## dmey is best for removing noise from EEG signals of epilepsy patients

In [None]:
dmey_processed_eeg = denoise(eeg, wavelet="dmey")
plot_eeg(dmey_processed_eeg)

# Serch missing value eeg index

In [None]:
def detect_na(row):
    if get_train_eeg(row).isna().sum().sum():
        return f"{row['eeg_id']}-{row['eeg_sub_id']}"

In [None]:
fig_out_dir = "all_eeg_figs"
os.makedirs(fig_out_dir, exist_ok=True)

# I don't like run so long, that's why I make branch of runtype, if you want to run all in interactive, you should use only Batch block.
if os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') == 'Interactive':
    print("Running on Interactive Notebook")
    temp_train_df = train_df.sample(100).copy()
    drop_index = joblib.Parallel(n_jobs=-1)(joblib.delayed(detect_na)(row) for i, row in temp_train_df.iterrows())
    joblib.Parallel(n_jobs=-1)(joblib.delayed(save_all_eeg_img)(p_eeg) for p_eeg in glob.glob(os.path.join(BASE_DIR, "train_eegs/*"))[:10])
elif os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') == 'Batch':
    print("Running on Background Notebook")
    joblib.Parallel(n_jobs=-1)(joblib.delayed(save_all_eeg_img)(p_eeg) for p_eeg in glob.glob(os.path.join(BASE_DIR, "train_eegs/*")))
    drop_index = joblib.Parallel(n_jobs=-1)(joblib.delayed(detect_na)(row) for i, row in train_df.iterrows())
    
drop_index = list(filter(None, drop_index))

In [None]:
print(f"detect {len(drop_index)=}")
joblib.dump(drop_index, "dropped-eeg-id-sub.joblib")

In [None]:
unique_missingvalue_set = set()
for drp_idx in drop_index:
    eedid, eegsubid = map(int, drp_idx.split("-"))
    row = dict(train_df.loc[(train_df.eeg_id==eedid)&(train_df.eeg_sub_id==eegsubid),:].iloc[0])
    na_eeg = get_train_eeg(row)
    unique_missingvalue_set.add(*set(na_eeg.isna().sum()))
    if len(set(na_eeg.isna().sum()))!=1:
        print("partial missing value is exit!")
else:
    print("all missing value is same time line")
print("-"*50)
print(f"each eeg missing value range: {min(unique_missingvalue_set)} ~ {max(unique_missingvalue_set)}")

In [None]:
print(row["expert_consensus"])
na_eeg = get_train_eeg(row)
plot_eeg(na_eeg)

# Search outlier value