# 🎨 dataset [here](https://www.kaggle.com/datasets/nakagawaren0805/hms-preprocessed-dataset)

Preprocessing HMS Dataset 🚀
===
## content 🧠
- denoise with demy(db8) 🌊
- fill missing value with linear interpolate 📈
---
## output 📦
- preprocessed Electroencephalogram (EEG) 🧠✨
- preprocedded train.csv(discard still contain missing value eeg-id data) 📑🚫
- preprocess function 🔄
---
## Lemma 📚
- Test dataset don't have missing value: [check submission code](https://www.kaggle.com/code/nakagawaren0805/hms-detect-na) ✅
- Use Group KFold to patient, in order to protect for data leakage! 🤝🔍

# Import modules

In [None]:
import os
import glob
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pywt
import torch

# Set parameters

## you don't need to change parameter

In [None]:
BASE_DIR = Path("/kaggle/input/hms-harmful-brain-activity-classification")
TRAIN_EEG_DIR = BASE_DIR/"train_eegs"

EEG_SAMPLING_TIME = 50  #second
EEG_SAMPLING_RATE = 200 #Hz
EEG_DURATION = EEG_SAMPLING_RATE * EEG_SAMPLING_TIME

N_INTERACTIVE = 100 # number of sampling for development environment

## you can customize parameter

In [None]:
WAVELET_DECOMPOSITION_TREE_LEVEL = 1
WAVELET_TYPE = "dmey"


## Set a seed

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)  # 🎲 Set seed for NumPy
    torch.manual_seed(seed)  # 🚀 Set seed for PyTorch on CPU
    torch.cuda.manual_seed(seed)  # 🚀 Set seed for PyTorch on GPU
    
    # ⚙️ When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    # 🔏 Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
# 🌱 Set seed using the configured seed value
set_seed()

# Define preprocess function

In [None]:
def plot_eeg(df, moving_avg=1):
    fig, axs = plt.subplots(20, 1, figsize=(30, 15), sharex=True)
    for i, ax in enumerate(axs):
        ax.plot(df.iloc[:,i], color="black")
        for vline in df[df.iloc[:,i].isna()].index:
            line_min = df.iloc[:,i].min()
            line_max = df.iloc[:,i].max()
            ax.vlines(vline, line_min, line_max, color='red')
        ax.set_ylabel(df.columns[i], rotation=0)
        ax.set_yticklabels([])
        ax.set_yticks([])
        ax.set_xticks([])
        ax.spines[["top", "bottom", "left", "right"]].set_visible(False)

def get_train_eeg(q:dict) -> pd.DataFrame:
    parquet_df = pd.read_parquet(BASE_DIR/f"train_eegs/{q['eeg_id']}.parquet")
    eeg_start_index = int(EEG_SAMPLING_RATE * q["eeg_label_offset_seconds"])
    return parquet_df.iloc[eeg_start_index:eeg_start_index+EEG_DURATION]

def denoise(x, wavelet='db8', level=1): # dmey for seizure patient denoise, db8 for healthy patient denoise
    # paper: https://jart.icat.unam.mx/index.php/jart/article/view/339/336
    def _maddest(d, axis=None):
        return np.mean(np.absolute(d - np.mean(d, axis)), axis)
    ret = {key:[] for key in x.columns}
    for pos in x.columns:
        coeff = pywt.wavedec(x[pos], wavelet, mode="per")
        sigma = (1/0.6745) * _maddest(coeff[-level])
        uthresh = sigma * np.sqrt(2*np.log(len(x)))
        coeff[1:] = (pywt.threshold(i, value=uthresh, mode='hard') for i in coeff[1:])
        ret[pos]=pywt.waverec(coeff, wavelet, mode='per')
    return pd.DataFrame(ret)

def interpolate(raw_df):
    df = raw_df.copy()
    df = df.interpolate(
        method='linear',
        axis=0,
        limit=1, # ref to 1 value
        limit_direction="both", # interpolate from pre and post values
        limit_area='inside',
    )
    return df

def replace_outlier(series, bias=1.5, upper=0.95, lower=0.05):
    lower_clip = series.quantile(lower)
    upper_clip = series.quantile(upper)
    iqr = upper_clip - lower_clip

    outlier_min = lower_clip - (iqr) * bias
    outlier_max = upper_clip + (iqr) * bias

    series = series.clip(outlier_min, outlier_max)
    return series

# Note: Kaggle environment cannot make large file,so this is demonstration.  
# If you want to obtain preprocessed data, you can get [here](https://www.kaggle.com/datasets/nakagawaren0805/hms-preprocessed-dataset)

In [None]:
train_df = pd.read_csv(BASE_DIR/"train.csv")
eeg_list = glob.glob(str(TRAIN_EEG_DIR/"*.parquet"))
if True or os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') == 'Interactive':
    print("Running on development environment")
#     train_df = train_df.sample(N_INTERACTIVE)
    eeg_list = eeg_list[:N_INTERACTIVE]
elif os.environ.get('KAGGLE_KERNEL_RUN_TYPE','') == 'Batch':
    print("Running on production environment")

# Preprocess EEGs

In [None]:
preprocessed_eeg_dir = "train_eegs"
os.makedirs(preprocessed_eeg_dir, exist_ok=True)

ignore_parquet_num = 0
ignore_parquet_eeg_ids = list()
for p_eeg in eeg_list:
    eeg_id = int(p_eeg.split("/")[-1].split(".")[0])
    eeg = pd.read_parquet(p_eeg)
    eeg = interpolate(eeg)
    if eeg.isna().sum().sum(): # discard EEG(still contail missing value)
        ignore_parquet_num+=1
        ignore_parquet_eeg_ids.append(eeg_id)
        continue
    eeg = denoise(eeg, wavelet=WAVELET_TYPE)
    eeg.to_parquet(os.path.join(preprocessed_eeg_dir, f"{eeg_id}.parquet"))
print(f"{ignore_parquet_num=}/{len(eeg_list)}")

# Preprocess train.csv

In [None]:
preprocessed_traindf = train_df.loc[~train_df["eeg_id"].isin(ignore_parquet_eeg_ids)].copy()
preprocessed_traindf.to_csv("train.csv")

# That's all preprocessing.
# You can use preprocessed dataset like original train dataset!
# Tell me if you notice any problem, no matter how trivial!!

# Note: Kaggle environment cannot make large file,so this is demonstration.  
# If you want to obtain preprocessed data, you can get [here](https://www.kaggle.com/datasets/nakagawaren0805/hms-preprocessed-dataset)