In [None]:
import os, gc
import pandas as pd, numpy as np
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle

# intro
* Hi! In this notebook, I will demonstrate how to extract signal features from EEG signals and perform basic exploratory data analysis (EDA).
* The notebook focuses on obtaining 9 key signal features: 'peak_to_peak', 'excess_kurtosis', 'crest_factor', 'zero_cross_ratio', 'shape_factor', 'impulse_factor', 'clearance_factor', 'skewness', and 'rms'.
* Additionally, basic EDA will be conducted, including the creation of a box plot for visualization.
* If you're interested in exploring other signal features or enhancing your EDA, this notebook serves as a helpful guide.
* I recommend creating multimodal analyses using these features with spectrograms for a more comprehensive understanding.

## Create dataset
* We will load all unique EEG signals into memory, slice EEG sub-data, and calculate features for each EEG signal.
* For this task, we will utilize the Numba JIT compiler, as calculating features for 100,000 EEG sub-signals, each with 20 signal channels and 9 features, can be time-consuming.
* Numba significantly accelerates this process, making it 10 times faster.

#### load data

In [None]:
# load meta data
dataset_path = '/kaggle/input/hms-harmful-brain-activity-classification'
training_meta = pd.read_csv(dataset_path + '/train.csv')
training_meta.head()

In [None]:
unique_eeg_id_list = training_meta.groupby("eeg_id").sum().index.tolist() # get unique id

In [None]:
%%time
# load all eeg signal into memory
eeg_arr = {}
for i,eeg_id in tqdm(enumerate(unique_eeg_id_list)):
    egg_data = pd.read_parquet(dataset_path+f'/train_eegs/{eeg_id}.parquet')
    eeg_arr[eeg_id] = egg_data
    
    

#### get features

In [None]:
feature_name = ['peak_to_peak', 'kurtosis', 'crest_factor', 'zero_cross_ratio', 'shape_factor', 'impulse_factor', 'clearance_factor', 'skewness', 'rms']
features_n = 9
signal_n = 20
sampling_rate = 200

In [None]:
from numba import jit

@jit(nopython=True)
def get_features(signal: np.ndarray):
    
    n = len(signal)
    mean = np.mean(signal)
    std = np.std(signal)  
    peak = np.max(np.abs(signal))
    abs_signal = np.abs(signal)
    squared_signal = signal**2
    zero_crossings = np.where(np.diff(np.sign(signal)))[0]

    # get features
    rms = np.float32(np.sqrt(np.sum(squared_signal) / n))
    skewness = (np.sum((signal - mean)**3) / n) / (std**3)
    clearance_factor = peak/ np.float32((np.sum(np.sqrt(abs_signal)) / n)**2)
    impulse_factor = np.float32(peak / (np.sum(abs_signal) / n))
    shape_factor = np.float32(rms / (np.sum(abs_signal) / n))
    zero_cross_ratio = np.float32(len(zero_crossings) / (n - 1))
    crest_factor = np.float32(peak / rms)
    kurtosis = np.float32(np.sum((signal - mean) ** 4) / (n * std ** 4))
    peak_to_peak = np.float32(np.max(signal) - np.min(signal))
    features = [peak_to_peak, kurtosis, crest_factor, zero_cross_ratio, shape_factor, impulse_factor, clearance_factor, skewness, rms]

    return features

In [None]:
%%time

features = np.zeros([len(training_meta), features_n * signal_n])
noise = np.random.normal(0, 0.001, 10000) # for zero divide error
for ind in tqdm(training_meta.index):
    
    eeg_id = training_meta.loc[ind,'eeg_id']
    eeg_label_offset_seconds = training_meta.loc[ind,'eeg_label_offset_seconds']
    
    egg_data = eeg_arr[eeg_id]

    start_ind_sub_data = int(eeg_label_offset_seconds * sampling_rate)
    end_ind_sub_data = int((eeg_label_offset_seconds + 50) * sampling_rate)
    eeg_sub_data = egg_data[start_ind_sub_data: end_ind_sub_data]
    
    columns = eeg_sub_data.columns
    eeg_signals = eeg_sub_data.values
    
    for i, colname in enumerate(columns):
        feature = get_features(eeg_signals[:,i] + noise)
        features[ind, i*features_n : (i+1)*features_n] = feature
        
for i, colname in enumerate(columns):
    feature_col_names = [feature + "_" + colname for feature in feature_name]
    training_meta.loc[:,feature_col_names] = features[:, i*features_n : (i+1)*features_n]

### basic eda using boxplot

In [None]:
# feature_name
features_col_list = training_meta.columns.tolist()[15:]

plt.figure(figsize=[40,120])
for i in range(signal_n):
    for j in range(features_n):
        feature_name_i = features_col_list[j + features_n * i]
        sub_data = training_meta.loc[:,['expert_consensus']+[feature_name_i]]
        plt.subplot(signal_n,features_n,j + features_n * i + 1)
#         plt.title(feature_name_i)
        sns.boxplot(x='expert_consensus', y=feature_name_i, data=sub_data, showfliers=False) #do not plot outlier 
    

### train classifier

In [None]:
training_meta.loc[:,['expert_consensus']+features_col_list]

In [None]:
def encode_label(target_arr):
    encode_dic = {
        'Seizure' : 0,
        'LPD' : 1,
        'GPD' : 2,
        'LRDA' : 3,
        'GRDA': 4,
        'Other' : 5,
    }
    encoded_label_arr = np.array([encode_dic[label[0]] for label in target_arr])
    return encoded_label_arr.reshape(-1,1)

In [None]:
# train set
from sklearn.model_selection import train_test_split

X = training_meta.loc[:,features_col_list].values
y = encode_label(training_meta.loc[:,['expert_consensus']].values)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=7)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(objective='multi:softprob',eval_metric=['merror','mlogloss'])
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=True)

feature importance(SHAP)

In [None]:
import shap

In [None]:
%%time
explainer = shap.TreeExplainer(model)
shap_values = explainer(X_train)

In [None]:
 #'Seizure' class feature importance
shap.summary_plot(shap_values[:,:,0], X_train, feature_names = features_col_list)

In [None]:
 #'LPD' class feature importance
shap.summary_plot(shap_values[:,:,0], X_train, feature_names = features_col_list)

In [None]:
 #'GPD' class feature importance
shap.summary_plot(shap_values[:,:,0], X_train, feature_names = features_col_list)

In [None]:
 #'LRDA' class feature importance
shap.summary_plot(shap_values[:,:,0], X_train, feature_names = features_col_list)

In [None]:
 #'GRDA' class feature importance
shap.summary_plot(shap_values[:,:,0], X_train, feature_names = features_col_list)

In [None]:
 #'Other' class feature importance
shap.summary_plot(shap_values[:,:,0], X_train, feature_names = features_col_list)

### prediction

In [None]:
# test set
test_meta = pd.read_csv(dataset_path + '/test.csv')

In [None]:
test_meta

In [None]:
# get test features
features = np.zeros([len(test_meta), features_n * signal_n])
noise = np.random.normal(0, 0.001, 10000) # for zero divide error
for ind in tqdm(test_meta.index):
    
    eeg_id = test_meta.loc[ind,'eeg_id']
    
    
    egg_data = pd.read_parquet(dataset_path+f'/test_eegs/{eeg_id}.parquet')
    columns = egg_data.columns
    eeg_signals = egg_data.values
    
    for i, colname in enumerate(columns):
        feature = get_features(eeg_signals[:,i] + noise)
        features[ind, i*features_n : (i+1)*features_n] = feature
        
for i, colname in enumerate(columns):
    feature_col_names = [feature + "_" + colname for feature in feature_name]
    test_meta.loc[:,feature_col_names] = features[:, i*features_n : (i+1)*features_n]

In [None]:
test_meta

In [None]:
X_test = test_meta.loc[:,features_col_list].values
print(X_test.shape)

In [None]:
y_pred_proba = np.float32(model.predict_proba(X_test))
print(y_pred_proba.shape)

In [None]:
TARGETS = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
sub = pd.DataFrame({'eeg_id': test_meta.eeg_id.values})
sub[TARGETS] = y_pred_proba
sub.iloc[0,1:] = sub.iloc[0,1:].values
sub.to_csv(f'submission.csv',index=False)
print(f'Submission shape: {sub.shape}')
sub.head()

In [None]:
np.sum(sub.iloc[0,1:].values)