# Analysis Goal
In this competition, our aim is to classify brain activity using EEG and spectrogram recordings. The train.csv file includes segment IDs for both EEG and spectrogram data, along with expert consensus on brain states during specific periods. This notebook focuses on transforming EEG signals into images by extracting features like PSD, band power, spectral centroid, and a custom spectrogram (distinct from the dataset's provided spectrogram). We plan to use a pretrained CNN model on these EEG-derived images for classification. Additionally, we will train a separate CNN solely on the dataset's spectrograms to create a composite voting classifier for the final submission.

# Load Data
Here, we provide a brief overview of the data structure.

In [None]:
import pandas as pd
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pickle
import joblib
from sklearn.model_selection import train_test_split
from tensorflow.keras import models, layers
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, Dropout, GlobalAveragePooling2D
from sklearn.preprocessing import StandardScaler, LabelEncoder, normalize, MinMaxScaler
from tensorflow.keras.applications import InceptionResNetV2
from sklearn.metrics import confusion_matrix, accuracy_score
from tensorflow.keras.utils import Sequence
import scipy.signal
from scipy.signal import coherence, cwt, ricker
from sklearn.decomposition import PCA

# Load the dataset
root = '/kaggle/input/hms-harmful-brain-activity-classification/'
data = pd.read_csv(root + 'train.csv')
data

In [None]:
sample_train_eeg = pd.read_parquet("/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/1000913311.parquet")
sample_train_eeg

In [None]:
sample_train_spec = pd.read_parquet("/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/1000086677.parquet")
sample_train_spec

# Feature Engineering
To create manageable-sized images from EEG time series for CNN training, we have developed several lightweight feature extraction functions. You can find more details in this [tutorial](https://towardsdatascience.com/cnns-for-audio-classification-6244954665ab).

In [None]:
fs = 200  # Sampling frequency

def calculate_psd(eeg_signal, fs=200, nfft=260):
    f, Pxx_den = scipy.signal.periodogram(eeg_signal, fs=fs, nfft=nfft)
    return Pxx_den[:130]

def calculate_band_power(eeg_signal, band, fs=200, window_size=200, step_size=77):
    band_powers = []
    for start in range(0, len(eeg_signal) - window_size + 1, step_size):
        segment = eeg_signal[start:start + window_size]
        f, Pxx = scipy.signal.welch(segment, fs, nperseg=window_size)
        band_power = np.trapz(Pxx[np.logical_and(f >= band[0], f <= band[1])], f[np.logical_and(f >= band[0], f <= band[1])])
        band_powers.append(band_power)
    return np.array(band_powers)

def calculate_spectral_centroid(eeg_signal, fs=200, window_size=200, step_size=77):
    spectral_centroids = []
    for start in range(0, len(eeg_signal) - window_size + 1, step_size):
        segment = eeg_signal[start:start + window_size]
        f, Pxx_den = scipy.signal.periodogram(segment, fs)
        spectral_centroid = (f * Pxx_den).sum() / Pxx_den.sum() if Pxx_den.sum() != 0 else 0
        spectral_centroids.append(spectral_centroid)
    return np.array(spectral_centroids)

def calculate_eeg_spectrogram(eeg_signal, fs=200):
    nperseg = 256 # Window size for each FFT
    noverlap = nperseg // 2  # 50% overlap
    nfft = nperseg # Number of FFT points, usually the same as nperseg for simplicity
    f, t, Sxx = scipy.signal.spectrogram(eeg_signal, fs=fs, nperseg=nperseg, noverlap=noverlap, nfft=nfft)
    return Sxx

# Function to pad the feature matrix to a fixed size
def padding(array, xx, yy):
    h = array.shape[0]
    w = array.shape[1]
    a = max((xx - h) // 2, 0)
    aa = max(0, xx - a - h)
    b = max(0, (yy - w) // 2)
    bb = max(yy - b - w, 0)
    return np.pad(array, pad_width=((a, aa), (b, bb)), mode='constant')

## Data Generator
Since EEG data is not directly available in `train.csv`, a data generator is defined to fetch EEG data corresponding to `train.csv` entries. This generator also performs feature engineering, ultimately producing images for training. The generated data image will be a tensor with depth 20 as the EEG signals contains 19 channels in different location, as well as a EKG channel, which may contain useful cardiac information. The tensor is further transformed into 3-channel tensor by PCA to reduce data dimension and meet the requirement of the pretrained model.

In [None]:
class EEGDataGenerator(Sequence):
    def __init__(self, df, batch_size, root, max_size=130, scaler=None):
        self.df = df
        self.batch_size = batch_size
        self.root = root
        self.max_size = max_size
        self.scaler = scaler if scaler else StandardScaler()

    def generate_eeg_features(self, eeg_data):
        channel_features_list = []
        max_feature_length = self.max_size
        scaler = MinMaxScaler()

        for channel in eeg_data.columns:
            channel_data = eeg_data[channel]
            # Calculate features
            psd = calculate_psd(channel_data)
            band_power_ts = calculate_band_power(channel_data, (8, 12), fs=200, window_size=256, step_size=128)
            spectral_centroid_ts = calculate_spectral_centroid(channel_data, fs=200, window_size=256, step_size=128)
            spectrogram = calculate_eeg_spectrogram(channel_data)

            # Normalize features
            norm_psd = scaler.fit_transform(psd.reshape(-1, 1))
            norm_band_power = scaler.fit_transform(band_power_ts.reshape(-1, 1))
            norm_spectral_centroid = scaler.fit_transform(spectral_centroid_ts.reshape(-1, 1))
            norm_spectrogram = scaler.fit_transform(spectrogram)
            
            # Pad features for this channel
            padded_psd = padding(norm_psd, max_feature_length, 1)
            padded_band_power = padding(norm_band_power, max_feature_length, 1)
            padded_spectral_centroid = padding(norm_spectral_centroid, max_feature_length, 1)
            padded_spectrogram = padding(norm_spectrogram, max_feature_length, norm_spectrogram.shape[1])
            
            # Stack features for this channel
            channel_features = np.hstack([
                padded_psd, padded_band_power, 
                padded_spectral_centroid, padded_spectrogram
            ])

            channel_features_list.append(channel_features)

        # Stack all channel features depth-wise to create a 3D tensor
        eeg_tensor = np.stack(channel_features_list, axis=-1)
        
        # Reshape for PCA
        height, width, channels = eeg_tensor.shape
        eeg_tensor_reshaped = eeg_tensor.reshape(-1, channels)  # (num_samples * height * width, channels)
        
        # Handle NaN values
        eeg_tensor_reshaped = np.nan_to_num(eeg_tensor_reshaped)

        # Apply PCA to reduce to 3 channels
        pca = PCA(n_components=3)
        eeg_tensor_pca = pca.fit_transform(eeg_tensor_reshaped)  # (num_samples * height * width, 3)

        # Reshape back to 3-channel format
        eeg_tensor_3_channels = eeg_tensor_pca.reshape(height, width, 3)

        return eeg_tensor_3_channels

    def __len__(self):
        return int(np.ceil(len(self.df) / self.batch_size))

    def __getitem__(self, idx):
        batch_df = self.df.iloc[idx * self.batch_size:(idx + 1) * self.batch_size]
        X, y = [], []

        for _, row in batch_df.iterrows():
            eeg_id = row['eeg_id']
            eeg_offset_seconds = row['eeg_label_offset_seconds']
            eeg_data = pd.read_parquet(self.root + 'train_eegs/' + str(eeg_id) + '.parquet')

            # Segment the EEG data
            start_ind = int(eeg_samples_per_second * eeg_offset_seconds)
            end_ind = start_ind + eeg_samples_per_second * eeg_seconds_per_subsample
            subsample_eeg_data = eeg_data.iloc[start_ind:end_ind]

            # Check if the subsample is complete
            if len(subsample_eeg_data) == eeg_samples_per_second * eeg_seconds_per_subsample:
                feature_image = self.generate_eeg_features(subsample_eeg_data)
                X.append(feature_image)

                # Prepare labels
                total_votes = row[vote_list].sum()
                normalized_votes = [vote / total_votes for vote in row[vote_list]]
                y.append(normalized_votes)

        return np.array(X), np.array(y)

The dimensions of images generated by the data generator are verified by a dummy dataframe with the same sample number. One channel has one image, and all images are stacked along the depth direction, forming a 3D tensor for CNN training.

In [None]:
# Create a dummy instance of EEGDataGenerator
dummy_df = pd.DataFrame()  # Empty DataFrame, as we won't use it here
dummy_batch_size = 1       # Dummy batch size, as we only need to generate a single image
eeg_generator = EEGDataGenerator(dummy_df, dummy_batch_size, '')  # Assuming root is not needed for generating a single image

# Sample EEG data segment
sample_eeg_data = pd.DataFrame(np.random.randn(10000, 20), columns=[f'Channel_{i}' for i in range(1, 21)])

# Generate EEG image
eeg_image = eeg_generator.generate_eeg_features(sample_eeg_data)

# Print the shape of the generated image
print("Shape of the generated EEG image:", eeg_image.shape)

# Model Training
We start by defining constants for the data generator, setting the stage for training.

In [None]:
# Constants
eeg_samples_per_second = 200
eeg_seconds_per_subsample = 50
    
# Load the labels
vote_list = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

# Create data generator
batch_size = 32
data_generator = EEGDataGenerator(data, batch_size, root)

# Split data into training and validation sets
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)
train_generator = EEGDataGenerator(train_df, batch_size, root)
val_generator = EEGDataGenerator(val_df, batch_size, root)

print(train_df.shape, val_df.shape)

## Using a pre-trained InceptionResNetV2

In [None]:
# Load the pre-trained InceptionResNetV2 model
base_model = InceptionResNetV2(include_top=False, weights='imagenet', input_shape=eeg_image.shape)

# Freeze the layers of the base model
for layer in base_model.layers:
    layer.trainable = False

# Add custom layers
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(512, activation='relu')(x)
predictions = Dense(len(vote_list), activation='sigmoid')(x)  # Final layer with sigmoid activation

# Create the final model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_generator, epochs=10, validation_data=val_generator)

# Plot training results
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Next
CNN for spectrogram

# Submission

In [None]:
test_eeg_data_list = []
for eeg_id in test_df['eeg_id']:
    eeg_data = pd.read_parquet(os.path.join(test_eeg_dir, str(eeg_id) + '.parquet'))
    eeg_offset_seconds = 0
    start_ind = int(eeg_samples_per_second * eeg_offset_seconds)
    subsample_eeg_data = eeg_data[start_ind:start_ind + eeg_samples_per_second * eeg_seconds_per_subsample]

    # Generate features for this subsample
    feature_image = eeg_generator.generate_eeg_features(subsample_eeg_data)

    # Reshape feature image for model prediction
    feature_image_reshaped = feature_image.reshape((1,) + feature_image.shape)  # Adding batch dimension

    test_eeg_data_list.append(feature_image_reshaped)

# Predict using the CNN model
solution_list = []
for i, test_eeg_data in enumerate(test_eeg_data_list):
    predictions = model.predict(test_eeg_data)
    predictions = predictions.flatten()  # Flatten to get a 1D array of predictions

    test_results_dict = {'eeg_id': test_df['eeg_id'].iloc[i]}
    for ind, vote in enumerate(vote_list):
        test_results_dict[vote] = predictions[ind]
    solution_list.append(test_results_dict)

solution_df = pd.DataFrame(solution_list)
solution_df.to_csv("submission.csv", index=False)
print(solution_df)