## HMS - Harmful Brain Activity Classification

## 1. Setup

In [None]:
import os
from tqdm import tqdm
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
competition_dataset_directory = Path('/kaggle/input/hms-harmful-brain-activity-classification')

In [None]:
def visualize_categorical_column_distribution(df, column, title, path=None):

    """
    Visualize distribution of the given categorical column in the given dataframe

    Parameters
    ----------
    df: pandas.DataFrame
        Dataframe with given categorical column

    column: str
        Name of the categorical column

    title: str
        Title of the plot

    path: path-like str or None
        Path of the output file or None (if path is None, plot is displayed with selected backend)
    """

    value_counts = df[column].value_counts()

    fig, ax = plt.subplots(figsize=(24, df[column].value_counts().shape[0] + 4), dpi=100)
    ax.bar(
        x=np.arange(len(value_counts)),
        height=value_counts.values,
    )
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticks(
        np.arange(len(value_counts)),
        [
            f'{value} ({count:,})' for value, count in value_counts.to_dict().items()
        ]
    )
    ax.tick_params(axis='x', labelsize=15, pad=10)
    ax.tick_params(axis='y', labelsize=15, pad=10)
    ax.set_title(title, size=20, pad=15)

    if path is None:
        plt.show()
    else:
        plt.savefig(path)
        plt.close(fig)


def visualize_continuous_column_distribution(df, column, title, path=None):

    """
    Visualize distribution of the given continuous column in the given dataframe

    Parameters
    ----------
    df: pandas.DataFrame
        Dataframe with given continuous column

    column: str
        Name of the continuous column,

    title: str
        Title of the plot

    path: path-like str or None
        Path of the output file or None (if path is None, plot is displayed with selected backend)
    """

    fig, ax = plt.subplots(figsize=(24, 6), dpi=100)
    ax.hist(df[column], bins=16)
    ax.tick_params(axis='x', labelsize=15)
    ax.tick_params(axis='y', labelsize=15)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(
        title + f'''
        Mean: {np.mean(df[column]):.2f} Median: {np.median(df[column]):.2f} Std: {np.std(df[column]):.2f}
        Min: {np.min(df[column]):.2f} Max: {np.max(df[column]):.2f}
        ''',
        size=15,
        pad=12.5,
        loc='center',
        wrap=True
    )

    if path is None:
        plt.show()
    else:
        plt.savefig(path, bbox_inches='tight')
        plt.close(fig)

        
def visualize_correlations(df, columns, title, path=None):

    """
    Visualize correlations of given columns in the given dataframe

    Parameters
    ----------
    df: pandas.DataFrame
        Dataframe with given columns

    columns: list
        List of names of columns

    title: str
        Title of the plot

    path: path-like str or None
        Path of the output file or None (if path is None, plot is displayed with selected backend)
    """

    fig, ax = plt.subplots(figsize=(20, 20), dpi=100)
    ax = sns.heatmap(
        df[columns].corr(),
        annot=True,
        square=True,
        cmap='coolwarm',
        annot_kws={'size': 12},
        fmt='.2f'
    )
    cbar = ax.collections[0].colorbar
    cbar.ax.tick_params(labelsize=15)
    ax.tick_params(axis='x', labelsize=10)
    ax.tick_params(axis='y', labelsize=10)
    ax.set_title(title, size=20, pad=15)

    if path is None:
        plt.show()
    else:
        plt.savefig(path)
        plt.close(fig)


## 2. Introduction

The goal of this competition is to detect and classify seizures and other types of harmful brain activity in electroencephalography (EEG) data.

Dataset consist of EEG and spectrogram data. Each row on the training set represents:
* **50 second** long EEG sample
* Matched spectrogram of the EEG that covers a **10 minute** window

Both EEG and its matched spectrogram are centered at the same time. EEG files are more than spectrogram files because many of the raw samples were overlapping and some of them were merged into single files. Metadata on training set allows you to extract the original subsets using 
`eeg_label_offset_seconds` and `spectrogram_label_offset_seconds` columns.

In [None]:
eeg_directory = competition_dataset_directory / 'train_eegs'
spectrogram_directory = competition_dataset_directory / 'train_spectrograms'
df_train = pd.read_csv(competition_dataset_directory / 'train.csv')

print(f'Training Set Shape: {df_train.shape} - EEG Files: {len(os.listdir(eeg_directory))} - Spectrogram Files: {len(os.listdir(spectrogram_directory))}')

df_train

## 3. Targets

Target columns in training set are:

* `seizure_vote`: Seizure
* `lpd_vote`: Lateralized periodic discharges
* `gpd_vote`: Generalized periodic discharges
* `lrda_vote`: Lateralized rhythmic delta activity
* `grda_vote`: Generalized rhythmic delta activity
* `other_vote`: Other

Those target columns represent counts of annotator votes for a given brain activity class. There is also another column named `expert_consensus` which is the argmax of previously mentioned target columns. It is provided for convenience only. Value counts of `expert_consensus` are balanced to some extend.

In [None]:
visualize_categorical_column_distribution(
    df=df_train,
    column='expert_consensus',
    title='expert_consensus Counts'    
)

Number of annotators varies between 1 and 28 with an average of **7.26**. Since the number of annotators is not consistent in training samples, target columns should be evaluated based on vote ratios rather than counts.

In [None]:
target_columns = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
df_train['total_vote'] = df_train[target_columns].sum(axis=1)

visualize_continuous_column_distribution(
    df=df_train,
    column='total_vote',
    title='total_vote Histogram'
)

Histograms of normalized target columns are visualized below. Target columns with high 0 and 0.7-1 bins indicate that annotators were having less disagreements while labeling those brain activities. On the other hand, target columns with high 0.1-0.6 bins indicate that brain activity is hard to detect and annotators had more disagreements.

In [None]:
normalized_target_columns = [f'{column}_normalized' for column in target_columns]
df_train[normalized_target_columns] = df_train[target_columns] / df_train['total_vote'].values.reshape(-1, 1)

for column in normalized_target_columns:
    visualize_continuous_column_distribution(
        df=df_train,
        column=column,
        title=f'Normalized Target Column {column}'
    )

Roughly half of the training set votes are unanimous. The count of samples with unamious vote is **51037** and the count of samples with vote disagreements is **55763**.

In [None]:
df_train['unanimous_vote'] = (df_train[normalized_target_columns] == 1.0).any(axis=1).astype(np.uint8)

visualize_categorical_column_distribution(
    df=df_train,
    column='unanimous_vote',
    title='unanimous_vote Counts'    
)

Annotator disagreements are very common in medical domain because tasks are challenging even for the experts. However there could be underlying patterns in their annotations. For example, targets don't have similar number of unanimous votes which clearly shows that experts had more disagreements on certain brain activities such as lateralized and generalized periodic discharges. On the contrary, experts had less disagreements while they were detecting seizures.

In [None]:
visualize_categorical_column_distribution(
    df=df_train.loc[df_train['unanimous_vote'] == 1],
    column='expert_consensus',
    title='Unanimous expert_consensus Counts'    
)

Correlations of normalized target columns can be seen below. Target columns like `Seizure`, `GRDA` and `Other` have more blue intensity, thus their vote percentages are higher.

In [None]:
visualize_correlations(
    df=df_train,
    columns=normalized_target_columns,
    title='Normalized Target Columns Correlations',
)

## 4. EEGs

EEG stands for electroencephalogram. It is a test that measures and records the electrical activity in the brain. The brain cells communicate with each other through electrical impulses, and these electrical signals can be detected and recorded using electrodes placed on the scalp.

EEG is commonly used in clinical settings to diagnose and monitor various neurological disorders. In this case, the neurological disorders are seizure, generalized/lateralized periodic discharges, lateralized/generalized rhythmic delta activity and other.

There are **1950** patients in training set. Each patient has **8.76** EEGs on average and each EEG has **6.25** subsamples on average which adds up to **106800** labeled 50 second long EEG subsamples.

In [None]:
df_patient_id_eeg_id_unique_counts = df_train.groupby('patient_id')[['eeg_id']].nunique()
visualize_continuous_column_distribution(
    df=df_patient_id_eeg_id_unique_counts,
    column='eeg_id',
    title='patient_id eeg_id nunique Distribution'
)

df_eeg_id_eeg_sub_id_unique_counts = df_train.groupby('eeg_id')[['eeg_sub_id']].nunique()
visualize_continuous_column_distribution(
    df=df_eeg_id_eeg_sub_id_unique_counts,
    column='eeg_sub_id',
    title='eeg_id eeg_sub_id nunique Distribution'
)

EEGs in this dataset use 10-20 system. The 10-20 system is a standardized method for electrode placement. It is widely used to ensure consistency and accuracy in EEG recordings across different laboratories and healthcare settings. The name "10-20" refers to the distances between adjacent electrode placements, which are either 10% or 20% of the total front-back or right-left distance of the skull.

The 10-20 system is widely used in clinical and research settings because it provides a standardized and reproducible way to place electrodes on the scalp. This consistency is essential for comparing EEG recordings across different individuals and studies.

Other variations of the system, such as the 10-10 system, involve additional electrode placements for more detailed mapping of the scalp. The choice of system depends on the specific requirements of the EEG recording and the clinical or research goals.

Electrode placements of 10-20 system can be seen on the image below.

![1020system](https://i.ibb.co/KrL8mGp/Screenshot-from-2024-01-13-13-34-15.png)

However, signals of A1 and A2 electrodes are not included but instead an additional EKG signal is added in this dataset. The EKG is for an electrocardiogram lead that records data from the heart. All of the columns in EEGs represent electrical signal on corresponding electrodes and `EKG` column represents the electrical signal on heart over time.

In [None]:
df_eeg = pd.read_parquet(eeg_directory / '1628180742.parquet')
df_eeg

All of the EEG data (for both train and test) was collected at a frequency of 200 samples per second so EEG `1628180742` is 90 seconds long. A `second` identifier column can be created by repeating each second 200 times.

In [None]:
sample_per_second = 200
seconds = df_eeg.shape[0] / sample_per_second

df_eeg['second'] = np.repeat(np.arange(seconds), 200)

The entire EEG signal of `1628180742` is visualized below.

In [None]:
def visualize_eeg_signal(df, title, path=None):
    
    """
    Visualize EEG signal over time

    Parameters
    ----------
    df: pandas.DataFrame
        Dataframe with given EEG and EKG columns

    title: str
        Title of the plot

    path: path-like str or None
        Path of the output file or None (if path is None, plot is displayed with selected backend)
    """
    
    eeg_columns = [
        'Fp1', 'F3', 'C3', 'P3', 'F7', 'T3',
        'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2',
        'F4', 'C4', 'P4', 'F8', 'T4', 'T6',
        'O2',
    ]
    ekg_column = 'EKG'
    eeg_spacing = 500
    
    fig, axes = plt.subplots(figsize=(24, 24), nrows=2, height_ratios=[10, 1], dpi=100)

    for column_idx, column in enumerate(eeg_columns):
        axes[0].plot(np.arange(0, df.shape[0]), df[column] + (eeg_spacing * column_idx), linewidth=0.5, color='black')
        
    y_ticks = np.arange(0, len(eeg_columns)) * eeg_spacing - 100
    axes[0].set_yticks(y_ticks)
    axes[0].set_yticklabels(eeg_columns)
    axes[0].tick_params(axis='x', labelsize=15)
    axes[0].tick_params(axis='y', labelsize=15)
    axes[0].set_xlabel('')
    axes[0].set_ylabel('')
    axes[0].set_title(title, size=15, pad=12.5, loc='center')
    
    axes[1].plot(np.arange(0, df.shape[0]), df['EKG'], linewidth=0.5, color='black')
    axes[1].set_yticks(np.array(axes[1].get_yticks()) * 1.5)
    axes[1].tick_params(axis='x', labelsize=12.5)
    axes[1].tick_params(axis='y', labelsize=12.5)
    axes[1].set_xlabel('')
    axes[1].set_ylabel('')
    axes[1].set_title('EKG', size=15, pad=12.5, loc='center')
    
    if path is None:
        plt.show()
    else:
        plt.savefig(path, bbox_inches='tight')
        plt.close(fig)


visualize_eeg_signal(
    df=df_eeg,
    title='EEG 1628180742'
)

There are 9 subsamples exist for EEG `1628180742`. They can be retrieved using the metadata from training set. `eeg_label_offset_seconds` is the start of that subsample and each subsample is 50 seconds long. Since the **central 10 seconds** are labeled for both EEGs and spectrograms, other parts may have overlap with other subsamples.

In [None]:
df_train.loc[df_train['eeg_id'] == 1628180742]

All subsamples of EEG `1628180742` are visualized below.

In [None]:
for idx, row in df_train.loc[df_train['eeg_id'] == 1628180742].reset_index(drop=True).iterrows():
    
    start_idx = int(row['eeg_label_offset_seconds'] * 200)
    end_idx = int((row['eeg_label_offset_seconds'] + 50) * 200)
    
    visualize_eeg_signal(
        df=df_eeg.iloc[start_idx:end_idx],
        title=f'EEG {row["eeg_id"]} - Subsample {row["eeg_sub_id"]} ({start_idx}-{end_idx}) - Label {row["expert_consensus"]}'
    )


Overlapping EEG subsamples might be problematic if they have inconsistent annotations. They can be found by checking the difference of `eeg_label_offset_seconds` and `expert_consensus`.

In [None]:
df_train['expert_consensus_encoded'] = df_train['expert_consensus'].map({
    'Seizure': 1,
    'LPD': 2,
    'GPD': 3,
    'LRDA': 4,
    'GRDA': 5,
    'Other': 6,
})

df_train['eeg_label_offset_seconds_diff'] = df_train.groupby('eeg_id')['eeg_label_offset_seconds'].diff()
df_train['expert_consensus_encoded_diff'] = df_train.groupby('eeg_id')['expert_consensus_encoded'].diff()

There are 1551 subsamples that have different labels when they have overlapping seconds between 2 and 48. `eeg_label_offset_seconds_diff == x` means that row has an overlapping `50 - x` amount of seconds with another row and their targets are different. Since the subsample labels are based on the center 10 seconds, overlapping seconds between 2 and 40 don't cause any inconsistent labels. Target counts of 50 second overlapping subsamples are consistent with the entire dataset.

In [None]:
condition = (df_train['eeg_label_offset_seconds_diff'] < 50) & (df_train['expert_consensus_encoded_diff'] != 0)

visualize_categorical_column_distribution(
    df=df_train.loc[condition],
    column='expert_consensus',
    title='Overlapping 50 Second Subsamples expert_consensus Counts'
)

Label inconsistency arises when subsamples have more than 40 seconds overlap because their center 10 seconds are starting to overlap after that point. There are **594** subsamples that have more than **40** seconds overlap with different target values. **109** of those subsamples even have 48 seconds of overlap with different target values. Those inconsistencies should be dealth with accordingly.

In [None]:
condition = (df_train['eeg_label_offset_seconds_diff'] < 10) & (df_train['expert_consensus_encoded_diff'] != 0)

visualize_categorical_column_distribution(
    df=df_train.loc[condition],
    column='eeg_label_offset_seconds_diff',
    title='Overlapping 10 Second Subsamples eeg_label_offset_seconds_diff Counts'
)

## 5. EEG Metadata

Data that is coming from multiple sources is very common in medical domain and that could be the case for EEGs. Patients, EEG machine, configurations and etc. can lead to inconsistencies in data collection process. The code below is used for metadata extraction in order to compare EEG subsamples.  

In [None]:
df_eeg_metadata = []

for eeg_id, df_train_eeg in tqdm(df_train.groupby('eeg_id'), total=df_train['eeg_id'].nunique()):

    df_eeg = pd.read_parquet(eeg_directory / f'{eeg_id}.parquet')

    for _, row in df_train_eeg.iterrows():

        eeg_sub_id = row['eeg_sub_id']
        start_idx = int(row['eeg_label_offset_seconds'] * 200)
        end_idx = int((row['eeg_label_offset_seconds'] + 50) * 200)
        df_eeg_subsample = df_eeg.iloc[start_idx:end_idx].reset_index(drop=True)

        nan_counts = df_eeg_subsample.isnull().sum().to_dict()
        nan_counts = {f'{k}_nan_count': v for k, v in nan_counts.items()}
        means = df_eeg_subsample.mean(axis=0).to_dict()
        means = {f'{k}_mean': v for k, v in means.items()}
        stds = df_eeg_subsample.std(axis=0).to_dict()
        stds = {f'{k}_std': v for k, v in stds.items()}

        metadata_dict = {
            'eeg_id': eeg_id,
            'eeg_sub_id': eeg_sub_id,
            'row_count': df_eeg_subsample.shape[0],
            'column_count': df_eeg_subsample.shape[1]
        }
        metadata_dict.update(nan_counts)
        metadata_dict.update(means)
        metadata_dict.update(stds)
        df_eeg_metadata.append(metadata_dict)

df_eeg_metadata = pd.DataFrame(df_eeg_metadata)

All of the EEG subsamples have consistent number of channels and timesteps.

In [None]:
visualize_categorical_column_distribution(
    df=df_eeg_metadata,
    column='row_count',
    title='EEG Metadata row_count Counts'    
)

visualize_categorical_column_distribution(
    df=df_eeg_metadata,
    column='column_count',
    title='EEG Metadata column_count Counts'    
)

Mean and standard deviation histograms of EEG subsamples suggest that EEGs are on different scales. Instead of normalizing with dataset statistics, instance normalization makes more sense.

Missing count histograms of EEG subsamples are identical on all channels, so missing values either exist on all channels at the same time or they don't exist at all. Missing values could be related to EEG failures or it could be some kind of an anomaly.

In [None]:
def visualize_eeg_metadata_distribution(df, eeg_column, title, path=None):

    """
    Visualize distribution of the given continuous columns in the given dataframe

    Parameters
    ----------
    df: pandas.DataFrame
        Dataframe with given continuous column

    eeg_column: str
        Name of the EEG column

    title: str
        Title of the plot

    path: path-like str or None
        Path of the output file or None (if path is None, plot is displayed with selected backend)
    """

    fig, axes = plt.subplots(figsize=(24, 6), ncols=3, dpi=100)
    axes[0].hist(df[f'{eeg_column}_mean'], bins=16)
    axes[1].hist(df[f'{eeg_column}_std'], bins=16)
    axes[2].hist(df[f'{eeg_column}_nan_count'], bins=16)
    
    for i, metadata in enumerate(['mean', 'std', 'nan_count']):
        axes[i].tick_params(axis='x', labelsize=15)
        axes[i].tick_params(axis='y', labelsize=15)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
        axes[i].set_title(
            f'''
            {metadata}
            Mean: {np.mean(df[f'{eeg_column}_{metadata}']):.2f} Median: {np.median(df[f'{eeg_column}_{metadata}']):.2f} Std: {np.std(df[f'{eeg_column}_{metadata}']):.2f}
            Min: {np.min(df[f'{eeg_column}_{metadata}']):.2f} Max: {np.max(df[f'{eeg_column}_{metadata}']):.2f}
            ''',
            size=15,
            pad=12.5,
            loc='center',
            wrap=True
        )
        
    fig.suptitle(title, fontsize=20, y=1.2)

    if path is None:
        plt.show()
    else:
        plt.savefig(path, bbox_inches='tight')
        plt.close(fig)


eeg_ekg_columns = [
    'Fp1', 'F3', 'C3', 'P3', 'F7', 'T3',
    'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2',
    'F4', 'C4', 'P4', 'F8', 'T4', 'T6',
    'O2', 'EKG'
]

for column in eeg_ekg_columns:
    visualize_eeg_metadata_distribution(
        df=df_eeg_metadata,
        eeg_column=column,
        title=f'EEG {column} Metadata'
    )


## 6. Spectrograms

A spectrogram is a detailed view of an EEG, is able to represent time, frequency, and amplitude at the same time. It is a way to analyze the frequency content of a signal over time. Spectrograms are commonly used in signal processing, audio analysis, and other fields where understanding the frequency distribution of a signal is important.

A raw EEG is a waveform that displays changes in a signal’s amplitude over time. A spectrogram, however, displays changes in the frequencies in a signal over time. Amplitude is then represented as variable intensity.

There are **1950** patients in training set. Each patient has **5.71** spectrograms on average and each spectrogram has **9.59** subsamples on average which adds up to 106800 labeled 10 minute long spectrogram subsamples.

In [None]:
df_patient_id_spectrogram_id_unique_counts = df_train.groupby('patient_id')[['spectrogram_id']].nunique()
visualize_continuous_column_distribution(
    df=df_patient_id_spectrogram_id_unique_counts,
    column='spectrogram_id',
    title='patient_id spectrogram_id nunique Distribution'
)

df_spectromram_id_spectrogram_sub_id_unique_counts = df_train.groupby('spectrogram_id')[['spectrogram_sub_id']].nunique()
visualize_continuous_column_distribution(
    df=df_spectromram_id_spectrogram_sub_id_unique_counts,
    column='spectrogram_sub_id',
    title='spectrogram_id spectrogram_sub_id nunique Distribution'
)

Spectrograms in this dataset are montages of **10 minute** long EEGs, not the **50 second** long ones that are given. Thus, they are zoomed out versions of given raw EEGs.

Montages are arrangements of signals that are created to display activity over the entire head and to provide lateralizing and localizing information. There are two categories of montages which are bipolar and referential. There are also multiple types of bipolar montages, but the most common one is the double banana, in which each electrode is linked and compared to the one behind it.

In the double banana montage, there are two chains per side

* Left outside temporal chain involving Fp1 → F7 → T3 → T5 → O1
* Left inside parasagittal chain involving Fp1 → F3 → C3 → P3 → O1
* Right outside temporal chain involving Fp2 → F8 → T4 → T6 → O2
* Right inside parasagittal chain involving Fp2 → F4 → C4 → P4 → O2

Finally, the "z" electrodes Fz → Cz → Pz form a small central chain.

In [None]:
df_spectrogram = pd.read_parquet(spectrogram_directory / '353733.parquet')
df_spectrogram

Spectrogram files have **400 signal** and **1 time** columns in total.

Signal column names start with `LL` (left temporal), `LP` (left parasagittal), `RL` (right temporal) and `RP` (right parasagittal) correspond to chains that are listed above, Small central chain and the EKG signal are not included. 

Signal column names end with a floating point number that represents the frequency in hertz (Hz). Those numbers are between 0.59 and 19.92 with a step size of 0.19/0.20.

Finally, there is a column named `time` which represents the time in **2** seconds, so a spectrogram with 300 rows is 600 seconds (10 minutes) long.

In [None]:
signal_column_frequencies = [float((a).split('_')[1]) for a in df_spectrogram.columns.tolist()[1:101]]
print(signal_column_frequencies)

Spectrogram subsamples can be retrieved by taking 600 seconds after the start time (`spectrogram_label_offset_seconds` column).

The function below can be used to get spectrogram subsamples by passing the spectrogram dataframe and start time. First column (`time`) is not taken and data is transposed. Returned spectrogram array has frequency on the first axis and time on the second axis. Different chains are stacked on the channel axis.

In [None]:
def get_spectrogram(df_spectrogram, start_time, fill_na=False, log_scale=False):
    
    """
    Get spectrogram array from the dataframe

    Parameters
    ----------
    df_spectrogram: pandas.DataFrame
        Dataframe with time and signal columns

    start_time: int
        Spectrogram offset seconds
        
    fill_na: bool
        Whether to fill missing values or not

    log_scale: bool
        Whether to do log transform or not

    Returns
    -------
    spectrogram: numpy.ndarray of shape (n_frequencies, n_time_steps, n_channels)
        Array of spectrogram
    """
    
    if start_time % 2 == 0:
        start_time += 1
    end_time = start_time + 598
    
    df_spectrogram_subsample = df_spectrogram.loc[(df_spectrogram['time'] >= start_time) & (df_spectrogram['time'] <= end_time)].iloc[:, 1:]
    
    if fill_na:
        df_spectrogram_subsample = df_spectrogram_subsample.fillna(0)
    
    spectrogram = df_spectrogram_subsample.values.T
    spectrogram = np.stack((
        spectrogram[0:100, :],
        spectrogram[100:200, :],
        spectrogram[200:300, :],
        spectrogram[300:400, :],
    ), axis=-1)
    
    if log_scale:
        spectrogram = np.log1p(spectrogram)
        
    return spectrogram


spectrogram = get_spectrogram(
    df_spectrogram=df_spectrogram,
    start_time=0,
    fill_na=False,
    log_scale=False
)

A spectrogram with seizure label is visualized below. Since the center 10 seconds are labeled, high amplitude in the center of right chains can be seen clearly. However, similar kind of high amplitude can't be seen on the left chains, so it means that activations can be anywhere including single electrodes.

In [None]:
def visualize_spectrogram(spectrogram, frequencies, channel_names, path=None):
    
    """
    Visualize spectrogram

    Parameters
    ----------
    spectrogram: numpy.ndarray of shape (n_frequencies, n_time_steps, n_channels)
        Array of spectrogram

    frequencies: list of shape (n_frequencies)
        List of frequencies
        
    channel_names: list of shape (n_channels)
        List of channel names

    path: path-like str or None
        Path of the output file or None (if path is None, plot is displayed with selected backend)
    """
    
    channels = spectrogram.shape[2]
    fig, axes = plt.subplots(figsize=(32, 32), nrows=channels)
    for channel in range(channels):
        spectrogram_channel = spectrogram[:, :, channel]
        axes[channel].imshow(spectrogram_channel, cmap='inferno')
        axes[channel].set_yticks(np.arange(0, spectrogram.shape[0])[::5])
        axes[channel].set_yticklabels(frequencies[::5])
        axes[channel].tick_params(axis='x', labelsize=15)
        axes[channel].tick_params(axis='y', labelsize=15)
        axes[channel].set_xlabel('Time', size=15, labelpad=12.5)
        axes[channel].set_ylabel('Frequency (Hz)', size=15, labelpad=12.5)
        axes[channel].set_title(f'{channel_names[channel]} Spectrogram', size=15, pad=12.5, loc='center')
        
    if path is None:
        plt.show()
    else:
        plt.savefig(path, bbox_inches='tight')
        plt.close(fig)
        

channel_names = [
    'LL (left temporal chain)',
    'RL (right temporal chain)',
    'LP (left parasagittal chain)',
    'RP (right parasagittal chain)'
]

visualize_spectrogram(
    spectrogram,
    frequencies=signal_column_frequencies,
    channel_names=channel_names
)

## 7. Spectrogram Metadata

The code below is used for metadata extraction in order to compare spectrogram subsamples.

In [None]:
df_spectrogram_metadata = []

for spectrogram_id, df_train_spectrogram in tqdm(df_train.groupby('spectrogram_id'), total=df_train['spectrogram_id'].nunique()):

    df_spectrogram = pd.read_parquet(spectrogram_directory / f'{spectrogram_id}.parquet')

    for _, row in df_train_spectrogram.iterrows():

        spectrogram_sub_id = row['spectrogram_sub_id']
        start_time = row['spectrogram_label_offset_seconds']
        
        if start_time % 2 == 0:
            start_time += 1
        end_time = start_time + 598

        df_spectrogram_subsample = df_spectrogram.loc[(df_spectrogram['time'] >= start_time) & (df_spectrogram['time'] <= end_time)].iloc[:, 1:]

        nan_counts = df_spectrogram_subsample.isnull().sum().to_dict()
        nan_counts = {f'{k}_nan_count': v for k, v in nan_counts.items()}
        means = df_spectrogram_subsample.mean(axis=0).to_dict()
        means = {f'{k}_mean': v for k, v in means.items()}
        stds = df_spectrogram_subsample.std(axis=0).to_dict()
        stds = {f'{k}_std': v for k, v in stds.items()}

        metadata_dict = {
            'spectrogram_id': spectrogram_id,
            'spectrogram_sub_id': spectrogram_sub_id,
            'row_count': df_spectrogram_subsample.shape[0],
            'column_count': df_spectrogram_subsample.shape[1]
        }
        metadata_dict.update(nan_counts)
        metadata_dict.update(means)
        metadata_dict.update(stds)
        df_spectrogram_metadata.append(metadata_dict)

df_spectrogram_metadata = pd.DataFrame(df_spectrogram_metadata)

All of the spectrogram subsamples have consistent number of frequencies and timesteps.

In [None]:
visualize_categorical_column_distribution(
    df=df_spectrogram_metadata,
    column='row_count',
    title='Spectrogram Metadata row_count Counts'    
)

visualize_categorical_column_distribution(
    df=df_spectrogram_metadata,
    column='column_count',
    title='Spectrogram Metadata column_count Counts'    
)


Mean and standard deviation histograms of spectrogram subsamples show that frequencies are on different scales. Lower frequencies have higher amplitudes, so they also have higher mean and standard deviations. As frequencies increase, their statistics decrease consistently but there are some minor exceptions.

Missing count histograms of spectrogram subsamples are same on all chains and frequencies, so missing values always occur at the same time or they don't exist at all just like EEGs.

In [None]:
def visualize_spectrogram_metadata_distribution(df, spectrogram_column, title, path=None):

    """
    Visualize distribution of the given continuous columns in the given dataframe

    Parameters
    ----------
    df: pandas.DataFrame
        Dataframe with given continuous column

    spectrogram_column: str
        Name of the spectrogram column

    title: str
        Title of the plot

    path: path-like str or None
        Path of the output file or None (if path is None, plot is displayed with selected backend)
    """

    fig, axes = plt.subplots(figsize=(24, 6), ncols=3, dpi=100)
    axes[0].hist(df[f'{spectrogram_column}_mean'], bins=16)
    axes[1].hist(df[f'{spectrogram_column}_std'], bins=16)
    axes[2].hist(df[f'{spectrogram_column}_nan_count'], bins=16)
    
    for i, metadata in enumerate(['mean', 'std', 'nan_count']):
        axes[i].tick_params(axis='x', labelsize=15)
        axes[i].tick_params(axis='y', labelsize=15)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
        axes[i].set_title(
            f'''
            {metadata}
            Mean: {np.mean(df[f'{spectrogram_column}_{metadata}']):.2f} Median: {np.median(df[f'{spectrogram_column}_{metadata}']):.2f} Std: {np.std(df[f'{spectrogram_column}_{metadata}']):.2f}
            Min: {np.min(df[f'{spectrogram_column}_{metadata}']):.2f} Max: {np.max(df[f'{spectrogram_column}_{metadata}']):.2f}
            ''',
            size=15,
            pad=12.5,
            loc='center',
            wrap=True
        )
        
    fig.suptitle(title, fontsize=20, y=1.2)

    if path is None:
        plt.show()
    else:
        plt.savefig(path, bbox_inches='tight')
        plt.close(fig)


spectrogram_columns = [[f'{chain}_{frequency}' for frequency in signal_column_frequencies] for chain in ['LL', 'RL', 'LP', 'RP']]
spectrogram_columns = np.array(spectrogram_columns).flatten().tolist()

for column in spectrogram_columns:
    visualize_spectrogram_metadata_distribution(
        df=df_spectrogram_metadata,
        spectrogram_column=column,
        title=f'Spectrogram {column} Metadata'
    )

## 8. Missing Values

Missing values exist in a small subset of both EEG and spectrogram subsamples. They always occur on the same EEG channels or spectrogram chains and frequencies, so they are consistent on time steps.

In [None]:
eeg_nan_count_columns = [column for column in df_eeg_metadata.columns.tolist() if 'nan_count' in column]
spectrogram_nan_count_columns = [column for column in df_spectrogram_metadata.columns.tolist() if 'nan_count' in column]

df_na_eegs = df_eeg_metadata.loc[(df_eeg_metadata[eeg_nan_count_columns] > 0).any(axis=1), ['eeg_id', 'eeg_sub_id'] + eeg_nan_count_columns]
df_na_eegs = df_na_eegs.sort_values(by='Fp1_nan_count', ascending=False).reset_index(drop=True)

df_na_spectrograms = df_spectrogram_metadata.loc[(df_spectrogram_metadata[spectrogram_nan_count_columns] > 0).any(axis=1), ['spectrogram_id', 'spectrogram_sub_id'] + spectrogram_nan_count_columns]
df_na_spectrograms = df_na_spectrograms.sort_values(by='LL_0.59_nan_count', ascending=False).reset_index(drop=True)

print(f'There are {df_na_eegs.shape[0]} EEG and {df_na_spectrograms.shape[0]} spectrogram subsamples with at least 1 missing value')
print(f'Subsamples are from {df_na_eegs["eeg_id"].nunique()} EEGs and {df_na_spectrograms["spectrogram_id"].nunique()} spectrograms')
print(f'{(df_na_eegs["eeg_id"].nunique() / df_train["eeg_id"].nunique() * 100):.2f}% of EEG and {(df_na_spectrograms["spectrogram_id"].nunique() / df_train["spectrogram_id"].nunique() * 100):.2f}% of spectrogram subsamples have at least 1 missing value')

Number of missing values statistics are calculated on subsamples with at least 1 missing value. Average missing value count is **170.75** on EEGs and **75.63** on spectrograms. Maximum number of missing values is **5197** for EEGs and **296** for spectrograms.

Since there are less time steps on spectrograms, they have more missing values compared to EEGs. Besides, number of spectrogram subsamples with at least 1 missing value is greater than 2 times of number of EEG subsamples with at least 1 missing value.

The reason of this phenomenon is probably related to spectrogram subsamples cover 10 minutes while EEG subsamples cover 50 seconds. It is more likely to happen in larger time frames.

In [None]:
visualize_continuous_column_distribution(
    df=df_na_eegs,
    column='Fp1_nan_count',
    title='EEG Subsamples NA Counts',
)

visualize_continuous_column_distribution(
    df=df_na_spectrograms,
    column='LL_0.59_nan_count',
    title='Spectrogram Subsamples NA Counts',
)

EEG subsamples with top 5 most missing values are visualized below. All of the missing values exist either on the most left or right side on all of them. They aren't randomly scattered around.

EEG 1593385762 also shows that there are different kind of anomalies that should be handled properly.

In [None]:
for eeg_id, eeg_label_offset_seconds in [(3289898692, 36.0), (3931449367, 0), (1593385762, 0), (2190373347, 0), (975631111, 0)]:
    
    start_idx = int(eeg_label_offset_seconds * 200)
    end_idx = int((eeg_label_offset_seconds + 50) * 200)
    
    df_eeg = pd.read_parquet(eeg_directory / f'{eeg_id}.parquet')
    
    visualize_eeg_signal(
        df=df_eeg.iloc[start_idx:end_idx],
        title=f'EEG {eeg_id} - {df_na_eegs.loc[df_na_eegs["eeg_id"] == eeg_id, "Fp1_nan_count"].values[0]} Missing Values'
    )


Spectrogram subsamples with top 5 most missing values are visualized below. Unlike EEGs, all of the missing values exist on the both sides of the center 10 seconds which is the labeled part. It could be an indication of the EEG device being inactive, thus no amplitude was present.

In [None]:
for spectrogram_id, spectrogram_label_offset_seconds in [(1925646794, 0), (314642970, 0), (1152072732, 0), (1250083997, 0), (1190299410, 0)]:
        
    df_spectrogram = pd.read_parquet(spectrogram_directory / f'{spectrogram_id}.parquet')
    spectrogram = get_spectrogram(
        df_spectrogram=df_spectrogram,
        start_time=spectrogram_label_offset_seconds,
        fill_na=False,
        log_scale=False
    )
    
    visualize_spectrogram(
        spectrogram=spectrogram,
        frequencies=signal_column_frequencies,
        channel_names=channel_names
    )


## 9. Signal to Spectrogram

In [None]:
# To Be Continued