In [None]:
!pip install \
   --requirement /kaggle/input/hms-hbac-offline-libs/requirements.txt \
   --no-index \
   --find-links file:///kaggle/input/hms-hbac-offline-libs/wheels

In [None]:
# All imports in this code block

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import webcolors as wc
import math

from koilerplate import INPUT_ROOT, WORKING_ROOT, TEMP_ROOT
from pathlib import Path
from enum import Enum
from typing import List, Tuple
from dataclasses import dataclass

from eeglib.helpers import Helper
from eeglib.eeg import EEG


In [None]:
# Set up some basic file paths
INPUT_PATH = Path(INPUT_ROOT)
COMPETITION_DATA_PATH = INPUT_PATH / "hms-harmful-brain-activity-classification"
COMPETITION_DATA_PATH

In [None]:
# Load the training set CSV
train_info = pd.read_csv(COMPETITION_DATA_PATH/"train.csv")
train_info

## Basic data visualisation

There's not a lot to look at in the basic training csv file. Just look at the expert categories to see if they are roughly the same order of magnitude each.

### Expert consensus

Just look at the raw data from the training file.

In [None]:
# Visualise the expert consensus category
sns.histplot(x="expert_consensus", data=train_info)

### Votes cast

We only know that there is a panel of experts, some presumably will not have voted and some may
vote more based on either the spectrogram or the raw eeg.

In [None]:
# Do we know how many experts looked at each eeg trace?
# Start by simply counting the votes for each entry 
train_info["vote_count"] = (
    train_info["seizure_vote"] +
    train_info["lpd_vote"] +
    train_info["gpd_vote"] +
    train_info["lrda_vote"] +
    train_info["grda_vote"] +
    train_info["other_vote"]
)
train_info

In [None]:
# Now get the maximum vote count based on EEG ID and Spectrogram ID
train_info["max_vote_count_eeg"] = train_info.groupby("eeg_id")["vote_count"].transform("max")
train_info["max_vote_count_spectrogram"] = train_info.groupby("spectrogram_id")["vote_count"].transform("max")
train_info

In [None]:
# Now test how many entries there are where the raw count is different from the max
total_rows = train_info.shape[0]
fewer_eeg_votes = train_info[train_info["vote_count"] < train_info["max_vote_count_eeg"]]
fewer_eeg_votes_rows = fewer_eeg_votes.shape[0]
fewer_spectrogram_votes = train_info[train_info["vote_count"] < train_info["max_vote_count_spectrogram"]]
fewer_spectrogram_votes_rows = fewer_spectrogram_votes.shape[0]
inconsistent_max_votes = train_info[train_info["max_vote_count_eeg"] != train_info["max_vote_count_spectrogram"]]
inconsistent_max_votes_rows = inconsistent_max_votes.shape[0]
print(f"{fewer_eeg_votes_rows=}/{total_rows}; {fewer_eeg_votes_rows/total_rows : .2%}")
print(f"{fewer_spectrogram_votes_rows=}/{total_rows}; {fewer_spectrogram_votes_rows/total_rows : .2%}")
print(f"{inconsistent_max_votes_rows=}/{total_rows}; {inconsistent_max_votes_rows/total_rows : .2%}")

In [None]:
# Explore further the inconsistent max votes
inconsistent_fewer_eeg_votes = inconsistent_max_votes[
    inconsistent_max_votes["max_vote_count_eeg"] < inconsistent_max_votes["max_vote_count_spectrogram"]
]
inconsistent_fewer_eeg_votes_rows = inconsistent_fewer_eeg_votes.shape[0]
inconsistent_fewer_spectrogram_votes = inconsistent_max_votes[
    inconsistent_max_votes["max_vote_count_spectrogram"] < inconsistent_max_votes["max_vote_count_eeg"]
]
inconsistent_fewer_spectrogram_votes_rows = inconsistent_fewer_spectrogram_votes.shape[0]
print(
    f"{inconsistent_fewer_eeg_votes_rows=}/{inconsistent_max_votes_rows}; "
    f"{inconsistent_fewer_eeg_votes_rows/inconsistent_max_votes_rows : .2%}"
)
print(
    f"{inconsistent_fewer_spectrogram_votes_rows=}/{inconsistent_max_votes_rows}; "
    f"{inconsistent_fewer_spectrogram_votes_rows/inconsistent_max_votes_rows : .2%}"
)

#### Discussion

So, what do we know:
1. Sometimes experts will not agree on a label and will not cast a vote.
2. The spectrogram IDs have more votes associated with them than the eeg ids.
   this is probably consistent as the spectrograms cover a longer period of time
   than the EEG traces.
3. This probably means that the size of the panel could be derived from the max
   number of votes associated with the spectrogram ID rather than the eeg ID.
   
Therefore, to convert our votes to probabilities we use the max count of spectrogram ID votes.

### Probabilities

We'll divide the vote numbers by the maximum number of votes grouped by spectrogram IDs

In [None]:
train_info["P_sz"] = train_info["seizure_vote"] / train_info["max_vote_count_spectrogram"]
train_info["P_lpd"] = train_info["lpd_vote"] / train_info["max_vote_count_spectrogram"]
train_info["P_gpd"] = train_info["gpd_vote"] / train_info["max_vote_count_spectrogram"]
train_info["P_lrda"] = train_info["lrda_vote"] / train_info["max_vote_count_spectrogram"]
train_info["P_grda"] = train_info["grda_vote"] / train_info["max_vote_count_spectrogram"]
train_info["P_other"] = train_info["other_vote"] / train_info["max_vote_count_spectrogram"]

In [None]:
# Work out probabilities across the whole training set
P_all = np.asarray([
    train_info["P_sz"].sum(),
    train_info["P_lpd"].sum(),
    train_info["P_gpd"].sum(),
    train_info["P_lrda"].sum(),
    train_info["P_grda"].sum(),
    train_info["P_other"].sum()
]) / total_rows
P_all

In [None]:
# Visualise as a table
P_df = pd.DataFrame(data=P_all.reshape((1,6)), columns=["P_sz", "P_lpd", "P_gpd", "P_lrda", "P_grda", "P_other"])
P_df

In [None]:
# What's the probability sum across everything, don't expect this to be 1.0
P_all.sum()

### Level of agreement

From the competition overview:

- 'idealized': High level of expert agreement
- 'proto patterns':  Cases where ~1/2 of experts give a label as “other” and ~1/2
   give one of the remaining five labels.
- 'edge cases': Where experts are approximately split between 2 of the 5 named patterns

> Not that easy to program!

In [None]:
# My stab at categorizing the agreement based on the vague description
def compute_agreement(P_sz:float, P_lpd:float, P_gpd:float, P_lrda:float, P_grda:float, P_other:float) -> str:
    agreement = "none"
    # Because of the way we have computed the probabilities for a row,
    # the parameters passed may not add to 1.0. Lets fix this first.
    P_tot = P_sz + P_lpd + P_gpd + P_lrda + P_grda + P_other
    P_sz = P_sz / P_tot
    P_lpd = P_lpd / P_tot
    P_gpd = P_gpd / P_tot
    P_lrda = P_lrda / P_tot
    P_grda = P_grda / P_tot
    P_other = P_other / P_tot
    # Now rank them by probability value
    p_dict = { "sz": P_sz, "lpd": P_lpd, "gpd": P_gpd, "lrda": P_lrda, "grda": P_grda, "other": P_other}
    p_list = list(p_dict.items())
    p_list.sort(key=lambda kv: 1-kv[1])
    # Ignore all but the three highest probabilities, get the top ranking categories
    first = p_list[0][0]
    second = p_list[1][0]
    third = p_list[2][0]
    # Re-scale by the ignored probabilities of the lower 3 categories
    p_first = p_dict[first]
    p_second = p_dict[second]
    p_third = p_dict[third]
    p_top_3 = p_first + p_second + p_third
    p_first /= p_top_3
    p_second /= p_top_3
    p_third /= p_top_3
    if p_first > 0.75:
        # If not other then there is strong agreement, if is other then no agreement
        if first != "other":
            agreement = "idealized"
    elif (p_first + p_second) > 0.75:
        # First two categories combined are significant
        if first == "other" or second == "other":
            # We'll assume that this is a proto-pattern
            agreement = "proto-pattern"
        else:
            # We have two of equal-ish ranking
            agreement = "edge-case"
    else:
        # No agreement (already set)
        ...
    return agreement

In [None]:
train_info["agreement"] = train_info.apply(
    lambda row : compute_agreement(
        row["P_sz"], row["P_lpd"], row["P_gpd"], row["P_lrda"], row["P_grda"], row["P_other"]
    ),
    axis=1
)
train_info

In [None]:
# Visualise the expert consensus category
sns.histplot(x="agreement", data=train_info)

## EEG Data exploration

Load in an arbitrary sample and have a look at the data

In [None]:
# Just choose a arbitrary sample to look at
SAMPLE = 7_777
sample_info = train_info.iloc[SAMPLE]
sample_info

In [None]:
# Load the EEG data
sample_eeg = pd.read_parquet(COMPETITION_DATA_PATH/"train_eegs"/f"{sample_info['eeg_id']}.parquet")
sample_eeg

In [None]:
# Add a time channel to the data. The sampling rate is given in the competition data set info
EEG_SAMPLING_RATE = 200
sample_eeg["time_offset"] = sample_eeg.index * (1/EEG_SAMPLING_RATE)
sample_eeg

In [None]:
# Extract 10 seconds of data around the event location by slicing the dataframe
EXTRACT_TIME = 10
eeg_label_begin = sample_info["spectrogram_label_offset_seconds"] - (EXTRACT_TIME / 2)
eeg_label_end = eeg_label_begin + EXTRACT_TIME
print(f"Extracting sample: {eeg_label_begin} <= time_offset < {eeg_label_end}.")

In [None]:
extracted_eeg = sample_eeg[(sample_eeg["time_offset"] >= eeg_label_begin) & (sample_eeg["time_offset"] < eeg_label_end)].copy()
extracted_eeg["time_offset"] -= sample_info["spectrogram_label_offset_seconds"]
extracted_eeg

## Single ended vs differential

Looks like the data is single-ended (which is good) but we probably need to make it differential to produce the same type of plots in the sample data.

In [None]:
# What are the column names?
for col in sample_eeg.columns:
    print(col)

In [None]:
# Left lateral (LL)
LL_EEG_CHANNELS = ["Fp1-F7", "F7-T3", "T3-T5", "T5-O1"]
# Left parasagittal (LP)
LP_EEG_CHANNELS = ["Fp1-F3", "F3-C3", "C3-P3", "P3-O1"]
# Central
CC_EEG_CHANNELS = ["Fz-Cz", "Cz-Pz"]
# Right parasagittal (RP)
RP_EEG_CHANNELS = ["Fp2-F4", "F4-C4", "C4-P4", "P4-O2"]
# Right lateral (RL)
RL_EEG_CHANNELS = ["Fp2-F8", "F8-T4", "T4-T6", "T6-O2"]
# Auxiliary information columns
AUX_EEG_COLUMNS = [ "EKG", "time_offset" ]

# Define how we want our EEG channels to be constructed
DIFF_EEG_COLUMNS = \
    LL_EEG_CHANNELS + \
    LP_EEG_CHANNELS + \
    CC_EEG_CHANNELS + \
    RP_EEG_CHANNELS + \
    RL_EEG_CHANNELS + \
    AUX_EEG_COLUMNS
DIFF_EEG_COLUMNS

In [None]:
# Make a new dataframe with differential channels, rather than single ended
def make_differential(df:pd.DataFrame, columns:List[str] = DIFF_EEG_COLUMNS):
    # Take copy as df is potentially a slice
    df = df.copy();
    to_drop: List[str] = []
    for column in columns:
        single_ended_channels = column.split("-")
        if len(single_ended_channels) == 2:
            # We need to make the differential channel
            df[column] = df[single_ended_channels[0]] - df[single_ended_channels[1]]
            if single_ended_channels[0] not in to_drop:
                to_drop.append(single_ended_channels[0])
            if single_ended_channels[1] not in to_drop:
                to_drop.append(single_ended_channels[1])
    # Drop non-differential columns
    df.drop(to_drop, axis=1, inplace=True)
    # Now return the dataframe in the correwct column order
    return df[columns]    

In [None]:
# Apply the differential function
extracted_diff_eeg = make_differential(extracted_eeg)
extracted_diff_eeg

## Plotting

We're going to define a plotting process that can plot the dataframe columns as a grouped, stacked line plot using a combination of seaborn and matplot libraries.

In [None]:
# Define s dataclass that defines a group of channels to be plotted
@dataclass
class PlotGroup:
    channels: List[str]
    fg_color: str
    bg_color: str

In [None]:
# This function will plot a group of channels from the dataframe
def eeg_group_plot(df: pd.DataFrame, x: str, plot_group: PlotGroup, axes:List[plt.axis], xlim:Tuple[float, float] = (-5.0, +5.0)):
    for channel, ax in zip(plot_group.channels, axes):
        sns.lineplot(data=df, x=x, y=channel, ax=ax, color=plot_group.fg_color)
        ax.set_facecolor(plot_group.bg_color)
        ax.set_xlim(xlim)
        ax.set_ylabel(channel, rotation=0, fontsize=12, horizontalalignment='right', verticalalignment='center')

In [None]:
# This function will plot and stack multiple groups together
def eeg_plot(df: pd.DataFrame, x: str, plot_groups: List[plt.axis], fig_height=12, fig_width=12):
    num_channels = 0;
    for plot_group in plot_groups:
        num_channels += len(plot_group.channels)
    figure, axes = plt.subplots(num_channels, 1)
    figure.subplots_adjust(hspace=0)
    figure.set_figheight(fig_height)
    figure.set_figwidth(fig_width)
    axis_num = 0
    for plot_group in plot_groups:
        num_group_chans = len(plot_group.channels)
        group_axes = axes[axis_num:axis_num+num_group_chans]
        eeg_group_plot(df, x, plot_group, group_axes)
        axis_num += num_group_chans
    figure.show()

In [None]:
# Now define what the plot groupings are and what the colours are to use.
# We'll plot from the left side of the head to the right side, front to back.
# We'll use red for left size and green for right (as per international navigation lights!)
# EKG will be last channel
DIFF_PLOT_GROUPS = [
    PlotGroup(["Fp1-F7", "F7-T3", "T3-T5", "T5-O1"], 'red', wc.CSS3_NAMES_TO_HEX["lightpink"]),
    PlotGroup(["Fp1-F3", "F3-C3", "C3-P3", "P3-O1"], 'red', wc.CSS3_NAMES_TO_HEX["lightsalmon"]),
    PlotGroup(["Fz-Cz", "Cz-Pz"], 'black', wc.CSS3_NAMES_TO_HEX["gainsboro"]),
    PlotGroup(["Fp2-F4", "F4-C4", "C4-P4", "P4-O2"], 'green', wc.CSS3_NAMES_TO_HEX["darkseagreen"]),
    PlotGroup(["Fp2-F8", "F8-T4", "T4-T6", "T6-O2"], 'green', wc.CSS3_NAMES_TO_HEX["lightseagreen"]),
    PlotGroup(["EKG"], 'blue', wc.CSS3_NAMES_TO_HEX["powderblue"]),
]

In [None]:
# Now plot the unprocessed data
eeg_plot(extracted_diff_eeg, 'time_offset', DIFF_PLOT_GROUPS)

## EEGLIB preprocessing

eeglib has sparse documentation and may/may not be useful. Let's just explore what can be done with the preprocessing functions it provides.

### eeglib Helper class

Looks like data has to start in a `Helper` object and we need to create this from a 
numpy array (as there's no way to directly import a parquet file).

**However:** If you simply create a `Helper` object from a numpy array the `Helper` 
only saves the reference to the data and then, potentially, modifies it during 
pre-processing. This is not a problem in a forward-running pipeline but when messing
around in the Notebook this can have unintended side-effects.

Therefore we create a helper function that safely creates the helper from the
DataFrame without danger of modifying the input dataframe.

In [None]:
@dataclass
class BandPass:
    low_cutoff: float|None
    high_cutoff: float|None

In [None]:
# Makes an eeglib helper without potential side effects on the input dataframe
# Returns the helper and the data dropped when we made the helper
def df_to_eeglib_helper(
    df: pd.DataFrame,
    columns: List[str]=DIFF_EEG_COLUMNS,
    drop_columns: List[str]=AUX_EEG_COLUMNS,
    sample_rate:int=EEG_SAMPLING_RATE, 
    window_size:int|None=None,
    band_pass:BandPass|None=None,
    normalize:bool=False,
    ica:bool=False
) -> Helper:
    required_cols = [col for col in columns if col not in drop_columns]
    # Here is all important copy, gives us a new data array to be mutated by eeglib
    copy_df = df[required_cols].copy()
    dropped_df = df[drop_columns].copy()
    data = copy_df.to_numpy().transpose()
    helper = Helper(
        data, 
        sampleRate=sample_rate, 
        names=required_cols, 
        windowSize=window_size,
        highpass=band_pass.low_cutoff if (band_pass and band_pass.low_cutoff) else None,
        lowpass=band_pass.high_cutoff if (band_pass and band_pass.high_cutoff) else None,
        normalize=normalize,
        ICA=ica,
    )
    return helper, dropped_df;

In [None]:
# Convert a eeglib EEG object back into a standard pandas dataframe
def eeg_to_df(
    eeg: EEG, 
    eeg_channels:List[str], 
    restore_df:pd.DataFrame|None, 
    columns:List[str]=DIFF_EEG_COLUMNS
):
    df = pd.DataFrame(eeg.window.window.transpose(), columns=eeg_channels, copy=True)
    if (restore_df is not None):
        df = df.join(restore_df.reset_index())
    return df[columns]

In [None]:
helper, aux_df = df_to_eeglib_helper(
    extracted_diff_eeg, 
    window_size=EEG_SAMPLING_RATE*EXTRACT_TIME,
    band_pass=BandPass(0.8, 40.0),
    normalize=True,
)
# Now we need to get an EEG object
# Note, looks like a bug in eeglib, can only iterate once so we'll collect the eegs in a list.
# There is only one eeg as we've told it the window size is the entire eeg
eegs = [eeg for eeg in helper]
eeg = eegs[0]
eeglib_modified_df = eeg_to_df(eeg, helper.names, aux_df)
eeg_plot(eeglib_modified_df, 'time_offset', DIFF_PLOT_GROUPS)

### eeglib data input summary

So, we've ingressed some of the spectrogram data and it's been pre-processed (basically filtered and normalized).

> Note that I can't get the ICA option to work, ignoring this!

It looks reasonably sensible although there are definite edge effects (looking at the training edge caused by the
band-pass filter.

## eeglib features

So, why might we use eeglib?? The main reason appears to be that it can decompose
the time sequence eeg into a set of features. 
There is a list in the [eeglib features documentation](https://eeglib.readthedocs.io/en/latest/features.html)

There _may_ be some visual correlations discernable between the features it produces
and the expert votes in the training data.

So how to do this:

1. Read in an EEG completeley into an eeglib helper.
2. Decompose this into windows, by default eeglib will make 1 second windows.
3. Generate a feature set, per channel, for each of the 1 second segments.
4. Do some data exploration for correlation between expert votes and eeglib features.


In [None]:
# We still have the saw specrogram data in sample_eeg
sample_eeg_diff = make_differential(sample_eeg)
sample_eeg_diff

## Feature generation

Don't really know much about any of these feature types but we'll generate the single channel
features available in eeglib for the entire sample sliced into one second chunks

In [None]:
# For a sample set generate lists of all the channel features
feature_helper, _ = df_to_eeglib_helper(
    sample_eeg_diff, 
    band_pass=BandPass(0.8, 40.0),
    normalize=True,
)
bp_alpha = []
bp_beta = []
bp_delta = []
bp_theta = []
pfd = []
hfd = []
hjorth_activity = []
hjorth_mobility = []
hjorth_complexity = []
samp_en = []
lzc = []
dfa = []
for eeg in feature_helper:
    bp = eeg.bandPower()
    # Split band power and convert to db
    bp_alpha.append([10.0 * math.log10(ch["alpha"]) for ch in bp])
    bp_beta.append([10.0 * math.log10(ch["beta"]) for ch in bp])
    bp_delta.append([10.0 * math.log10(ch["delta"]) for ch in bp])
    bp_theta.append([10.0 * math.log10(ch["theta"]) for ch in bp])
    pfd.append(eeg.PFD())
    hfd.append(eeg.HFD())
    hjorth_activity.append(eeg.hjorthActivity())
    hjorth_mobility.append(eeg.hjorthMobility())
    hjorth_complexity.append(eeg.hjorthComplexity())
    samp_en.append(eeg.sampEn())
    lzc.append(eeg.LZC())
    dfa.append(eeg.DFA())
print("Done generating features")

In [None]:
# Now make this into a dataframe
df_as_dict = {}
for chan_idx, col_name in enumerate(feature_helper.names):
    df_as_dict[f"{col_name}.bp_alpha"] = [feature_values[chan_idx] for feature_values in bp_alpha]
    df_as_dict[f"{col_name}.bp_beta"] = [feature_values[chan_idx] for feature_values in bp_beta]
    df_as_dict[f"{col_name}.bp_delta"] = [feature_values[chan_idx] for feature_values in bp_delta]
    df_as_dict[f"{col_name}.bp_theta"] = [feature_values[chan_idx] for feature_values in bp_theta]
    df_as_dict[f"{col_name}.pfd"] = [feature_values[chan_idx] for feature_values in pfd]
    df_as_dict[f"{col_name}.hfd"] = [feature_values[chan_idx] for feature_values in hfd]
    df_as_dict[f"{col_name}.hjorth_activity"] = [feature_values[chan_idx] for feature_values in hjorth_activity]
    df_as_dict[f"{col_name}.hjorth_mobility"] = [feature_values[chan_idx] for feature_values in hjorth_mobility]
    df_as_dict[f"{col_name}.hjorth_complexity"] = [feature_values[chan_idx] for feature_values in hjorth_complexity]
    df_as_dict[f"{col_name}.samp_en"] = [feature_values[chan_idx] for feature_values in samp_en]
    df_as_dict[f"{col_name}.lzc"] = [feature_values[chan_idx] for feature_values in lzc]
    df_as_dict[f"{col_name}.dfa"] = [feature_values[chan_idx] for feature_values in dfa]
feature_df = pd.DataFrame.from_dict(df_as_dict)
feature_df


In [None]:
# Now normalize each column using min-max normalization (feature scaling)
# as better approriate for db scale values and feature values than zscoring
feature_df = (feature_df - feature_df.min()) / (feature_df.max() - feature_df.min())
feature_df

In [None]:
# Add time channel
feature_df = feature_df.copy() # This is because pandas warns the dataframe is fragmented!
feature_df["time_offset"] = feature_df.index * 1.0
feature_df

In [None]:
# There's a lot of numbers here! Try boiling some of it down into mean values per region...
EEG_REGIONS = [
    ("LL", LL_EEG_CHANNELS),
    ("LP", LP_EEG_CHANNELS),
    ("CC", CC_EEG_CHANNELS),
    ("RP", RP_EEG_CHANNELS),
    ("RL", RL_EEG_CHANNELS),
]
FEATURE_NAMES = [
    "bp_alpha", 
    "bp_beta", 
    "bp_delta", 
    "bp_theta", 
    "pfd", 
    "hfd", 
    "hjorth_activity",
    "hjorth_mobility", 
    "hjorth_complexity", 
    "samp_en", 
    "lzc", 
    "dfa"
]
for region_name, region_chans in EEG_REGIONS:
    for feature_name in FEATURE_NAMES:
        region_feature_chans = [f"{chan}.{feature_name}" for chan in region_chans]
        feature_df[f"{region_name}.{feature_name}"] = feature_df[region_feature_chans].mean(axis=1)
feature_df

## Pivot the training data on time axis

In order to do some visualisation of the features we need to map the training
data into a time series.

In [None]:
required_columns = [
    "eeg_label_offset_seconds", 
    "expert_consensus", 
    "P_sz", "P_lpd", "P_gpd", "P_lrda", "P_grda", "P_other",
    "agreement"
]
sample_train_info = train_info[
    train_info["eeg_id"] == sample_info.eeg_id
][required_columns].copy()
sample_train_info

In [None]:
labelled_features = feature_df.copy()
labelled_features = labelled_features.join(
    sample_train_info.set_index("eeg_label_offset_seconds"), 
    on="time_offset", 
    rsuffix="_label"
)
labelled_features

In [None]:
# Fix labels and probabilities
probabilities = ["P_sz", "P_lpd", "P_gpd", "P_lrda", "P_grda", "P_other"]
categories = ["expert_consensus", "agreement"]
labelled_features[probabilities] = labelled_features[probabilities].replace(np.nan, 0)
labelled_features[categories] = labelled_features[categories].replace(np.nan, "na")
labelled_features

In [None]:
REGION_MNEMONICS = [mne for mne, _ in EEG_REGIONS]
REGION_MNEMONICS

In [None]:
def feature_violinplot(
    data: pd.DataFrame, 
    feature_name:str,
    x:str="expert_consensus",
    regions:List[str]=REGION_MNEMONICS,
    fig_height:float=4,
    fig_width:float=12
) -> None:
    figure, axes = plt.subplots(1, len(regions))
    figure.subplots_adjust(wspace=0)
    figure.set_figheight(fig_height)
    figure.set_figwidth(fig_width)
    visible_yaxis = True
    for region, ax in zip(regions, axes):
        sns.violinplot(data=data, x=x, y=f"{region}.{feature_name}", ax=ax)
        ax.set_ylim((0.0, 1.0))
        ax.title.set_text(region)
        ax.yaxis.set_visible(visible_yaxis)
        ax.set_xlabel(None)
        ax.set_ylabel(None)
        visible_yaxis = False
    figure.supylabel(f"{feature_name.upper()}\nby region")
    figure.show()

In [None]:
for feature_name in FEATURE_NAMES:
    feature_violinplot(labelled_features, feature_name, fig_height=2)

#### Discussion

So can we see any correlation, I'm not sure however what we are looking at is
statistically insignificant - we're only looking at a single EEG sequence from
a single patient.

The next step would be to feature-ise the entire training data and build a
dataset containing all of the features across all of the traces.

Then, maybe, we'll see some sort of correlation...