In [None]:
import pandas as pd
import pathlib

import matplotlib.pyplot as plt
import numpy as np

dset_path = pathlib.PurePath("/kaggle/input/hms-harmful-brain-activity-classification/")
train_eegs = dset_path/"train_eegs"
train_specs = dset_path/"train_spectrograms"

In [None]:
train_meta = pd.read_csv(dset_path/"train.csv", dtype={"eeg_label_offset_seconds": "Int64",
                                                       "spectrogram_label_offset_seconds": "Int64",
                                                       "expert_consensus": "category"})
train_meta.info()

In [None]:
def paired_eeg(eeg_id, offset):
    consolidated_eeg = pd.read_parquet(train_eegs/f"{eeg_id}.parquet")
    # 200 rows = 1 second
    # Want 50 seconds starting at the offset
    start = offset * 200
    end = start + (200 * 50)
    return consolidated_eeg.iloc[start:end,]

def paired_spectrogram(spec_id, offset):
    consolidated_spec = pd.read_parquet(train_specs/f"{spec_id}.parquet")
    start = offset
    end = offset + 600
    return consolidated_spec[(consolidated_spec["time"] <= end) & (consolidated_spec["time"] >= start)]

In [None]:
row = train_meta.sample(n=1).iloc[0,]
row

# Plotting
## EEG

In [None]:
eeg_df = paired_eeg(row["eeg_id"], row["eeg_label_offset_seconds"])
eeg_df

The eeg dataframe contains 50 seconds, to somewhat replicate the plots in example_figures we should subset to the central 10 seconds.

In [None]:
def central_window_eeg(df, window_seconds=10):
    start = df.index[0]
    end = df.index[-1]
    mid = (start + end) / 2
    new_start = int(mid - window_seconds/2 * 200) - start + 1
    new_end = int(mid + window_seconds/2 * 200) - start + 1
    return df.iloc[new_start:new_end]
    
central_window_eeg(eeg_df, 10)

In [None]:
def plot_eeg(df, moving_avg=1):
    fig, axs = plt.subplots(20, 1, figsize=(15, 10), sharex=True)
    for i, ax in enumerate(axs):
        ax.plot(df.iloc[:,i], color="black")
        ax.set_ylabel(df.columns[i], rotation=0)
        ax.set_yticklabels([])
        ax.set_yticks([])
        ax.set_xticks([])
        ax.spines[["top", "bottom", "left", "right"]].set_visible(False)

plot_eeg(central_window_eeg(eeg_df, 10))

There is a lot of noise in this plot, even when only plotting 10 seconds, especially for EKG.

In [None]:
_ = plt.specgram(eeg_df["EKG"], Fs=200)

Looking at a spectrogram of the data the noise seems to be mostly at 60Hz and 0Hz. https://www.gehealthcare.co.uk/insights/article/a-guide-to-ecg-signal-filtering confirms that this is common for this kind of data.

It seems to be relatively common to filter eeg data using a bandpass filter between 1-40 or 1-30 Hz.

In [None]:
from scipy import signal

fs = 200
# b_notch, a_notch = signal.iirnotch(60, 30.0, fs)
# b_lowpass, a_lowpass = signal.butter(4, 10, btype="lowpass", fs=200)
sos = signal.butter(4, [1, 40], btype="band", fs=200, output="sos")

def filt(data):
#     data = signal.filtfilt(b_notch, a_notch, data)
#     return signal.filtfilt(b_lowpass, a_lowpass, data)
    return signal.sosfiltfilt(sos, data)

to_filter = eeg_df["EKG"]
fig, axs = plt.subplots(1, 2, figsize=(15, 8), sharey=True)
axs[0].plot(to_filter)
axs[0].set_title("Original signal")
axs[1].plot(filt(to_filter))
axs[1].set_title("Filtered")

In [None]:
def plot_eeg(df):
    fig, axs = plt.subplots(20, 1, figsize=(15, 10), sharex=True)
    for i, ax in enumerate(axs):
        ax.plot(filt(df.iloc[:,i]), color="black")
        ax.set_ylabel(df.columns[i], rotation=0)
        ax.set_yticklabels([])
        ax.set_yticks([])
        ax.set_xticks([])
        ax.spines[["top", "bottom", "left", "right"]].set_visible(False)

plot_eeg(central_window_eeg(eeg_df, 10))

## Spectrogram

In [None]:
spec = paired_spectrogram(row["spectrogram_id"], row["spectrogram_label_offset_seconds"])
spec

~~This dataframe contains 10 minutes of data, for the purpose of replicating the plots we only need the central 10 seconds, though for making decisions we probably want to use a wider window.~~

Nevermind, I misread the plot, the plot in the examples is using the whole 10 minutes, but the central window function may still be useful.

In [None]:
def central_window_spec(df, window_seconds=10):
    start = df.iloc[0, 0]
    end = df.iloc[-1, 0]
    mid = (start + end) / 2
    new_start = mid - window_seconds/2
    new_end = mid + window_seconds/2
    return df[(df["time"] >= new_start)].iloc[:window_seconds//2]

central_window_spec(spec, 30)

In [None]:
from scipy import signal

def plot_spec(df):
    fig, axs = plt.subplots(4, 1, figsize=(15, 10), sharey=True)

    prefixes = ['LL', 'RL', 'LP', 'RP']

    for ax, prefix in zip(axs, prefixes):
        cols = df.filter(regex=f"^{prefix}_").columns
        ax.imshow(spec[cols].T, origin="lower", norm="log", cmap="plasma", interpolation="none")
        ax.set_title(prefix)
        ax.set_yticks(np.arange(0, 101, 25.))
        ax.set_yticklabels([0, 5, 10, 15, 20])
        ax.set_ylabel("Freq")
        ax.set_xticks(np.arange(0, 301, 75.))
        ax.set_xticklabels(range(spec.iloc[0, 0], spec.iloc[-1, 0], (spec.iloc[-1, 0]-spec.iloc[0, 0])//4))
        ax.set_xlabel("Seconds")

    plt.tight_layout()
    
plot_spec(spec)

# Some EDA

In [None]:
train_meta["expert_consensus"].value_counts().plot(kind="bar")

In [None]:
train_meta["patient_id"].value_counts().head(30).plot(kind="bar")

In [None]:
print(train_meta["patient_id"].value_counts().mean())
print(train_meta["patient_id"].value_counts().median())

The number of examples for each patient seems to vary a lot. Potentially when learning should use less of the most frequent patients. I am assuming that the test/leaderboard data will include new patients, so trying to improve performance on unseen patients could be useful.

In [None]:
top_ids = train_meta["patient_id"].value_counts().head(50).index
mask = ~train_meta["patient_id"].isin(top_ids)
filtered_train_meta = train_meta[mask]
filtered_train_meta["expert_consensus"].value_counts().plot(kind="bar")

However simply removing the most frequently occuring patient IDs (or downsampling) makes the imbalance in expert concensus a bigger issue.

## Spectrogram sums

In [None]:
sample = 500
def plot_summed_specs(df, sample):
    acc = paired_spectrogram(df.iloc[0]["spectrogram_id"], df.iloc[0]["spectrogram_label_offset_seconds"])
    for col in acc.columns:
        acc[col].values[:] = 0
    for i, row in df.sample(n=sample).iterrows():
        spec = paired_spectrogram(row["spectrogram_id"], row["spectrogram_label_offset_seconds"]).fillna(0)
#         normalised_spec = (spec - spec.min()) / (spec.max() - spec.min())
        acc = acc.add(spec, fill_value=0)
    plot_spec(acc)

In [None]:
selected = train_meta[train_meta["expert_consensus"] == "Seizure"]
plot_summed_specs(selected, sample)

In [None]:
selected = train_meta[train_meta["expert_consensus"] == "LPD"]
plot_summed_specs(selected, sample)

In [None]:
selected = train_meta[train_meta["expert_consensus"] == "GPD"]
plot_summed_specs(selected, sample)

In [None]:
selected = train_meta[train_meta["expert_consensus"] == "LRDA"]
plot_summed_specs(selected, sample)

In [None]:
selected = train_meta[train_meta["expert_consensus"] == "GRDA"]
plot_summed_specs(selected, sample)

In [None]:
selected = train_meta[train_meta["expert_consensus"] == "Other"]
plot_summed_specs(selected, sample)