# Setup

In [None]:
is_main = __name__ == '__main__'

In [None]:
import numpy as np
import polars as pl
from pathlib import Path

# Load dataset

In [None]:
if is_main:
    df_train = pl.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')

In [None]:
if is_main:
    display(df_train)

In [None]:
def scan_eeg_subsamples(eeg_id, offset):
    return (
        pl.scan_parquet(
            Path('/kaggle/input/hms-harmful-brain-activity-classification/train_eegs')
            / f'{eeg_id}.parquet',
        ).slice(int(offset * 200), 50 * 200)
    )

def read_eeg_subsamples(eeg_id, offset):
    return scan_eeg_subsamples(eeg_id, offset).collect()

In [None]:
if is_main:
    df_eeg_0 = read_eeg_subsamples(1628180742, 0)

In [None]:
if is_main:
    display(df_eeg_0)

# Butter lowpass filter

Learned from <https://www.kaggle.com/code/cdeotte/wavenet-starter-lb-0-52#Butter-Low-Pass-Filter>.

In [None]:
import scipy.signal

def butter_lowpass_filter(data, cutoff_freq = 20, sampling_rate = 200, order = 4):
    nyquist = 0.5 * sampling_rate
    normal_cutoff = cutoff_freq / nyquist
    b, a = scipy.signal.butter(order, normal_cutoff, btype = 'low', analog = False)
    return scipy.signal.lfilter(b, a, data, axis = 0)

In [None]:
if is_main:
    display(butter_lowpass_filter(df_eeg_0))

In [None]:
if is_main:
    import seaborn as sns
    import matplotlib.pyplot as plt
    import pandas as pd

In [None]:
if is_main:
    fig, axes = plt.subplots(2, figsize = (64, 16))
    sns.lineplot(df_eeg_0.to_pandas(), ax = axes[0]);
    axes[0].set(ylim = (-1600, 300));
    sns.lineplot(pd.DataFrame(butter_lowpass_filter(df_eeg_0), columns = df_eeg_0.columns), ax = axes[1]);
    axes[1].set(ylim = (-1600, 300));

In [None]:
if is_main:
    fig, axes = plt.subplots(2, figsize = (64, 16))
    sns.lineplot(df_eeg_0.slice(6000, 2000).to_pandas(), ax = axes[0]);
    axes[0].set(ylim = (-1600, 300));
    sns.lineplot(pd.DataFrame(butter_lowpass_filter(df_eeg_0)[6000:8000], columns = df_eeg_0.columns), ax = axes[1]);
    axes[1].set(ylim = (-1600, 300));

# Downsampling

From discussion <https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification/discussion/468684>:

> In this competition, we (most likely) are only concerned with brain waves of 20Hz and below.

So downsampling from 200Hz to 40Hz would reduce the data size without loosing much information.

In [None]:
import scipy.signal

def downsample(data, q = 5, zero_phase = False):
    return scipy.signal.decimate(data, q, axis = 0, zero_phase = zero_phase)

In [None]:
if is_main:
    print('Full shape', df_eeg_0.shape)
    print('Downsampled shape', downsample(df_eeg_0).shape)

In [None]:
if is_main:
    fig, axes = plt.subplots(2, figsize = (64, 16))
    sns.lineplot(df_eeg_0.to_pandas(), ax = axes[0]);
    axes[0].set(ylim = (-1600, 300));
    sns.lineplot(pd.DataFrame(downsample(df_eeg_0), columns = df_eeg_0.columns), ax = axes[1]);
    axes[1].set(ylim = (-1600, 300));

In [None]:
if is_main:
    fig, axes = plt.subplots(2, figsize = (64, 16))
    sns.lineplot(df_eeg_0.slice(6000, 2000).to_pandas(), ax = axes[0]);
    axes[0].set(ylim = (-1600, 300));
    sns.lineplot(pd.DataFrame(downsample(df_eeg_0)[1200:1600], columns = df_eeg_0.columns), ax = axes[1]);
    axes[1].set(ylim = (-1600, 300));

In [None]:
if is_main:
    fig, axes = plt.subplots(2, figsize = (64, 16))
    sns.lineplot(pd.DataFrame(butter_lowpass_filter(df_eeg_0)[6000:8000], columns = df_eeg_0.columns), ax = axes[0]);
    axes[0].set(ylim = (-1600, 300));
    axes[0].set_title('Lowpass');
    sns.lineplot(pd.DataFrame(downsample(df_eeg_0)[1200:1600], columns = df_eeg_0.columns), ax = axes[1]);
    axes[1].set(ylim = (-1600, 300));
    axes[1].set_title('Downsample');

Alternative way to downsample: averaging every 5 samples.

In [None]:
def downsample_pl(x, q = 5):
    return (
        x.with_row_count('row_count')
        .group_by(pl.col('row_count') // q)
        .mean()
        .sort('row_count')
        .drop('row_count')
    )

In [None]:
if is_main:
    display(downsample_pl(df_eeg_0))

In [None]:
if is_main:
    fig, axes = plt.subplots(2, figsize = (64, 16))
    sns.lineplot(df_eeg_0.to_pandas(), ax = axes[0]);
    axes[0].set(ylim = (-1600, 300));
    sns.lineplot(downsample_pl(df_eeg_0).to_pandas(), ax = axes[1]);
    axes[1].set(ylim = (-1600, 300));

In [None]:
if is_main:
    fig, axes = plt.subplots(2, figsize = (64, 16))
    sns.lineplot(df_eeg_0.slice(6000, 2000).to_pandas(), ax = axes[0]);
    axes[0].set(ylim = (-1600, 300));
    sns.lineplot(downsample_pl(df_eeg_0).slice(1200, 400).to_pandas(), ax = axes[1]);
    axes[1].set(ylim = (-1600, 300));

In [None]:
if is_main:
    fig, axes = plt.subplots(2, figsize = (64, 16))
    sns.lineplot(pd.DataFrame(downsample(df_eeg_0)[1200:1600], columns = df_eeg_0.columns), ax = axes[0]);
    axes[0].set(ylim = (-1600, 300));
    axes[0].set_title('SciPy downsample');
    sns.lineplot(downsample_pl(df_eeg_0).slice(1200, 400).to_pandas(), ax = axes[1]);
    axes[1].set(ylim = (-1600, 300));
    axes[1].set_title('Polars downsample');