# Single numpy file with spectrograms

v1: using only F3 and F4, as two channels.


There is room for tuning when producing the spectrograms: 
  - see influence of window shape and size: https://www.audiolabs-erlangen.de/resources/MIR/FMP/C2/C2_STFT-Window.html  
  - apply logarithm

To modify train/val/test sets, go to keras/03_stratified.ipynb

- Band pass applied before the SFFT: Not needed. Comparison applying band pass before with slicing the spectrogram without filtering.


In [None]:
!pip install --upgrade /kaggle/input/hms-libraries/scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.signal import ShortTimeFFT
from scipy.signal.windows import gaussian
import random

# base_dir = "../../kaggle_data/hms"
# base_dir = "../../data/hms"
base_dir = "/kaggle/input/hms-harmful-brain-activity-classification"

# data_dir = '../data'
data_dir = '/kaggle/input/hms-indices-train-val-test-v1'

# output_dir = '../data/'
output_dir = ''

fs = 200  # Sample rate.

df_traincsv = pd.read_csv(f'{base_dir}/train.csv')

TARGETS = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

df_traincsv = pd.read_csv(f'{base_dir}/train.csv')
df_traincsv.loc[df_traincsv.expert_consensus == 'Seizure', 'target'] = 0
df_traincsv.loc[df_traincsv.expert_consensus == 'LPD', 'target'] = 1
df_traincsv.loc[df_traincsv.expert_consensus == 'GPD', 'target'] = 2
df_traincsv.loc[df_traincsv.expert_consensus == 'LRDA', 'target'] = 3
df_traincsv.loc[df_traincsv.expert_consensus == 'GRDA', 'target'] = 4
df_traincsv.loc[df_traincsv.expert_consensus == 'Other', 'target'] = 5

# Transform votes into percentages.
df_traincsv['sum_votes'] = df_traincsv.seizure_vote + df_traincsv.lpd_vote + df_traincsv.gpd_vote	+ df_traincsv.lrda_vote + df_traincsv.grda_vote + df_traincsv.other_vote
df_traincsv['seizure_vote'] = df_traincsv.seizure_vote/df_traincsv.sum_votes
df_traincsv['lpd_vote'] = df_traincsv.lpd_vote/df_traincsv.sum_votes
df_traincsv['gpd_vote'] = df_traincsv.gpd_vote/df_traincsv.sum_votes
df_traincsv['lrda_vote'] = df_traincsv.lrda_vote/df_traincsv.sum_votes
df_traincsv['grda_vote'] = df_traincsv.grda_vote/df_traincsv.sum_votes
df_traincsv['other_vote'] = df_traincsv.other_vote/df_traincsv.sum_votes

idxs_train = np.load(f'{data_dir}/03_stratified_v1_idxs_train.npy')
idxs_val = np.load(f'{data_dir}/03_stratified_v1_idxs_val.npy')
idxs_test = np.load(f'{data_dir}/03_stratified_v1_idxs_test.npy')
df_train = df_traincsv.loc[idxs_train]
df_val = df_traincsv.loc[idxs_val]
df_test = df_traincsv.loc[idxs_test]

print("Added target column. Transformed into percentages.")
print("Train:", len(df_train))
print("Val:", len(df_val))
print("Test:", len(df_test))


In [None]:
# Check there is no patient_id in more than one set.
ids = np.unique(df_train['patient_id'])
print(len(df_val.loc[df_val['patient_id'].isin(ids)]))
print(len(df_test.loc[df_test['patient_id'].isin(ids)]))
ids = np.unique(df_val['patient_id'])
print(len(df_train.loc[df_train['patient_id'].isin(ids)]))
print(len(df_test.loc[df_test['patient_id'].isin(ids)]))


# Version v1

Using only F3 and F4, in two channels.

In [None]:
#
# Training set
#

n_channels = 2
max_freq = 20  # Only keep freqs below this number.
# min_freq = 8  # Only keep freqs above this number.
# time_window = 10  # 10 second event.

#
# SFT setup: some tuning may be applied.
#
g_std = 24  # standard deviation for Gaussian window in samples
hop = 3
win_width = 48  # Pick an odd number.
mfft = 400
win = gaussian(win_width, std=g_std, sym=True)  # symmetric Gaussian wind.
SFT = ShortTimeFFT(win, hop=hop, fs=fs, mfft=mfft)

n2 = int(max_freq/SFT.delta_f)  # Number of bins below max_freq (30 Hz).
# Dimensions of Sx, check the code in playing building spectrograms.
dim1 = 39
dim2 = 682
# Each channel is appended to all.
# all = np.array([]).reshape(0, dim1, dim2, n_channels)
specs = np.empty((len(df_train), dim1, dim2, n_channels))

# item: [eeg_id, eeg_sub_id, idx in all (1st index), target,
#       seizure_vote, lpd_vote, gpd_vote, lrda_vote,
#       grda_vote, other_vote]
items = np.array([], dtype=float).reshape(0,10)

for i in np.arange(len(df_train)):
    if i%400 == 0:
        print(f'{i} files loaded.', end='\r')
    item = df_train.iloc[i]
    eeg = pd.read_parquet(f'{base_dir}/train_eegs/{item.eeg_id}.parquet')
    eeg = eeg.interpolate(limit_direction='both') # <<<<< Interpolation

    # 10 second eeg sub samples 
    offset = int(item.eeg_label_offset_seconds)
    start = (offset + 20) * fs
    end = (offset + 30) * fs
    eeg_sub_10 = eeg[start:end]

    N = eeg_sub_10.shape[0]
    t_x = np.arange(N) * 1/fs  # time indexes for signal

    X = np.empty((1, dim1, dim2, n_channels))
    # X[0,:,:,c] = Sx[n1:n2,:].copy()

    # for c in np.arange(n_channels):
    x = eeg_sub_10['F3'].values
    Sx = SFT.spectrogram(x)  # calculate absolute square of STFT
    specs[i,:,:,0] = Sx[1:n2,:].copy()
    x = eeg_sub_10['F4'].values
    Sx = SFT.spectrogram(x)  # calculate absolute square of STFT
    specs[i,:,:,1] = Sx[1:n2,:].copy()

    xitem = np.array([item.eeg_id, item.eeg_sub_id, i, item.target,
                    item.seizure_vote, item.lpd_vote, item.gpd_vote,
                    item.lrda_vote, item.grda_vote, item.other_vote],
                    dtype=float).reshape(1,10)
    items = np.concatenate([items, xitem])

filename = '03_single_spectrograms_v1_train'     
print(f'Saving to {filename}.npy')
print(f'Saving to {filename}_items.npy')
np.save(f'{output_dir}{filename}.npy', specs)
np.save(f'{output_dir}{filename}_items.npy', items)


In [None]:
#
# Validation set
#

n_channels = 2
max_freq = 20  # Only keep freqs below this number.
# min_freq = 8  # Only keep freqs above this number.
# time_window = 10  # 10 second event.

#
# SFT setup: some tuning may be applied.
#
g_std = 24  # standard deviation for Gaussian window in samples
hop = 3
win_width = 48  # Pick an odd number.
mfft = 400
win = gaussian(win_width, std=g_std, sym=True)  # symmetric Gaussian wind.
SFT = ShortTimeFFT(win, hop=hop, fs=fs, mfft=mfft)

n2 = int(max_freq/SFT.delta_f)  # Number of bins below max_freq (30 Hz).
# Dimensions of Sx, check the code in playing building spectrograms.
dim1 = 39
dim2 = 682
# Each channel is appended to all.
# all = np.array([]).reshape(0, dim1, dim2, n_channels)
specs = np.empty((len(df_val), dim1, dim2, n_channels))

# item: [eeg_id, eeg_sub_id, idx in all (1st index), target,
#       seizure_vote, lpd_vote, gpd_vote, lrda_vote,
#       grda_vote, other_vote]
items = np.array([], dtype=float).reshape(0,10)

for i in np.arange(len(df_val)):
    if i%400 == 0:
        print(f'{i} files loaded.', end='\r')
    item = df_val.iloc[i]
    eeg = pd.read_parquet(f'{base_dir}/train_eegs/{item.eeg_id}.parquet')
    eeg = eeg.interpolate(limit_direction='both') # <<<<< Interpolation

    # 10 second eeg sub samples 
    offset = int(item.eeg_label_offset_seconds)
    start = (offset + 20) * fs
    end = (offset + 30) * fs
    eeg_sub_10 = eeg[start:end]

    N = eeg_sub_10.shape[0]
    t_x = np.arange(N) * 1/fs  # time indexes for signal

    X = np.empty((1, dim1, dim2, n_channels))
    # X[0,:,:,c] = Sx[n1:n2,:].copy()

    # for c in np.arange(n_channels):
    x = eeg_sub_10['F3'].values
    Sx = SFT.spectrogram(x)  # calculate absolute square of STFT
    specs[i,:,:,0] = Sx[1:n2,:].copy()
    x = eeg_sub_10['F4'].values
    Sx = SFT.spectrogram(x)  # calculate absolute square of STFT
    specs[i,:,:,1] = Sx[1:n2,:].copy()

    xitem = np.array([item.eeg_id, item.eeg_sub_id, i, item.target,
                    item.seizure_vote, item.lpd_vote, item.gpd_vote,
                    item.lrda_vote, item.grda_vote, item.other_vote],
                    dtype=float).reshape(1,10)
    items = np.concatenate([items, xitem])

filename = '03_single_spectrograms_v1_val'     
print(f'Saving to {filename}.npy')
print(f'Saving to {filename}_items.npy')
np.save(f'{output_dir}{filename}.npy', specs)
np.save(f'{output_dir}{filename}_items.npy', items)

In [None]:
#
# Testing set
#

n_channels = 2
max_freq = 20  # Only keep freqs below this number.
# min_freq = 8  # Only keep freqs above this number.
# time_window = 10  # 10 second event.

#
# SFT setup: some tuning may be applied.
#
g_std = 24  # standard deviation for Gaussian window in samples
hop = 3
win_width = 48  # Pick an odd number.
mfft = 400
win = gaussian(win_width, std=g_std, sym=True)  # symmetric Gaussian wind.
SFT = ShortTimeFFT(win, hop=hop, fs=fs, mfft=mfft)

n2 = int(max_freq/SFT.delta_f)  # Number of bins below max_freq (30 Hz).
# Dimensions of Sx, check the code in playing building spectrograms.
dim1 = 39
dim2 = 682
# Each channel is appended to all.
# all = np.array([]).reshape(0, dim1, dim2, n_channels)
specs = np.empty((len(df_test), dim1, dim2, n_channels))

# item: [eeg_id, eeg_sub_id, idx in all (1st index), target,
#       seizure_vote, lpd_vote, gpd_vote, lrda_vote,
#       grda_vote, other_vote]
items = np.array([], dtype=float).reshape(0,10)

for i in np.arange(len(df_test)):
    if i%400 == 0:
        print(f'{i} files loaded.', end='\r')
    item = df_test.iloc[i]
    eeg = pd.read_parquet(f'{base_dir}/train_eegs/{item.eeg_id}.parquet')
    eeg = eeg.interpolate(limit_direction='both') # <<<<< Interpolation

    # 10 second eeg sub samples 
    offset = int(item.eeg_label_offset_seconds)
    start = (offset + 20) * fs
    end = (offset + 30) * fs
    eeg_sub_10 = eeg[start:end]

    N = eeg_sub_10.shape[0]
    t_x = np.arange(N) * 1/fs  # time indexes for signal

    X = np.empty((1, dim1, dim2, n_channels))
    # X[0,:,:,c] = Sx[n1:n2,:].copy()

    # for c in np.arange(n_channels):
    x = eeg_sub_10['F3'].values
    Sx = SFT.spectrogram(x)  # calculate absolute square of STFT
    specs[i,:,:,0] = Sx[1:n2,:].copy()
    x = eeg_sub_10['F4'].values
    Sx = SFT.spectrogram(x)  # calculate absolute square of STFT
    specs[i,:,:,1] = Sx[1:n2,:].copy()

    xitem = np.array([item.eeg_id, item.eeg_sub_id, i, item.target,
                    item.seizure_vote, item.lpd_vote, item.gpd_vote,
                    item.lrda_vote, item.grda_vote, item.other_vote],
                    dtype=float).reshape(1,10)
    items = np.concatenate([items, xitem])

filename = '03_single_spectrograms_v1_test'     
print(f'Saving to {filename}.npy')
print(f'Saving to {filename}_items.npy')
np.save(f'{output_dir}{filename}.npy', specs)
np.save(f'{output_dir}{filename}_items.npy', items)