It is always important to look at the data and figure out what kind of data we are going to be working with.

# Prep

In [None]:
import numpy as np
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import matplotlib.cm as cm

In [None]:
# Show EEG and Spctr parallelly
def show_eeg_spctr(df_eeg, df_spctr, title=''):
    plt.rcParams["font.size"] = 32
    #set time axis
    t_eeg = np.arange(0, df_eeg.shape[0]*0.01, 0.01) # step = 0.01s/slice, df_eeg.shape[0]: 5000 entries for 50 s EEG records, 
    t_spctr = np.arange(0, df_spctr.shape[0]/30, 1/30) # step = 1/30min (2s) /slice, df_spctr.shape[0]: 300 entries for 10 min == 600 s spectrograms records
    
    # Make fig and ax
    fig, (ax1, ax2) = plt.subplots(nrows=2,ncols=1, figsize=(30, 30),tight_layout=True)
    
    # EEG plotting
    for i,col in enumerate(df_eeg): #plot each columns
        y = df_eeg[col]
        ax1.plot(t_eeg, y, color=cm.hsv(i/df_eeg.shape[1]), label=col, alpha=0.3) # color - for gradient coloring
    ax1.set(title=f'{title}_EEG',xlabel='Time (s)', ylabel='intensity',ylim=(-250,250))
    ax1.legend(loc='center left', bbox_to_anchor=(1, .5), ncols=1) # legend loc adjustment
    
    # spectrogram plotting
    for i,col in enumerate(df_spctr.iloc[:, 1:]): # first column: time
        y = df_spctr[col]
        if i%25 == 0: #Filter labels name on legend
            ax2.plot(t_spctr, y, color=cm.hsv(i/300.0), label=col, alpha=0.3) # color - Circular gradient so that colors correspond to angular locations
        else:
            ax2.plot(t_spctr, y, color=cm.hsv(i/300.0), alpha=0.15)
    ax2.set(title =f'{title}_Spectrogram', xlabel='Time (min)', ylabel='intensity', ylim=(0,500))
    ax2.legend(loc='center left', bbox_to_anchor=(1, .5), ncols=1) # legend loc adjustment
    
    plt.show()
    plt.rcParams["font.size"] = 12

# Test_csv
> Only **spectrogram_id, eeg_id** and **patient_id** are given - because there will be no duplication in EEG and spectrograms.

In [None]:
test = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/test.csv')
test

# Train_csv
> "Metadata for the train set. The expert annotators reviewed 50 second long EEG samples plus matched spectrograms covering 10 a minute window centered at the same time and labeled the central 10 seconds. Many of these samples overlapped and have been consolidated. train.csv provides the metadata that allows you to extract the original subsets that the raters annotated."

In [None]:
train = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
train

**unique values counter**

In [None]:
unique_values = pd.DataFrame()
for col in train.columns[:-6]:
    unique_values.at['unique_count', col] = len(train[col].unique())
unique_values.astype(int)

### eeg_id 
> **"A unique identifier for the entire EEG recording."**  - Correspond to train_eegs

In [None]:
# load eeg by id
eeg0_id = train['eeg_id'][0]
eeg0 = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg0_id}.parquet')
eeg0

### eeg_sub_id
> **"An ID for the specific 50 second long subsample this row's labels apply to."** - some train_eegs are recorded longer and More subsamples are extracted

In [None]:
#number of subsamples per sample
eeg_sub_id_num = pd.DataFrame(
    np.zeros((len(train['eeg_sub_id'].unique()), 1)), columns=['number_of_ids']
)
for i in range(len(train['eeg_sub_id'].unique())):
    num = train[train['eeg_sub_id']==i].shape[0]
    eeg_sub_id_num.iloc[i]=num
eeg_sub_id_num = eeg_sub_id_num.diff(-1)
eeg_sub_id_num = eeg_sub_id_num[eeg_sub_id_num['number_of_ids']>0]

plt.bar(eeg_sub_id_num.index, eeg_sub_id_num['number_of_ids'], log=True)
plt.ylabel('Frequency')
plt.xlabel('Subsamples in one recording')
plt.title("Bar of duplicated EEGs")

In [None]:
eeg_sub_id_num

### eeg_label_offset_seconds 
> **"The time between the beginning of the consolidated EEG and this subsample."** - for clopping 50s eeg subset

In [None]:
# clop corresponding 50 second long subsample
i = 2024

eeg1_id = train['eeg_id'][i] # == 6.0s
offset_idx = int(train['eeg_label_offset_seconds'][i]*100) 

eeg1 = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg1_id}.parquet')
eeg1 = eeg1[offset_idx:offset_idx+5000] #slice 5000 slices = 50 sec.
print(f"offset_sec = {train['eeg_label_offset_seconds'][i]}")
eeg1

### spectrogram_id
>**"A unique identifier for the entire EEG recording."** - Correspond to train_spectrograms

In [None]:
# load spectrogram by id
spc0_id = train['spectrogram_id'][0]
spc0 = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/{spc0_id}.parquet')
spc0

### spectrogram_sub_id
> **"An ID for the specific 10 minute subsample this row's labels apply to."** - some train_spectrograms are recorded longer and More subsamples are extracted (same as eeg)

In [None]:
#number of subsamples per sample
spctr_sub_id_num = pd.DataFrame(
    np.zeros((len(train['spectrogram_sub_id'].unique()), 1)), columns=['number_of_ids']
)

for i in range(len(train['spectrogram_sub_id'].unique())):
    num = train[train['spectrogram_sub_id']==i].shape[0]
    spctr_sub_id_num.iloc[i]=num

spctr_sub_id_num = spctr_sub_id_num.diff(-1)
spctr_sub_id_num = spctr_sub_id_num[spctr_sub_id_num['number_of_ids']>0]

plt.bar(spctr_sub_id_num.index, spctr_sub_id_num['number_of_ids'], log=True)
plt.ylabel('Frequency')
plt.xlabel('Subsamples in one recording')
plt.title("Bar of duplicated spectrograms")

### spectogram_label_offset_seconds 
> **"The time between the beginning of the consolidated spectrogram and this subsample."** for clopping 10 min spectrogram subset

In [None]:
# clop corresponding 50 second long subsample
i = 2024
spctr_id = train['spectrogram_id'][i] # == 6.0s
offset_idx = int(train['spectrogram_label_offset_seconds'][i]//2) 

spctr1 = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/{spctr_id}.parquet')
spctr1 = spctr1[offset_idx:offset_idx+300] #slice 300 slices = 10 min (0.5 slice/s) 

print(f"offset_sec = {train['spectrogram_label_offset_seconds'][i]}")
spctr1

### labels_id
> **"An ID for this set of labels."** - 106800 unique values == rows of train

### patient_id
> **"An ID for the patient who donated the data."** - 1950 unique values -> several eegs and spectrograms are recorded

In [None]:
gb = train[['eeg_id', 'spectrogram_id', 'patient_id', 'label_id']].groupby('patient_id').nunique()
plt.hist(gb['eeg_id'].sort_values().reset_index(drop=True), 30, range=(0,60),alpha=0.7, label="eeg")
plt.hist(gb['spectrogram_id'].sort_values().reset_index(drop=True), 30, range=(0,60),alpha=0.35, label="spectrogram")
plt.hist(gb['label_id'].sort_values().reset_index(drop=True), 30, range=(0,100),alpha=0.25, label="label")
plt.legend()
plt.title('total labels per patient')
plt.xlabel('experiment (subsamples)')
plt.ylabel("patient")
plt.show()

In [None]:
gb = train[['patient_id', 'expert_consensus']].groupby('patient_id').nunique()
for i in np.sort(gb['expert_consensus'].unique()):
    print(f'Number of consensus obtained in a patient:{i} | {len(gb[gb["expert_consensus"]==i])} people')

### expert_consensus
> **"The consensus annotator label. Provided for convenience only."**

In [None]:
plt.figure()
plt.title('Number of samples in each consensus')
plt.xticks(np.arange(6)+1,train['expert_consensus'].unique())
plt.bar(np.arange(6)+1.2, train[['label_id','expert_consensus']].groupby('expert_consensus').nunique()['label_id'],width=0.4,
        alpha=0.5, color='C0', label='labels')
plt.legend()
plt.twinx()
plt.bar(np.arange(6)+0.8, train[['patient_id','expert_consensus']].groupby('expert_consensus').nunique()['patient_id'],width=0.4,
        alpha=0.5, color='C1', label='patients')
plt.legend()
plt.show()

# EEG and Spectrogram

### Subsamples

In [None]:
train_eegs_paths = glob.glob('/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/*.parquet')
train_spectrograms_paths = glob.glob('/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/*.parquet')

print(f'EEG subsamples: {len(train_eegs_paths)} | spectrogram subsamples: {len(train_spectrograms_paths)}')

### seizure / lpd / gpd / lrda / grda / other_vote
> **"The count of annotator votes for a given brain activity class.** Explanation by Bard:
#### EEA
> - **lpd: lateralized periodic discharges** - Recurring every 0.5-3 seconds, focused on one side of the brain with sharp waves or spikes.
<br>
> - **gpd: generalized periodic discharges**  - Firing on both sides of the brain every 0.5-3 seconds. Unlike their "lateralized" cousins, they don't favor one side over the other.
<br>
> - **lrda: lateralized rhythmic delta activity** - A slow, rhythmic drumbeat focused on one side. 0.25-1 seconds.
<br>
> - **grda: generalized rhythmic delta activity** - Samely, Rhythmic drumbeats every 0.25-1 seconds, echoing throughout the brain.
<br>
> - **seizure** - Focal or generalized. More complex, organized, and often faster rhythmic activity compared to the isolated bursts of GPDs or the slow, repetitive waves of GRDA.

#### Spectrogram

> - **lpd: lateralized periodic discharges** - Isolated sharp spikes at regular intervals (0.5-3 seconds) within a specific band (usually higher than GPDs) appear like intermittent lightning flashes.
<br>
> - **gpd: generalized periodic discharges**  - Repetitive bursts of higher frequency spikes (0.5-3 seconds) across a broader band paint a picture of scattered rainstorms across the brain.
<br>
> - **lrda: lateralized rhythmic delta activity** - A sustained, rhythmic band of slow delta waves (1-4 Hz) dominates one side of the spectrum, like a droning bass line focused on one speaker.
<br>
> - **grda: generalized rhythmic delta activity** - A widespread, rhythmic band of slow delta waves (1-4 Hz) fills the entire spectrum, resembling a thick fog enveloping the whole soundscape.
<br>
> - **seizure** - Focal or generalized. More complex, organized, and often faster rhythmic activity compared to the isolated bursts of GPDs or the slow, repetitive waves of GRDA.

We would still like to see a specific example. Let's draw some obvious waves with constant higher votes in certain category.

### Seizure

In [None]:
seizure_sample = train.sort_values('seizure_vote',ascending=False).reset_index()

seizure0_eeg_id = seizure_sample['eeg_id'][0]
seizure0_spctr_id = seizure_sample['spectrogram_id'][0]
eeg0_offset_idx = int(seizure_sample['eeg_label_offset_seconds'][0]*100)
spctr0_offset_idx = int(seizure_sample['spectrogram_label_offset_seconds'][0]//2) 

seizure0_eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{seizure0_eeg_id}.parquet')
seizure0_eeg = seizure0_eeg[eeg0_offset_idx:eeg0_offset_idx+5000] #slice 5000 slices = 50 sec.

seizure0_spctr = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/{seizure0_spctr_id}.parquet')
seizure0_spctr = seizure0_spctr.iloc[spctr0_offset_idx:spctr0_offset_idx+300] #slice 300 slices = 10 min (0.5 slice/s) 

show_eeg_spctr(seizure0_eeg, seizure0_spctr, title='Seizure')

### LPD (Lateralized Periodic Discharges)

In [None]:
lpd_sample = train.sort_values('lpd_vote',ascending=False).reset_index()

lpd0_eeg_id = lpd_sample['eeg_id'][0]
lpd0_spctr_id = lpd_sample['spectrogram_id'][0]
eeg0_offset_idx = int(lpd_sample['eeg_label_offset_seconds'][0]*100)
spctr0_offset_idx = int(lpd_sample['spectrogram_label_offset_seconds'][0]//2) 

lpd0_eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{lpd0_eeg_id}.parquet')
lpd0_eeg = lpd0_eeg[eeg0_offset_idx:eeg0_offset_idx+5000] 

lpd0_spctr = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/{lpd0_spctr_id}.parquet')
lpd0_spctr = lpd0_spctr.iloc[spctr0_offset_idx:spctr0_offset_idx+300]

show_eeg_spctr(lpd0_eeg, lpd0_spctr, title='LPD')

### GPD (generalized periodic discharges)

In [None]:
gpd_sample = train.sort_values('gpd_vote',ascending=False).reset_index()

gpd0_eeg_id = gpd_sample['eeg_id'][0]
gpd0_spctr_id = gpd_sample['spectrogram_id'][0]
eeg0_offset_idx = int(gpd_sample['eeg_label_offset_seconds'][0]*100)
spctr0_offset_idx = int(gpd_sample['spectrogram_label_offset_seconds'][0]//2) 

gpd0_eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{gpd0_eeg_id}.parquet')
gpd0_eeg = gpd0_eeg[eeg0_offset_idx:eeg0_offset_idx+5000] 

gpd0_spctr = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/{gpd0_spctr_id}.parquet')
gpd0_spctr = gpd0_spctr.iloc[spctr0_offset_idx:spctr0_offset_idx+300]

show_eeg_spctr(gpd0_eeg, gpd0_spctr, title='GPD')

### LRDA (lateralized rhythmic delta activity)

In [None]:
lrda_sample = train.sort_values('lrda_vote',ascending=False).reset_index()

lrda0_eeg_id = lrda_sample['eeg_id'][1]
lrda0_spctr_id = lrda_sample['spectrogram_id'][1]
eeg0_offset_idx = int(lrda_sample['eeg_label_offset_seconds'][1]*100)
spctr0_offset_idx = int(lrda_sample['spectrogram_label_offset_seconds'][1]//2) 

lrda0_eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{lrda0_eeg_id}.parquet')
lrda0_eeg = lrda0_eeg[eeg0_offset_idx:eeg0_offset_idx+5000] 

lrda0_spctr = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/{lrda0_spctr_id}.parquet')
lrda0_spctr = lrda0_spctr.iloc[spctr0_offset_idx:spctr0_offset_idx+300]

show_eeg_spctr(lrda0_eeg, lrda0_spctr, title='LRDA')

### GRDA (generalized rhythmic delta activity)

In [None]:
grda_sample = train.sort_values('grda_vote',ascending=False).reset_index()

grda0_eeg_id = grda_sample['eeg_id'][1]
grda0_spctr_id = grda_sample['spectrogram_id'][1]
eeg0_offset_idx = int(grda_sample['eeg_label_offset_seconds'][1]*100)
spctr0_offset_idx = int(grda_sample['spectrogram_label_offset_seconds'][1]//2) 

grda0_eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{grda0_eeg_id}.parquet')
grda0_eeg = grda0_eeg[eeg0_offset_idx:eeg0_offset_idx+5000] 

grda0_spctr = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/{grda0_spctr_id}.parquet')
grda0_spctr = grda0_spctr.iloc[spctr0_offset_idx:spctr0_offset_idx+300]

show_eeg_spctr(grda0_eeg, grda0_spctr, title='GRDA')

### Other - Normal?

In [None]:
other_sample = train.sort_values('other_vote',ascending=False).reset_index()

other0_eeg_id = other_sample['eeg_id'][0]
other0_spctr_id = other_sample['spectrogram_id'][0]
eeg0_offset_idx = int(other_sample['eeg_label_offset_seconds'][0]*100)
spctr0_offset_idx = int(other_sample['spectrogram_label_offset_seconds'][0]//2) 

other0_eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{other0_eeg_id}.parquet')
other0_eeg = other0_eeg[eeg0_offset_idx:eeg0_offset_idx+5000] #slice 5000 slices = 50 sec.

other0_spctr = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/{other0_spctr_id}.parquet')
other0_spctr = other0_spctr.iloc[spctr0_offset_idx:spctr0_offset_idx+300] #slice 300 slices = 10 min (0.5 slice/s) 

show_eeg_spctr(other0_eeg, other0_spctr, title='Other')

- In LRDA and GRDA, spikes occur in a constant rhythm over a short period of time, while LPD and GPD are slower rhythms.
- The seizure shows more dynamic wave behavior.
- Spectrogram intensity is consistently low in GPD and LPD.

### To Be Continued

- Baseline: Mean and std may work well??


I will continue to update this notebook for my EDA. If you have any suggestions, please comment!