# Introduction

In this competition, EEGs and Spectrograms are supposed to be important data. Why EEG first? Because spectrograms are made from EEGs. We are going to delve down into EEG first, them look into Spectrograms in next notebook. 

### What is EEG?

EEG is the observation of the sum of the electrical activity of a population of neurons in the brain from a given direction. In the observation, electrodes are arranged according to a rule called the "International 10-20 system", Except for A1 and A2 are removed and EKG(ElectroCardiogram) is added.
The electrode arrangement in the International 10-20 system is as follows:

![electrogram](https://upload.wikimedia.org/wikipedia/commons/7/70/21_electrodes_of_International_10-20_system_for_EEG.svg)
(by https://en.wikipedia.org/wiki/10%E2%80%9320_system_(EEG))

Based on the above, it is possible to express the positional relationship of the electrodes in a polar coordinate system. In the following plots, Hue is represented by θ in polar coordinates and r by cloma to represent the neighborhood relationship.

In [None]:
#location:[r, θ] , r, θ ∈ [0,1]
electrodes_map = {"Cz": [0, 0, 0],
                  "Fz": [0.5, 0, 0], "F3": [0.5, 0.125, 0], "C3": [0.5, 0.25, 0], "P3": [0.5, 0.375, 0], "Pz": [0.5, 0.5, 0],
                  "P4": [0.5, 0.625, 0], "C4": [0.5, 0.75, 0], "F4": [0.5, 0.875, 0],
                  "Fp1": [1.0, 0.05, 0], "F7": [1.0, 0.15, 0], "T3": [1.0, 0.25, 0], "T5": [1.0, 0.35, 0], "O1": [1.0, 0.45, 0],
                  "O2": [1.0, 0.55, 0], "T6": [1.0, 0.65, 0], "T4": [1.0, 0.75, 0], "F8": [1.0, 0.85, 0], "Fp2": [1.0, 0.95, 0],
                  "EKG": [0,0, 1]}# might be better annotation for EKG

In [None]:
import numpy as np
import pandas as pd
import os
import glob
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import matplotlib.colors as mcolors

#gradient coloring
def collor(col_name):
    if col_name=="EKG": return (0,0,0,0.1)#gray
    val, hue, _ = electrodes_map[col_name]
    if val==0.5: 
        val=0.4;
    rgb = mcolors.hsv_to_rgb((hue,1, val))
    if val==0.5: return np.append(rgb, 0.35)
    return np.append(rgb, 0.2)

In [None]:
fig, ax1 = plt.subplots(nrows=1,ncols=1, figsize=(8, 8),tight_layout=True)#()30,30,
for loc, vec in electrodes_map.items(): #plot each columns
    if loc=="EKG":continue
    y = vec[0]*np.cos(vec[1]*2*np.pi)
    x = -vec[0]*np.sin(vec[1]*2*np.pi)
    ax1.scatter(x, y, color=collor(loc), label=loc, s=1000)
    ax1.annotate(loc, xy=(x-0.04, y-0.02))
theta = np.linspace(0,2*np.pi,num=10000)
for ang in theta:
    ax1.plot(1.3*np.sin(theta),1.3*np.cos(theta), color='k', lw=0.1)
ax1.set(title=f'Location and Color',ylabel='Back and Forth', xlabel='Left and Right')
ax1.grid()
plt.show()

# Prep

In [None]:
train = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')

In [None]:
# Plot function
def show_eeg(df_eeg,  title='', ylim=(-250,250), vote=False, filtering=False):
    plt.rcParams["font.size"] = 12#32
    #set time axis
    t_eeg = np.arange(0, df_eeg.shape[0]*0.01, 0.01) # step = 0.01s/slice, df_eeg.shape[0]: 5000 entries for 50 s EEG records, 
    
    # Make fig and ax
    fig, ax1 = plt.subplots(nrows=1,ncols=1, figsize=(10, 8),tight_layout=True)#()30,30,
    
    # EEG plotting
    for i,col in enumerate(df_eeg): #plot each columns
        y = df_eeg[col]
        if filtering: 
            if (y.max()<filtering and y.min()>-filtering): continue
        ax1.plot(t_eeg, y, color=collor(col), label=col)
    ax1.set(title=title, xlabel='Time (s)', ylabel='intensity')
    if ylim:ax1.set_ylim(ylim)
    ax1.legend(loc='center left', bbox_to_anchor=(1, .5), ncols=1) # legend loc adjustment

    plt.show()
    if type(vote)!=bool:
        print(f"seizure_vote: {vote['seizure_vote']} | lpd_vote:  {vote['lpd_vote']} | gpd_vote:   {vote['gpd_vote']}\nlrda_vote:    {vote['lrda_vote']} | grda_vote: {vote['grda_vote']} | other_vote: {vote['other_vote']}")
    plt.rcParams["font.size"] = 12

# Plot

### Random

In [None]:
for i in range(2,-1,-1):
    sample = train.sample(n=1, random_state=i).reset_index()
    eeg_id = sample['eeg_id'][0]
    offset_idx = int(sample['eeg_label_offset_seconds'][0]*100)
    eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
    eeg = eeg[offset_idx:offset_idx+5000]
    vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[0]

    show_eeg(eeg, title=f'Random', vote=vote, ylim=False)

# Conditions
>Plot a few EEGs for each to get a sense of the characteristics in each condition.

## Seizure

In [None]:
sample = train.sort_values('seizure_vote', ascending=False).reset_index()
i, count=0, 0
patient_li=[]
while count<3:
    pat_id = sample['patient_id'][i]
    if pat_id in patient_li:
        i+=1
        continue
    patient_li.append(pat_id)
    eeg_id = sample['eeg_id'][i]
    offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
    eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
    eeg = eeg[offset_idx:offset_idx+5000]
    vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]
    
    show_eeg(eeg, title=f'Random', vote=vote)
    count+=1
    i+=1

## LPD

In [None]:
sample = train.sort_values('lpd_vote', ascending=False).reset_index()
i, count=0, 0
patient_li=[]
while count<3:
    pat_id = sample['patient_id'][i]
    if pat_id in patient_li:
        i+=1
        continue
    patient_li.append(pat_id)
    eeg_id = sample['eeg_id'][i]
    offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
    eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
    eeg = eeg[offset_idx:offset_idx+5000]
    vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]
    
    show_eeg(eeg, title=f'LPD patient: {pat_id}', vote=vote)
    count+=1
    i+=1

## GPD

In [None]:
sample = train.sort_values('gpd_vote', ascending=False).reset_index()
i, count=0, 0
patient_li=[]
while count<3:
    pat_id = sample['patient_id'][i]
    if pat_id in patient_li:
        i+=1
        continue
    patient_li.append(pat_id)
    eeg_id = sample['eeg_id'][i]
    offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
    eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
    eeg = eeg[offset_idx:offset_idx+5000]
    vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]
    
    show_eeg(eeg, title=f'GPD patient: {pat_id}', vote=vote)
    count+=1
    i+=1

## LRDA

In [None]:
sample = train.sort_values('lrda_vote', ascending=False).reset_index()
i, count=0, 0
patient_li=[]
while count<3:
    pat_id = sample['patient_id'][i]
    if pat_id in patient_li:
        i+=1
        continue
    patient_li.append(pat_id)
    eeg_id = sample['eeg_id'][i]
    offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
    eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
    eeg = eeg[offset_idx:offset_idx+5000]
    vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]
    
    show_eeg(eeg, title=f'LRDA patient: {pat_id}', vote=vote)
    count+=1
    i+=1

## GRDA

In [None]:
sample = train.sort_values('grda_vote', ascending=False).reset_index()
i, count=0, 0
patient_li=[]
while count<3:
    pat_id = sample['patient_id'][i]
    if pat_id in patient_li:
        i+=1
        continue
    patient_li.append(pat_id)
    eeg_id = sample['eeg_id'][i]
    offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
    eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
    eeg = eeg[offset_idx:offset_idx+5000]
    vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]
    
    show_eeg(eeg, title=f'GRDA patient: {pat_id}', vote=vote)
    count+=1
    i+=1

# Filtering
> Let's restrict the EEG displayed in the plot to electrodes whose maximum signal strength exceeds a certain one and see what the trend is.

In [None]:
def show_eeg_sub(ax1, df_eeg,  title='', ylim=(-250,250), vote=False, filtering=False):
    plt.rcParams["font.size"] = 12#32
    #set time axis
    t_eeg = np.arange(0, df_eeg.shape[0]*0.01, 0.01) # step = 0.01s/slice, df_eeg.shape[0]: 5000 entries for 50 s EEG records, 
    # EEG plotting
    for i,col in enumerate(df_eeg): #plot each columns
        y = df_eeg[col]
        if filtering: 
            if (y.max()<filtering and y.min()>-filtering): continue
        ax1.plot(t_eeg, y, color=collor(col), label=col)
    ax1.set(title=title, xlabel='Time (s)', ylabel='intensity')
    if ylim:ax1.set_ylim(ylim)
    ax1.legend(loc='center left', bbox_to_anchor=(1, .5), ncols=1) # legend loc adjustment
    if type(vote)!=bool:
        print(f"seizure_vote: {vote['seizure_vote']} | lpd_vote:  {vote['lpd_vote']} | gpd_vote:   {vote['gpd_vote']}\nlrda_vote:    {vote['lrda_vote']} | grda_vote: {vote['grda_vote']} | other_vote: {vote['other_vote']}")
    plt.rcParams["font.size"] = 12

## LPD vs GPD

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2,ncols=2, figsize=(15, 10),tight_layout=True)

sample = train.sort_values('lpd_vote', ascending=False).reset_index()
i=6
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax1, eeg, title=f'LPD patient: {pat_id}',  filtering=80, ylim=(-200,200))

sample = train.sort_values('lpd_vote', ascending=False).reset_index()
i=12
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax2, eeg, title=f'LPD patient: {pat_id}', filtering=160, ylim=(-200,200))

sample = train.sort_values('gpd_vote', ascending=False).reset_index()
i=2
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax3, eeg, title=f'GPD patient: {pat_id}', filtering=170, ylim=(-200,200))

sample = train.sort_values('gpd_vote', ascending=False).reset_index()
i=6
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax4, eeg, title=f'GPD patient: {pat_id}', filtering=400, ylim=(400,-400))
plt.show()

Compared to LPD, GPD appears to overlap many signals in a spike. This seems to be the case given that GPD affects a wider range of brain waves.
The spectrograms were measured in four separate locations (LL, LR, RL, and RR), but it seems that features that account for resonance in a wider range are needed.

## LRDA vs GRDA

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2,ncols=2, figsize=(15, 10),tight_layout=True)

sample = train.sort_values('lrda_vote', ascending=False).reset_index()
i=0
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax1, eeg, title=f'LRDA patient: {pat_id}',  filtering=210, ylim=(-200,200))

i=58
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax2, eeg, title=f'LRDA patient: {pat_id}', filtering=109, ylim=(-200,200))

sample = train.sort_values('grda_vote', ascending=False).reset_index()
i=1
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax3, eeg, title=f'GRDA patient: {pat_id}', filtering=210, ylim=(-200,200))

i=3
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax4, eeg, title=f'GRDA patient: {pat_id}', filtering=200, ylim=(-200,200))
plt.show()

It appears difficult to distinguish between LRDA and GRDA, but there still appears to be more overlap of the spike areas in GRDA. Also, in GRDA, the behavior of EKG, represented in gray in the figure, might be important

## LPD vs LRDA

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2,ncols=2, figsize=(15, 10),tight_layout=True)

sample = train.sort_values('lpd_vote', ascending=False).reset_index()
i=6
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax1, eeg, title=f'LPD patient: {pat_id}',  filtering=80, ylim=(-200,200))

sample = train.sort_values('lpd_vote', ascending=False).reset_index()
i=12
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax2, eeg, title=f'LPD patient: {pat_id}', filtering=160, ylim=(-200,200))

sample = train.sort_values('lrda_vote', ascending=False).reset_index()
i=0
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax3, eeg, title=f'LRDA patient: {pat_id}',  filtering=210, ylim=(-200,200))

i=58
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax4, eeg, title=f'LRDA patient: {pat_id}', filtering=109, ylim=(-200,200))
plt.show()

## GPD vs GRDA

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2,ncols=2, figsize=(15, 10),tight_layout=True)

sample = train.sort_values('gpd_vote', ascending=False).reset_index()
i=2
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax1, eeg, title=f'GPD patient: {pat_id}', filtering=170, ylim=(-200,200))

sample = train.sort_values('gpd_vote', ascending=False).reset_index()
i=6
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax2, eeg, title=f'GPD patient: {pat_id}', filtering=400, ylim=(400,-400))

sample = train.sort_values('grda_vote', ascending=False).reset_index()
i=1
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax3, eeg, title=f'GRDA patient: {pat_id}', filtering=210, ylim=(-200,200))

i=3
pat_id = sample['patient_id'][i]
eeg_id = sample['eeg_id'][i]
offset_idx = int(sample['eeg_label_offset_seconds'][i]*100)
eeg = pd.read_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
eeg = eeg[offset_idx:offset_idx+5000]
vote = sample[['seizure_vote' ,'lpd_vote' ,'gpd_vote', 'lrda_vote', 'grda_vote' ,'other_vote']].iloc[i]

show_eeg_sub(ax4, eeg, title=f'GRDA patient: {pat_id}', filtering=200, ylim=(-200,200))
plt.show()

As can be seen in [my previous notebook](https://www.kaggle.com/code/shunsukekikuchi/fast-eda), periodic discharges (LPD and GPD) vs rhythmic delta activity (LRDA, GRDA and seizures) will be distinguished by frequency (spectrograms). Though it's a guess...

# To be Continued

- EDA on Spectrograms, probably more important than EEGs...?
- Baseline: GDBT with some feature discovery might be a good start

I will continue to update this notebook for my EDA. If you have any suggestions, please leave a comment!