## 📋 Table of Contents
* [File Overview](#files)
* [Training File](#train)
* [EEG File Example](#ex_EEG)
* [Spectrogram File Example](#ex_spec)
* [Test and Submission File](#sub)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# configs
pd.set_option('display.max_columns', None) # we want to display all columns in this notebook

# aesthetics
default_color_1 = 'darkblue'
default_color_2 = 'darkgreen'
default_color_3 = 'darkred'

<a id='files'></a>
# File Overview

In [None]:
!ls -l '../input/hms-harmful-brain-activity-classification'

In [None]:
!ls -l '../input/hms-harmful-brain-activity-classification/example_figures'

In [None]:
!ls -l '../input/hms-harmful-brain-activity-classification/train_spectrograms'

In [None]:
!ls -l '../input/hms-harmful-brain-activity-classification/train_eegs'

<a id='train'></a>
# Training File

In [None]:
# training data - load and preview
df_train = pd.read_csv('../input/hms-harmful-brain-activity-classification/train.csv')
df_train.head()

In [None]:
# votes columns
features_vote = ['seizure_vote', 'lpd_vote', 'gpd_vote',
                 'lrda_vote', 'grda_vote', 'other_vote']

In [None]:
# group by eeg_id to get just one row per EEG
df_train_unique_votes = df_train.groupby('eeg_id')[features_vote].mean()

# calc sum of votes for each row
df_train_unique_votes['vote_sum'] = df_train_unique_votes[features_vote].sum(axis=1)

# normalize votes (=> 100% distribution for each row/EEG)
for f in features_vote:
    df_train_unique_votes[f] = df_train_unique_votes[f] / df_train_unique_votes.vote_sum
    
df_train_unique_votes.head(10)

<a id='ex_EEG'></a>
# EEG File Example

In [None]:
# load an EEG file
df_eeg = pd.read_parquet('../input/hms-harmful-brain-activity-classification/train_eegs/1000913311.parquet')

In [None]:
# preview
df_eeg.head()

In [None]:
# show structure
df_eeg.info()

In [None]:
# basis stats
df_eeg.describe()

In [None]:
# list of features
features_eeg = ['Fp1', 'F3', 'C3', 'P3', 'F7', 
                'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz',
                'Fp2', 'F4', 'C4', 'P4', 'F8',
                'T4', 'T6', 'O2', 'EKG']

In [None]:
# plot all features - time series
for f in features_eeg:
    plt.figure(figsize=(12,3))
    plt.plot(df_eeg[f], color=default_color_1)
    plt.title(f)
    plt.grid()
    plt.show()

In [None]:
# plot all features - distributions
for f in features_eeg:
    plt.figure(figsize=(12,3))
    plt.hist(df_eeg[f], bins=100,
             color=default_color_1)
    plt.title(f)
    plt.grid()
    plt.show()

### Correlation:

In [None]:
# correlation matrix
cor_eeg = df_eeg[features_eeg].corr(method='pearson')
plt.figure(figsize=(12,8))
sns.heatmap(cor_eeg, annot=True,
            fmt='.2f',
            linecolor='black', linewidths=.5,
            cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Correlation - EEG example')
plt.show()

<a id='ex_spec'></a>
# Spectrogram File Example

In [None]:
# load spectrogram file
df_spec = pd.read_parquet('../input/hms-harmful-brain-activity-classification/train_spectrograms/1000646093.parquet')

In [None]:
# preview
df_spec.head()

In [None]:
# show structure
df_spec.info(verbose=True, show_counts=True)

In [None]:
# basis stats
df_spec.describe()

In [None]:
# export to file
df_spec.to_csv('spec_example.csv')

### Some plots:

In [None]:
# plotting function given a specific frequency
def plot_spec(i_frequency):
    feature_1 = 'LL_' + i_frequency
    feature_2 = 'RL_' + i_frequency
    feature_3 = 'RP_' + i_frequency
    feature_4 = 'LP_' + i_frequency
    plt.figure(figsize=(10,3))
    plt.scatter(df_spec.time, df_spec[feature_1], s=5, label='LL')
    plt.scatter(df_spec.time, df_spec[feature_2], s=5, label='RL')
    plt.scatter(df_spec.time, df_spec[feature_3], s=5, label='RP')
    plt.scatter(df_spec.time, df_spec[feature_4], s=5, label='LP')
    plt.legend(loc='upper right')
    plt.title('Frequency=' + i_frequency)
    plt.grid()
    plt.show()

In [None]:
# plot for a few frequencies
frequencies = ['0.59', '3.91', '6.45', '10.16', '19.92']
for freq in frequencies:
    plot_spec(freq)

In [None]:
# frequencies
freqs = [0.59, 0.78, 0.98, 1.17, 1.37, 1.56, 1.76, 1.95, 2.15, 2.34, 2.54, 2.73, 2.93, 3.13, 3.32, 
         3.52, 3.71, 3.91, 4.1, 4.3, 4.49, 4.69, 4.88, 5.08, 5.27, 5.47, 5.66, 5.86, 6.05, 6.25,
         6.45, 6.64, 6.84, 7.03, 7.23, 7.42, 7.62, 7.81, 8.01, 8.2, 8.4, 8.59, 8.79, 8.98, 9.18,
         9.38, 9.57, 9.77, 9.96, 10.16, 10.35, 10.55, 10.74, 10.94, 11.13, 11.33, 11.52, 11.72,
         11.91, 12.11, 12.3, 12.5, 12.7, 12.89, 13.09, 13.28, 13.48, 13.67, 13.87, 14.06, 14.26,
         14.45, 14.65, 14.84, 15.04, 15.23, 15.43, 15.63, 15.82, 16.02, 16.21, 16.41, 16.6, 16.8,
         16.99, 17.19, 17.38, 17.58, 17.77, 17.97, 18.16, 18.36, 18.55, 18.75, 18.95, 19.14, 19.34,
         19.53, 19.73, 19.92]

In [None]:
# LL columns for all frequencies
cols_LL = ['LL_' + str(f) for f in freqs]

In [None]:
# basic stats
df_spec[cols_LL].describe()

In [None]:
# plot all frequencies in one chart
plt.figure(figsize=(12,5))
plt.plot(df_spec[cols_LL])
plt.title('LL Measurements - All Frequencies')
plt.grid()
plt.show()

In [None]:
# plot all frequencies in one chart - log scale
plt.figure(figsize=(12,5))
plt.plot(np.log10(df_spec[cols_LL]))
plt.title('LL Measurements - All Frequencies / log10 scale')
plt.grid()
plt.show()

<a id='sub'></a>
# Test and Submission File

In [None]:
# load (dummy) test data
df_test = pd.read_csv('../input/hms-harmful-brain-activity-classification/test.csv')
df_test

In [None]:
# load submission file
df_sub = pd.read_csv('../input/hms-harmful-brain-activity-classification/sample_submission.csv')
df_sub

#### 

#### Simply use (grouped) means for a first baseline. See also this notebook: https://www.kaggle.com/code/seshurajup/eda-train-csv

In [None]:
# simply use means for a first baseline
mean_values = df_train_unique_votes[features_vote].mean()

df_sub.seizure_vote = mean_values['seizure_vote']
df_sub.lpd_vote = mean_values['lpd_vote']
df_sub.gpd_vote = mean_values['gpd_vote']
df_sub.lrda_vote = mean_values['lrda_vote']
df_sub.grda_vote = mean_values['grda_vote']
df_sub.other_vote = mean_values['other_vote']

df_sub

In [None]:
# save submission file
df_sub.to_csv('submission.csv', index=False)