# 🧠 Overview
Here we're going to take an initial look at what data we're working with.

---
# ⚙️ Setup
Let's get our python environment ready to go, then look at what files we've been given.

In [None]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from tqdm import tqdm
from itertools import cycle
import random
from random import choice
import os
import sys
from collections import Counter

import tensorflow as tf
import tensorflow_io as tfio

from typing import Optional

from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from scipy.stats import entropy

# %run kullback-leibler-divergence/metric.py

In [None]:
# Settings
sns.set_style("whitegrid")
color_pal = plt.rcParams["axes.prop_cycle"].by_key()["color"]

In [None]:
# View files

def list_files_and_folders_with_info(input_dir):
    try:
        for comp_dir in os.listdir(input_dir):
            comp_path = '/'.join([input_dir, comp_dir])
            print(f"Competition Directory: {comp_path}")
            print("Contains:")
            with os.scandir(comp_path) as entries:
                for entry in entries:
                    if entry.is_file():
                        print(f"- (File) {entry.name}, Size: {entry.stat().st_size} bytes")
                    elif entry.is_dir():
                        print(f"- (Folder) {entry.name}")

    except FileNotFoundError:
        print(f"The specified directory '{directory}' does not exist.")
    except PermissionError:
        print(f"Permission error accessing directory '{directory}'.")

# Replace 'path/to/your/directory' with the actual path of the directory you want to inspect
input_dir = '/kaggle/input'

list_files_and_folders_with_info(input_dir)

In [None]:
# Load Data
comp_path = '/kaggle/input/hms-harmful-brain-activity-classification'
train_df = pd.read_csv(f'{comp_path}/train.csv')
test_df = pd.read_csv(f'{comp_path}/test.csv')
ss_df = pd.read_csv(f'{comp_path}/sample_submission.csv')
df_dict = {
    "Training data": train_df,
    "Testing data": test_df,
    "Sample submission": ss_df,
}

train_spect_dir = '/'.join([comp_path, 'train_spectrograms'])
train_eeg_dir = '/'.join([comp_path, 'train_eegs'])
test_eeg_dir = '/'.join([comp_path, 'test_eegs'])
test_spect_dir = '/'.join([comp_path, 'test_spectrograms'])

train_eeg_path_list = [entry.path for entry in os.scandir(train_eeg_dir)]
train_spect_path_list = [entry.path for entry in os.scandir(train_spect_dir)]
test_eeg_path_list = [entry.path for entry in os.scandir(test_eeg_dir)]
test_spect_path_list = [entry.path for entry in os.scandir(test_spect_dir)]

In [None]:
print(f"Training EEG files: {len(train_eeg_path_list)}")
print(f"Training Spectrogram files: {len(train_spect_path_list)}")
print(f"Testing EEG files: {len(test_eeg_path_list)}")
print(f"Testing Spectrogram files: {len(test_spect_path_list)}")

---
# 🛠️ Define Tools

Define some useful functions for exploring this data.

In [None]:
# Functions for plotting features
def plot_numerical_feature(data):
    print(f"Max: {max(data)}, Min: {min(data)}, Mean: {np.mean(data):.2f}, Median: {np.median(data)}")
    fig, axes = plt.subplots(1, 2, figsize=(20, 5))

    sns.histplot(data, ax=axes[0], color=color_pal[0])

    sns.boxplot(data, ax=axes[1], color=color_pal[0], orient='h')
    
    plt.tight_layout()
    plt.show()
    return

def plot_categorical_feature(data):
    if len(np.unique(data)) > 50:
        print(f"{len(np.unique(data))} categorical features. Examples:")
        top_5_items = Counter(data).most_common(5)
        for item, count in top_5_items:
            print(f"{item} - {count} counts.")
        return
    
    # Column Chart
    if len(np.unique(data)) > 10:
        # Vertical plot layout
        fig, axes = plt.subplots(2, 1, figsize=(20, 10), height_ratios=(1/3, 2/3))
    else:
        # Horizontal plot layout
        fig, axes = plt.subplots(1, 2, figsize=(20, 5), width_ratios=(1/3, 2/3))
    s = data.value_counts().rename_axis('class').rename('count') # if x- and y-labels are important
    sns.barplot(x=s.index, y=s.values, order=s.index, ax=axes[0])
    axes[0].tick_params(axis='x', rotation=90)
    
    # Donut chart
    threshold_percent = 5
    threshold_count = s.sum() * threshold_percent / 100
    small_segments = s[s < threshold_count]
    pie_data = s.copy()
    if len(small_segments):
        pie_data['Other'] = small_segments.sum()
        pie_data.loc[small_segments.index] = np.nan
    pie_data = pie_data.dropna()
    
    axes[1].pie(pie_data, labels=pie_data.index, autopct='%1.1f%%', startangle=90, pctdistance=0.85, wedgeprops=dict(width=0.4))
    centre_circle = plt.Circle((0, 0), 0.70, fc='white')
    axes[1].add_artist(centre_circle)
    
    plt.tight_layout()
    plt.show()
    return

def plot_feature(data):
    if str(data[0]).replace(".", "").isdecimal():
        plot_numerical_feature(data)
    elif isinstance(data[0], list):
        plot_categorical_feature(data.explode().tolist())
    else:
        plot_categorical_feature(data)
    return

def plot_features(df):
    for feature in df.columns.values:
        print('-'*20)
        print(f"Feautre: {feature}")
        plot_feature(df[feature])
    return

# Functions for plotting features as a function of a target
def plot_numerical_feature_v_numerical_target(data, target):
    
    fig, axes = plt.subplots(1, 3, figsize=(20, 5))

    sns.histplot(data, ax=axes[0], color=color_pal[0])
    axes[0].set_title('Distribution Plot')

    sns.boxplot(data, ax=axes[1], color=color_pal[0], orient='h')
    axes[1].set_title('Box Plot')
    
    sns.scatterplot(x=data, y=target, ax=axes[2], color=color_pal[0])
    pfit, residuals, rank, singular_values, rcond = np.polyfit(x=data, y=target, deg=3, full=True)
    xfitted = np.linspace(min(data), max(data), 1000)
    fitmodel = np.poly1d(pfit)
    yfitted = fitmodel(xfitted)
    plt.plot(xfitted, yfitted, color=color_pal[1])
    r2 = float(1-residuals/(sum(np.square(target))))
    axes[2].set_title(f'Scatter Plot (R2 = {r2:.3f})')
    
    plt.tight_layout()
    plt.show()
    return
    
def plot_categorical_feature_v_numerical_target(data, target):
    if len(np.unique(data)) > 50:
        print(f"{len(np.unique(data))} categorical features. Examples:")
        top_5_items = Counter(data).most_common(5)
        for item, count in top_5_items:
            print(f"{item} - {count} counts.")
        return
    
    if len(np.unique(data)) > 10:
        fig, axes = plt.subplots(2, 1, figsize=(20, 10), height_ratios=(1/3, 2/3))
    else:
        fig, axes = plt.subplots(1, 2, figsize=(20, 5), width_ratios=(1/3, 2/3))
    s = data.value_counts().rename_axis('class').rename('count') # if x- and y-labels are important
    sns.barplot(x=s.index, y=s.values, order=s.index, ax=axes[0])
#     sns.countplot(x=data, ax=axes[0])
    sns.boxplot(x=target, y=data, ax=axes[1], order=s.index, orient='h')
    plt.tight_layout()
    plt.show()
    return

def plot_feature_v_numerical_target(data, target):
    if str(data[0]).replace(".", "").isdecimal():
        plot_numerical_feature(data, target)
    else:
        plot_categorical_feature(data, target)
    return

def plot_features_v_numerical_target(df, target_name):
    for feature in df.columns.values:
        if feature == target_name:
            continue
        print(f"Feature: {feature}")
        plot_feature(df[feature], df[target_name])
    return



Scoring functions taken from https://www.kaggle.com/code/metric/kullback-leibler-divergence

In [None]:
def kl_divergence(solution: pd.DataFrame, submission: pd.DataFrame, epsilon: float, micro_average: bool, sample_weights: Optional[pd.Series]):
    # Overwrite solution for convenience
    for col in solution.columns:
        # Prevent issue with populating int columns with floats
        if not pd.api.types.is_float_dtype(solution[col]):
            solution[col] = solution[col].astype(float)

        # Clip both the min and max following Kaggle conventions for related metrics like log loss
        # Clipping the max avoids cases where the loss would be infinite or undefined, clipping the min
        # prevents users from playing games with the 20th decimal place of predictions.
        submission[col] = np.clip(submission[col], epsilon, 1 - epsilon)

        y_nonzero_indices = solution[col] != 0
        solution[col] = solution[col].astype(float)
        solution.loc[y_nonzero_indices, col] = solution.loc[y_nonzero_indices, col] * np.log(solution.loc[y_nonzero_indices, col] / submission.loc[y_nonzero_indices, col])
        # Set the loss equal to zero where y_true equals zero following the scipy convention:
        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.rel_entr.html#scipy.special.rel_entr
        solution.loc[~y_nonzero_indices, col] = 0

    if micro_average:
        return np.average(solution.sum(axis=1), weights=sample_weights)
    else:
        return np.average(solution.mean())
    
def score(
        solution: pd.DataFrame,
        submission: pd.DataFrame,
        row_id_column_name: str,
        epsilon: float=10**-15,
        micro_average: bool=True,
        sample_weights_column_name: Optional[str]=None
    ) -> float:
    ''' The Kullback–Leibler divergence.
    The KL divergence is technically undefined/infinite where the target equals zero.

    This implementation always assigns those cases a score of zero; effectively removing them from consideration.
    The predictions in each row must add to one so any probability assigned to a case where y == 0 reduces
    another prediction where y > 0, so crucially there is an important indirect effect.

    https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence

    solution: pd.DataFrame
    submission: pd.DataFrame
    epsilon: KL divergence is undefined for p=0 or p=1. If epsilon is not null, solution and submission probabilities are clipped to max(eps, min(1 - eps, p).
    row_id_column_name: str
    micro_average: bool. Row-wise average if True, column-wise average if False.

    Examples
    --------
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> score(pd.DataFrame({'id': range(4), 'ham': [0, 1, 1, 0], 'spam': [1, 0, 0, 1]}), pd.DataFrame({'id': range(4), 'ham': [.1, .9, .8, .35], 'spam': [.9, .1, .2, .65]}), row_id_column_name=row_id_column_name)
    0.216161...
    >>> solution = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
    >>> submission = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
    >>> score(solution, submission, 'id')
    0.0
    >>> solution = pd.DataFrame({'id': range(3), 'ham': [0, 0.5, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.9, 0, 0]})
    >>> submission = pd.DataFrame({'id': range(3), 'ham': [0.2, 0.3, 0.5], 'spam': [0.1, 0.5, 0.5], 'other': [0.7, 0.2, 0]})
    >>> score(solution, submission, 'id')
    0.160531...
    '''
    del solution[row_id_column_name]
    del submission[row_id_column_name]

    sample_weights = None
    if sample_weights_column_name:
        if sample_weights_column_name not in solution.columns:
            raise ParticipantVisibleError(f'{sample_weights_column_name} not found in solution columns')
        sample_weights = solution.pop(sample_weights_column_name)

    if sample_weights_column_name and not micro_average:
        raise ParticipantVisibleError('Sample weights are only valid if `micro_average` is `True`')

    for col in solution.columns:
        if col not in submission.columns:
            raise ParticipantVisibleError(f'Missing submission column {col}')


    return kl_divergence(solution, submission, epsilon=epsilon, micro_average=micro_average, sample_weights=sample_weights)

---
# 👀 Quick look at the data
Now let's take a look at the data that we're working with. What we're looking for is:

* What data do I need to reformat/adapt before I even start looking at it?
* What does the data mean?
* Is there any data missing?

In [None]:
# Print out an exerpt from each of the dataframes we've been given
for df_key in df_dict.keys():
    print(f"- {df_key} -\nShape: {df_dict[df_key].shape} - {df_dict[df_key].shape[0]} Rows x {df_dict[df_key].shape[1]} Columns\nFeatures: {df_dict[df_key].columns.values}\n")
    print(df_dict[df_key].head(10))
    print('─'*70)

---
# Feature Engineering

In [None]:

vote_list = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']
train_df['total_vote'] = train_df[vote_list].sum(axis=1)
train_df['patient_samples'] = train_df['patient_id'].map(train_df['patient_id'].value_counts())


---
# Exploratory Data Analysis (EDA)

## Examine Train.csv


In [None]:
plot_features(train_df)

In [None]:
print(f"Unique Patient IDs: {len(train_df['patient_id'].unique())}")
print(f"Unique Label IDs: {len(train_df['label_id'].unique())}")
print(f"Unique EEG IDs: {len(train_df['eeg_id'].unique())}")
print(f"Unique Spectrogram IDs: {len(train_df['spectrogram_id'].unique())}")

In [None]:
top_items = Counter(train_df['expert_consensus']).most_common()
for i, (item, count) in enumerate(top_items):
    print(f"{i+1:{2}} {item:{10}} - {count} counts.")

In [None]:
unique_consensus_count_list = [len(train_df[train_df['patient_id'] == patient_id]['expert_consensus'].unique()) for patient_id in train_df['patient_id'].unique()]
plt.figure()
plt.hist(unique_consensus_count_list, bins=np.arange(0.5,7.5,1))
plt.xlabel("Number of different consensus across all samples")
plt.ylabel("Number of patients")
plt.title("Diversity in conensus for each patient")
plt.show()

## Examine EEG Data
Let's look at some of the EEG Data

In [None]:
sample_eeg = pd.read_parquet(train_eeg_path_list[0])
sample_eeg.info()
sample_eeg.describe()

In [None]:
fig, axes = plt.subplots(nrows=7, ncols=3, figsize=(20, 30))
axes = axes.flatten()
for i, column in enumerate(sample_eeg.columns.tolist()):
    ax = axes[i]
    ax.plot(sample_eeg[column])
    ax.set_title(column)

## Examine Spectrogram Data


In [None]:
sample_spect = pd.read_parquet(train_spect_path_list[0])
sample_spect.info()
sample_spect.describe()
sample_spect

In [None]:
split_spect = {
    "LL": sample_spect.filter(regex='^LL', axis=1),
    "RL": sample_spect.filter(regex='^RL', axis=1),
    "RP": sample_spect.filter(regex='^RP', axis=1),
    "LP": sample_spect.filter(regex='^LP', axis=1),
}

In [None]:
def plot_spectrograms(spectrogram_df):
    
    split_spect = {
        "LL": spectrogram_df.filter(regex='^LL', axis=1),
        "RL": spectrogram_df.filter(regex='^RL', axis=1),
        "RP": spectrogram_df.filter(regex='^RP', axis=1),
        "LP": spectrogram_df.filter(regex='^LP', axis=1),
    }
    
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(15, 12))
    axes = axes.flatten()
    label_interval = 5
    for i, split_name in enumerate(split_spect.keys()):
        ax = axes[i]
        img = ax.imshow(np.log(split_spect[split_name]).T, cmap='viridis', aspect='auto', origin='lower')  # You can choose any colormap (cmap) that suits your preferences
        cbar = fig.colorbar(img, ax=ax)
        cbar.set_label('Log(Value)')
        ax.set_title(split_name)
        ax.set_ylabel("Frequency (Hz)")
        ax.set_xlabel("Time")

        ax.set_yticks(np.arange(len(split_spect[split_name].columns)))
        ax.set_yticklabels([column_name[3:] for column_name in split_spect[split_name].columns])
        frequencies = [column_name[3:] for column_name in split_spect[split_name].columns]
        ax.set_yticks(np.arange(0, len(split_spect[split_name].columns), label_interval))
        ax.set_yticklabels(frequencies[::label_interval])
    plt.tight_layout()
    plt.show()
    
plot_spectrograms(sample_spect)

---
# Analysis
* Some code borrowed from https://www.kaggle.com/code/datajl/fit-tensorflow-on-parquet-dataset-via-tfio 

In [None]:
# output_parquet_features_dataset = tfio.IODataset.from_parquet(train_eeg_path_list[0])
# output_parquet_features_dataset.element_spec

In [None]:
# print(type(output_parquet_features_dataset))
# for data in output_parquet_features_dataset.take(2):
#     print(data)

In [None]:
eeg_samples_per_second = 200
eeg_seconds_per_subsample = 50
X = []
y = []
for sample_index in range(5000):
    eeg_id = train_df['eeg_id'].iloc[sample_index]
    eeg_sub_id = train_df['eeg_sub_id'].iloc[sample_index]
    eeg_offset_seconds = train_df['eeg_label_offset_seconds'].iloc[sample_index]
#     print(f"EEG id: {eeg_id}, Sub id: {eeg_sub_id}, offset seconds: {eeg_offset_seconds}")
    eeg_data = pd.read_parquet('/'.join([train_eeg_dir, str(eeg_id)])+ '.parquet')
    start_ind = int(eeg_samples_per_second*eeg_offset_seconds)
    subsample_eeg_data = eeg_data[start_ind:start_ind + eeg_samples_per_second*eeg_seconds_per_subsample]
    total_votes = train_df[vote_list].iloc[sample_index].sum()
    y.append([int(votes)/total_votes for votes in train_df[vote_list].iloc[sample_index]])
    X.append(np.nan_to_num(subsample_eeg_data['EKG'], nan=0))

---
# K Nearest Neighbour attempt


In [None]:
# Assuming X contains your timeseries data (vectors) and y contains the labels (categories)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Initialize NearestNeighbors
nn_model = NearestNeighbors(n_neighbors=3, metric='euclidean')
nn_model.fit(X_train)

# Find the nearest neighbors
distances, indices = nn_model.kneighbors(X_test)
# print(distances, indices)
x_results_list = []
y_test_list = []
for i, index_results in enumerate(indices):
    total_results = np.zeros(6)
    for index in index_results:
        total_results += np.asarray(y_train[index])/3
    x_results_dict = {
        'id': i,
    }
    y_test_dict = {
        'id': i,
    }
    for ind, vote in enumerate(vote_list):
        x_results_dict[vote] = total_results[ind]
        y_test_dict[vote] = y_test[i][ind]
    x_results_list.append(x_results_dict)
    y_test_list.append(y_test_dict)
    

x_results_df = pd.DataFrame(x_results_list)    
y_test_df = pd.DataFrame(y_test_list)
score(y_test_df, x_results_df, 'id')
# print("Classification Report:")
# print(classification_report(y_test, y_pred))

---
# 📮 Final Submission
For the moment, let's generate a default solution to submit.



In [None]:
nn_model = NearestNeighbors(n_neighbors=3, metric='euclidean')
nn_model.fit(X)
solution_list = []

test_eeg_data_list = []

for i, eeg_id in enumerate(test_df['eeg_id']):
    eeg_data = pd.read_parquet('/'.join([test_eeg_dir, str(eeg_id)])+ '.parquet')
    eeg_offset_seconds = 0
    start_ind = int(eeg_samples_per_second*eeg_offset_seconds)
    subsample_eeg_data = eeg_data[start_ind:start_ind + eeg_samples_per_second*eeg_seconds_per_subsample]
    test_eeg_data_list.append(np.nan_to_num(subsample_eeg_data['EKG'], nan=0))
distances, indices = nn_model.kneighbors(test_eeg_data_list)

solution_list = []
for i, index_results in enumerate(indices):
    total_results = np.zeros(6)
    for index in index_results:
        total_results += np.asarray(y[index])/3
    test_results_dict = {
        'id': i,
    }

    for ind, vote in enumerate(vote_list):
        test_results_dict[vote] = total_results[ind]
    solution_list.append(test_results_dict)
solution_df = pd.DataFrame(solution_list)

print(solution_df)

In [None]:
solution_list = []
for eeg_id in test_df['eeg_id']:
    solution_dict = {
        'eeg_id': eeg_id,
        'seizure_vote': train_df['expert_consensus'].value_counts().get('seizure_vote', 0)/len(train_df),
        'lpd_vote': train_df['expert_consensus'].value_counts().get('lpd_vote', 0)/len(train_df),
        'gpd_vote': train_df['expert_consensus'].value_counts().get('gpd_vote', 0)/len(train_df),
        'lrda_vote': train_df['expert_consensus'].value_counts().get('lrda_vote', 0)/len(train_df),
        'grda_vote': train_df['expert_consensus'].value_counts().get('grda_vote', 0)/len(train_df),
    }
    solution_dict['other_vote'] = 1
    for key in ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote']:
        solution_dict['other_vote'] = solution_dict['other_vote'] - solution_dict[key]
    solution_list.append(solution_dict)
solution_df = pd.DataFrame(solution_list)

In [None]:
solution_df = pd.DataFrame(solution_list)
solution_df.to_csv("submission.csv", index=False)
print(solution_df)
print(ss_df)