# Purpose
The main purpose of this notebook is to explore and display shifts in votes and switches in evaluators over spectograms.
The most relevant finding is that many spectograms with hundreds of subsections have no shifts in votes. 

There are additional sections mostly relating to this central plot and an animation of votes over time too.
The findings are discussed in more detail [here](https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification/discussion/477893).

# Setup

## Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append('/kaggle/input/kaggle-kl-div')
from kaggle_kl_div import score
import numpy as np

import matplotlib.animation as animation
from IPython.display import HTML

## Load train.csv

In [None]:
df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')

# Make names shorter
df.columns = df.columns.str.replace('_vote', '')
df.head()

## Spectograms by n_subsections

In [None]:
grouped_df = df.groupby('spectrogram_id').count()
sorted_samples = grouped_df.spectrogram_sub_id.sort_values(ascending=False)
sorted_samples[:3]

# EDA
## Spectogram Subsections
Here we show the 10 spectograms with the highest number of subsections.
What is displayed are 4 measures in one plot:

1. the total number of shifts in votes over the spectogram_subsection.
2. the total number switches of the number of experts over the spectogram_subsection (this must at least be as high as shifts of course, as every switch is a shift).
3. the current number of experts for this subsection.
4. the KL divergence between the first subsection and the current.

=> You can find many takeaways, but a big one is that some spectograms have hundreds of shifts and switches and some none, even though they have 800+ subsections.


In [None]:
start = 0 # Start at the most populous spectogram
show = 5 # Show the start and the next 5 popular spectograms

In [None]:
for spectogram in range(start,start+show):
    # Initialize variables
    shift_list = []
    switch_list = []
    KL_list = []
    nvote_list = []

    counter_pred_shift, counter_expert_switch = 0, 0  # Initialize counters, one for number of prediction changes, one for number of rater changes

    for i, row in df[df.spectrogram_id == sorted_samples.index[spectogram]].iterrows():
        votes = row[9:15]
        nvote_list.append(votes.sum())

        # Normalize votes so they sum up to 1, required for KL divergence
        norm_votes = votes / votes.sum()
        
        if counter_pred_shift == 0:
            df_base_vote = pd.DataFrame({
                'id': [0],  
                'seizure_vote': [norm_votes.iloc[0]],
                'lpd_vote': [norm_votes.iloc[1]],
                'gpd_vote': [norm_votes.iloc[2]],
                'lrda_vote': [norm_votes.iloc[3]],
                'grda_vote': [norm_votes.iloc[4]],
                'other_vote': [norm_votes.iloc[5]]
            })
            counter_pred_shift += 1
            counter_expert_switch += 1

            # Calculate KL divergence for the change
            df_norm_vote = pd.DataFrame({
                'id': [0],  
                'seizure_vote': [norm_votes.iloc[0]],
                'lpd_vote': [norm_votes.iloc[1]],
                'gpd_vote': [norm_votes.iloc[2]],
                'lrda_vote': [norm_votes.iloc[3]],
                'grda_vote': [norm_votes.iloc[4]],
                'other_vote': [norm_votes.iloc[5]]
            })
            current_KL = score(df_base_vote.copy(), df_norm_vote, 'id', epsilon=1e-2)
    
        else:
            # Compare current votes with previous to detect changes
            if not votes.equals(prev_votes):
                counter_pred_shift += 1
                if votes.sum() != prev_votes.sum():
                    counter_expert_switch += 1
                
                # Calculate KL divergence for the change
                df_norm_vote = pd.DataFrame({
                    'id': [0],  
                    'seizure_vote': [norm_votes.iloc[0]],
                    'lpd_vote': [norm_votes.iloc[1]],
                    'gpd_vote': [norm_votes.iloc[2]],
                    'lrda_vote': [norm_votes.iloc[3]],
                    'grda_vote': [norm_votes.iloc[4]],
                    'other_vote': [norm_votes.iloc[5]]
                })
                current_KL = score(df_base_vote.copy(), df_norm_vote, 'id', epsilon=1e-2)

        KL_list.append(current_KL)
        shift_list.append(counter_pred_shift)
        switch_list.append(counter_expert_switch)
        
        prev_votes = votes

    # Plot the shifts, switches and KL divergence
    fig, ax1 = plt.subplots()

    ax1.plot(range(sorted_samples.values[spectogram]), shift_list, label='Sum Shifts')
    ax1.plot(range(sorted_samples.values[spectogram]), switch_list, label='Sum Switches', color='orange')
    ax1.plot(range(sorted_samples.values[spectogram]), nvote_list, label='Num Experts', color='red', alpha=0.5)

    ax1.set_xlabel('Spectrogram Index')
    ax1.set_ylabel('Count')
    ax2 = ax1.twinx()
    ax2.bar(range(sorted_samples.values[spectogram]), KL_list, label='KL vs start', color='green', alpha=0.5)
    ax2.set_ylabel('KL Divergence')

    fig.legend()
    plt.title('Spectrogram ID: ' + str(sorted_samples.index[spectogram]))
    plt.show()

## Animation
Displays animation of vote fractions over time for one spectogram. 

You might want to decrease speed of animation.

It also runs for a few minutes for the spectograms with most subsections.

In [None]:
def update(frame):
    ax.clear()
    votes = spec.iloc[frame][9:15]
    total_votes = votes.sum()
    votes_percentage = votes / total_votes
    votes_percentage.plot(kind='bar', ax=ax)
    ax.set_xlabel('Vote')
    ax.set_ylabel('Vote Fraction')
    ax.set_title('Votes')
    ax.set_ylim(0, 1)

In [None]:
x_highest_spectogram = 0

counter_pred_shift, counter_rater_switch = 0, 0  # Initialize counters, one for number of prediction changes, one for number of rater changes
spec = df[df.spectrogram_id == sorted_samples.index[x_highest_spectogram]]

fig, ax = plt.subplots()
ani = animation.FuncAnimation(fig, update, frames=spec.__len__(), interval=1, repeat=False)

HTML(ani.to_jshtml())

## Spectogram Average
Shows average KL divergence between the first subsection and subsequent ones for the top 80 spectograms (and up to subsection 100).

In [None]:
KL_list_all_spect = []
for spectogram in range(80):

    KL_list = []

    first = True
    for i, row in df[df.spectrogram_id == sorted_samples.index[spectogram]].iterrows():
        votes = row[9:15]
        
        # Normalize votes so they sum up to 1, required for KL divergence
        norm_votes = votes / votes.sum()
        
        if first:
            first = False
            df_base_vote = pd.DataFrame({
                'id': [0],  
                'seizure_vote': [norm_votes.iloc[0]],
                'lpd_vote': [norm_votes.iloc[1]],
                'gpd_vote': [norm_votes.iloc[2]],
                'lrda_vote': [norm_votes.iloc[3]],
                'grda_vote': [norm_votes.iloc[4]],
                'other_vote': [norm_votes.iloc[5]]
            })

            # Calculate KL divergence for the change
            df_norm_vote = pd.DataFrame({
                'id': [0],  
                'seizure_vote': [norm_votes.iloc[0]],
                'lpd_vote': [norm_votes.iloc[1]],
                'gpd_vote': [norm_votes.iloc[2]],
                'lrda_vote': [norm_votes.iloc[3]],
                'grda_vote': [norm_votes.iloc[4]],
                'other_vote': [norm_votes.iloc[5]]
            })
            current_KL = score(df_base_vote.copy(), df_norm_vote, 'id', epsilon=1e-2)
    
        else:
            # Compare current votes with previous to detect changes
            if not votes.equals(prev_votes):
                counter_pred_shift += 1
                if votes.sum() != prev_votes.sum():
                    counter_rater_switch += 1
                
                # Calculate KL divergence for the change
                df_norm_vote = pd.DataFrame({
                    'id': [0],  
                    'seizure_vote': [norm_votes.iloc[0]],
                    'lpd_vote': [norm_votes.iloc[1]],
                    'gpd_vote': [norm_votes.iloc[2]],
                    'lrda_vote': [norm_votes.iloc[3]],
                    'grda_vote': [norm_votes.iloc[4]],
                    'other_vote': [norm_votes.iloc[5]]
                })
                current_KL = score(df_base_vote.copy(), df_norm_vote, 'id', epsilon=1e-2)

        KL_list.append(current_KL)
        prev_votes = votes

    KL_list_all_spect.append(KL_list[:100])

KL_array = np.array(KL_list_all_spect)
plt.plot(list(KL_array.mean(axis=0)))
plt.xlabel('Index')
plt.ylabel('Mean KL')
plt.title('Mean KL vs Index')
plt.show()

## Display Particular Spectogram

In [None]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [None]:
display_all(df[df.spectrogram_id == 1266022743])

## Offset Between Subsections

In [None]:
df['eeg_combined'] = df.eeg_id * 100 + df.eeg_sub_id
df['spec_combined'] = df.spectrogram_id * 100 + df.spectrogram_sub_id

value_counts = df.spectrogram_label_offset_seconds.value_counts()

# Create a new dataframe with the desired ranges
ranges_df = pd.DataFrame({'Range': ['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100', '100-200', '200-300', '300-400', '400-500', '500-1000', '1000+'],
                          'Count': [value_counts[(value_counts.index >= 0) & (value_counts.index < 10)].sum(),
                                    value_counts[(value_counts.index >= 10) & (value_counts.index < 20)].sum(),
                                    value_counts[(value_counts.index >= 20) & (value_counts.index < 30)].sum(),
                                    value_counts[(value_counts.index >= 30) & (value_counts.index < 40)].sum(),
                                    value_counts[(value_counts.index >= 40) & (value_counts.index < 50)].sum(),
                                    value_counts[(value_counts.index >= 50) & (value_counts.index < 60)].sum(),
                                    value_counts[(value_counts.index >= 60) & (value_counts.index < 70)].sum(),
                                    value_counts[(value_counts.index >= 70) & (value_counts.index < 80)].sum(),
                                    value_counts[(value_counts.index >= 80) & (value_counts.index < 90)].sum(),
                                    value_counts[(value_counts.index >= 90) & (value_counts.index < 100)].sum(),
                                    value_counts[(value_counts.index >= 100) & (value_counts.index < 200)].sum(),
                                    value_counts[(value_counts.index >= 200) & (value_counts.index < 300)].sum(),
                                    value_counts[(value_counts.index >= 300) & (value_counts.index < 400)].sum(),
                                    value_counts[(value_counts.index >= 400) & (value_counts.index < 500)].sum(),
                                    value_counts[(value_counts.index >= 500) & (value_counts.index < 1000)].sum(),
                                    value_counts[(value_counts.index >= 1000)].sum()]})

# Plot the new dataframe
ranges_df.plot(x='Range', y='Count', kind='bar')

plt.xticks(range(len(ranges_df)), ranges_df['Range'], rotation=45)
plt.ylabel('Count')
plt.title('Value Counts')
plt.show()