In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
train = pd.read_csv("/kaggle/input/hms-harmful-brain-activity-classification/train.csv")

train.head()

**train.csv** Metadata for the train set. The expert annotators reviewed 50 second long EEG samples plus matched spectrograms covering 10 a minute window centered at the same time and labeled the central 10 seconds. Many of these samples overlapped and have been consolidated. train.csv provides the metadata that allows you to extract the original subsets that the raters annotated.

**eeg_id** - A unique identifier for the entire EEG recording.

**eeg_sub_id** - An ID for the specific 50 second long subsample this row's labels apply to.

**eeg_label_offset_seconds** - The time between the beginning of the consolidated EEG and this subsample.

**spectrogram_id** - A unique identifier for the entire EEG recording.

**spectrogram_sub_id** - An ID for the specific 10 minute subsample this row's labels apply to.

**spectogram_label_offset_seconds** - The time between the beginning of the consolidated spectrogram and this subsample.

**label_id** - An ID for this set of labels.

**patient_id** - An ID for the patient who donated the data.

**expert_consensus** - The consensus annotator label. Provided for convenience only.

**[seizure/lpd/gpd/lrda/grda/other]_vote** - The count of annotator votes for a given brain activity class. The full names of the activity classes are as follows: lpd: lateralized periodic discharges, gpd: generalized periodic discharges, lrd: lateralized rhythmic delta activity, and grda: generalized rhythmic delta activity . A detailed explanations of these patterns is available here.


## Brain activity notebook series

### [EEGS 10–20 system](https://www.kaggle.com/code/seshurajup/eegs-10-20-system)
Better understanding eegs 10-20 system
### [Missing Eeg_ids Train.csv vs train_eegs [Resolved]](https://www.kaggle.com/code/seshurajup/missing-eeg-ids-in-train-csv-vs-train-eegs-parquet)
Extra training eggs [Resolved] as we can ignore it
### [EDA train.csv](https://www.kaggle.com/code/seshurajup/eda-train-csv)
Detailed analysis of the train.csv
### [Eegs Pairing Analysis & Features](https://www.kaggle.com/code/seshurajup/eegs-pairing-analysis-features)
Pairing features analysis and build features
### [Eegs Target Analysis - Correct way to merge target](https://www.kaggle.com/code/seshurajup/eegs-target-analysis-correct-way-to-merge-target)
How to choice the target votes for training
### [Eegs Train Split (CV)](https://www.kaggle.com/seshurajup/eegs-train-splits-cv)
generate better train split without patient_id overlap

#### **Upvote my work if it is useful**

In [None]:
rows, columns = train.shape
rows, columns

In [None]:
train_info = train.info()
train_info

In [None]:
train.describe()

In [None]:
categorical_columns = train.select_dtypes(include=['object', 'category']).columns
categorical_summary = train[categorical_columns].describe()
categorical_summary

In [None]:
list(set(train['expert_consensus'].unique()))

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=train, x='expert_consensus')
plt.title('Distribution of Expert Consensus')
plt.xlabel('Expert Consensus')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train['patient_id'], bins=30, kde=False)
plt.title('Distribution of Patient ID')
plt.xlabel('Patient ID')
plt.ylabel('Count')
plt.show()

In [None]:
targets = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']

plt.figure(figsize=(15, 10))
for i, column in enumerate(targets, 1):
    plt.subplot(2, 4, i)
    sns.histplot(train[column], kde=False, bins=30)
    plt.title(column)
plt.tight_layout()

In [None]:
correlation_targets = train[targets].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_targets, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Vote Columns')
plt.show()

plt.figure(figsize=(12, 10))
for i, column in enumerate(targets, 1):
    plt.subplot(3, 2, i)
    sns.violinplot(data=train, x='expert_consensus', y=column)
    plt.title(f'Distribution of {column} by Expert Consensus')

plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(train[targets])
plt.suptitle('Pairwise Relationships of Target Votes', y=1.02)
plt.show()

In [None]:
offset_stats = train[['eeg_label_offset_seconds', 'spectrogram_label_offset_seconds']].describe()

plt.figure(figsize=(12, 6))
sns.histplot(train['eeg_label_offset_seconds'], bins=30, kde=True)
plt.title('Distribution of EEG Label Offset Seconds')
plt.xlabel('EEG Label Offset Seconds')
plt.ylabel('Count')
plt.show()

plt.figure(figsize=(12, 6))
sns.histplot(train['spectrogram_label_offset_seconds'], bins=30, kde=True)
plt.title('Distribution of Spectrogram Label Offset Seconds')
plt.xlabel('Spectrogram Label Offset Seconds')
plt.ylabel('Count')
plt.show()

offset_stats

In [None]:
total_eegs = len(train['eeg_id'].unique())
total_eegs

In [None]:
all_eeg_label_offset_seconds = sorted(list(train['eeg_label_offset_seconds'].unique()))
len(all_eeg_label_offset_seconds), str(all_eeg_label_offset_seconds[0:5]), str(all_eeg_label_offset_seconds[-5:])

In [None]:
all_spectrogram_label_offset_seconds = sorted(list(train['spectrogram_label_offset_seconds'].unique()))
len(all_spectrogram_label_offset_seconds), str(all_spectrogram_label_offset_seconds[0:5]), str(all_spectrogram_label_offset_seconds[-5:])

In [None]:
eeg_sub_id_count_per_eeg_id = train.groupby('eeg_id')['eeg_sub_id'].nunique()
spectrogram_sub_id_count_per_spectrogram_id = train.groupby('spectrogram_id')['spectrogram_sub_id'].nunique()

plt.figure(figsize=(12, 6))
sns.histplot(eeg_sub_id_count_per_eeg_id, bins=50, kde=True)
plt.title('EEG Sub-ID Count per EEG ID')
plt.xlabel('Count of EEG Sub-ID per EEG ID')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(12, 6))
sns.histplot(spectrogram_sub_id_count_per_spectrogram_id, bins=50, kde=True)
plt.title('Spectrogram Sub-ID Count per Spectrogram ID')
plt.xlabel('Count of Spectrogram Sub-ID per Spectrogram ID')
plt.ylabel('Frequency')
plt.show()

In [None]:
vote_counts_by_consensus = train.groupby('expert_consensus')[targets].sum()

plt.figure(figsize=(12, 8))
vote_counts_by_consensus.plot(kind='bar', stacked=True)
plt.title('Overall Vote Counts by Expert Consensus')
plt.xlabel('Expert Consensus')
plt.ylabel('Total Votes')
plt.xticks(rotation=45)
plt.legend(title='Vote Types')
plt.show()

In [None]:
cumulative_votes = train.groupby('eeg_label_offset_seconds')[targets].sum().cumsum().reset_index()

plt.figure(figsize=(12, 8))
for column in targets:
    plt.plot(cumulative_votes['eeg_label_offset_seconds'], cumulative_votes[column], label=column)

plt.title('Vote Counts Over EEG Label Offset Seconds')
plt.xlabel('EEG Label Offset Seconds')
plt.ylabel('Total Votes')
plt.legend()
plt.show()

In [None]:
cumulative_votes = train.groupby('spectrogram_label_offset_seconds')[targets].sum().cumsum().reset_index()

plt.figure(figsize=(12, 8))
for column in targets:
    plt.plot(cumulative_votes['spectrogram_label_offset_seconds'], cumulative_votes[column], label=column)

plt.title('Vote Counts Over Spectrogram Offset Seconds')
plt.xlabel('EEG Label Offset Seconds')
plt.ylabel('Total Votes')
plt.legend()
plt.show()

In [None]:
cumulative_votes

In [None]:
sorted_data = train.sort_values(by=['eeg_id', 'eeg_sub_id'])

sorted_data['offset_difference'] = sorted_data.groupby('eeg_id')['eeg_label_offset_seconds'].diff()

offset_differences = sorted_data['offset_difference'].dropna()

offset_difference_stats = offset_differences.describe()

plt.figure(figsize=(12, 6))
sns.histplot(offset_differences, bins=30, kde=True)
plt.title('Offset Differences within EEG IDs')
plt.xlabel('Offset Difference (Seconds)')
plt.ylabel('Frequency')
plt.show()

In [None]:
sorted_data = train.sort_values(by=['spectrogram_id', 'spectrogram_sub_id'])

sorted_data['offset_difference'] = sorted_data.groupby('spectrogram_id')['spectrogram_label_offset_seconds'].diff()

offset_differences = sorted_data['offset_difference'].dropna()

offset_difference_stats = offset_differences.describe()

plt.figure(figsize=(12, 6))
sns.histplot(offset_differences, bins=30, kde=True)
plt.title('Offset Differences within Spectrogram IDs')
plt.xlabel('Offset Difference (Seconds)')
plt.ylabel('Frequency')
plt.show()

In [None]:
sample_patients = train['patient_id'].sample(20, random_state=1).values
sample_data = train[train['patient_id'].isin(sample_patients)]

for i, vote_type in enumerate(targets, 1):
    plt.figure(figsize=(15, 10))
    sns.boxplot(x='patient_id', y=vote_type, data=sample_data)
    plt.title(f'Distribution of {vote_type} for Selected Patients')
    plt.xlabel('Patient ID')
    plt.ylabel(f'{vote_type} Count')
    plt.show()

In [None]:
plt.figure(figsize=(15, 10))


for i, patient_id in enumerate(sample_patients, 1):
    plt.figure(figsize=(15, 10))
    patient_data = train[train['patient_id'] == patient_id]
    correlation_matrix = patient_data[targets].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title(f'Correlation of Votes for Patient ID {patient_id}')
    plt.show()

In [None]:
total_votes_per_pat = train.groupby('patient_id')[targets].sum().sum(axis=1)
normalized_votes = train.groupby('patient_id')[targets].sum().div(total_votes_per_pat, axis=0)
mean_vote_ratio = normalized_votes.mean()
print( mean_vote_ratio )

## **Its @cdeotte idea** - https://www.kaggle.com/competitions/hms-harmful-brain-activity-classification/discussion/467021

In [None]:
gap = 1 - sum([round(v,6) for _, v in mean_vote_ratio.items()])
print(gap)
mean_vote_ratio['other_vote'] += gap

In [None]:
sum([round(v,5) for _, v in mean_vote_ratio.items()])

In [None]:
mean_vote_ratio

In [None]:
sub = pd.read_csv("/kaggle/input/hms-harmful-brain-activity-classification/sample_submission.csv")
for target in targets:
    sub[target] = mean_vote_ratio[target]
sub

In [None]:
sub.to_csv("/kaggle/working/submission.csv", index=False)