In [None]:

import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
! ls ../input/hms-harmful-brain-activity-classification


In [None]:
root = "../input/hms-harmful-brain-activity-classification"
train = pd.read_csv(f"{root}/train.csv")
train


In [None]:
all_eeg_ids = list(set(train['eeg_id']))
len(all_eeg_ids)


In [None]:
quality = [-1]*len(train)
total_nans = 0
total = 0
for eeg_id in tqdm(all_eeg_ids, total=len(all_eeg_ids)):
    all_egs = train[train['eeg_id'] == eeg_id]
    data = np.array(pd.read_parquet(f"{root}/train_eegs/{eeg_id}.parquet"))
    for i, row in all_egs.iterrows():
        start_offset =int(row['eeg_label_offset_seconds'] * 200)
        selected_egg = data[start_offset : start_offset + 50* 200, :].reshape(-1)
        quality[i]  = np.count_nonzero(~np.isnan(selected_egg))/ len(selected_egg)
        total_nans += np.count_nonzero(np.isnan(selected_egg))
        total += len(selected_egg)


In [None]:
"EEG's total Nan% for 50secs", total_nans/total*100

In [None]:
quality = [-1]*len(train)
total_nans = 0
total = 0
for eeg_id in tqdm(all_eeg_ids, total=len(all_eeg_ids)):
    all_egs = train[train['eeg_id'] == eeg_id]
    data = np.array(pd.read_parquet(f"{root}/train_eegs/{eeg_id}.parquet"))
    for i, row in all_egs.iterrows():
        start_offset =int(row['eeg_label_offset_seconds'] * 200) + 4000
        selected_egg = data[start_offset : start_offset + 2000, :].reshape(-1)
        quality[i]  = np.count_nonzero(~np.isnan(selected_egg))/ len(selected_egg)
        total_nans += np.count_nonzero(np.isnan(selected_egg))
        total += len(selected_egg)

In [None]:
"EEG's total Nan% for 10secs", total_nans/total*100

In [None]:
train['egg_quality'] = quality

In [None]:
plt.figure(figsize=(12, 6))
plt.hist(train['egg_quality'], bins=20, color='skyblue')
plt.xlabel('Egg Quality')
plt.ylabel('Frequency')
plt.title('Distribution of Egg Quality')
plt.show()

In [None]:
train['egg_quality'].describe()

In [None]:
all_spectrogram_ids = list(set(train['spectrogram_id']))
len(all_spectrogram_ids)

In [None]:
sepc_quality = [-1]*len(train)
total_nans = 0
total = 0
for spectrogram_id in tqdm(all_spectrogram_ids, total=len(all_spectrogram_ids)):
    all_specs = train[train['spectrogram_id'] == spectrogram_id]
    data = np.array(pd.read_parquet(f"{root}/train_spectrograms/{spectrogram_id}.parquet"))
    for i, row in all_specs.iterrows():
        start_offset =int(row['spectrogram_label_offset_seconds']/2)
        selected_spec = data[start_offset : start_offset + 300, :].reshape(-1)
        sepc_quality[i]  = np.count_nonzero(~np.isnan(selected_spec))/ len(selected_spec)
        total_nans += np.count_nonzero(np.isnan(selected_spec))
        total += len(selected_spec)

In [None]:
"Spectrogram's total Nan% for 600secs", total_nans/total*100


In [None]:
sepc_quality = [-1]*len(train)
total_nans = 0
total = 0
for spectrogram_id in tqdm(all_spectrogram_ids, total=len(all_spectrogram_ids)):
    all_specs = train[train['spectrogram_id'] == spectrogram_id]
    data = np.array(pd.read_parquet(f"{root}/train_spectrograms/{spectrogram_id}.parquet"))
    for i, row in all_specs.iterrows():
        start_offset =int(row['spectrogram_label_offset_seconds']/2) + 147
        selected_spec = data[start_offset : start_offset + 6, :].reshape(-1)
        sepc_quality[i]  = np.count_nonzero(~np.isnan(selected_spec))/ len(selected_spec)
        total_nans += np.count_nonzero(np.isnan(selected_spec))
        total += len(selected_spec)

In [None]:
"Spectrogram's total Nan% for 12secs", total_nans/total*100

In [None]:
train['spec_quality'] = sepc_quality
train['spec_quality'].describe()

In [None]:
plt.figure(figsize=(12, 6))
plt.hist(train['spec_quality'], bins=20, color='skyblue')
plt.xlabel('Spec Quality')
plt.ylabel('Frequency')
plt.title('Distribution of Spec Quality')
plt.show()

In [None]:
train[ train['spec_quality'] != 1]

In [None]:
train[ train['egg_quality'] != 1]

In [None]:
train['max_quality'] = train.apply(lambda x: max(x['egg_quality'], x['spec_quality']), axis=1)
train['min_quality'] = train.apply(lambda x: min(x['egg_quality'], x['spec_quality']), axis=1)
train['avg_quality'] = train.apply(lambda x: (x['egg_quality'] + x['spec_quality'])/2, axis=1)

In [None]:
train['max_quality'].describe()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

axes[0].boxplot(train['max_quality'])
axes[0].set_xlabel(f'Max Quality')
axes[0].set_ylabel('Value')
axes[0].set_title(f'Max Quality - {len(train[ train["max_quality"] != 1])} - {len(train[ train["max_quality"] != 1])/len(train)*100:.2f}%')

axes[1].boxplot(train['min_quality'])
axes[1].set_xlabel(f'Min Quality')
axes[1].set_ylabel('Value')
axes[1].set_title(f'Min Quality - {len(train[ train["min_quality"] != 1])} - {len(train[ train["min_quality"] != 1])/len(train)*100:.2f}%')

axes[2].boxplot(train['avg_quality'])
axes[2].set_xlabel(f'Avg Quality')
axes[2].set_ylabel('Value')
axes[2].set_title(f'Avg Quality - {len(train[ train["avg_quality"] != 1])} - {len(train[ train["avg_quality"] != 1])/len(train)*100:.2f}%')
plt.show()


In [None]:
plt.scatter(train['egg_quality'], train['spec_quality'])
plt.xlabel('EEG Quality')
plt.ylabel('Spectrogram Quality')
plt.title('EEG Quality vs Spectrogram Quality')
plt.show()

In [None]:
train[train['min_quality'] != 1]

In [None]:
final_train = train[train['min_quality'] == 1].reset_index(drop=True)
final_train.shape


In [None]:
final_train.to_csv("/kaggle/working/train_clean.csv",index=False)