In [None]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
from scipy.signal import spectrogram
import matplotlib as mpl
from matplotlib import cm
import matplotlib.patches as patches
import matplotlib.pyplot as plt
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

In [None]:
df =pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
test=pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/test.csv')
eeg = pd.read_parquet('/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/1484166292.parquet')
spectrogram = pd.read_parquet('/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/1000086677.parquet')

In [None]:
df

In [None]:
expert_consensus = df.expert_consensus.value_counts().reset_index()
expert_consensus.columns = ["consensus", "frequency"]

plt.figure(figsize=(20, 10))

figure = sns.barplot(data=expert_consensus,x="consensus", y="frequency",errwidth=0)
plt.title('Expert Consensus - Distribution', weight="bold", size=20)
plt.xlabel("Consensus", size = 18, weight="bold")
plt.ylabel("Count", size = 18, weight="bold")

for i in figure.containers:
    figure.bar_label(i,)




# **Create New Features**
We will create a new feature named pattern , where it based on the experts votes.
* **idealized** = High levels of agreement
* **proto** = 50% of expert give a lebel as 'other_vote' and 50% give one of the remaining five labels
* **edge** = Cases where experts are approximately split between 2 of the 5 named patterns

In [None]:
target_col = list(df.columns[-6:])
train_df = df.groupby(by=["eeg_id", "spectrogram_id", "patient_id"])\
                    [target_col].sum().reset_index()

def get_pattern(df):
    col_name = list(df.columns[-6:])
    pattern =[]
    max_treshold = 0.75
    equal_treshold = 0.4
    for i in range(len(df)):
        max_vote = df.iloc[i][target_col].max()
        total_vote = df.iloc[i][target_col].sum()
        perc = max_vote/total_vote 
   
        if perc >= max_treshold:
            pattern.append('idealized')
        elif df.iloc[i]['other_vote']/total_vote >=equal_treshold and perc>= equal_treshold:
            pattern.append('proto')
        elif df.iloc[i]['other_vote']==0  and perc>= equal_treshold:
            pattern.append('edge')
        else:
            pattern.append('unidentified')
    return pattern
train_df['pattern'] = get_pattern(train_df)


In [None]:
pattern = train_df.pattern.value_counts().reset_index()
pattern.columns = ["pattern", "frequency"]

plt.figure(figsize=(20, 10))

figure = sns.barplot(data=pattern,x="pattern", y="frequency",errwidth=0)
plt.title('Pattern - Distribution', weight="bold", size=20)
plt.xlabel("pattern", size = 18, weight="bold")
plt.ylabel("Count", size = 18, weight="bold")

for i in figure.containers:
    figure.bar_label(i,)

In [None]:
df

# Relationship between the patient Id ,eeg_id and spectrogram_id

unique patient_id = 1950

for each patient_id , we have more then 1 eeg_id and spectrogram_id is present.

let me take patient_id == 30631 ,

In [None]:
pat_id=df[df['patient_id']==30631]
print('the length :',len(pat_id))

spec_count=pat_id['spectrogram_id'].nunique()
eeg_count=pat_id['eeg_id'].nunique()
print('eeg_count:',eeg_count)
print('spec_count:',spec_count)

eeg_id =pat_id[pat_id['eeg_id']==1270973624]
unique_spec_id_wrt_eegid=eeg_id['spectrogram_id'].nunique()
print('total counts of spectogram_id for a unique eeg_id:',unique_spec_id_wrt_eegid)

spec_id =pat_id[pat_id['spectrogram_id']==764146759]
unique_eeg_id_wrt_specid=spec_id['eeg_id'].nunique()
print('total counts of eeg_id for a unique spectogram_id:',unique_eeg_id_wrt_specid)


The above code shows the following conclusion :
* One spectrogram (like the case below - 764146759) can be a part of multiple EEG recordings, all being from the same patient_id.
* One EEG Id  (like the case below - 1270973624) have single spectrogram id , all being from the same patient_id.

# Spectogram

In [None]:
spec_id = '1000086677'
spec_path = '/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/'
spectrogram = pd.read_parquet(spec_path + spec_id+'.parquet')
spectrogram

# **Visualize the Spectrogram on Pattern = 'idealized'**

In [None]:


def spec_dict(idealized_df):
    N = 1
    spec_dict = {
        "seizure_vote": 0,
        "lpd_vote": 0,
        "gpd_vote": 0,
        "lrda_vote":0, 
        "grda_vote":0,
        "other_vote":0
    }

    for key in spec_dict.keys():
        col_idx = idealized_df[key].sort_values(ascending=False).head(N).index
        spec_dict[key] = idealized_df.loc[col_idx, "spectrogram_id"].values
        
    return spec_dict

    

def idealized_visualization(vote,spec_id,pattern):
    spec_path = '/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/'
    spec_data = pd.read_parquet(spec_path + spec_id+'.parquet')

    fig, axes = plt.subplots(1, 1, figsize=(6, 3), sharey=True)

    axes.imshow(np.log(spec_data.T))
    axes.set_title(f'{pattern} pattern {vote} id {spec_id}', size=10)
    axes.set_xlabel('Time', size=10)
    axes.set_ylabel('(Hz)', size=10)
    axes.tick_params(axis='both', which='both', labelsize=10)

    plt.show()
    

idealized_df = train_df[train_df['pattern']=='idealized']
spec_dict=spec_dict(idealized_df)
for vote,spec_id in spec_dict.items():
    
    idealized_visualization(vote,str(spec_id[0]),'idealized')
    



# Spectrogram Visualization on pattern = Edge

In [None]:

edge_df = train_df[train_df['pattern']=='edge']
N = 1
spec_dict = {
    "seizure_vote": 0,
    "lpd_vote": 0,
    "gpd_vote": 0,
    "lrda_vote":0, 
    "grda_vote":0,
    "other_vote":0
}

for key in spec_dict.keys():
    col_idx = edge_df[key].sort_values(ascending=False).head(N).index
    spec_dict[key] = edge_df.loc[col_idx, "spectrogram_id"].values
# spec_dict=spec_dict(edge_df)
for vote,spec_id in spec_dict.items():
    
    idealized_visualization(vote,str(spec_id[0]),'edge')

# Spectrogram visualization pattern ='proto'

In [None]:

proto_df = train_df[train_df['pattern']=='proto']
N =1
spec_dict = {
    "seizure_vote": 0,
    "lpd_vote": 0,
    "gpd_vote": 0,
    "lrda_vote":0, 
    "grda_vote":0,
    "other_vote":0
}

for key in spec_dict.keys():
    col_idx = proto_df[key].sort_values(ascending=False).head(N).index
    spec_dict[key] = proto_df.loc[col_idx, "spectrogram_id"].values
# spec_dict=spec_dict(edge_df)
for vote,spec_id in spec_dict.items():
    
    idealized_visualization(vote,str(spec_id[0]),'proto')

# Spectrogram Visualization Pattern = 'unidentified'

In [None]:

unidentified_df = train_df[train_df['pattern']=='unidentified']
N =1
spec_dict = {
    "seizure_vote": 0,
    "lpd_vote": 0,
    "gpd_vote": 0,
    "lrda_vote":0, 
    "grda_vote":0,
    "other_vote":0
}

for key in spec_dict.keys():
    col_idx = unidentified_df[key].sort_values(ascending=False).head(N).index
    spec_dict[key] = unidentified_df.loc[col_idx, "spectrogram_id"].values
# spec_dict=spec_dict(edge_df)
for vote,spec_id in spec_dict.items():
    
    idealized_visualization(vote,str(spec_id[0]),'unidentified')

In [None]:
# import spectrogram info
spect_data = np.load("/kaggle/input/brain-spectrograms/specs.npy", allow_pickle=True).item()


In [None]:
spectrogram = pd.read_parquet('/kaggle/input/hms-harmful-brain-activity-classification/train_spectrograms/1000086677.parquet')
spect_feature_name =spectrogram.columns[1:]

In [None]:
fe_data = {}

for spect_id, data in spect_data.items():
    fe_data[spect_id] = {}
    
    for k, feature in enumerate(spect_feature_name):
        fe_data[spect_id][f"{feature}_mean"] = data[:, k].mean()
        fe_data[spect_id][f"{feature}_min"] = data[:, k].min()
        fe_data[spect_id][f"{feature}_max"] = data[:, k].max()
        fe_data[spect_id][f"{feature}_std"] = data[:, k].std()
        
# convert to df
fe_data_df = pd.DataFrame.from_dict(fe_data, orient='index').reset_index()

# append target labels
# target_df = train_group\
#             .groupby("spectrogram_id")[train_group.filter(regex='_vote$').columns]\
#             .sum().reset_index()


In [None]:
target_df = df\
            .groupby("spectrogram_id")["expert_consensus"]\
            .first().reset_index()
# encoding from string to numbers
target_df['expert_consensus'] = pd.factorize(target_df['expert_consensus'])[0]

final_df = pd.merge(left=fe_data_df, right=target_df, 
                    left_on="index", right_on="spectrogram_id")

final_df

In [None]:
from sklearn.model_selection import train_test_split


dtrain, dvalid = train_test_split(final_df, train_size=0.8, random_state=42)

FEATURE_COLS = final_df.columns[1:-2]
TARGET_COL = final_df.columns[-1]



# This notebook still in progress