Imports


In [24]:


import torch

import mne




from sklearn.preprocessing import label_binarize

mne.set_log_level('ERROR')

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
import pandas as pd

import numpy as np
from scipy.fftpack import fft, ifft
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew
from scipy.stats import kurtosis
#NOTE USE LABEL ENCODER BEFORE RUNNING THE FULL VERSION 

Dataset and preprocessing


In [25]:
#this is where we actually extract the features 
def get_variability_measures(eeg_data):
    variability_features=[]
    std_value=np.std(eeg_data,axis=1)
    iqr_value = np.subtract(*np.percentile(eeg_data, [75, 25], axis=1))  # IQR of all channels
    variability_features.extend((np.mean(std_value),np.mean(iqr_value)))
    return variability_features

def get_distribution_features(eeg_data):
    distribution_features=[]
    skewness_value = skew(eeg_data, axis=1)
    kurtosis_value = kurtosis(eeg_data, axis=1) 
    distribution_features.extend((np.mean(skewness_value),np.mean(kurtosis_value)))
    return distribution_features
  
def zero_crossings(signal):
    # If signal crosses zero line, we'll have a change in sign of adjacent values
    return ((signal[:-1] * signal[1:]) < 0).sum()

def frequency_content_features(eeg_data):
    zero_crossings_values = np.apply_along_axis(zero_crossings, 1, eeg_data)
    return zero_crossings_values


    
def get_power_bands(new_raw):
    
    power_features=[]
    

# Define frequency bands
    bands = {
        'delta': (0.5, 4),
        'theta': (4, 8),
        'alpha': (8, 13),
        'beta': (13, 30),
        'gamma': (30, None)  # Assuming Gamma is 30+ Hz
    }

# Dictionary to hold power values for each band
    band_power = {}

    for band, (l_freq, h_freq) in bands.items():
        # Filter the data for each frequency band
        band_data = new_raw.copy().filter(l_freq=l_freq, h_freq=h_freq, picks='eeg', verbose=False)

        # Compute the PSD for the filtered data using the Welch method
        
        spectrum= band_data.compute_psd(method='welch', fmin=l_freq, fmax=100, picks='eeg', verbose=False)
        psd=spectrum.get_data()
        # Integrate the PSD over the frequencies of interest to get the absolute power for each band
        band_power[band] = psd.mean(axis=1).sum()



    
    power_features.extend(band_power.values())
    
    epsilon = 1e-6

    # Calculate power ratios using values from the dictionary
    delta_theta_ratio = band_power['delta'] / (band_power['theta'] + epsilon)
    delta_alpha_ratio = band_power['delta'] / (band_power['alpha'] + epsilon)
    theta_beta_ratio = band_power['theta'] / (band_power['beta'] + epsilon)
    alpha_gamma_ratio = band_power['alpha'] / (band_power['gamma'] + epsilon)
    beta_gamma_ratio = band_power['beta'] / (band_power['gamma'] + epsilon)

    # Append the calculated ratios 
    power_features.extend([delta_theta_ratio, delta_alpha_ratio, theta_beta_ratio, alpha_gamma_ratio, beta_gamma_ratio])

    # Calculate power sums using values from the dictionary
    delta_theta_sum = band_power['delta'] + band_power['theta']
    alpha_beta_sum = band_power['alpha'] + band_power['beta']
    theta_gamma_sum = band_power['theta'] + band_power['gamma']

    # Append the calculated sums 
    power_features.extend([delta_theta_sum, alpha_beta_sum, theta_gamma_sum])
    
    return power_features
    

    

    

In [26]:
def extract_features(surrogate_segment,segment_info):
    
    features=[]
    eeg_data=np.real(surrogate_segment)
    new_raw=mne.io.RawArray(eeg_data,segment_info)
    
    features.append(get_variability_measures(eeg_data))
    print('no features should be 2 here ',len(features))
    features.append(get_distribution_features(eeg_data))
    print('no features should be 4 here ',len(features))
    features.append(frequency_content_features(eeg_data))
    print('no features should be 5 here ',len(features))
    features.extend((1,2,3,4,5,6))
    print('no features should be 11 here ',len(features))
    features.append(get_power_bands(new_raw))
    print("no features should be 24 here",len(features))
    features.append(0)
    print("no features should be 25 hre" ,len(features))
    return features

def create_raw_from_parquet(parquet_file):
    df=pd.read_parquet(parquet_file)
    data=df.to_numpy().T
    info=mne.create_info(ch_names=list(df.columns),sfreq=200,ch_types='eeg')
    raw=mne.io.RawArray(data,info)
    return raw

def get_duration(raw):
    num_samples=len(raw.times)
    sampling_freq=raw.info['sfreq']
    duration=np.floor(num_samples/sampling_freq)
    
    return duration

In [27]:



# Initialize an empty DataFrame for the new training data
columns = [
    'eeg_id', 
    'eeg_sub_id', 
    'patient_id', 
    'Standard Deviation (STD)', 
    'Inter-Quartile Range (IQR)', 
    'Skewness', 
    'Kurtosis', 
    'Number of Zero Crossings', 
    'Hjorth Mobility', 
    'Hjorth Complexity', 
    'Higuchi Fractal Dimension', 
    'Shannon Entropy', 
    'Spectral Entropy', 
    'Binned Entropy', 
    'Delta Power', 
    'Theta Power', 
    'Alpha Power', 
    'Beta Power', 
    'Gamma Power', 
    'Delta/Theta Ratio', 
    'Delta/Alpha Ratio', 
    'Theta/Beta Ratio', 
    'Alpha/Gamma Ratio', 
    'Beta/Gamma Ratio', 
    'Delta+Theta Power', 
    'Alpha+Beta Power', 
    'Theta+Gamma Power', 
    'Total Power',
    'expert_consensus'
]

new_train_df = pd.DataFrame(columns=columns)
print(new_train_df.shape[1])

# Load the original training CSV
train_csv = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')
count=0
# Loop through each row in the train CSV
for index, row in train_csv.iterrows():
    eeg_id = row['eeg_id']
    sub_id = row['eeg_sub_id']
    patient_id = row['patient_id']
    seizure_label = row['expert_consensus']
   
    # Load the corresponding parquet file as an mne Raw object
    raw = create_raw_from_parquet(f'/kaggle/input/hms-harmful-brain-activity-classification/train_eegs/{eeg_id}.parquet')
    
    sub_id_start_time=sub_id
    tmin = sub_id_start_time * 50
    tmax = tmin + 50  # This ensures a 50-second window

    # Adjust tmax to not exceed the recording
    tmax = min(tmax, raw.times[-1])
    
    # Additionally, ensure tmin does not exceed the adjusted tmax or the recording
    tmin = min(tmin, tmax - 0.001)
    segment = raw.crop(tmin, tmax)  # Adjust 'tmin' and 'tmax' as necessary
    segment_info=raw.info
    # Generate FT Surrogate for the segment
    ft_segment = fft(segment.get_data())
    random_phases = np.exp(2j * np.pi * np.random.rand(*ft_segment.shape))
    surrogate_data = np.abs(ft_segment) * random_phases
    surrogate_segment = ifft(surrogate_data)
    
    # Extract features from the surrogate segment
    features = extract_features(surrogate_segment,segment_info)
    print(len(features))
    # Append to the new DataFrame
    new_row = [eeg_id, sub_id, patient_id] + features + [seizure_label]
    print(len(new_row))
    new_train_df.loc[len(new_train_df)] = new_row
    count+=1
    print(count)
    if count==1:
        break
# Save the new DataFrame to CSV
#new_train_df.to_csv('path/to/new_train_data.csv', index=False)


29
no features should be 2 here  1
no features should be 4 here  2
no features should be 5 here  3
no features should be 11 here  9
no features should be 24 here 10
no features should be 25 hre 11
11
15


ValueError: cannot set a row with mismatched columns

In [23]:
print(new_train_df)

Empty DataFrame
Columns: [eeg_id, eeg_sub_id, patient_id, Standard Deviation (STD), Inter-Quartile Range (IQR), Skewness, Kurtosis, Number of Zero Crossings, Hjorth Mobility, Hjorth Complexity, Higuchi Fractal Dimension, Shannon Entropy, Spectral Entropy, Binned Entropy, Delta Power, Theta Power, Alpha Power, Beta Power, Gamma Power, Delta/Theta Ratio, Delta/Alpha Ratio, Theta/Beta Ratio, Alpha/Gamma Ratio, Beta/Gamma Ratio, Delta+Theta Power, Alpha+Beta Power, Theta+Gamma Power, Total Power, expert_consensus]
Index: []

[0 rows x 29 columns]


In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/kaggle/input/hms-harmful-brain-activity-classification/train.csv')

# Calculate the percentage of each label
label_percentage = df['expert_consensus'].value_counts(normalize=True) * 100

print(label_percentage)


Model Architure
