In [None]:
#Importing relevant libraries
import pandas as pd
import numpy as np
from scipy import signal
from scipy.signal import butter, filtfilt, iirnotch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy import stats
import sklearn
import os
import mne
from scipy.fft import fft, ifft

## 1. Read .csv file📁

In [None]:
csv = '/kaggle/input/hms-harmful-brain-activity-classification/train.csv'
df = pd.read_csv(csv)
df.head()

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


## 2. Signal Processing 📶

In [None]:
def butterfilt(data, lowcut, highcut, fs, order=4, padlen = None):
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = butter(order, [low, high], btype='band')
    bandpass_filtered_data = filtfilt(b, a, data, axis=0)
    return bandpass_filtered_data

def notchfilt(data, fs, notch_freq=50, Q=10):
    b, a = iirnotch(notch_freq, Q, fs=fs)
    notched_data = filtfilt(b, a, data, axis=0)
    return notched_data

In [None]:
def apply_filters(signal_data):
    # Apply notch filter
    notched_data = notchfilt(signal_data, fs=200)
    # Apply bandpass filter
    filtered_data = butterfilt(notched_data, lowcut=0.1, highcut=50, fs=200, padlen=None)
    return filtered_data

## 3. Feature Extraction ⚗️
Statistics used as a proxy for the features of the signal:
1. Mean: Central tendency of the signal, this may be higher in seizures and GPD due to higher overall activity, whereas thet might be slightly lower in GRDA and LRDA due to the presence of slow delta activiy. Might also reflect asymmetry in LPD and LRDA.
2. Standard deviation: Variability of the signal around the mean: this may be higher in seizures to due drastic changes in amplitude, as well as in LRDA, LPD and GPD.
3. Peak-to-peak amplitude: difference between the highest and lowest points in the data. Likely to be higher in seizures due to more drastic changes in amplitude.
4. Variation: Measure of spread, higher in seizures due to more drastic and frequent changes, as well as in GPD, LRDA and LPD.
5. The minimum and maximum values of the signal: assessment of amplitude.
6. Square root of the signal
7. Skew: Asymmetry of the distribution, positive in seizures due to asymmetry compared to baseline activity, but also seen in LRDA and LPD as these are lateralised.
8. Kurtosis: Peakedness/flatness of the signal, may be higher in seizure due to more peaks.

In [None]:
def extract_features(signal_data):
    features = []

    # Define feature extraction functions
    def mean(x):
        return np.mean(x, axis=0)

    def std(x):
        return np.std(x, axis=0)

    def ptp(x):
        return np.ptp(x, axis=0)

    def var(x):
        return np.var(x, axis=0)

    def minim(x):
        return np.min(x, axis=0)

    def maxim(x):
        return np.max(x, axis=0)

    def sqrt(x):
        return np.sqrt(np.mean(x ** 2, axis=0))

    def abs_diff_sigma(x):
        return np.sum(np.abs(np.diff(x, axis=0)), axis=0)

    def skew(x):
        return stats.skew(x, axis=0)

    def kurtosis(x):
        return stats.kurtosis(x, axis=0)

    # Extract features from signal data
    signal_features = [
        mean(signal_data),
        std(signal_data),
        ptp(signal_data),
        var(signal_data),
        minim(signal_data),
        maxim(signal_data),
        sqrt(signal_data),
        abs_diff_sigma(signal_data),
        skew(signal_data),
        kurtosis(signal_data)
    ]

    return signal_features

## 4. Preparing Data and Labels 👩‍🍳

In [None]:
# Step 4: Prepare Data and Labels
filename_to_label = dict(zip(df['eeg_id'], df['expert_consensus']))

parquet_folder = '/kaggle/input/hms-harmful-brain-activity-classification/train_eegs'
parquet_files = os.listdir(parquet_folder)

features = []
labels = []

total_files = len(parquet_files)
processed_files = 0

for _, row in df.iterrows():
    parquet_file = row['eeg_id']
    # Read Parquet data
    parquet_data = pd.read_parquet(os.path.join(parquet_folder, f"{parquet_file}.parquet"))

    # Convert offset seconds to integer
    offset_seconds = int(row['eeg_label_offset_seconds'])
    signal_data = parquet_data[offset_seconds:offset_seconds+50]  # Use the integer value for slicing
    filtered_signal_data = apply_filters(signal_data)
    signal_features = extract_features(signal_data)
    features.append(signal_features)
    labels.append(filename_to_label[parquet_file])

    processed_files += 1
    #print(f"Processed {parquet_file} | Progress: {processed_files}/{total_files} ({processed_files/total_files*100:.2f}%)")

X = np.array(features)
y = np.array(labels)

print('Processing Complete')

# 5. Train Classifier 📈

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.reshape(len(X_train), -1)
y_train = y_train.reshape(len(y_train), -1)
X_test = X_test.reshape(len(X_test), -1)
y_test = y_test.reshape(len(y_test), -1)

rf_classifier = sklearn.ensemble.HistGradientBoostingClassifier(max_iter=100, random_state=42)
rf_classifier.fit(X_train, y_train)


  y = column_or_1d(y, warn=True)


In [None]:
#Model Accuracy (Validation)
accuracy = rf_classifier.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.8146535580524344


## 6. Submission  📋

In [None]:
parquet_folder_test = '/kaggle/input/hms-harmful-brain-activity-classification/test_eegs'
parquet_files_test = os.listdir(parquet_folder_test)


test_features = []
test_csv = '/kaggle/input/hms-harmful-brain-activity-classification/test.csv'
df_test = pd.read_csv(test_csv)
df_test.head()

for _, row in df_test.iterrows():
    parquet_file_test = row['eeg_id']
    # Read Parquet data
    parquet_data_test = pd.read_parquet(os.path.join(parquet_folder_test, f"{parquet_file_test}.parquet"))
    filtered_signal_test = apply_filters(parquet_data_test)
    signal_features_test = extract_features(filtered_signal_test)
    test_features.append(signal_features_test)

X_submission = np.array(test_features)
X_submission = X_submission.reshape(len(X_submission), -1)

print('Processing Complete')


# Step 2: Perform prediction
prediction_probabilities = rf_classifier.predict_proba(X_submission)[0]  # Predict probabilities

# Step 3: Initialize a DataFrame to store predictions
predictions_df = pd.DataFrame(columns=['eeg_id', 'seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote'])

# Step 4: Create a row for the new image prediction
row = pd.DataFrame({
    'eeg_id': ['new_image'],  # Assuming 'new_image' as the eeg_id for the new image
    'seizure_vote': [prediction_probabilities[0]],
    'lpd_vote': [prediction_probabilities[1]],
    'gpd_vote': [prediction_probabilities[2]],
    'lrda_vote': [prediction_probabilities[3]],
    'grda_vote': [prediction_probabilities[4]],
    'other_vote': [prediction_probabilities[5]]
})

# Step 5: Concatenate the new row with the predictions DataFrame
predictions_df = pd.concat([predictions_df, row], ignore_index=True)

# Step 6: Save predictions to a CSV file
predictions_df.to_csv('submission.csv', index=False)

Processing Complete


  predictions_df = pd.concat([predictions_df, row], ignore_index=True)


In [None]:
from IPython.display import FileLink

# Create a download link for the file
FileLink('submission.csv')