In [4]:
import os
import mne
import pandas as pd
import numpy as np
import janitor
import warnings
from tqdm.notebook import tqdm
import dask.dataframe as dd

import torch
from torch.utils.data import Dataset

warnings.filterwarnings("ignore", category=RuntimeWarning)

data_path = '/home/owner/Documents/DEV/BrainLabyrinth/data/Ymaze_exp'
final_dataset_path = '/home/owner/Documents/DEV/BrainLabyrinth/data/final_dataset'

In [3]:
# from sklearn.decomposition import PCA as sklearnPCA
import scipy.signal as signal

for user_id in tqdm(os.listdir(data_path)):
    person_dir = os.path.join(data_path, user_id)
    for file in os.listdir(person_dir):
        file_path = os.path.join(person_dir, file)

        if file.endswith(".vhdr"):
            # Read the EEG data
            raw = mne.io.read_raw_brainvision(file_path, preload=True, ignore_marker_types=True, verbose=False)
            
            # Apply preprocessing steps

            # 1. Resample to 500 Hz
            raw.resample(500, npad="auto")  # Resample to 500 Hz
            
            # 2. Filter high-pass at 0.1 Hz (for low-frequency noise removal)
            raw.filter(l_freq=0.1, h_freq=None)  # High-pass filter (low-frequency cutoff at 0.1 Hz)
            
            # 3. Notch filter at 50 Hz (to remove power line noise)
            raw.notch_filter(freqs=50, picks='all')  # Notch filter for 50 Hz

            # 4. Detrend the data using scipy.signal.detrend
            raw._data = signal.detrend(raw._data, axis=1)  # Detrend along the time axis (axis=1)

            # 5. Apply PCA for dimensionality reduction (if necessary)
            # n_components = 20
            # pca = sklearnPCA(n_components=n_components)
            # data_pca = pca.fit_transform(raw.get_data())  # Apply PCA to the EEG data
            # raw._data = data_pca  # Replace the original data with the PCA-transformed data
            
            # Continue with the rest of your pipeline as you have it
            # Find the corresponding .vmrk file
            vmrk_file = file.replace('.vhdr', '.vmrk')
            vmrk_file_path = os.path.join(person_dir, vmrk_file)

            if os.path.exists(vmrk_file_path):
                # Read the annotations (markers) from the .vmrk file
                annotations = mne.annotations.read_annotations(vmrk_file_path)
            # Convert raw data to DataFrame
            time_series = raw.to_data_frame()

            # Extract markers (annotations)
            marker_times = annotations.onset  # In seconds
            marker_labels = annotations.description  # The marker labels

            # Create a DataFrame for the markers
            markers_df = pd.DataFrame({
                'event_id': np.arange(len(marker_labels)), 
                'start': marker_times - 3.0,
                'end': marker_times + 1.0,
                'marker': marker_labels
            })
            
            markers_df = markers_df[
                ~markers_df.marker.isin([
                    'Marker/Impedance', 'New Segment/', 'Stimulus/2'
                ])
            ]
            # markers_df.marker = markers_df.marker.replace({'Stimulus/2': 'Stimulus/P'})

            # Display the first few rows of both DataFrames
            # Merge markers with EEG data
            time_series['time'] = time_series['time'].round(3)  # Round times to 3 decimal places for matching

            merged_df = janitor.conditional_join(
                markers_df, 
                time_series,
                ('start', 'time', '<='),
                ('end', 'time', '>='),
                how='left',
                df_columns=['event_id', 'marker']
            )
            # display(merged_df.groupby('event_id').count())
            merged_path = os.path.join(final_dataset_path, f"{user_id}.parquet")
            merged_df.to_parquet(merged_path)


  0%|          | 0/53 [00:00<?, ?it/s]

Filtering raw data in 1 contiguous segment
Setting up high-pass filter at 0.1 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal highpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.10
- Lower transition bandwidth: 0.10 Hz (-6 dB cutoff frequency: 0.05 Hz)
- Filter length: 16501 samples (33.002 s)

Filtering raw data in 1 contiguous segment
Setting up band-stop filter from 49 - 51 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandstop filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 49.38
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 49.12 Hz)
- Upper passband edge: 50.62 Hz
- Upper transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 50.88 Hz)
- Filter length: 3301 sa

In [7]:
# Define the directory containing the Parquet files
parquet_directory = '/home/owner/Documents/DEV/BrainLabyrinth/data/final_dataset (copy)'

# List all Parquet files in the directory
parquet_files = [os.path.join(parquet_directory, f) for f in os.listdir(parquet_directory) if f.endswith('.parquet')]

# Initialize an empty list to hold the Dask DataFrames
dask_dfs = []

# Process each Parquet file
for file in parquet_files:
    # Read the Parquet file into a Dask DataFrame
    df = dd.read_parquet(file)

    # Extract the file name without the extension
    file_name = os.path.basename(file).replace('.parquet', '')

    # Create a unique event_id by combining the original event_id with the file name
    df['event_id'] = df['event_id'].astype(str) + '_' + file_name

    # Append the DataFrame to the list
    dask_dfs.append(df)

# Concatenate all DataFrames into a single Dask DataFrame
combined_df = dd.concat(dask_dfs, ignore_index=True)

# Create a mapping dictionary for unique event_id values
unique_event_ids = combined_df['event_id'].unique().compute()
event_id_mapping = {event_id: idx for idx, event_id in enumerate(unique_event_ids)}

# Renumber the event_id column using the mapping dictionary
combined_df['event_id'] = combined_df['event_id'].map(event_id_mapping, meta=('event_id', 'int64'))

# Optionally, you can perform operations on the combined Dask DataFrame
# For example, you can compute the first few rows to verify the data
print(combined_df.head())

# Save the combined Dask DataFrame to a new Parquet file
combined_df.to_parquet('/home/owner/Documents/DEV/BrainLabyrinth/data/combined.parquet')


   event_id      marker    time        Fp1        Fpz        Fp2         F7  \
0         0  Stimulus/P  23.620  18.929390  27.035186  26.474021 -37.195889   
1         0  Stimulus/P  23.622  21.587595  26.084012  25.872122 -34.220734   
2         0  Stimulus/P  23.624  22.985251  22.778513  20.055913 -32.408453   
3         0  Stimulus/P  23.626  26.445074  26.938968  21.746364 -29.977234   
4         0  Stimulus/P  23.628  27.858514  33.056260  30.219951 -27.094996   

         F3         Fz         F4  ...        PO3        PO4       PO6  \
0  8.102022  26.115237  13.509483  ... -13.759602  -6.662856 -7.093113   
1  2.335635  25.532525  17.738738  ... -15.645522  -9.955678 -9.481412   
2  1.843952  23.887538  14.971009  ...  -7.170127 -12.647842 -9.841793   
3  6.261537  25.611078  17.984585  ... -13.886536 -11.800133 -9.656715   
4  9.726444  30.116939  19.419637  ... -27.232310 -10.963815 -8.354059   

         FT7       FT8        TP7        TP8        PO7        PO8         Oz  


In [7]:
import dask.dataframe as dd

# Define the directory containing the Parquet files
parquet_directory = '/home/owner/Documents/DEV/BrainLabyrinth/data/combined.parquet'

# Read the Parquet files into a Dask DataFrame
dask_df = dd.read_parquet(parquet_directory)

# Compute the shape of the Dask DataFrame
rows, cols = dask_df.shape
rows_computed = rows.compute()

# Print the shape
print(f"Shape of the final dataset: ({rows_computed}, {cols})")

# Optionally, you can compute the first few rows to verify the data
print(dask_df.head())


Shape of the final dataset: (14632831, 67)
   event_id      marker    time        Fp1        Fpz        Fp2         F7  \
0         0  Stimulus/P  23.620  18.929390  27.035186  26.474021 -37.195889   
1         0  Stimulus/P  23.622  21.587595  26.084012  25.872122 -34.220734   
2         0  Stimulus/P  23.624  22.985251  22.778513  20.055913 -32.408453   
3         0  Stimulus/P  23.626  26.445074  26.938968  21.746364 -29.977234   
4         0  Stimulus/P  23.628  27.858514  33.056260  30.219951 -27.094996   

         F3         Fz         F4  ...        PO3        PO4       PO6  \
0  8.102022  26.115237  13.509483  ... -13.759602  -6.662856 -7.093113   
1  2.335635  25.532525  17.738738  ... -15.645522  -9.955678 -9.481412   
2  1.843952  23.887538  14.971009  ...  -7.170127 -12.647842 -9.841793   
3  6.261537  25.611078  17.984585  ... -13.886536 -11.800133 -9.656715   
4  9.726444  30.116939  19.419637  ... -27.232310 -10.963815 -8.354059   

         FT7       FT8        TP7    

In [8]:
import dask.dataframe as dd
import torch
from torch.utils.data import Dataset

# Define the directory containing the Parquet files
parquet_directory = '/home/owner/Documents/DEV/BrainLabyrinth/data/combined.parquet'

class EventDataset(Dataset):
    def __init__(self, path):
        self.__dataframe = dd.read_parquet(path)
        self.__event_ids = self.__dataframe['event_id'].unique().compute()
        self.__len = len(self.__event_ids)

    def __len__(self):
        return self.__len

    def __getitem__(self, idx):
        event_id = self.__event_ids[idx]
        event_rows = self.__dataframe[self.__dataframe['event_id'] == event_id].compute()

        # Convert the event rows to a tensor
        event_tensor = torch.tensor(event_rows.drop(columns=['event_id', 'time', 'marker']).values, dtype=torch.float32)
        marker = torch.tensor(event_rows.marker.values[0] == 'Stimulus/P', dtype=torch.float32)

        return marker, event_tensor

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from torch.nn.utils.rnn import pad_sequence

import torch.nn.functional as F

def collate_fn(batch):
    max_length = 2000  # Fixed length for all sequences
    markers, tensors = zip(*batch)

    batch_size = len(tensors)
    num_features = tensors[0].size(1)  # Assuming all tensors have the same number of features

    # Pre-allocate a tensor of the desired shape
    padded_tensors = torch.zeros((batch_size, max_length, num_features), dtype=torch.float32)

    for i, tensor in enumerate(tensors):
        length = tensor.size(0)
        if length < max_length:
            padded_tensors[i, :length, :] = tensor
        else:
            padded_tensors[i, :max_length, :] = tensor[:max_length]

    # Convert markers to a tensor
    markers = torch.tensor(markers, dtype=torch.float32)

    return markers, padded_tensors


class RNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNNClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = 64  # Number of features
hidden_size = 128  # Number of features in the hidden state
num_layers = 1  # Number of recurrent layers
output_size = 1  # Number of output classes (binary classification)
learning_rate = 0.001
num_epochs = 10
batch_size = 4

# Check if a GPU is available and set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model, loss function, and optimizer
model = RNNClassifier(input_size, hidden_size, num_layers, output_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Create a DataLoader for the dataset
dataset = EventDataset(parquet_directory)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
print(len(dataloader))
# Training loop
for epoch in tqdm(range(num_epochs)):
    for i, (marker, features) in tqdm(enumerate(dataloader)):
        features = features.to(device)  # Move features to the device
        marker = marker.unsqueeze(-1).to(device)
        
        outputs = model(features)
        loss = criterion(outputs, marker)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(dataloader)}], Loss: {loss.item():.4f}')

print("Training complete.")
torch.save(model.state_dict(), "model.torch")


158


  0%|          | 0/10 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Epoch [1/10], Step [100/158], Loss: 0.1666


0it [00:00, ?it/s]

Epoch [2/10], Step [100/158], Loss: 0.4376


0it [00:00, ?it/s]

Epoch [3/10], Step [100/158], Loss: 0.7368


0it [00:00, ?it/s]

Epoch [4/10], Step [100/158], Loss: 0.4407


0it [00:00, ?it/s]

Epoch [5/10], Step [100/158], Loss: 0.1013


0it [00:00, ?it/s]

Epoch [6/10], Step [100/158], Loss: 0.0745


0it [00:00, ?it/s]

Epoch [7/10], Step [100/158], Loss: 0.1555


0it [00:00, ?it/s]

Epoch [8/10], Step [100/158], Loss: 0.0586


0it [00:00, ?it/s]

Epoch [9/10], Step [100/158], Loss: 0.0541


0it [00:00, ?it/s]

Epoch [10/10], Step [100/158], Loss: 0.0116
Training complete.


In [27]:
import torch
from torch.utils.data import DataLoader, Subset
import random
from sklearn.metrics import roc_auc_score

# Assuming `dataset` is your original dataset
# Define the percentage of the dataset to sample
sample_percentage = 0.5

# Calculate the number of samples to take
num_samples = int(len(dataset) * sample_percentage)

# Generate random indices for sampling
indices = random.sample(range(len(dataset)), num_samples)

# Create a subset of the dataset using the random indices
sampled_dataset = Subset(dataset, indices)

# Create a DataLoader for the sampled dataset
batch_size = 4  # Define your batch size
sampled_dataloader = DataLoader(sampled_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Example usage
model.eval()
all_markers = []
all_outputs = []

with torch.no_grad():
    for markers, features in sampled_dataloader:
        features = features.to(device)  # Ensure features are in float format
        markers = markers.unsqueeze(-1)  # Ensure markers are in float format

        outputs = model(features).squeeze()
        all_markers.extend(markers.cpu().numpy())
        all_outputs.extend(outputs.cpu().numpy())

# Calculate ROC AUC
roc_auc = roc_auc_score(all_markers, all_outputs)
print(f'ROC AUC: {roc_auc:.4f}') 


ROC AUC: 0.9981


In [33]:
all_outputs

[np.float32(-3.201051),
 np.float32(1.4520878),
 np.float32(-1.3312658),
 np.float32(-5.50101),
 np.float32(0.53134525),
 np.float32(-2.098539),
 np.float32(-3.6563442),
 np.float32(-3.5946827),
 np.float32(-4.993655),
 np.float32(-5.7273407),
 np.float32(-3.5125065),
 np.float32(-0.9484016),
 np.float32(-3.8781166),
 np.float32(-2.9709537),
 np.float32(-7.981956),
 np.float32(-3.7589788),
 np.float32(-3.377091),
 np.float32(-3.4795809),
 np.float32(-3.3115153),
 np.float32(-4.5321345),
 np.float32(-4.558713),
 np.float32(-1.6348418),
 np.float32(2.6722407),
 np.float32(-3.974273),
 np.float32(3.5575738),
 np.float32(-4.78675),
 np.float32(-3.1726108),
 np.float32(-8.263599),
 np.float32(-1.5308461),
 np.float32(1.5028058),
 np.float32(-3.968042),
 np.float32(-6.1864343),
 np.float32(-3.0929122),
 np.float32(-4.1842923),
 np.float32(-5.591712),
 np.float32(-5.6195846),
 np.float32(-2.1992478),
 np.float32(-5.516196),
 np.float32(-4.168899),
 np.float32(0.50249624),
 np.float32(-4.25164