In [7]:
import os
import mne
import pandas as pd
import numpy as np
import janitor
import warnings
from tqdm.notebook import tqdm
import dask.dataframe as dd

import torch
from torch.utils.data import Dataset

warnings.filterwarnings("ignore", category=RuntimeWarning)

data_path = '/home/owner/Documents/DEV/BrainLabyrinth/data/Ymaze_exp'
final_dataset_path = '/home/owner/Documents/DEV/BrainLabyrinth/data/final_dataset'

In [8]:
# from sklearn.decomposition import PCA as sklearnPCA
import scipy.signal as signal

for user_id in tqdm(os.listdir(data_path)):
    person_dir = os.path.join(data_path, user_id)
    for file in os.listdir(person_dir):
        file_path = os.path.join(person_dir, file)

        if file.endswith(".vhdr"):
            # Read the EEG data
            raw = mne.io.read_raw_brainvision(file_path, preload=True, ignore_marker_types=True, verbose=False)
            
            # Apply preprocessing steps

            # 1. Resample to 500 Hz
            raw.resample(500, npad="auto")  # Resample to 500 Hz
            
            # 2. Filter high-pass at 0.1 Hz (for low-frequency noise removal)
            raw.filter(l_freq=0.1, h_freq=None)  # High-pass filter (low-frequency cutoff at 0.1 Hz)
            
            # 3. Notch filter at 50 Hz (to remove power line noise)
            raw.notch_filter(freqs=50, picks='all')  # Notch filter for 50 Hz

            # 4. Detrend the data using scipy.signal.detrend
            raw._data = signal.detrend(raw._data, axis=1)  # Detrend along the time axis (axis=1)

            # 5. Apply PCA for dimensionality reduction (if necessary)
            # n_components = 20
            # pca = sklearnPCA(n_components=n_components)
            # data_pca = pca.fit_transform(raw.get_data())  # Apply PCA to the EEG data
            # raw._data = data_pca  # Replace the original data with the PCA-transformed data
            
            # Continue with the rest of your pipeline as you have it
            # Find the corresponding .vmrk file
            vmrk_file = file.replace('.vhdr', '.vmrk')
            vmrk_file_path = os.path.join(person_dir, vmrk_file)

            if os.path.exists(vmrk_file_path):
                # Read the annotations (markers) from the .vmrk file
                annotations = mne.annotations.read_annotations(vmrk_file_path)
            # Convert raw data to DataFrame
            time_series = raw.to_data_frame()

            # Extract markers (annotations)
            marker_times = annotations.onset  # In seconds
            marker_labels = annotations.description  # The marker labels

            # Create a DataFrame for the markers
            markers_df = pd.DataFrame({
                'event_id': np.arange(len(marker_labels)), 
                'start': marker_times - 3.0,
                'end': marker_times + 1.0,
                'marker': marker_labels
            })
            
            markers_df = markers_df[
                ~markers_df.marker.isin([
                    'Marker/Impedance', 'New Segment/', 'Stimulus/2'
                ])
            ]
            # markers_df.marker = markers_df.marker.replace({'Stimulus/2': 'Stimulus/P'})

            # Display the first few rows of both DataFrames
            # Merge markers with EEG data
            time_series['time'] = time_series['time'].round(3)  # Round times to 3 decimal places for matching

            merged_df = janitor.conditional_join(
                markers_df, 
                time_series,
                ('start', 'time', '<='),
                ('end', 'time', '>='),
                how='left',
                df_columns=['event_id', 'marker']
            )
            # display(merged_df.groupby('event_id').count())
            merged_path = os.path.join(final_dataset_path, f"{user_id}.parquet")
            merged_df.to_parquet(merged_path)


  0%|          | 0/53 [00:00<?, ?it/s]

Filtering raw data in 1 contiguous segment
Setting up high-pass filter at 0.1 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal highpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.10
- Lower transition bandwidth: 0.10 Hz (-6 dB cutoff frequency: 0.05 Hz)
- Filter length: 16501 samples (33.002 s)



[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    0.1s


Filtering raw data in 1 contiguous segment
Setting up band-stop filter from 49 - 51 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandstop filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 49.38
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 49.12 Hz)
- Upper passband edge: 50.62 Hz
- Upper transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 50.88 Hz)
- Filter length: 3301 samples (6.602 s)



KeyboardInterrupt: 

In [7]:
# Define the directory containing the Parquet files
parquet_directory = '/home/owner/Documents/DEV/BrainLabyrinth/data/final_dataset (copy)'

# List all Parquet files in the directory
parquet_files = [os.path.join(parquet_directory, f) for f in os.listdir(parquet_directory) if f.endswith('.parquet')]

# Initialize an empty list to hold the Dask DataFrames
dask_dfs = []

# Process each Parquet file
for file in parquet_files:
    # Read the Parquet file into a Dask DataFrame
    df = dd.read_parquet(file)

    # Extract the file name without the extension
    file_name = os.path.basename(file).replace('.parquet', '')

    # Create a unique event_id by combining the original event_id with the file name
    df['event_id'] = df['event_id'].astype(str) + '_' + file_name

    # Append the DataFrame to the list
    dask_dfs.append(df)

# Concatenate all DataFrames into a single Dask DataFrame
combined_df = dd.concat(dask_dfs, ignore_index=True)

# Create a mapping dictionary for unique event_id values
unique_event_ids = combined_df['event_id'].unique().compute()
event_id_mapping = {event_id: idx for idx, event_id in enumerate(unique_event_ids)}

# Renumber the event_id column using the mapping dictionary
combined_df['event_id'] = combined_df['event_id'].map(event_id_mapping, meta=('event_id', 'int64'))

# Optionally, you can perform operations on the combined Dask DataFrame
# For example, you can compute the first few rows to verify the data
print(combined_df.head())

# Save the combined Dask DataFrame to a new Parquet file
combined_df.to_parquet('/home/owner/Documents/DEV/BrainLabyrinth/data/combined.parquet')


   event_id      marker    time        Fp1        Fpz        Fp2         F7  \
0         0  Stimulus/P  23.620  18.929390  27.035186  26.474021 -37.195889   
1         0  Stimulus/P  23.622  21.587595  26.084012  25.872122 -34.220734   
2         0  Stimulus/P  23.624  22.985251  22.778513  20.055913 -32.408453   
3         0  Stimulus/P  23.626  26.445074  26.938968  21.746364 -29.977234   
4         0  Stimulus/P  23.628  27.858514  33.056260  30.219951 -27.094996   

         F3         Fz         F4  ...        PO3        PO4       PO6  \
0  8.102022  26.115237  13.509483  ... -13.759602  -6.662856 -7.093113   
1  2.335635  25.532525  17.738738  ... -15.645522  -9.955678 -9.481412   
2  1.843952  23.887538  14.971009  ...  -7.170127 -12.647842 -9.841793   
3  6.261537  25.611078  17.984585  ... -13.886536 -11.800133 -9.656715   
4  9.726444  30.116939  19.419637  ... -27.232310 -10.963815 -8.354059   

         FT7       FT8        TP7        TP8        PO7        PO8         Oz  


In [None]:
import dask.dataframe as dd

# Define the directory containing the Parquet files
parquet_directory = '/home/owner/Documents/DEV/BrainLabyrinth/data/combined.parquet'

# Read the Parquet files into a Dask DataFrame
dask_df = dd.read_parquet(parquet_directory)

# Compute the shape of the Dask DataFrame
rows, cols = dask_df.shape
rows_computed = rows.compute()

# Print the shape
print(f"Shape of the final dataset: ({rows_computed}, {cols})")

# Optionally, you can compute the first few rows to verify the data
print(dask_df.head())


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Shape of the final dataset: (1263656, 67)
   event_id      marker    time        Fp1        Fpz        Fp2         F7  \
0         0  Stimulus/P  23.620  18.929390  27.035186  26.474021 -37.195889   
1         0  Stimulus/P  23.622  21.587595  26.084012  25.872122 -34.220734   
2         0  Stimulus/P  23.624  22.985251  22.778513  20.055913 -32.408453   
3         0  Stimulus/P  23.626  26.445074  26.938968  21.746364 -29.977234   
4         0  Stimulus/P  23.628  27.858514  33.056260  30.219951 -27.094996   

         F3         Fz         F4  ...        PO3        PO4       PO6  \
0  8.102022  26.115237  13.509483  ... -13.759602  -6.662856 -7.093113   
1  2.335635  25.532525  17.738738  ... -15.645522  -9.955678 -9.481412   
2  1.843952  23.887538  14.971009  ...  -7.170127 -12.647842 -9.841793   
3  6.261537  25.611078  17.984585  ... -13.886536 -11.800133 -9.656715   
4  9.726444  30.116939  19.419637  ... -27.232310 -10.963815 -8.354059   

         FT7       FT8        TP7     