In [1]:
import os
import mne
import pandas as pd
import numpy as np
import janitor
import warnings
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore", category=RuntimeWarning)

data_path = '/home/owner/Documents/DEV/BrainLabyrinth/data/Ymaze_exp'
final_dataset_path = '/home/owner/Documents/DEV/BrainLabyrinth/data/final_dataset'

In [None]:
# from sklearn.decomposition import PCA as sklearnPCA
import scipy.signal as signal
from mne.preprocessing import ICA


for user_id in tqdm(os.listdir(data_path)):
    person_dir = os.path.join(data_path, user_id)
    for file in os.listdir(person_dir):
        file_path = os.path.join(person_dir, file)

        if file.endswith(".vhdr"):
            # Read the EEG data
            raw = mne.io.read_raw_brainvision(file_path, preload=True, ignore_marker_types=True, verbose=False)
            
            # Apply preprocessing steps

            # 1. Resample to 500 Hz
            raw.resample(500, npad="auto")  # Resample to 500 Hz
            
            # 2. Filter high-pass at 0.1 Hz (for low-frequency noise removal)
            raw.filter(l_freq=0.1, h_freq=30)  # High-pass filter (low-frequency cutoff at 0.1 Hz)
            
            # 3. Notch filter at 50 Hz (to remove power line noise)
            raw.notch_filter(freqs=50, picks='all')  # Notch filter for 50 Hz

            # 4. ICA for blink removal
            ica = ICA(n_components=20, method='fastica', max_iter=2000, random_state=97)
            ica.fit(raw, picks='eeg')  # Fit to EEG channels excluding EOG
            
            # Detect EOG artifacts
            eog_indices, eog_scores = ica.find_bads_eog(raw, ch_name='EOG', threshold=2.0)
            ica.exclude = eog_indices
            
            # Apply ICA cleaning
            raw = ica.apply(raw)
            raw.drop_channels(['EOG'])  # Remove auxiliary EOG channel after cleaning

            # 4. Detrend the data using scipy.signal.detrend
            raw._data = signal.detrend(raw._data, axis=1)  # Detrend along the time axis (axis=1)

            # 5. Apply PCA for dimensionality reduction (if necessary)
            # n_components = 20
            # pca = sklearnPCA(n_components=n_components)
            # data_pca = pca.fit_transform(raw.get_data())  # Apply PCA to the EEG data
            # raw._data = data_pca  # Replace the original data with the PCA-transformed data
            
            # Continue with the rest of your pipeline as you have it
            # Find the corresponding .vmrk file
            vmrk_file = file.replace('.vhdr', '.vmrk')
            vmrk_file_path = os.path.join(person_dir, vmrk_file)

            if os.path.exists(vmrk_file_path):
                # Read the annotations (markers) from the .vmrk file
                annotations = mne.annotations.read_annotations(vmrk_file_path)
            # Convert raw data to DataFrame
            time_series = raw.to_data_frame()

            # Extract markers (annotations)
            marker_times = annotations.onset  # In seconds
            marker_labels = annotations.description  # The marker labels

            # Create a DataFrame for the markers
            markers_df = pd.DataFrame({
                'event_id': np.arange(len(marker_labels)), 
                'start': marker_times - 3.0,
                'end': marker_times + 1.0,
                'marker': marker_labels
            })
            
            markers_df = markers_df[
                ~markers_df.marker.isin([
                    'Marker/Impedance', 'New Segment/'#, 'Stimulus/2'
                ])
            ]
            # markers_df.marker = markers_df.marker.replace({'Stimulus/2': 'Stimulus/P'})

            # Display the first few rows of both DataFrames
            # Merge markers with EEG data
            time_series['time'] = time_series['time'].round(3)  # Round times to 3 decimal places for matching

            merged_df = janitor.conditional_join(
                markers_df, 
                time_series,
                ('start', 'time', '<='),
                ('end', 'time', '>='),
                how='left',
                df_columns=['event_id', 'marker']
            )
            # display(merged_df.groupby('event_id').count())
            merged_path = os.path.join(final_dataset_path, f"{user_id}.parquet")
            merged_df.to_parquet(merged_path)


In [3]:
import os
ok = os.listdir('/home/owner/Documents/DEV/BrainLabyrinth/data/final_dataset copy')

cleaned_dir = '/home/owner/Documents/DEV/BrainLabyrinth/data/final_dataset'
for file in os.listdir(cleaned_dir):
    if file not in ok:
        os.remove(os.path.join(cleaned_dir, file))

In [None]:
# Define the directory containing the Parquet files
parquet_directory = '/home/owner/Documents/DEV/BrainLabyrinth/data/final_dataset'

# List all Parquet files in the directory
parquet_files = [os.path.join(parquet_directory, f) for f in os.listdir(parquet_directory) if f.endswith('.parquet')]


def restore_stimulus_direction(pdf, event_col='event_id', marker_col='marker', time_col='time'):
    """
    For each event_id:
      1) Sort by time_col.
      2) Remove all 'Stimulus/s16' rows.
      3) Mark the first row that is Stimulus/1 or Stimulus/2 (is_first_turn=1) but do NOT drop it yet.
      4) Determine base_dir (+1 / -1) from that first turn, apply flips for Stimulus/A.
      5) Convert all 1/2/A/P to 'Left'/'Right'.
      6) Create a 'prev_turn' column: the previous row's turn label ('Left'/'Right') within the same event_id.
      7) Drop rows where 'prev_turn' is NaN (i.e., the first turn row) using dropna.
    """

    # 1) Sort by time
    pdf = pdf.sort_values(time_col).copy()

    # 2) Remove 'Stimulus/s16' rows
    pdf = pdf[pdf[marker_col] != 'Stimulus/s16']

    # Make a mask for rows that are Stimulus/1 or Stimulus/2
    mask_1_2 = pdf[marker_col].isin(['Stimulus/1','Stimulus/2'])

    # Step 3: For each event_id, find the *first* row that is 1 or 2
    pdf['temp_1_2'] = mask_1_2.astype(int)
    pdf['cumsum_1_2'] = pdf.groupby(event_col)['temp_1_2'].cumsum()
    pdf['is_first_turn'] = np.where(pdf['cumsum_1_2'] == 1, 1, 0)

    # Build a dict: event_id -> base_dir from the first turn
    first_turn_dir = (
        pdf.loc[pdf['is_first_turn'] == 1, [event_col, marker_col]]
          .groupby(event_col)[marker_col]
          .first()  # the first turn in each event_id
          .map({'Stimulus/1': 1, 'Stimulus/2': -1})
    )
    pdf['base_dir'] = pdf[event_col].map(first_turn_dir)

    # Step 4: flipping logic
    # Stimulus/A => flip=1, everything else => flip=0
    pdf['flip'] = 0
    pdf.loc[pdf[marker_col] == 'Stimulus/A', 'flip'] = 1
    pdf['flip_cum'] = pdf.groupby(event_col)['flip'].cumsum()

    # final_dir = base_dir * ((-1)^flip_cum)
    pdf['final_dir'] = pdf['base_dir'] * ((-1) ** pdf['flip_cum'])

    # Step 5: rewrite rows in {1,2,A,P} => Left/Right if final_dir is not NaN
    turn_set = ['Stimulus/1','Stimulus/2','Stimulus/A','Stimulus/P']
    mask_turn = pdf[marker_col].isin(turn_set) & pdf['final_dir'].notna()

    pdf.loc[mask_turn, marker_col] = np.where(
        pdf.loc[mask_turn, 'final_dir'] == 1,
        'Left',
        'Right'
    )

    # Step 6: create a 'prev_turn' column to capture the *previous* turn label.
    # A "turn" row is any row where marker is now 'Left' or 'Right'.
    pdf['turn_dir'] = np.where(
        pdf[marker_col].isin(['Left','Right']),
        pdf[marker_col],
        np.nan
    )

    # For each row, prev_turn is the turn_dir of the *previous* row that is a turn in the same event_id.
    pdf['prev_turn'] = pdf.groupby(event_col)['turn_dir'].shift(1)

    # Step 7: drop rows where prev_turn is NaN => removes the actual first turn row
    pdf.dropna(subset=['prev_turn'], inplace=True)

    # Clean up intermediate columns we don't want in final dataset
    pdf.drop([
        'temp_1_2','cumsum_1_2','is_first_turn',
        'base_dir','flip','flip_cum','final_dir','turn_dir'
    ], axis=1, inplace=True)

    return pdf


# Initialize an empty list to hold the Dask DataFrames
dask_dfs = []

# Process each Parquet file
for file in tqdm(parquet_files):
    # Read the Parquet file into a Dask DataFrame
    df = pd.read_parquet(file)
    
    df = restore_stimulus_direction(df)
    
    # Extract the file name without the extension
    file_name = os.path.basename(file).replace('.parquet', '')

    # Create a unique event_id by combining the original event_id with the file name
    df['event_id'] = df['event_id'].astype(str) + '_' + file_name

    # Append the DataFrame to the list
    dask_dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dask_dfs, ignore_index=True)

# Create a mapping dictionary for unique event_id values
unique_event_ids = combined_df['event_id'].unique()
event_id_mapping = {event_id: idx for idx, event_id in enumerate(unique_event_ids)}

# Renumber the event_id column using the mapping dictionary
combined_df['event_id'] = combined_df['event_id'].map(event_id_mapping).astype('int64')


# Save the combined Dask DataFrame to a new Parquet file
combined_df.to_parquet('/home/owner/Documents/DEV/BrainLabyrinth/data/combined.parquet')


  0%|          | 0/21 [00:00<?, ?it/s]

In [None]:
# Define the directory containing the Parquet files
parquet_directory = '/home/owner/Documents/DEV/BrainLabyrinth/data/combined.parquet'

# Read the Parquet files into a Dask DataFrame
df = pd.read_parquet(parquet_directory)

# Compute the shape of the Dask DataFrame
rows, cols = df.shape
rows_computed = rows

# Print the shape
print(f"Shape of the final dataset: ({rows_computed}, {cols})")

# Optionally, you can compute the first few rows to verify the data
print(df.head())
df.columns


Shape of the final dataset: (79960, 67)
   event_id marker   time        Fp1        Fpz       Fp2        F7        F3  \
0         0   Left  9.440  18.279552  17.167071  1.736013  2.509046  3.320807   
1         0   Left  9.442  18.187657  17.195618  1.650717  2.619466  3.746728   
2         0   Left  9.444  17.988738  17.064795  1.536693  2.780392  4.108368   
3         0   Left  9.446  17.697904  16.795464  1.420282  2.984275  4.421709   
4         0   Left  9.448  17.339646  16.426039  1.325767  3.213150  4.703400   

          Fz        F4  ...        PO4       PO6       FT7       FT8  \
0  11.627260  6.565130  ... -10.643972  4.269458 -6.793052 -2.421540   
1  12.211854  7.602871  ... -11.330150  2.269126 -7.381371 -1.549136   
2  12.665638  8.477080  ... -12.088001  0.015087 -7.783466 -0.885222   
3  12.995026  9.112953  ... -12.876470 -2.320403 -7.966721 -0.441055   
4  13.219588  9.470200  ... -13.659191 -4.564826 -7.926686 -0.196698   

         TP7       TP8        PO7       

Index(['event_id', 'marker', 'time', 'Fp1', 'Fpz', 'Fp2', 'F7', 'F3', 'Fz',
       'F4', 'F8', 'FC5', 'FC1', 'FC2', 'FC6', 'M1', 'T7', 'C3', 'Cz', 'C4',
       'T8', 'M2', 'CP5', 'CP1', 'CP2', 'CP6', 'P7', 'P3', 'Pz', 'P4', 'P8',
       'POz', 'O1', 'O2', 'AF7', 'AF3', 'AF4', 'AF8', 'F5', 'F1', 'F2', 'F6',
       'FC3', 'FCz', 'FC4', 'C5', 'C1', 'C2', 'C6', 'CP3', 'CP4', 'P5', 'P1',
       'P2', 'P6', 'PO5', 'PO3', 'PO4', 'PO6', 'FT7', 'FT8', 'TP7', 'TP8',
       'PO7', 'PO8', 'Oz', 'prev_turn'],
      dtype='object')