In [1]:
# Setup
import xarray as xr
import mne
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mne_bids
from pathlib import Path
from autoreject import AutoReject
from autoreject import get_rejection_threshold

## cut noisy parts
def cut_noisy(raw, task, language):
    """
    First step of preprocessing in my pipeline!
    it's important to get rid of noisy parts in baseline and experinece segments: informed from the audio files.

    Parameters
    ----------
    raw : mne.io.Raw
        eeg raw data
    task : str
        can be experience1, etc or baseline1 or baseline2
    language : str
        language of the experiment ['eng' or 'hun']
    """
    # validate task and language name
    import re
    from collections import namedtuple
    
    if not re.fullmatch('(baseline[12])|(experience[1-4])',task):
        raise Exception('Invalid task!')
    
    tasklang = task + language

    # helper named tuple to make the code more readable
    Interval = namedtuple('Interval', ['tmin', 'tmax'])

    cut_intervals = {
        '(experience[1-4](eng|hun))|baseline1hun': Interval(tmin=20, tmax=320),       
        'baseline1eng': Interval(tmin=25, tmax=325),       
        'baseline2(eng|hun)': Interval(tmin=30, tmax=330),       
        }
    interval = [value for key,value in cut_intervals.items() if re.fullmatch(key, tasklang)][0]

    raw.crop(tmin=interval.tmin, tmax=interval.tmax)

    return raw

In [None]:
# ids_map = ids_map.loc[:2132614].drop_duplicates()
a = pd.read_excel('docs/ids_map.xlsx', header=1, index_col='behavioral_id').bids_id
a.index = a.index.astype('int64')
a

In [23]:
# open long formatted behavioral data 
data = pd.read_excel('docs/data_with_psds.xlsx', header=1, index_col='Unnamed: 0')
# get the index of the condition where hypnosis introduced as real hypnosis
truehypnosis_ind = [(i+1) - 4*int(i/4)
                    for i in range(len(data))
                    if (data.iloc[i]['description_type'] == 'hypnosis' and data.iloc[i]['trial_type'] == True)]
truehypnosis_series = pd.Series(truehypnosis_ind,index=data.index.drop_duplicates())
# remove two participants with software crash
truehypnosis_series = truehypnosis_series.iloc[:50]

# open ids map (shold match the behavioral with bids ids)
ids_map = pd.read_excel('docs/ids_map.xlsx', header=1, index_col='behavioral_id').bids_id
# remove participants with software crash
ids_map = ids_map.loc[:2132614].drop_duplicates()
# change the

In [None]:

## create dataset from data of two tasks of two persons
bids_root = Path('data/Main-study')
subjects = ['01', '02']
tasks = ['baseline1', 'experience1']
allSubjectsTasks = {}

for task in tasks:

    # timpoints and initialize
    timepoints = 30001
    allSubjects = np.empty((1,61, timepoints))

    # Open data of a specific task for all subjects one by one, reshape and append them
    for sub in subjects:
        bids_path = mne_bids.BIDSPath(subject=sub, session='01', task=task, root=bids_root)
        raw = mne_bids.read_raw_bids(bids_path, verbose=False)
        # Cut noisy parts only for experience and baseline task
        if task[:-1] in ['baseline', 'experience']:
            raw = cut_noisy(raw, task,'hun')
        
        # raw.resample(256)
        # Get eeg data as np.array and add subject dimension
        # TODO check if getting data this way can change data's precision
        oneSubject = np.expand_dims(raw.get_data(),0)

        allSubjects = np.append(allSubjects, oneSubject, axis=0)

    # remove empty array
    allSubjects = np.delete(allSubjects, 0, axis=0)

    # mega data
    allSubjectsTasks[task] = allSubjects
    
## open dataset with mne

In [4]:
# creating xarray from the dict
baseline = allSubjectsTasks['baseline1']
experience = allSubjectsTasks['experience1']

dataset = xr.Dataset(
    {
        "baseline": (["subject", "electrodes", "time"], baseline),
        "experience": (["subject", "electrodes", "time"], experience)
    }
)

# save and check the size (if it was ok we will use all data)
comp = dict(zlib=True, complevel=9)
encoding = {var: comp for var in dataset.data_vars}
dataset.to_netcdf('data/dataset.nc', engine="h5netcdf", encoding=encoding)