# Appendix A: Data Cleaning

To simplify the final project output, the data cleaning part is done in this notebook, and exports the final data to use for analysis as a `*.csv` file. 

What this data cleaning notebook does is it iterates through all of the available subjects and sessions (some subjects have multiple EEG scans/sessions), capture all the onset epoches (the only thing we care about) as defined as 10 seconds before onset, and 5 seconds after. Then, for each epoches, difference from time 0s is taken with vector-wise operation, and then combined into a giant dataframe. A 1/0 indicated is appended to the end of the dataframe to indicate whether this row of record corresponds to an expert (1) or nonexpert (0). The final dataframe is then exported as csv to be used. 

In [1]:
import mne
import pandas as pd

### Defines the indecies of subjects and sessions available

In [2]:
nonexperts = ['001','002','003','004','005','006','007','008','009','010','011','012']
experts = ['013','014','015','016','017','018','019','020','021','022','023','024']
sessions = ['01','02','03']

### A function to load BDF files for EEG measures

In [3]:
def load_bdf_file(raw_fname):
    raw = mne.io.read_raw_bdf(raw_fname, preload=True)
    raw.drop_channels(['EXG1', 'EXG2', 'EXG3', 'EXG4', 'EXG5', 'EXG6', 'EXG7', 'EXG8', 
                       'GSR1', 'GSR2', 'Erg1', 'Erg2', 'Resp', 'Plet', 'Temp'])
    raw.set_montage('biosemi64alpha');


    from os.path import abspath
    montage = mne.channels.read_montage(abspath("../biosemi64alpha.txt"))
    raw.set_montage(montage);


    events = mne.find_events(raw, stim_channel='Status')
    
    # We really only care about the onset, so I am neglecting all other events
    event_dict = {'First question onset (most important marker)': 128}

    epochs = mne.Epochs(raw, events, event_id=event_dict, tmin=-10, tmax=5, preload=True)
    conds = ['First question onset (most important marker)']
    epochs.equalize_event_counts(conds)

    onset_epochs = epochs['First question onset (most important marker)']
    
    return onset_epochs

### A function to transform Epoches to a difference dataframe

In [4]:
def transform_dataframe(onset_epochs):
    # Transforming onset epoches to a Pandas dataframe 
    df = onset_epochs.to_data_frame().loc['First question onset (most important marker)',].drop('Status', axis=1)
    epoches = df.index.get_level_values(0).unique()
    
    transformed_df = pd.DataFrame(columns=df.columns)

    # Iterate through the epoches and take the vector-wise difference in 
    # reference to time 0 (onset), and append to the final dataframe
    for i in epoches:
        ref = df.loc[i].loc[0]
        diff_df = df.loc[i] - ref
        transformed_df = transformed_df.append(diff_df)
        
    return transformed_df

### Iterate through all BDF files available and create final data

In [5]:
# Supresses warnings
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path

# Set up a large dataframe for our analysis; this operation will 
# neglect individuals as well as continous time. 
channels=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11',
       'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19', 'A20', 'A21',
       'A22', 'A23', 'A24', 'A25', 'A26', 'A27', 'A28', 'A29', 'A30', 'A31',
       'A32', 'B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B9', 'B10',
       'B11', 'B12', 'B13', 'B14', 'B15', 'B16', 'B17', 'B18', 'B19', 'B20',
       'B21', 'B22', 'B23', 'B24', 'B25', 'B26', 'B27', 'B28', 'B29', 'B30',
       'B31', 'B32']
nonexpert_data = pd.DataFrame(columns=channels)
expert_data = pd.DataFrame(columns=channels)

# Iterate through all non-experts
for subj in nonexperts:
    for ses in sessions:
        raw_fname = f'../rawdata/bidsexport/sub-{subj}/ses-{ses}/eeg/sub-{subj}_ses-{ses}_task-meditation_eeg.bdf'
        
        if Path(raw_fname).exists():
            onset_epochs = load_bdf_file(raw_fname)
            transformed_df = transform_dataframe(onset_epochs)
            nonexpert_data = nonexpert_data.append(transformed_df)

nonexpert_data['expert'] = 0

# Iterate through all experts
for subj in experts:
    for ses in sessions:
        raw_fname = f'../rawdata/bidsexport/sub-{subj}/ses-{ses}/eeg/sub-{subj}_ses-{ses}_task-meditation_eeg.bdf'
        
        if Path(raw_fname).exists():
            onset_epochs = load_bdf_file(raw_fname)
            transformed_df = transform_dataframe(onset_epochs)
            expert_data = expert_data.append(transformed_df)

expert_data['expert'] = 1

final_data = nonexpert_data.append(expert_data)
final_data

Extracting EDF parameters from /Users/yuyang.zhong/eeg/rawdata/bidsexport/sub-001/ses-01/eeg/sub-001_ses-01_task-meditation_eeg.bdf...
BDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 696575  =      0.000 ...  2720.996 secs...
Trigger channel has a non-zero initial value of 65536 (consider using initial_event=True to detect this event)
Removing orphaned offset at the beginning of the file.
87 events found
Event IDs: [  2   4 128]
28 matching events found
Applying baseline correction (mode: mean)
Not setting metadata
0 projection items activated
Loading data for 28 events and 3841 original time points ...
0 bad epochs dropped
Dropped 0 epochs
Converting "time" to "<class 'numpy.int64'>"...
Extracting EDF parameters from /Users/yuyang.zhong/eeg/rawdata/bidsexport/sub-001/ses-02/eeg/sub-001_ses-02_task-meditation_eeg.bdf...
BDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 695551  =      0.000 .

Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 462335  =      0.000 ...  1805.996 secs...
Trigger channel has a non-zero initial value of 65536 (consider using initial_event=True to detect this event)
Removing orphaned offset at the beginning of the file.
54 events found
Event IDs: [  2   4   8 128 254]
18 matching events found
Applying baseline correction (mode: mean)
Not setting metadata
0 projection items activated
Loading data for 18 events and 3841 original time points ...
0 bad epochs dropped
Dropped 0 epochs
Converting "time" to "<class 'numpy.int64'>"...
Extracting EDF parameters from /Users/yuyang.zhong/eeg/rawdata/bidsexport/sub-007/ses-01/eeg/sub-007_ses-01_task-meditation_eeg.bdf...
BDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 694783  =      0.000 ...  2713.996 secs...
Trigger channel has a non-zero initial value of 65536 (consider using initial_event=True to detect this event)
Removing orph

Trigger channel has a non-zero initial value of 65536 (consider using initial_event=True to detect this event)
Removing orphaned offset at the beginning of the file.
70 events found
Event IDs: [  2   4   8 128]
25 matching events found
Applying baseline correction (mode: mean)
Not setting metadata
0 projection items activated
Loading data for 25 events and 3841 original time points ...
0 bad epochs dropped
Dropped 0 epochs
Converting "time" to "<class 'numpy.int64'>"...
Extracting EDF parameters from /Users/yuyang.zhong/eeg/rawdata/bidsexport/sub-014/ses-01/eeg/sub-014_ses-01_task-meditation_eeg.bdf...
BDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 697599  =      0.000 ...  2724.996 secs...
Trigger channel has a non-zero initial value of 65536 (consider using initial_event=True to detect this event)
Removing orphaned offset at the beginning of the file.
61 events found
Event IDs: [  2   4   8 128]
24 matching events found
Applying basel

Trigger channel has a non-zero initial value of 65536 (consider using initial_event=True to detect this event)
Removing orphaned offset at the beginning of the file.
140 events found
Event IDs: [  2   4   8 128 254]
37 matching events found
Applying baseline correction (mode: mean)
Not setting metadata
0 projection items activated
Loading data for 37 events and 3841 original time points ...
0 bad epochs dropped
Dropped 0 epochs
Converting "time" to "<class 'numpy.int64'>"...
Extracting EDF parameters from /Users/yuyang.zhong/eeg/rawdata/bidsexport/sub-022/ses-01/eeg/sub-022_ses-01_task-meditation_eeg.bdf...
BDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 371711  =      0.000 ...  1451.996 secs...
Trigger channel has a non-zero initial value of 65536 (consider using initial_event=True to detect this event)
Removing orphaned offset at the beginning of the file.
38 events found
Event IDs: [  2   4   8 128]
15 matching events found
Applying 

channel,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,B24,B25,B26,B27,B28,B29,B30,B31,B32,expert
-10000,96.843571,255.093279,198.843383,156.562211,24.624954,23.906206,208.718364,32.374940,-53.718651,98.999817,...,70.843619,66.187378,53.156152,34.093687,-68.374874,64.562381,5.906239,-23.968706,-5.937489,0
-9996,98.656068,256.218277,203.499624,160.124704,21.187461,24.312455,208.405865,33.812438,-52.968652,101.656062,...,72.718616,66.874876,55.468648,37.281181,-64.937380,72.281116,7.281237,-22.562458,-5.343740,0
-9992,111.718544,267.093256,214.655853,171.030934,30.999943,34.437436,216.187101,41.343674,-45.031167,108.937299,...,79.187354,72.718616,62.999884,46.249915,-55.906147,84.374844,13.624975,-16.749969,-5.312490,0
-9988,114.624788,267.718255,216.562100,173.437180,43.562420,38.968678,215.468352,42.437422,-42.718671,110.093547,...,81.187350,74.843612,66.374877,51.281155,-51.124906,88.968586,17.468718,-14.374973,-3.812493,0
-9984,112.281043,263.874512,212.499607,169.062188,42.281172,32.062441,211.499609,40.437425,-46.593664,105.937304,...,81.062350,75.156111,68.031124,54.312400,-49.281159,89.999834,21.843710,-11.968728,4.281242,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4984,-79.999852,78.749854,-120.999776,-67.749875,-70.874869,-86.624840,-3.843743,73.499864,-68.531123,-12.156228,...,-49.624908,-38.781178,-54.406149,-13.937474,-42.562421,24.499955,190.812147,13.406225,-76.093609,1
4988,-83.624845,74.593612,-124.843519,-71.749867,-73.218615,-87.124839,-7.249987,70.062371,-70.718619,-13.937474,...,-50.656156,-41.781173,-58.906141,-16.874969,-45.531166,17.406218,183.155912,8.249985,-83.093596,1
4992,-87.906088,69.781121,-130.187259,-75.624860,-76.624858,-90.374833,-12.593727,66.781127,-71.343618,-18.406216,...,-50.343657,-44.156168,-60.781138,-20.593712,-50.312407,12.031228,175.843425,4.499992,-88.374837,1
4996,-93.999826,63.687382,-134.437252,-81.906099,-83.124846,-96.374822,-18.312466,62.499885,-75.406111,-27.062450,...,-54.937398,-47.218663,-65.218629,-28.499947,-61.281137,3.656243,168.843438,-1.374997,-95.968573,1


### Exporting to CSV

In [6]:
final_data.to_csv('../final_data.csv')