In [1]:
import os
import numpy as np
import pandas as pd
from scipy import interpolate
from scipy.signal import resample

In [None]:
SAMPLE_RATE = 128  # fs
SAMPLE_LEN = 128   # T

In [2]:
# root dir
root = 'TDBrain\derivatives'
# participants file path
ptc_path = os.path.join(root, 'TDBRAIN_participants_V2.tsv')
ptc = pd.read_csv(ptc_path, sep='\t')
ptc

Unnamed: 0,participants_ID,DISC/REP,indication,formal_status,Dataset,Consent,sessSeason,sessTime,Responder,Remitter,...,BDI_post,rTMS PROTOCOL,ADHD_pre_Hyp_leading,ADHD_pre_Att_leading,ADHD_post_Att_leading,ADHD_post_Hyp_leading,NF Protocol,YBOCS_pre,YBOCS_post,Unnamed: 110
0,sub-19681349,REPLICATION,REPLICATION,REPLICATION,REPLICATION,YES,REPLICATION,REPLICATION,REPLICATION,REPLICATION,...,REPLICATION,2,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,
1,sub-19681385,REPLICATION,REPLICATION,REPLICATION,REPLICATION,YES,REPLICATION,REPLICATION,REPLICATION,REPLICATION,...,REPLICATION,1,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,
2,sub-19684666,REPLICATION,REPLICATION,REPLICATION,REPLICATION,YES,REPLICATION,REPLICATION,REPLICATION,REPLICATION,...,REPLICATION,1,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,
3,sub-19686324,REPLICATION,REPLICATION,REPLICATION,REPLICATION,YES,REPLICATION,REPLICATION,REPLICATION,REPLICATION,...,REPLICATION,2,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,
4,sub-19687321,REPLICATION,REPLICATION,REPLICATION,REPLICATION,YES,REPLICATION,REPLICATION,REPLICATION,REPLICATION,...,REPLICATION,3,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,REPLICATION,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1345,sub-88078837,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,,
1346,sub-88079017,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,,
1347,,,,,,,,,,,...,,,,,,,,,,
1348,,,,,,,,,,,...,,,,,,,,,,


## Labels

In [3]:
# delete invalid participants and the participants with replication indication(nan, replication)
# As described in their paper, the replication data is used for validation and test for the model. 
# You can contact the corresponding author of the TDBrain paper to verify the accuracy of the model about the replication data.
ptc = ptc[~(ptc['participants_ID'].isna())]
ptc = ptc[~(ptc['indication'].isna())]
ptc = ptc[~(ptc['indication']=='REPLICATION')]
# capitalize the indication uniformly
ptc['indication'] = ptc['indication'].str.upper()
# remove invalid space placeholder of disease label
ptc['indication'] = ptc['indication'].str.strip()
ptc

Unnamed: 0,participants_ID,DISC/REP,indication,formal_status,Dataset,Consent,sessSeason,sessTime,Responder,Remitter,...,BDI_post,rTMS PROTOCOL,ADHD_pre_Hyp_leading,ADHD_pre_Att_leading,ADHD_post_Att_leading,ADHD_post_Hyp_leading,NF Protocol,YBOCS_pre,YBOCS_post,Unnamed: 110
120,sub-87963457,DISCOVERY,BURNOUT,BURNOUT,,YES,spring,afternoon,,,...,,,,,,,,,,
121,sub-87963593,DISCOVERY,BURNOUT,BURNOUT,,YES,spring,morning,,,...,,,,,,,,,,
122,sub-87963725,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,,
123,sub-87963769,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,,
124,sub-87964717,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1342,sub-88078657,DISCOVERY,INSOMNIA,INSOMNIA,,YES,fall,,,,...,,,,,,,,,,
1343,sub-88078749,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,,
1344,sub-88078793,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,,
1345,sub-88078837,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,,


In [4]:
# Get duplicate values of participants_id
l = ptc.drop_duplicates(['participants_ID'],keep=False)['participants_ID'].to_list()
dup_ptc = ptc[~(ptc['participants_ID'].isin(l))]
dup_ptc

Unnamed: 0,participants_ID,DISC/REP,indication,formal_status,Dataset,Consent,sessSeason,sessTime,Responder,Remitter,...,BDI_post,rTMS PROTOCOL,ADHD_pre_Hyp_leading,ADHD_pre_Att_leading,ADHD_post_Att_leading,ADHD_post_Hyp_leading,NF Protocol,YBOCS_pre,YBOCS_post,Unnamed: 110
124,sub-87964717,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,,
125,sub-87964717,DISCOVERY,SMC,UNKNOWN,,YES,summer,,,,...,,,,,,,,,,
131,sub-87966293,DISCOVERY,SMC,UNKNOWN,,YES,fall,afternoon,,,...,,,,,,,,,,
132,sub-87966293,DISCOVERY,SMC,UNKNOWN,,YES,winter,afternoon,,,...,,,,,,,,,,
133,sub-87966293,DISCOVERY,SMC,UNKNOWN,,YES,spring,morning,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
958,sub-88047289,DISCOVERY,DEPERSONALIZATION,UNKNOWN,,YES,summer,,,,...,,,,,,,,,,
962,sub-88047649,DISCOVERY,OCD,OCD,OCD,YES,summer,,1,1,...,6,7,,,,,,23,12,
963,sub-88047649,DISCOVERY,OCD,OCD,OCD,YES,summer,,1,1,...,6,7,,,,,,23,12,
972,sub-88048193,DISCOVERY,OCD,OCD,OCD,YES,summer,,1,0,...,0,7,,,,,,22,13,


In [5]:
# Test if the same participant has different indications for different sessions
for i in range(len(dup_ptc)):
    temp = dup_ptc[dup_ptc['participants_ID']==dup_ptc['participants_ID'].iloc[i]].drop_duplicates(['indication'])
    if len(temp)!=1:
        print('The different sessions in the same participant may have different indications!')

In [6]:
# Drop duplicate participants
ptc = ptc.drop_duplicates(['participants_ID'])
ptc

Unnamed: 0,participants_ID,DISC/REP,indication,formal_status,Dataset,Consent,sessSeason,sessTime,Responder,Remitter,...,BDI_post,rTMS PROTOCOL,ADHD_pre_Hyp_leading,ADHD_pre_Att_leading,ADHD_post_Att_leading,ADHD_post_Hyp_leading,NF Protocol,YBOCS_pre,YBOCS_post,Unnamed: 110
120,sub-87963457,DISCOVERY,BURNOUT,BURNOUT,,YES,spring,afternoon,,,...,,,,,,,,,,
121,sub-87963593,DISCOVERY,BURNOUT,BURNOUT,,YES,spring,morning,,,...,,,,,,,,,,
122,sub-87963725,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,,
123,sub-87963769,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,,
124,sub-87964717,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1342,sub-88078657,DISCOVERY,INSOMNIA,INSOMNIA,,YES,fall,,,,...,,,,,,,,,,
1343,sub-88078749,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,,
1344,sub-88078793,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,,
1345,sub-88078837,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,,


In [7]:
# EEG data preprocessing

# all disease mapping
all_dise_dict = {}
ls_ind = list(ptc['indication'].unique())
for i in range(len(ls_ind)):
    all_dise_dict[ls_ind[i]] = i
print(all_dise_dict)
    
ptc['indication_id'] = ptc['indication'].map(all_dise_dict)
ptc

{'BURNOUT': 0, 'SMC': 1, 'HEALTHY': 2, 'DYSLEXIA': 3, 'CHRONIC PAIN': 4, 'MDD': 5, 'ADHD': 6, 'ADHD/ASPERGER': 7, 'PDD NOS/DYSLEXIA': 8, 'PDD NOS': 9, 'WHIPLASH': 10, 'ANXIETY': 11, 'ADHD/DYSLEXIA': 12, 'ASD': 13, 'TINNITUS': 14, 'OCD': 15, 'PANIC': 16, 'MDD/ANXIETY': 17, 'MIGRAINE': 18, 'PDD NOS/ANXIETY': 19, 'PARKINSON': 20, 'BIPOLAR': 21, 'MDD/BIPOLAR': 22, 'DYSPRAXIA': 23, 'TINNITUS/MDD': 24, 'ADHD/ASD/ANXIETY': 25, 'MDD/ADHD': 26, 'ADHD/PDD NOS': 27, 'ASPERGER': 28, 'ADHD/EPILEPSY': 29, 'MDD/PAIN': 30, 'PDD NOS/GTS': 31, 'PDD NOS/ADHD': 32, 'PDD NOS/ASD': 33, 'TBI': 34, 'ADHD/ANXIETY': 35, 'ADHD/DYSLEXIA/DYSCALCULIA': 36, 'ADHD/MDD': 37, 'MDD/PANIC': 38, 'DEPERSONALIZATION': 39, 'MDD/TRAUMA': 40, 'PTSD/ADHD': 41, 'OCD/DPS': 42, 'MDD/OCD': 43, 'MDD/TUMOR': 44, 'ADHD/GTS': 45, 'OCD/MDD': 46, 'CONVERSION DX': 47, 'ASD/ASPERGER': 48, 'MDD/ADHD/LYME': 49, 'ADHD/OCD': 50, 'MSA-C': 51, 'OCD/ASD': 52, 'STROKE/PAIN': 53, 'STROKE': 54, 'MDD/OCD/ADHD': 55, 'EPILEPSY/OCD': 56, 'INSOMNIA': 57,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ptc['indication_id'] = ptc['indication'].map(all_dise_dict)


Unnamed: 0,participants_ID,DISC/REP,indication,formal_status,Dataset,Consent,sessSeason,sessTime,Responder,Remitter,...,rTMS PROTOCOL,ADHD_pre_Hyp_leading,ADHD_pre_Att_leading,ADHD_post_Att_leading,ADHD_post_Hyp_leading,NF Protocol,YBOCS_pre,YBOCS_post,Unnamed: 110,indication_id
120,sub-87963457,DISCOVERY,BURNOUT,BURNOUT,,YES,spring,afternoon,,,...,,,,,,,,,,0
121,sub-87963593,DISCOVERY,BURNOUT,BURNOUT,,YES,spring,morning,,,...,,,,,,,,,,0
122,sub-87963725,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,,1
123,sub-87963769,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,,1
124,sub-87964717,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1342,sub-88078657,DISCOVERY,INSOMNIA,INSOMNIA,,YES,fall,,,,...,,,,,,,,,,57
1343,sub-88078749,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,,57
1344,sub-88078793,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,,57
1345,sub-88078837,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,,57


In [8]:
"""# filter subjects with PARKINSON and healthy indication
# 25 PARKINSON & 47 healthy 
ptc = ptc[((ptc['indication_id']==20)|(ptc['indication_id']==2))]
ptc['is_parkinson'] = 0
ptc['is_parkinson'][ptc['indication_id']==20] = 1
ptc"""

"# filter subjects with PARKINSON and healthy indication\n# 25 PARKINSON & 47 healthy \nptc = ptc[((ptc['indication_id']==20)|(ptc['indication_id']==2))]\nptc['is_parkinson'] = 0\nptc['is_parkinson'][ptc['indication_id']==20] = 1\nptc"

In [9]:
# label.npy
# initialization column 'subject_id'
ptc['subject_id'] = np.arange(1, ptc.shape[0] + 1)
ptc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ptc['subject_id'] = np.arange(1, ptc.shape[0] + 1)


Unnamed: 0,participants_ID,DISC/REP,indication,formal_status,Dataset,Consent,sessSeason,sessTime,Responder,Remitter,...,ADHD_pre_Hyp_leading,ADHD_pre_Att_leading,ADHD_post_Att_leading,ADHD_post_Hyp_leading,NF Protocol,YBOCS_pre,YBOCS_post,Unnamed: 110,indication_id,subject_id
120,sub-87963457,DISCOVERY,BURNOUT,BURNOUT,,YES,spring,afternoon,,,...,,,,,,,,,0,1
121,sub-87963593,DISCOVERY,BURNOUT,BURNOUT,,YES,spring,morning,,,...,,,,,,,,,0,2
122,sub-87963725,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,1,3
123,sub-87963769,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,1,4
124,sub-87964717,DISCOVERY,SMC,UNKNOWN,,YES,summer,afternoon,,,...,,,,,,,,,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1342,sub-88078657,DISCOVERY,INSOMNIA,INSOMNIA,,YES,fall,,,,...,,,,,,,,,57,907
1343,sub-88078749,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,57,908
1344,sub-88078793,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,57,909
1345,sub-88078837,DISCOVERY,INSOMNIA,INSOMNIA,,YES,winter,,,,...,,,,,,,,,57,910


In [10]:

df_label = ptc[['indication_id', 'subject_id']]
label_path = 'Processed/TDBrain-19/Label'
if not os.path.exists(label_path):
    os.makedirs(label_path)
    
np.save(label_path + '/label.npy', df_label.values)

In [11]:
np.load('Processed/TDBrain-19/Label/label.npy')

array([[  0,   1],
       [  0,   2],
       [  1,   3],
       ...,
       [ 57, 909],
       [ 57, 910],
       [ 57, 911]], dtype=int64)

## Features

In [12]:
# resample the time series data from original_fs to target_fs
def resample_time_series(data, original_fs, target_fs):
    T, C = data.shape
    new_length = int(T * target_fs / original_fs)
    
    resampled_data = np.zeros((new_length, C))
    for i in range(C):
        resampled_data[:, i] = resample(data[:, i], new_length)
        
    return resampled_data

# split the EEG data into segments of length segment_length, dropping the last segment if it is shorter than segment_length
def split_eeg_segments(data, segment_length=128):
    T, C = data.shape
    num_segments = T // segment_length
    reshaped_data = data[:num_segments * segment_length].reshape(num_segments, segment_length, C)

    return reshaped_data

In [13]:
# 911 valid subjects
# Unified sampling rate: 500Hz
# Unified channels (26 EEG electrodes, 7 additional electrodes). 
# We only use 19 EEG electrodes in 10-20 systems: ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'T3', 'C3', 'Cz', 'C4', 'T4', 'T5', 'P3', 'Pz', 'P4', 'T6', 'O1', 'O2']. 
# T3 and T4 is not in these channels, we use T7 and T8 instead, the two closest electrodes to T3 and T4.
# Initial data downloaded from TDBrain database: All the EEG signals are preprocessed and saved as CSV files
# Data structure: subject-session-task(EC/EO)

feature_path = 'Processed/TDBrain-19/Feature'
if not os.path.exists(feature_path):
    os.makedirs(feature_path)

sub_id = 1
ls_ptc = ptc['participants_ID'].to_list()
for sub in os.listdir(root):
    print(sub)
    print("Effective subject number: ", sub_id)
    # select valid subjects
    if sub in ls_ptc:
        sub_path = os.path.join(root, sub) 
        li_sub = []  # a list to store the EEG data of EO/EC tasks in all sessions of one subject
        for ses in os.listdir(sub_path):
            print(ses)
            print('-----------')
            ses_path = os.path.join(sub_path, ses, 'eeg')
            for task in os.listdir(ses_path):
                if 'EC' in task:
                    print("Eye closed")
                else:
                    print("Eye open")
                task_path = os.path.join(ses_path, task)
                # T7 and T8 is the same position as T3 and T4 
                # P7 and P8 is the same position as T5 and T6
                columns_to_read = ['Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'T7', 'C3', 'Cz', 'C4', 'T8', 'P7', 'P3', 'Pz', 'P4', 'P8', 'O1', 'O2']
                data = pd.read_csv(task_path, usecols=columns_to_read).to_numpy()
                print("Raw data shape ", data.shape)
                data = resample_time_series(data, 500, SAMPLE_RATE)
                task_array = split_eeg_segments(data, SAMPLE_LEN)
                print("Downsampling and segmented data shape ", task_array.shape)
                li_sub.append(task_array)
                print('--------------------------------------------')
            
        feature_array = np.concatenate(li_sub, axis=0)
        print("Subject array shape ", feature_array.shape)
        np.save(feature_path + '/feature_{:03d}.npy'.format(sub_id), feature_array)
        sub_id += 1
        print('--------------------------------------------------------------------------\n')
    else:
        print(" ", sub, " is not in the list of valid subjects.")
        print('--------------------------------------------------------------------------\n')
        continue

sub-19681349
  sub-19681349  is not in the list of valid subjects.
--------------------------------------------------------------------------

sub-19681385
  sub-19681385  is not in the list of valid subjects.
--------------------------------------------------------------------------

sub-19684666
  sub-19684666  is not in the list of valid subjects.
--------------------------------------------------------------------------

sub-19686324
  sub-19686324  is not in the list of valid subjects.
--------------------------------------------------------------------------

sub-19687321
  sub-19687321  is not in the list of valid subjects.
--------------------------------------------------------------------------

sub-19687396
  sub-19687396  is not in the list of valid subjects.
--------------------------------------------------------------------------

sub-19690494
  sub-19690494  is not in the list of valid subjects.
--------------------------------------------------------------------------


In [14]:
# Test the saved npy file
# example
np.load('Processed/TDBrain-19/Feature/feature_016.npy').shape

(240, 128, 19)