In [1]:
import os
import mne
import numpy as np
import pandas as pd
from scipy import interpolate
import matplotlib.pyplot as plt
from scipy.signal import resample
import warnings
warnings.filterwarnings("ignore")

In [None]:
SAMPLE_RATE = 128  # fs
SAMPLE_LEN = 128   # T

In [2]:
# root dir
root = 'Depression/'
# participants file path
participants_path = os.path.join(root, 'participants.tsv')
participants = pd.read_csv(participants_path, sep='\t')
participants

Unnamed: 0,participant_id,Original_ID,sex,age,BDI,STAI,SCID,SCID_notes,HamD
0,sub-001,507,1.0,19.0,0.0,23.0,No Interview,,
1,sub-002,508,1.0,18.0,4.0,47.0,No Interview,,
2,sub-003,509,1.0,18.0,7.0,44.0,No Interview,,
3,sub-004,510,1.0,19.0,1.0,27.0,No Interview,,
4,sub-005,511,2.0,22.0,1.0,23.0,No Interview,,
...,...,...,...,...,...,...,...,...,...
117,sub-118,624,1.0,20.0,23.0,60.0,Current MDD,,21.0
118,sub-119,625,1.0,19.0,16.0,60.0,Past MDD,subsyndromal current,4.0
119,sub-120,626,1.0,18.0,14.0,41.0,Current MDD,,10.0
120,sub-121,627,2.0,19.0,30.0,47.0,Past MDD,,3.0


In [3]:
# available BDI
participants[participants['BDI'].isna()]

Unnamed: 0,participant_id,Original_ID,sex,age,BDI,STAI,SCID,SCID_notes,HamD
37,sub-038,544,,,,,No Interview,INVALID PARTICIPANT,


In [4]:
participants[participants['participant_id']=='sub-038']['BDI'] = -1
participants

Unnamed: 0,participant_id,Original_ID,sex,age,BDI,STAI,SCID,SCID_notes,HamD
0,sub-001,507,1.0,19.0,0.0,23.0,No Interview,,
1,sub-002,508,1.0,18.0,4.0,47.0,No Interview,,
2,sub-003,509,1.0,18.0,7.0,44.0,No Interview,,
3,sub-004,510,1.0,19.0,1.0,27.0,No Interview,,
4,sub-005,511,2.0,22.0,1.0,23.0,No Interview,,
...,...,...,...,...,...,...,...,...,...
117,sub-118,624,1.0,20.0,23.0,60.0,Current MDD,,21.0
118,sub-119,625,1.0,19.0,16.0,60.0,Past MDD,subsyndromal current,4.0
119,sub-120,626,1.0,18.0,14.0,41.0,Current MDD,,10.0
120,sub-121,627,2.0,19.0,30.0,47.0,Past MDD,,3.0


## Labels

In [5]:
labels = np.empty(shape=(participants.shape[0],2), dtype='int32')
labels.shape

(122, 2)

In [6]:
for i, participant in enumerate(participants.values):
    sub_id = int(participant[0].split('-')[1])
    if participant[0] != 'sub-038':
        label = int(participant[4])
    else:
        label = -1   # invalid BDI score
    labels[i,0] = label
    labels[i,1] = sub_id

In [7]:
label_path = 'Processed/Depression/Label'
if not os.path.exists(label_path):
    os.makedirs(label_path)
np.save(label_path + '/label.npy', labels)

In [8]:
np.load('Processed/Depression/Label/label.npy')

array([[  0,   1],
       [  4,   2],
       [  7,   3],
       [  1,   4],
       [  1,   5],
       [  1,   6],
       [  0,   7],
       [  5,   8],
       [  5,   9],
       [  0,  10],
       [  0,  11],
       [  1,  12],
       [  6,  13],
       [  3,  14],
       [  2,  15],
       [  0,  16],
       [  1,  17],
       [  3,  18],
       [  2,  19],
       [  0,  20],
       [  1,  21],
       [  1,  22],
       [  0,  23],
       [  1,  24],
       [  1,  25],
       [  1,  26],
       [  2,  27],
       [  5,  28],
       [  2,  29],
       [  2,  30],
       [  2,  31],
       [  0,  32],
       [  0,  33],
       [  1,  34],
       [  0,  35],
       [  0,  36],
       [  1,  37],
       [ -1,  38],
       [  0,  39],
       [  5,  40],
       [  2,  41],
       [  3,  42],
       [  2,  43],
       [  1,  44],
       [  2,  45],
       [  2,  46],
       [  4,  47],
       [  1,  48],
       [  1,  49],
       [  2,  50],
       [  3,  51],
       [ 29,  52],
       [ 25,

## Features

In [9]:
# Test for bad channels, sampling freq and shape
bad_channel_list, sampling_freq_list, data_shape_list = [], [], []
for sub in os.listdir(root):
    if 'sub-' in sub:
        sub_path = os.path.join(root, sub, 'eeg/')
        # print(sub_path)
        for file in os.listdir(sub_path):
            if '.set' in file:
                file_path = os.path.join(sub_path, file)
                raw = mne.io.read_raw_eeglab(file_path, preload=True)
                # print(raw.get_montage())
                # get bad channels
                bad_channel = raw.info['bads']
                bad_channel_list.append(bad_channel)
                # get sampling frequency
                sampling_freq = raw.info['sfreq']
                sampling_freq_list.append(sampling_freq)
                # get eeg data
                data = raw.get_data()
                data_shape = data.shape
                data_shape_list.append(data_shape)

Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-001\eeg\sub-001_task-Rest_run-01_eeg.fdt
Reading 0 ... 250733  =      0.000 ...   501.466 secs...
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-001\eeg\sub-001_task-Rest_run-02_eeg.fdt
Reading 0 ... 184092  =      0.000 ...   368.184 secs...
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-002\eeg\sub-002_task-Rest_run-01_eeg.fdt
Reading 0 ... 244933  =      0.000 ...   489.866 secs...
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-002\eeg\sub-002_task-Rest_run-02_eeg.fdt
Reading 0 ... 183032  =      0.000 ...   366.064 secs...
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-003\eeg\sub-003_task-Rest_run-01_eeg.fdt
Reading 0 ... 253453  =      0.000 ...   506.906 secs...
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Dep

In [10]:
# 0 bad channels
print(bad_channel_list)
# 500 Hz for all runs
print(sampling_freq_list)
# same number of channels & different timestamps
print(data_shape_list)    # channel number is inconsistent

[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
[500.0, 500.0, 500.0, 500.0

In [11]:
# channel number not consistent, take the common channels
common_channels = []
for sub in os.listdir(root):
    if 'sub-' in sub:
        sub_path = os.path.join(root, sub, 'eeg/')
        # print(sub_path)
        for file in os.listdir(sub_path):
            if '.set' in file:
                file_path = os.path.join(sub_path, file)
                raw = mne.io.read_raw_eeglab(file_path, preload=True)
                current_channels = set(raw.info['ch_names'])
                if not common_channels:
                    common_channels = current_channels
                else:
                    common_channels &= current_channels
common_channels = list(common_channels)
print(common_channels)
print("Common channels number: ", len(common_channels))

Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-001\eeg\sub-001_task-Rest_run-01_eeg.fdt
Reading 0 ... 250733  =      0.000 ...   501.466 secs...
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-001\eeg\sub-001_task-Rest_run-02_eeg.fdt
Reading 0 ... 184092  =      0.000 ...   368.184 secs...
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-002\eeg\sub-002_task-Rest_run-01_eeg.fdt
Reading 0 ... 244933  =      0.000 ...   489.866 secs...
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-002\eeg\sub-002_task-Rest_run-02_eeg.fdt
Reading 0 ... 183032  =      0.000 ...   366.064 secs...
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-003\eeg\sub-003_task-Rest_run-01_eeg.fdt
Reading 0 ... 253453  =      0.000 ...   506.906 secs...
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Dep

In [12]:
# resample the time series data from original_fs to target_fs
def resample_time_series(data, original_fs, target_fs):
    T, C = data.shape
    new_length = int(T * target_fs / original_fs)

    resampled_data = np.zeros((new_length, C))
    for i in range(C):
        resampled_data[:, i] = resample(data[:, i], new_length)

    return resampled_data

# split the EEG data into segments of length segment_length, dropping the last segment if it is shorter than segment_length
def split_eeg_segments(data, segment_length=128):
    T, C = data.shape
    num_segments = T // segment_length
    reshaped_data = data[:num_segments * segment_length].reshape(num_segments, segment_length, C)

    return reshaped_data

In [13]:
feature_path = 'Processed/Depression/Feature'
if not os.path.exists(feature_path):
    os.makedirs(feature_path)

sub_id = 1
for sub in os.listdir(root):
    if 'sub-' in sub:
        li_sub = []
        sub_path = os.path.join(root, sub, 'eeg/')
        print(sub_path)
        for file in os.listdir(sub_path):
            if '.set' in file:
                file_path = os.path.join(sub_path, file)
                raw = mne.io.read_raw_eeglab(file_path, preload=True)
                raw.pick(common_channels)
                data = raw.get_data().T
                print("Raw data shape ", data.shape)
                data = resample_time_series(data, 500, SAMPLE_RATE)
                feature_array = split_eeg_segments(data, SAMPLE_LEN)
                print("Downsampling and segmented data shape ", feature_array.shape)
                np.save(feature_path + '/feature_{:03d}.npy'.format(sub_id), feature_array)
        sub_id += 1
    print("-------------------------------------\n")

-------------------------------------

-------------------------------------

-------------------------------------

-------------------------------------

-------------------------------------

-------------------------------------

-------------------------------------

Depression/sub-001\eeg/
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-001\eeg\sub-001_task-Rest_run-01_eeg.fdt
Reading 0 ... 250733  =      0.000 ...   501.466 secs...
Raw data shape  (250734, 66)
Downsampling and segmented data shape  (501, 128, 66)
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-001\eeg\sub-001_task-Rest_run-02_eeg.fdt
Reading 0 ... 184092  =      0.000 ...   368.184 secs...
Raw data shape  (184093, 66)
Downsampling and segmented data shape  (368, 128, 66)
-------------------------------------

Depression/sub-002\eeg/
Reading C:\Users\24700\PycharmProjects\DataPreprocessingYihe\Depression\Depression\sub-002\eeg\sub-002_t

In [14]:
# Test the saved npy file
# example

path = 'Processed/Depression/Feature/'

for file in os.listdir(path):
    sub_path = os.path.join(path, file)
    print(np.load(sub_path).shape)

(368, 128, 66)
(366, 128, 66)
(405, 128, 66)
(183, 128, 66)
(224, 128, 66)
(168, 128, 66)
(159, 128, 66)
(163, 128, 66)
(196, 128, 66)
(499, 128, 66)
(171, 128, 66)
(166, 128, 66)
(177, 128, 66)
(163, 128, 66)
(169, 128, 66)
(166, 128, 66)
(171, 128, 66)
(216, 128, 66)
(172, 128, 66)
(168, 128, 66)
(165, 128, 66)
(164, 128, 66)
(175, 128, 66)
(165, 128, 66)
(168, 128, 66)
(165, 128, 66)
(167, 128, 66)
(165, 128, 66)
(164, 128, 66)
(154, 128, 66)
(163, 128, 66)
(172, 128, 66)
(175, 128, 66)
(168, 128, 66)
(168, 128, 66)
(166, 128, 66)
(163, 128, 66)
(231, 128, 66)
(166, 128, 66)
(165, 128, 66)
(164, 128, 66)
(169, 128, 66)
(165, 128, 66)
(195, 128, 66)
(167, 128, 66)
(192, 128, 66)
(143, 128, 66)
(237, 128, 66)
(169, 128, 66)
(269, 128, 66)
(199, 128, 66)
(160, 128, 66)
(171, 128, 66)
(220, 128, 66)
(177, 128, 66)
(168, 128, 66)
(173, 128, 66)
(216, 128, 66)
(220, 128, 66)
(169, 128, 66)
(219, 128, 66)
(223, 128, 66)
(211, 128, 66)
(216, 128, 66)
(168, 128, 66)
(237, 128, 66)
(176, 128,