In [1]:
import os
import mne
import numpy as np
import pandas as pd
from scipy import interpolate
import matplotlib.pyplot as plt
import scipy
from scipy.signal import resample
import warnings
warnings.filterwarnings("ignore")

In [None]:
SAMPLE_RATE = 128  # fs
SAMPLE_LEN = 128   # T

In [2]:
# root dir
root = 'REEG-SRM/'
# participants file path
participants_path = os.path.join(root, 'participants.tsv')
participants = pd.read_csv(participants_path, sep='\t')
participants

Unnamed: 0,participant_id,age,sex,ravlt_1,ravlt_5,ravlt_tot,ravlt_imm,ravlt_del,ravlt_rec,ravlt_fp,...,tmt_2,tmt_3,tmt_4,cw_1,cw_2,cw_3,cw_4,vf_1,vf_2,vf_3
0,sub-001,29,f,8,14,64,13,15,15,1,...,20,23.0,57.0,33,24,82,79,45.0,44.0,15.0
1,sub-002,29,f,8,14,65,13,14,15,1,...,23,33.0,101.0,34,29,44,51,32.0,36.0,14.0
2,sub-003,62,f,6,13,48,11,9,12,0,...,45,43.0,75.0,27,28,93,61,,,
3,sub-004,20,f,8,14,62,13,13,15,0,...,18,17.0,49.0,28,20,43,45,56.0,54.0,23.0
4,sub-005,32,f,13,15,73,15,15,15,0,...,26,19.0,47.0,25,21,42,41,50.0,59.0,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,sub-107,18,f,8,15,65,15,15,15,0,...,19,24.0,39.0,21,19,41,44,42.0,54.0,12.0
107,sub-108,50,f,7,14,54,10,12,14,0,...,36,38.0,86.0,31,21,55,60,35.0,44.0,15.0
108,sub-109,19,f,8,15,61,14,15,15,0,...,20,38.0,128.0,40,20,90,84,30.0,38.0,13.0
109,sub-110,39,f,9,15,66,15,14,15,0,...,20,18.0,46.0,30,17,41,60,63.0,48.0,15.0


## Some data test, we found sub-29 cannot be read

In [22]:
# Test for bad channels, sampling freq and shape
derivative_path = root + 'derivatives/cleaned_epochs'
bad_channel_list, sampling_freq_list, data_shape_list = [], [], []
for sub in os.listdir(derivative_path):
    if 'sub-' in sub:
        sub_path = os.path.join(derivative_path, sub)
        for session in os.listdir(sub_path): 
            session_path = os.path.join(sub_path, session, 'eeg/')
            # print(sub_path)
            for file in os.listdir(session_path):
                if '.set' in file:
                    file_path = os.path.join(session_path, file)
                    try:
                        raw = mne.io.read_epochs_eeglab(file_path)
                        # print(raw.get_montage())
                        # get bad channels
                        bad_channel = raw.info['bads']
                        bad_channel_list.append(bad_channel)
                        # get sampling frequency
                        sampling_freq = raw.info['sfreq']
                        sampling_freq_list.append(sampling_freq)
                        # get eeg data
                        data = raw.get_data()
                        data_shape = data.shape
                        data_shape_list.append(data_shape)
                    except Exception as e:
                        print(f"Failed to load {file_path}: {e}")
                        continue 
# Subject 29 and 104 cannot be loaded

Extracting parameters from C:\Users\24700\PycharmProjects\DataPreprocessingYihe\REEG-SRM\REEG-SRM\derivatives\cleaned_epochs\sub-001\ses-t1\eeg\sub-001_ses-t1_task-resteyesc_desc-epochs_eeg.set...
Not setting metadata
53 matching events found
No baseline correction applied
0 projection items activated
Ready.
Extracting parameters from C:\Users\24700\PycharmProjects\DataPreprocessingYihe\REEG-SRM\REEG-SRM\derivatives\cleaned_epochs\sub-002\ses-t1\eeg\sub-002_ses-t1_task-resteyesc_desc-epochs_eeg.set...
Not setting metadata
55 matching events found
No baseline correction applied
0 projection items activated
Ready.
Extracting parameters from C:\Users\24700\PycharmProjects\DataPreprocessingYihe\REEG-SRM\REEG-SRM\derivatives\cleaned_epochs\sub-002\ses-t2\eeg\sub-002_ses-t2_task-resteyesc_desc-epochs_eeg.set...
Not setting metadata
55 matching events found
No baseline correction applied
0 projection items activated
Ready.
Extracting parameters from C:\Users\24700\PycharmProjects\DataPreproce

In [23]:
# 0 bad channels
print(bad_channel_list)
# 500 Hz for all runs
print(sampling_freq_list)
# same number of channels & different timestamps
print(data_shape_list)    # channel number is inconsistent
print(len(data_shape_list) )

[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]
[1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 1024.0, 10

## Labels

In [3]:
# subtract two issue subjects
labels = np.empty(shape=(participants.shape[0]-2,2), dtype='int32')
labels.shape

(109, 2)

In [4]:
sub_id = 1 # count from 1, does not match raw subject ID since no sub-29 and sub-104
for i in range(len(labels)):
    labels[i,0] = 0  # according to the readme file, all the subjects are healthy controls
    labels[i,1] = sub_id
    sub_id += 1

In [5]:
label_path = 'Processed/REEG-SRM/Label'
if not os.path.exists(label_path):
    os.makedirs(label_path)
np.save(label_path + '/label.npy', labels)

In [6]:
np.load('Processed/REEG-SRM/Label/label.npy')

array([[  0,   1],
       [  0,   2],
       [  0,   3],
       [  0,   4],
       [  0,   5],
       [  0,   6],
       [  0,   7],
       [  0,   8],
       [  0,   9],
       [  0,  10],
       [  0,  11],
       [  0,  12],
       [  0,  13],
       [  0,  14],
       [  0,  15],
       [  0,  16],
       [  0,  17],
       [  0,  18],
       [  0,  19],
       [  0,  20],
       [  0,  21],
       [  0,  22],
       [  0,  23],
       [  0,  24],
       [  0,  25],
       [  0,  26],
       [  0,  27],
       [  0,  28],
       [  0,  29],
       [  0,  30],
       [  0,  31],
       [  0,  32],
       [  0,  33],
       [  0,  34],
       [  0,  35],
       [  0,  36],
       [  0,  37],
       [  0,  38],
       [  0,  39],
       [  0,  40],
       [  0,  41],
       [  0,  42],
       [  0,  43],
       [  0,  44],
       [  0,  45],
       [  0,  46],
       [  0,  47],
       [  0,  48],
       [  0,  49],
       [  0,  50],
       [  0,  51],
       [  0,  52],
       [  0,

## Features

In [7]:
def resample_data(array, original_fs, target_fs=256):
    N, T, C = array.shape
    # Calculate the new length of the time dimension after resampling
    new_length = int(np.round(T * (target_fs / original_fs)))

    # Create an empty array to store the resampled data
    resampled_data = np.zeros((N, new_length, C))

    # Resample each channel of each sample
    for i in range(N):
        for j in range(C):
            resampled_data[i, :, j] = resample(array[i, :, j], new_length)

    return resampled_data

In [9]:
feature_path = 'Processed/REEG-SRM/Feature'
if not os.path.exists(feature_path):
    os.makedirs(feature_path)
    
derivative_path = root + 'derivatives/cleaned_epochs'
sub_id = 1
for sub in os.listdir(derivative_path):
    if 'sub-' in sub:  
        if sub == "sub-029" or sub == "sub-104":  # skip subject 29 and subject 104
            continue
        feature_list = []
        sub_path = os.path.join(derivative_path, sub)
        for session in os.listdir(sub_path): 
            session_path = os.path.join(sub_path, session, 'eeg/')
            # print(sub_path)
            for file in os.listdir(session_path):
                if '.set' in file:
                    file_path = os.path.join(session_path, file)
                    raw = mne.io.read_epochs_eeglab(file_path)
                    freq = raw.info['sfreq']
                    # get eeg data
                    data = np.transpose(raw.get_data(), (0, 2, 1))
                    print("Raw data shape ", data.shape)
                    data = resample_data(data, freq, SAMPLE_RATE)
                    session_feature_array = data.reshape((-1, SAMPLE_LEN, 64))   # 64 channels
                    print("Downsampling and segmented session data shape ", session_feature_array.shape)
                    feature_list.append(session_feature_array)
        feature_array = np.concatenate(feature_list, axis=0)
        print("Subject data shape ", feature_array.shape)
        np.save(feature_path + '/feature_{:03d}.npy'.format(sub_id), feature_array)
        sub_id += 1
    print("-------------------------------------\n")
# Subject 29 cannot be loaded

-------------------------------------

-------------------------------------

Extracting parameters from C:\Users\24700\PycharmProjects\DataPreprocessingYihe\REEG-SRM\REEG-SRM\derivatives\cleaned_epochs\sub-001\ses-t1\eeg\sub-001_ses-t1_task-resteyesc_desc-epochs_eeg.set...
Not setting metadata
53 matching events found
No baseline correction applied
0 projection items activated
Ready.
Raw data shape  (53, 4096, 64)
Downsampling and segmented session data shape  (212, 128, 64)
Subject data shape  (212, 128, 64)
-------------------------------------

Extracting parameters from C:\Users\24700\PycharmProjects\DataPreprocessingYihe\REEG-SRM\REEG-SRM\derivatives\cleaned_epochs\sub-002\ses-t1\eeg\sub-002_ses-t1_task-resteyesc_desc-epochs_eeg.set...
Not setting metadata
55 matching events found
No baseline correction applied
0 projection items activated
Ready.
Raw data shape  (55, 4096, 64)
Downsampling and segmented session data shape  (220, 128, 64)
Extracting parameters from C:\Users\24700\

In [10]:
# Test the saved npy file
# example

path = 'Processed/REEG-SRM/Feature/'

for file in os.listdir(path):
    sub_path = os.path.join(path, file)
    print(np.load(sub_path).shape)

(212, 128, 64)
(440, 128, 64)
(308, 128, 64)
(220, 128, 64)
(440, 128, 64)
(220, 128, 64)
(432, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(440, 128, 64)
(220, 128, 64)
(212, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(440, 128, 64)
(440, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(440, 128, 64)
(440, 128, 64)
(440, 128, 64)
(440, 128, 64)
(220, 128, 64)
(440, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(440, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(440, 128, 64)
(412, 128, 64)
(220, 128, 64)
(220, 128, 64)
(440, 128, 64)
(440, 128, 64)
(440, 128, 64)
(220, 128, 64)
(220, 128, 64)
(440, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(440, 128, 64)
(220, 128, 64)
(440, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(440, 128, 64)
(220, 128, 64)
(220, 128, 64)
(220, 128, 64)
(440, 128,