In [15]:
import os
import numpy as np
import pandas as pd
import pyedflib
from pyedflib import highlevel
from scipy.signal import resample
from sklearn.utils import shuffle

In [None]:
SAMPLE_RATE = 128  # fs
SAMPLE_LEN = 128   # T

In [16]:
# majority session sampling rate: 256Hz, both downsampling & upsampling are needed
# remember to delete the eeg data with 1Hz sampling frequency, but standard 19 channels do not have the abnormal frequencies.

# root path of epilepsy & no epilepsy edf files
root_path = 'v2.0.0'
path_ep = 'v2.0.0//epilepsy_edf'
path_noep = 'v2.0.0//no_epilepsy_edf'

# 'Fp1', 'Fp2', 'F7', 'F3', 'Fz', 'F4', 'F8', 'T3', 'C3', 'Cz', 'C4', 'T4', 'T5', 'P3', 'Pz', 'P4', 'T6', 'O1', 'O2'
li_std_ch = ['EEG FP1-REF','EEG FP2-REF','EEG F7-REF','EEG F3-REF','EEG FZ-REF','EEG F4-REF','EEG F8-REF','EEG T3-REF','EEG C3-REF',
            'EEG CZ-REF','EEG C4-REF','EEG T4-REF','EEG T5-REF','EEG P3-REF','EEG PZ-REF','EEG P4-REF','EEG T6-REF','EEG O1-REF','EEG O2-REF']

In [17]:
def std_sub(root, li_std_ch, min_num=2, max_num=40):
    li_std_sub = []
    for file in os.listdir(root):
        p = os.path.join(root, file)
        if os.path.isdir(p):
            edf_path = os.path.join(root, file)
            for sub in os.listdir(edf_path):
                sub_path = os.path.join(edf_path, sub)
                num_std_tri = 0
                for ses in os.listdir(sub_path):
                    ses_path = os.path.join(sub_path, ses)
                    ses_path_sp = os.path.join(ses_path, os.listdir(ses_path)[0]) # only one subset
                    for tri in os.listdir(ses_path_sp):
                        tri_path = os.path.join(ses_path_sp, tri)
                        f = pyedflib.EdfReader(tri_path)
                        channels = f.getSignalLabels()
                        f.close()
                        if all([ch in channels for ch in li_std_ch]):
                            num_std_tri += 1
                if (num_std_tri>=min_num) & (num_std_tri<=max_num):
                    li_std_sub.append(sub_path)
                                            
    return li_std_sub

# select subjects with 1-300 standard trials and have all 19 standard channels
li_std_sub = std_sub(root_path, li_std_ch, min_num=1, max_num=300)
print(len(li_std_sub))
li_std_sub

179


['v2.0.0\\epilepsy_edf\\aaaaaanr',
 'v2.0.0\\epilepsy_edf\\aaaaaawu',
 'v2.0.0\\epilepsy_edf\\aaaaabdn',
 'v2.0.0\\epilepsy_edf\\aaaaabhz',
 'v2.0.0\\epilepsy_edf\\aaaaabju',
 'v2.0.0\\epilepsy_edf\\aaaaacrz',
 'v2.0.0\\epilepsy_edf\\aaaaadkv',
 'v2.0.0\\epilepsy_edf\\aaaaaelp',
 'v2.0.0\\epilepsy_edf\\aaaaaeph',
 'v2.0.0\\epilepsy_edf\\aaaaaeqq',
 'v2.0.0\\epilepsy_edf\\aaaaaewf',
 'v2.0.0\\epilepsy_edf\\aaaaafif',
 'v2.0.0\\epilepsy_edf\\aaaaaflb',
 'v2.0.0\\epilepsy_edf\\aaaaagnh',
 'v2.0.0\\epilepsy_edf\\aaaaagxr',
 'v2.0.0\\epilepsy_edf\\aaaaaicb',
 'v2.0.0\\epilepsy_edf\\aaaaaifn',
 'v2.0.0\\epilepsy_edf\\aaaaaifp',
 'v2.0.0\\epilepsy_edf\\aaaaaimz',
 'v2.0.0\\epilepsy_edf\\aaaaaint',
 'v2.0.0\\epilepsy_edf\\aaaaaiud',
 'v2.0.0\\epilepsy_edf\\aaaaajat',
 'v2.0.0\\epilepsy_edf\\aaaaajbn',
 'v2.0.0\\epilepsy_edf\\aaaaajqo',
 'v2.0.0\\epilepsy_edf\\aaaaajud',
 'v2.0.0\\epilepsy_edf\\aaaaajus',
 'v2.0.0\\epilepsy_edf\\aaaaakbt',
 'v2.0.0\\epilepsy_edf\\aaaaakfe',
 'v2.0.0\\epilepsy_e

## Features

In [18]:
# resample the time series data from original_fs to target_fs
def resample_time_series(data, original_fs, target_fs):
    T, C = data.shape
    new_length = int(T * target_fs / original_fs)
    
    resampled_data = np.zeros((new_length, C))
    for i in range(C):
        resampled_data[:, i] = resample(data[:, i], new_length)
        
    return resampled_data

# split the EEG data into segments of length segment_length, dropping the last segment if it is shorter than segment_length
def split_eeg_segments(data, segment_length=128):
    T, C = data.shape
    num_segments = T // segment_length
    reshaped_data = data[:num_segments * segment_length].reshape(num_segments, segment_length, C)

    return reshaped_data
        

def eeg_data(std_edf_path, li_std_ch, target_freq=128, window_size=128):
    signals, signal_headers, _ = highlevel.read_edf(std_edf_path, ch_names=li_std_ch)
    freq = signal_headers[0]['sample_frequency']
    data = signals.T
    print("Raw data shape ", data.shape)
    if freq != target_freq:
        data = resample_time_series(data, freq, target_freq)

    data = split_eeg_segments(data, window_size)
    print("Downsampling and segmented data shape ", data.shape)
    return data

In [19]:
# main
feature_path = 'Processed/TUEP/Feature'
if not os.path.exists(feature_path):
    os.makedirs(feature_path)

sub_id = 1
for sub_path in li_std_sub:
    print(sub_path)
    li_sub = []
    for ses in os.listdir(sub_path):
        ses_path = os.path.join(sub_path, ses)
        ses_path_sp = os.path.join(ses_path, os.listdir(ses_path)[0]) # only one subset
        for tri in os.listdir(ses_path_sp):
            tri_path = os.path.join(ses_path_sp, tri)
            f = pyedflib.EdfReader(tri_path)
            channels = f.getSignalLabels()
            f.close()
            if all([ch in channels for ch in li_std_ch]):  # check if all 19 channels exist
                data = eeg_data(tri_path, li_std_ch, SAMPLE_RATE, SAMPLE_LEN)
                li_sub.append(data)
    feature_array = np.concatenate(li_sub, axis=0)
    feature_array = shuffle(feature_array)[:800]   # randomly select 800 samples of each subject to avoid too strong subject features
    print("Subject array shape: ", feature_array.shape)
    np.save(feature_path + '/feature_{:03d}.npy'.format(sub_id), feature_array)
    sub_id += 1
    print("-------------------------------------\n")

v2.0.0\epilepsy_edf\aaaaaanr
Raw data shape  (331250, 19)
Downsampling and segmented data shape  (1325, 128, 19)
Raw data shape  (317750, 19)
Downsampling and segmented data shape  (1271, 128, 19)
Raw data shape  (28672, 19)
Downsampling and segmented data shape  (112, 128, 19)
Raw data shape  (48128, 19)
Downsampling and segmented data shape  (188, 128, 19)
Raw data shape  (153856, 19)
Downsampling and segmented data shape  (601, 128, 19)
Raw data shape  (153856, 19)
Downsampling and segmented data shape  (601, 128, 19)
Raw data shape  (76800, 19)
Downsampling and segmented data shape  (300, 128, 19)
Raw data shape  (153856, 19)
Downsampling and segmented data shape  (601, 128, 19)
Raw data shape  (76800, 19)
Downsampling and segmented data shape  (300, 128, 19)
Raw data shape  (153856, 19)
Downsampling and segmented data shape  (601, 128, 19)
Raw data shape  (76800, 19)
Downsampling and segmented data shape  (300, 128, 19)
Raw data shape  (153856, 19)
Downsampling and segmented data 

## Labels

In [20]:
# label.npy

dict_label = {}
sub_id = 1
for sub_path in li_std_sub:
    if 'no_epilepsy' in sub_path:
        dict_label[sub_id] = 0
    else:
        dict_label[sub_id] = 1
    sub_id += 1
                
df_label = pd.DataFrame([dict_label]).T.reset_index().rename(columns = {'index':'subject_id', 0:'is_epilepsy'})

label_path = 'Processed/TUEP/Label'
if not os.path.exists(label_path):
    os.makedirs(label_path)

labels = df_label.values
labels[:,[1,0]] = labels[:,[0,1]]
np.save(label_path + '/label.npy', labels)

In [21]:
# label.npy
np.load('Processed/TUEP/Label/label.npy')

array([[  1,   1],
       [  1,   2],
       [  1,   3],
       [  1,   4],
       [  1,   5],
       [  1,   6],
       [  1,   7],
       [  1,   8],
       [  1,   9],
       [  1,  10],
       [  1,  11],
       [  1,  12],
       [  1,  13],
       [  1,  14],
       [  1,  15],
       [  1,  16],
       [  1,  17],
       [  1,  18],
       [  1,  19],
       [  1,  20],
       [  1,  21],
       [  1,  22],
       [  1,  23],
       [  1,  24],
       [  1,  25],
       [  1,  26],
       [  1,  27],
       [  1,  28],
       [  1,  29],
       [  1,  30],
       [  1,  31],
       [  1,  32],
       [  1,  33],
       [  1,  34],
       [  1,  35],
       [  1,  36],
       [  1,  37],
       [  1,  38],
       [  1,  39],
       [  1,  40],
       [  1,  41],
       [  1,  42],
       [  1,  43],
       [  1,  44],
       [  1,  45],
       [  1,  46],
       [  1,  47],
       [  1,  48],
       [  1,  49],
       [  1,  50],
       [  1,  51],
       [  1,  52],
       [  1,

In [22]:
# Test the saved npy file
# example

path = 'Processed/TUEP/Feature/'

for file in os.listdir(path):
    sub_path = os.path.join(path, file)
    print(np.load(sub_path).shape)

(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128, 19)
(800, 128,