# Preprocessing Raw EEGNet Data

In [1]:
import numpy as np
import pandas as pd
import glob
import time
from scipy import signal
from tqdm.notebook import tqdm

In [2]:
train_files = glob.glob('data\Train\Data*.csv')
test_files = glob.glob('data\Test\Data*.csv')
print(train_files[:5])

['data\\Train\\Data_S02_Sess01.csv', 'data\\Train\\Data_S02_Sess02.csv', 'data\\Train\\Data_S02_Sess03.csv', 'data\\Train\\Data_S02_Sess04.csv', 'data\\Train\\Data_S02_Sess05.csv']


In [3]:
# CONSTANTS
training_subjects = 16 #num of training subjects
num_of_fb = 340 #num of feedbacks / subject
freq = 200 #sampling rate
epoch_time = 0.5 #proposed epoching time in seconds
epoch = int(freq * epoch_time) #epoch in indices
pre_fb_time = 0.1
pre_fb = int(freq * pre_fb_time) #sampling time before the feedback
num_of_cols = 59
eeg_cols = 6
b_s = int(-0.4*freq) #index where baseline starts relative to feedback (-400ms)
b_e = int(-0.3*freq) #index where baseline ends relative to feedback (-300ms)
order = 5 #butterworth order
low_pass = 1 #low frequency pass for butterworth filter
high_pass = 40 #high frequency pass for butterworth filter

all_channels = ['Fp1', 'Fp2', 'AF7', 'AF3', 'AF4', 'AF8', 'F7', 'F5', 'F3', 'F1',
               'Fz', 'F2', 'F4', 'F6', 'F8', 'FT7', 'FC5', 'FC3', 'FC1', 'FCz',
               'FC2', 'FC4', 'FC6', 'FT8', 'T7', 'C5', 'C3', 'C1', 'Cz', 'C2',
               'C4', 'C6', 'T8', 'TP7', 'CP5', 'CP3', 'CP1', 'CPz', 'CP2', 'CP4',
               'CP6', 'TP8', 'P7', 'P5', 'P3', 'P1', 'Pz', 'P2', 'P4', 'P6', 'P8',
               'PO7', 'POz', 'P08', 'O1', 'O2']
channels = ['Fz', 'FCz', 'Cz', 'CPz', 'Pz', 'POz']
channels_indices = []
for channel in channels:
    channels_indices.append(all_channels.index(channel))

# [10, 19, 28, 37, 46, 52]
# print(channels_indices)

In [4]:
def butter_filter(order, low_pass, high_pass, fs,sig):
    nyq = 0.5 * fs
    lp = low_pass / nyq
    hp = high_pass / nyq
    sos = signal.butter(order, [lp, hp], btype='band', output = 'sos')
    return signal.sosfilt(sos, sig)



def extract_data(files, e_s = None, baseline = True, bandpass = True):
    start = time.time()
    
    
    
    temp = np.empty((1,epoch+pre_fb, len(channels)), float)
    for i, f in tqdm(enumerate(files)):
        df = pd.read_csv(f) #read each file
        index_fb = df[df['FeedBackEvent'] == 1].index.values
        df_array = np.array(df) 
        
        #uncomment below for butterworth filter
        if bandpass == True:
            eeg = df_array[:, channels_indices] #only eeg values to apply butterworth filter
            for i, channel in enumerate(channels):
                raw_eeg = df[channel].values
                eeg_filtered = butter_filter(order, low_pass, high_pass, freq, raw_eeg) #butterworth filter applied
                eeg[:,i] = eeg_filtered
            df = np.array(df)
            df[:, channels_indices] = eeg #replacing old eeg values with new ones
        else:
            df = np.array(df)
        
        for j, indx in enumerate(index_fb): #epoching 100 indexes (0.5 seconds) after each stimulus
            epoch_array = df[indx-pre_fb:indx+epoch, channels_indices]
            epoch_array = epoch_array.reshape((1, epoch_array.shape[0], epoch_array.shape[1]))

            #uncomment below for baseline correction
            if baseline == True:
                #baseline correction of 100ms (20 indexes), 400ms to 300ms before fb
                baseline_array = df[indx+b_s:indx+b_e, channels_indices] 
                baseline_array = baseline_array.reshape((1,20,int(baseline_array.shape[1])))
                baseline_mean = np.mean(baseline_array, axis = 1)
                #noise subtracted from epoched data
                epoch_array[:,:,:] = epoch_array[:,:,:] - baseline_mean
                
            temp = np.vstack((temp,epoch_array))
                
    now = time.time()
    print('Elapsed Time: ' + str(int(now-start)) + ' seconds')
    return temp

In [5]:
# train = extract_data(train_files)
# print(train.shape)
# test = extract_data(test_files)
# print(test.shape)

In [6]:
# np.save('tr1.npy',train[1:,:,:])
# np.save('te1.npy',test[1:,:,:])
train = np.load('tr1.npy')
test = np.load('te1.npy')

NameError: name 'train' is not defined

In [None]:
train_shape = train.shape[0]
test_shape = test.shape[0]

train = np.reshape(train, (train_shape, eeg_cols, epoch + pre_fb))
test = np.reshape(test, (test_shape, eeg_cols, epoch + pre_fb))

EEG_train = train[:,:,:].reshape(5440*(epoch + pre_fb), eeg_cols)
EEG_test = test[:,:,:].reshape(3400*(epoch + pre_fb), eeg_cols)

train_filtered = EEG_train.reshape(5440, int(eeg_cols), epoch + pre_fb)
test_filtered = EEG_test.reshape(3400, int(eeg_cols), epoch + pre_fb)

print(train_filtered.shape)
print(test_filtered.shape)


In [None]:
train_filtered[0].shape

In [None]:
np.save('data\X_train_bwbs.npy',train_filtered)
np.save('data\X_test_bwbs.npy',test_filtered)

# Generate Graphs

In [None]:
import matplotlib.pyplot as plt

In [None]:
train_files = glob.glob('data/train/Data*.csv')
test_files = glob.glob('data/test/Data*.csv')
print(train_files[:5])

In [None]:
#Not baseline corrected no filter
x = np.arange(-0.1,0.5,0.005) # length = 120
train = extract_data(train_files[:1], -0.4, False, False)
train = train[1:,:,:]

for i, c in enumerate(channels):
    first_electrode = train[:,:120,i]

    for j in np.arange(60):
        y = first_electrode[j,:]
        plt.plot(x, y)

    plt.xlabel('Time (s)')
    plt.ylabel('Frequency')
    plt.title('{} Frequencies of Epoched Data (Bandpassed and Baseline corrected)'.format(c))
    plt.axvline(x=0, marker = '|', linewidth = 2, label = 'Feedback at x = 0', color = 'black')
    plt.legend()
    plt.show()


In [None]:
#Bandpassed, baseline not corrected
x = np.arange(-0.1,0.5,0.005)
train = extract_data(train_files[:1], -0.4, baseline = False, bandpass = True)
train = train[1:,:,:]

for i, c in enumerate(channels):
    first_electrode = train[:,:120,i]

    for j in np.arange(60):
        y = first_electrode[j,:]
        plt.plot(x, y)

    plt.xlabel('Time (s)')
    plt.ylabel('Frequency')
    plt.title('{} Frequencies of Epoched Data (Bandpassed and Baseline corrected)'.format(c))
    plt.axvline(x=0, marker = '|', linewidth = 2, label = 'Feedback at x = 0.', color = 'black')
    plt.legend()
    plt.show()


In [None]:
#Bandpassed, and baseline corrected
x = np.arange(-0.1,0.5,0.005)
train = extract_data(train_files[:1], -0.4, baseline = True, bandpass = True)
train = train[1:,:,:]

for i, c in enumerate(channels):
    first_electrode = train[:,:120,i]

    for j in np.arange(60):
        y = first_electrode[j,:]
        plt.plot(x, y)

    plt.xlabel('Time (s)')
    plt.ylabel('Frequency')
    plt.title('{} Frequencies of Epoched Data (Bandpassed and Baseline corrected)'.format(c))
    plt.axvline(x=0, marker = '|', linewidth = 2, label = 'Feedback at x = 0', color = 'black')
    plt.legend()
    plt.show()
