# Dataset Prepper

### Notes
* Latest version is in EEG/Scripts
* train/test dataset only requires preictal and interictal segments. Ictal segment is not needed, because preictal means an ictal already occurred. Therefore DL algo only needs to classify between 1) preictal or 0) interictal. 

### References
* https://natmeg.se/mne_preprocessing/1-MNE_from_raw_to_epochs_evoked.html

In [8]:
# all imports

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import mne  
import tqdm

import scipy.signal as signal
import scipy.stats as stats
import scipy.io as sio

import os, copy, shutil, datetime, random
from pathlib import Path

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Loading

In [9]:
# pathing

path = "/home/SharedFiles/Data/HospitalData/EEG/SNUCH_VEM_EDF"
annot_dir = '/home/SharedFiles/Projects/EEG/Interim/annotated_raws'
seg_dir = '/home/SharedFiles/Projects/EEG/Interim/seg_raws'
seq_dir = '/home/SharedFiles/Projects/EEG/Interim/seq_raws'
arr_dir = '/home/SharedFiles/Projects/EEG/Inputs/seq_arr/'
output_dir = '/home/SharedFiles/Projects/EEG/Outputs/runs'
input_path = "/home/SharedFiles/Data/HospitalData/EEG/SNUCH_VEM_EDF"
annot_path = '/home/SharedFiles/Projects/EEG/Interim/annotated_raws'
xl_path = os.path.join(input_path, 'SNUCH 2020 labeling_edit_20220303.xlsx')

for _dir in (annot_dir, seq_dir, seg_dir):
    if not os.path.exists(_dir):
        os.makedirs(_dir)

In [15]:
# load edfs
print('loading edfs as raws...')
patients=[]
raw_paths = []
for it in os.listdir(input_path):
    if not it.startswith('.') and it.lower().endswith('.edf'): # filter-out ghosts
        patients.append(it.split('.')[0])
        raw_paths.append(os.path.join(input_path, it))

loading edfs as raws...


# Prepping DF

In [17]:
# fill empty date rows
for i in range(len(df.index)):
    pos = df.iat[i, 0]
    if not pd.isnull(pos):
        currDate = pos
    else:
        df.iat[i,0] = currDate

In [18]:
df = df[['Date', 'Time', 'Annotation']]
df['Datetime'] = df.apply(lambda r : datetime.datetime.combine(
    datetime.datetime.strptime(str(r['Date']),'%Y%m%d'),r['Time']),1)
df = df[['Datetime', 'Annotation']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Datetime'] = df.apply(lambda r : datetime.datetime.combine(


# Annotating

In [19]:
print('starting on annotations...')
for patient, raw_path in zip(patients, raw_paths): 
    raw = mne.io.read_raw_edf(raw_path, verbose=True)
    print(patient)
                         
    raw_data = raw.load_data()    
    orig_time = df.loc[patient]['Datetime'][0]
    start_time = df[df['Annotation'].str.contains(
        'Start of seizure')].loc[patient]['Datetime'].reset_index(drop=True)
    end_time = df[df['Annotation'].str.contains(
        'End of seizure')].loc[patient]['Datetime'].reset_index(drop=True)
        # leave as floats, because mne.Annotations only takes in array of floats
    
    onsets = np.round(np.array((start_time - orig_time).dt.total_seconds())) 
    durations = np.round(np.array((end_time - start_time).dt.total_seconds())) 
    
#     # TESTS
#     print('orig_time:', orig_time)
#     print('start_time:', start_time)
#     print('end_time:', end_time)
#     print('onsets:', onsets)
#     print('durations:', durations)
    
    # filter out negatives
    for i, (onset, duration) in enumerate(zip(onsets, durations)):
        if onset < 0 or duration < 0:
            print('negative warning (onset, duration):', onset, duration)
            onsets = np.delete(onsets, i)
            durations = np.delete(durations, i)
    
    descriptions = ['ictal' for i in range(len(onsets))]
    print('length equivalency:', len(onsets)==len(durations)==len(descriptions)) ### TEST
    
    curr_annot = mne.Annotations(
                              onset=onsets,
                              duration=durations,
                              description=descriptions,
                                )
    print(curr_annot)
        
    # Set
    print('setting annotations...')
    raw_annot = raw_data.set_annotations(curr_annot)

    # Save
    fname = patient + '_annotated_raw.fif'
    fpath = os.path.join(annot_path, fname)
    raw_annot.save(fpath, overwrite=True)  # saved to annot_path #given at the top
    print("Saved.")
    
    # mem clear
    del raw, raw_data, raw_annot
    
    print()

starting on annotations...
Extracting EDF parameters from /home/SharedFiles/Data/HospitalData/EEG/SNUCH_VEM_EDF/SNUCH01.EDF...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
SNUCH01
Reading 0 ... 16788999  =      0.000 ... 83944.995 secs...
length equivalency: True
<Annotations | 2 segments: ictal (2)>
setting annotations...
Overwriting existing file.
Writing /home/SharedFiles/Projects/EEG/Interim/annotated_raws/SNUCH01_annotated_raw.fif
Closing /home/SharedFiles/Projects/EEG/Interim/annotated_raws/SNUCH01_annotated_raw.fif
[done]
Saved.

Extracting EDF parameters from /home/SharedFiles/Data/HospitalData/EEG/SNUCH_VEM_EDF/SNUCH02.EDF...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
SNUCH02
Reading 0 ... 14149999  =      0.000 ... 70749.995 secs...
length equivalency: True
<Annotations | 21 segments: ictal (21)>
setting annotations...
Overwriting existing file.
Writing /home/SharedFiles/Projects/EEG/Interim/annotat

# Sequence Slicer

Slices seq_len sequences

In [20]:
def slicer(raw, annots, patient, preictals_dir, interictals_dir):
    ## initializations
    # variables
    horizon_len = 5 *60#sec # pre-seizure horizon of 5 min
    seg_len = 10 *60#sec # segment of 10 min
    seq_len = seg_len*6 #sec # sequence of 1 hr
    buffer_len = 4*60 *60#sec # 4hr buffer for interictal (one side)
    
    # constants
    tot_time = np.round(raw.times[-1])
    
    # lists
    interictal_annots = [0.0]
    preictal_seqs, interictals, interictal_seqs = [], [], []
    
    # check
    print(annots)
    
    ## create preictal sequences and possible interictal points
    for annot in annots: 
        # PREICTAL SEQUENCES
        preictal_end = annot['onset'] - horizon_len # prior to seizure horizon (exclusive)
        preictal_start = preictal_end - seq_len 
        
        if preictal_start > 0: # if within bounds
            if len(preictal_seqs) == 0: 
                preictal_seqs.append([preictal_start, preictal_end])
            elif preictal_start > preictal_seqs[-1][1]: # avoid preictal overlap
                preictal_seqs.append([preictal_start, preictal_end])
                
                
        
        # INTERICTAL SEQUENCES
        '''
        ==Note==
        - Find smarter & lighter method for interictal filtering. Too heavy right now.
        '''
        # add buffer at left or right of seizure
        interictal_left = annot['onset'] - buffer_len # buffer_len before seizure
        interictal_right = annot['onset'] + annot['duration'] + buffer_len # buffer_len after seizure
        
        if interictal_left > 0:
            interictal_annots.append(interictal_left)
        if interictal_right < tot_time - seq_len:
            interictal_annots.append(interictal_right)

    
    # interictal partitioning
    interictal_annots = sorted(interictal_annots) # order from low to high
    interictal_annots.append(tot_time)
    for a in range(len(interictal_annots)-1):
        if interictal_annots[a+1] - interictal_annots[a] > seq_len: 
        # interictal is above seq_len
            no_overlap = True
            for preictal_seq in preictal_seqs: 
                #if preictal_seq[0] >= interictal_annots[a] and preictal_seq[0] <= interictal_annots[a+1]:  
                ## faster better method. double check logic. filter for preictal_seq[1] too?
                if (int(preictal_seq[0]), int(preictal_seq[1])+1) in range(int(interictal_annots[a]), int(interictal_annots[a+1])+1): #overkill, too heavy.
                    no_overlap = False # if overlapping with any preictal_starts 
           
            if no_overlap:
                interictals.append([interictal_annots[a], interictal_annots[a+1]])
                                                                              

    # TEST
    print('preictal_seqs:\t\t\t {}'.format(len(preictal_seqs)))
    
    if len(preictal_seqs)==0 or len(interictals)==0:
        print('preictal_seqs:', preictal_seqs)
        print('interictal_parts:', interictals)
    
    if len(interictals) <= 0:
        print('interictal_annots', interictal_annots)
        print('***')
        print('***WARNING: num interictals is 0. This patient will be skipped. Decrease buffer_len and rerun.***')
        print('***')
        ############## ADD: decrease buffer_len by seq_len til seq_len per itertation,  restart curr call from top ###########
        print()
        return
    
    # interictal sequences    
    for interictal in interictals:
        num_interictal_seqs = int((interictal[1] - interictal[0]) // seq_len)
        interictal_seq_start = interictal[0]
        for num in range(num_interictal_seqs): # see if this and line above can be combined into range([0],[1],step=seq_len)
            interictal_seq_end = interictal_seq_start + seq_len
            if interictal_seq_end <= interictal[1]:
                interictal_seqs.append([interictal_seq_start, interictal_seq_end]) #
            interictal_seq_start = interictal_seq_end + 1
            
            if interictal_seq_end > interictal[1]:
                print("***NOTE: Omitted last interictal pair's interictal_seq_end that was out of interictal range: {}~{}. ***".format(interictal_seq_start, interictal_seq_end))
    
    # TEST
    print('interictal_parts -> _seqs: {} -> {}'.format(len(interictals), len(interictal_seqs)))
    
    if len(interictal_seqs)==0:
        print('preictal_seqs:', preictal_seqs)
        print('interictal_annots:', interictal_annots)
        print('interictal_parts:', interictals)
        print('num_interictal_seqs:', num_interictal_seqs)
        print('interictal_seqs:', interictal_seqs)

#     ## Crop 'n Save sequences

#     print('Saving preictals...')
#     for seq_num, preictal_seq in enumerate(preictal_seqs):
#         preictal_crop = raw.copy().crop(tmin=preictal_seq[0], tmax=preictal_seq[1])
#         # Save
#         preictal_crop_fname =  os.path.join(preictals_dir, patient + '_preictal_' + str(seq_num+1).zfill(3) + '_raw.fif')
#         preictal_crop.save(preictal_crop_fname, overwrite=inp_overwrite)
#         print(preictal_crop_fname)
    
#     print('Saving interictals...')
#     for seq_num, interictal_seq in enumerate(interictal_seqs):
#         interictal_crop = raw.copy().crop(tmin=interictal_seq[0], tmax=interictal_seq[1])
#         # Save
#         interictal_crop_fname =  os.path.join(interictals_dir, patient + '_interictal_' + str(seq_num+1).zfill(3) + '_raw.fif')
#         interictal_crop.save(interictal_crop_fname, overwrite=inp_overwrite)
#         print(interictal_crop_fname)

    print()


In [23]:
def main():
    if os.path.exists(seq_dir):
        inp_overwrite = True if input("Overwrite pre-existing saves in '{}'? y/(N):" \
                                      .format(seq_dir)).lower() == 'y' else False
        if inp_overwrite:
            shutil.rmtree(seq_dir)
            os.makedirs(seq_dir)
    
    for it in os.listdir(annot_dir): # for every annotated fif
        if not it.startswith('._') and it.endswith('.fif'):  
        # if non-ghost fif file
            # create dirs
            patient = it.split('_')[0]
            patient_dir = os.path.join(seq_dir, patient)
            preictals_dir = os.path.join(patient_dir, 'preictals')
            interictals_dir = os.path.join(patient_dir, 'interictals')
            if not os.path.exists(preictals_dir):
                os.makedirs(preictals_dir)
            if not os.path.exists(interictals_dir):
                os.makedirs(interictals_dir)

            # load
            fif_pth = os.path.join(annot_dir, it)
            raw = mne.io.read_raw_fif(fif_pth)
            annots = mne.read_annotations(fif_pth)

            # slice
            slicer(raw, annots, patient, preictals_dir, interictals_dir)
            
    print('All process complete.\nTerminating program.')
main()

Overwrite pre-existing saves in '/home/SharedFiles/Projects/EEG/Interim/seq_raws'? y/(N): 


Opening raw data file /home/SharedFiles/Projects/EEG/Interim/annotated_raws/SNUCH01_annotated_raw.fif...
Isotrak not found
    Range : 0 ... 16788999 =      0.000 ... 83944.995 secs
Ready.
<Annotations | 2 segments: ictal (2)>
preictal_seqs:			 2
interictal_parts -> _seqs: 4 -> 22

Opening raw data file /home/SharedFiles/Projects/EEG/Interim/annotated_raws/SNUCH02_annotated_raw.fif...
Isotrak not found
    Range : 0 ... 14149999 =      0.000 ... 70749.995 secs
Ready.
<Annotations | 21 segments: ictal (21)>
preictal_seqs:			 4
interictal_parts -> _seqs: 4 -> 14

Opening raw data file /home/SharedFiles/Projects/EEG/Interim/annotated_raws/SNUCH03_annotated_raw.fif...
Isotrak not found
    Range : 0 ... 15757999 =      0.000 ... 78789.995 secs
Ready.
<Annotations | 5 segments: ictal (5)>
preictal_seqs:			 5
interictal_parts -> _seqs: 8 -> 17

Opening raw data file /home/SharedFiles/Projects/EEG/Interim/annotated_raws/SNUCH04_annotated_raw.fif...
Isotrak not found
    Range : 0 ... 16452999

In [22]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

# Segment Slicer

slices seg_len segments from every sequence

In [None]:
# retrieve all paths
patients, all_preictals, all_interictals = [], [], []
for patient in os.listdir(seq_dir):
    patient_pth = os.path.join(seq_dir, patient)
    if patient.startswith('SNUCH'):
        patients.append(patient)
        all_preictals.append([])
        all_interictals.append([])
        for ictalType in os.listdir(patient_pth):
            ictalType_pth = os.path.join(patient_pth, ictalType) 
            if os.path.isdir(ictalType_pth):
                if ictalType == 'preictals':
                    for preictal in os.listdir(ictalType_pth):
                        if not preictal.startswith('._'):
                            preictal_pth = os.path.join(ictalType_pth, preictal)
                            all_preictals[-1].append(preictal_pth)
                if ictalType == 'interictals':
                    for interictal in os.listdir(ictalType_pth):
                        if not interictal.startswith('._'):
                            interictal_pth = os.path.join(ictalType_pth, interictal)
                            all_interictals[-1].append(interictal_pth)

# Test                            
len(all_preictals) == len(all_interictals) == len(patients)

In [None]:
for p in range(5,len(patients)): # test
    print(patients[p])
    # create new seg_dir
    # create interictal preictal dir in seg_dir
    patient_dir = os.path.join(seg_dir, patient[p])
    preictals_dir = os.path.join(patient_dir, 'preictals')
    interictals_dir = os.path.join(patient_dir, 'interictals')
    
    for _dir in (patient_dir, preictals_dir, interictals_dir):
        if not os.path.exists(_dir):
            os.makedirs(_dir)
    
    numPreictals = len(all_preictals[p])
    numInterictals = len(all_interictals[p])
    
    print('num Preictals:', numPreictals)
    if numPreictals < numInterictals:    
        # make total number of preictal segs equivalent to total number of interictal segs
        totInterictalSegs = numInterictals * seg_cnt  
        singlePreictalSegs = totInterictalSegs // numPreictals
        totCropCnt = 0 # TEST
        print('num segs for one preictal:', singlePreictalSegs) #### TEST
        for preictal in all_preictals[p]:
            raw = mne.io.read_raw_fif(preictal, verbose=False)
            print(raw)
            totTime = int(raw.times[-1]) # sec
            startT, endT= 0, 0
            currCropCnt = 0 #### TEST
            #print('Cropping preictal segments...')
            if singlePreictalSegs > seg_cnt:
            # sliding window oversampling on preictals. 
                window_size = (totTime - seg_len) // singlePreictalSegs # same as totTime / singlePreictalSegs.startTs
                for seg in range(singlePreictalSegs):
                    endT = startT + seg_len 
                    if endT <= totTime: # double safety
                        crop = raw.copy().crop(tmin=float(startT), tmax=float(endT), include_tmax=False)
                        currCropCnt += 1 
                        # # Save
                        # fname = os.path.join(preictals_dir, patients[p] + '_preictal_' + str(seq_num+1).zfill(3) + '_raw.fif')
                        # crop.save(fname, overwrite=inp_overwrite)
                        # print(fname)
                    startT += window_size         
            else:
                num_segs = totTime // seg_len
                for seg in range(num_segs):
                    endT = startT + seg_len 
                    if endT <= totTime: # double safety
                        crop = raw.copy().crop(tmin=float(startT), tmax=float(endT), include_tmax=False)
                        currCropCnt += 1
                        # # Save
                        # fname = os.path.join(preictals_dir, patients[p] + '_preictal_' + str(seq_num+1).zfill(3) + '_raw.fif')
                        # crop.save(fname, overwrite=inp_overwrite)
                        # print(fname)
                    startT = endT  
            print('final endT:', endT) #### TEST
            print('currCropCnt expected == actual:', singlePreictalSegs==currCropCnt) #### TEST
            totCropCnt += currCropCnt #### TEST
                
            
        print('total number of expected preictal segments:', singlePreictalSegs * numPreictals)
        print('total number of actual preictal segments:', totCropCnt)
        print()
        
    print('num Interictals:', numInterictals)
    totCropCnt = 0 # TEST
    for interictal in all_interictals[p]:
        raw = mne.io.read_raw_fif(interictal, verbose=False)
        print(raw)
        totTime = int(raw.times[-1]) # sec
        num_segs = totTime // seg_len
        startT, endT= 0, 0          
        currCropCnt = 0 #### TEST
        #print('Cropping interictal segments...')
        for seg in range(num_segs):
            endT = startT + seg_len
            if endT <= totTime: # double safety
                crop = raw.copy().crop(tmin=float(startT), tmax=float(endT), include_tmax=False)
                currCropCnt += 1
                # # Save
                # fname = os.path.join(preictals_dir, patients[p] + '_preictal_' + str(seq_num+1).zfill(3) + '_raw.fif')
                # crop.save(fname, overwrite=inp_overwrite)
                # print(fname)
            startT = endT
        print('final endT:', endT) #### TEST
        print('currCropCnt expected == actual:', num_segs==currCropCnt) #### TEST
        totCropCnt += currCropCnt #### TEST

    print('total number of expected interictal segments:', seg_cnt * numInterictals)
    print('total number of actual interictal segments:', totCropCnt)
    print()
    print()
                
 

In [None]:
lst = [1,2,3,4,5,6,7,8]
def sliding_window(elements, window_size):
    if len(elements) <= window_size:
        return elements
    for i in range(len(elements) - window_size + 1):
        print(elements[i:i+window_size])
sliding_window(lst, window_size)

In [None]:
break

In [None]:
# Unnecessary.

In [None]:
eeg_channels = ch_names[:-6]

In [None]:
# plot first 3
for it in os.listdir(annot_dir)[:3]:
    fif_pth = os.path.join(annot_dir, it)
    raw = mne.io.read_raw_fif(fif_pth, verbose=False)
    picks = mne.pick_types(raw.info, eeg=True)
    raw.plot(order=picks, n_channels=21)

##### References
* https://braindecode.org/index.html

##### Notes
* https://machinelearningmastery.com/random-oversampling-and-undersampling-for-imbalanced-classification/