In [77]:
import numpy as np
import tensorflow as tf
import pandas as pd
import os
from glob import glob
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, RobustScaler


# NOTE:
# 1) Split data into train and test
# 2) Training data turns into a special input dataset of shape (size, frames=t_lookback*500//sample_reduction, features=32)
#       Because alot of the time there is 0's meaning no event, we only want to train where there is an event 
#       (note: this may have the effect of increasing false positives as we are focussing on when events occur)
#       We downsample and look back a fixed amount of time.
# 3) No downsample for test data. Look back a fixed amount of time and use a set number of frames spaced by the sample_reduction.

In [88]:
""" READING THE DATA """
DATA_PATH = 'C:\\Users\\Sean\\Desktop\\AI4E_A3' # Directory containing all the data
# DATA_PATH = 'C:\\Users\\seanb\\Desktop\\AI4E\\train'
DATA_FREQUENCY = 500 # The dataset is sampled at 500 Hz

# Sorter function to sort the files in the correct order
file_name_sorter = lambda string_list: sorted(string_list, key=lambda s: (int(re.search(r'subj(\d+)', s).group(1)), int(re.search(r'series(\d+)', s).group(1)) if re.search(r'series(\d+)', s) else 0))


# Take the first 6 series of each subject to be the training set
train_data_files = file_name_sorter(glob(os.path.join(DATA_PATH,'train', 'subj*_series[1-6]_data.csv')))
train_filtered_data_files = file_name_sorter(glob(os.path.join(DATA_PATH,'train_filtered', 'subj*_series[1-6]_*.csv')))
train_event_files = file_name_sorter(glob(os.path.join(DATA_PATH, 'train', 'subj*_series[1-6]_events.csv')))

# Take the 7th and 8th series to be the test set
test_data_files = file_name_sorter(glob(os.path.join(DATA_PATH, 'train', 'subj*_series[7-8]_data.csv')))
test_filtered_data_files = file_name_sorter(glob(os.path.join(DATA_PATH,'train_filtered', 'subj*_series[7-8]_*.csv')))
test_event_files = file_name_sorter(glob(os.path.join(DATA_PATH, 'train', 'subj*_series[7-8]_events.csv')))



# Remove the first column and convert to NUMPY array, each row is a sample and each column is a feature.

x_train_data_series = [pd.read_csv(file, index_col=None, header=0).iloc[:,1:].to_numpy(dtype=np.float16) for file in train_data_files] # List of the training time series' from each subj_series
# x_train_data_series = [pd.read_csv(file, index_col=None, header=0).iloc[:,1:].to_numpy(dtype=np.float16) for file in train_filtered_data_files] # FILTERED
y_train_data_series = [pd.read_csv(file, index_col=None, header=0).iloc[:,1:].to_numpy(dtype=np.float16) for file in train_event_files]

x_test_data_series = [pd.read_csv(file, index_col=None, header=0).iloc[:,1:].to_numpy(dtype=np.float16) for file in test_data_files]
# x_test_data_series = [pd.read_csv(file, index_col=None, header=0).iloc[:,1:].to_numpy(dtype=np.float16) for file in test_filtered_data_files] # FILTERED TODO: whether to use filtered or not
y_test_data_series = [pd.read_csv(file, index_col=None, header=0).iloc[:,1:].to_numpy(dtype=np.float16) for file in test_event_files]

In [89]:
""" SCALING """
scaler = RobustScaler()

scaler = scaler.fit(np.concatenate(x_train_data_series, axis=0))

x_train_data_series = [scaler.transform(series) for series in x_train_data_series]
x_test_data_series = [scaler.transform(series) for series in x_test_data_series]

In [90]:
""" CREATING TRAINING DATA FORMAT """
def generate_input_dataset(x_train_data_series, y_train_data_series, t_lookback=2, sample_reduction=50):
    """ Generate the input dataset for the LSTM model. This is done by looking back a fixed amount of time and downsampling the data."""

    # t_lookback specifies how many seconds to look back for LSTM
    # sample_reduction specifies how much to downsample the data by (e.g. for 10 means 500/10 = 50 Hz, 50 menans 500/50 = 10 Hz). This mimics a reduced sampling rate of EEG data by taking every nth reading

    
    n_frames = t_lookback * DATA_FREQUENCY # Number of data frames to look back NOT REDUCED
    x = [] # Train data
    y = [] # Train labels

    # Go through each series/events file separately
    for series_data, series_events in zip(x_train_data_series, y_train_data_series):
        
        for i in range(n_frames-1, series_data.shape[0]-1): # Go through all the possible predictable frames (frames that have hisotrical data avaiable)
            
            if np.sum(series_events[i]) > 0: # If there is an event at this frame

                x.append(series_data[i-n_frames+sample_reduction:i+sample_reduction:sample_reduction]) # Append the data
                y.append(series_events[i]) # Append the label for the event with the historicla data in mind
                

    return np.array(x), np.array(y)

x_train, y_train = generate_input_dataset(x_train_data_series, y_train_data_series)



In [100]:
""" SHUFFLE THE DATA """
shuffle_idx = np.random.permutation(x_train.shape[0])
x_train = x_train[shuffle_idx]
y_train = y_train[shuffle_idx]

In [101]:
""" BUILD THE MODEL """
model = tf.keras.models.Sequential(layers=[

    tf.keras.layers.Input((x_train.shape[1], x_train.shape[2]), name='input'), # Input should be frames*features (e.g. 500*32)

    # tf.compat.v1.keras.layers.CuDNNLSTM(units=128, name='LSTM1', return_sequences=True),
    # tf.keras.layers.Dropout(0.5, name='dropout1'),

    
    # tf.compat.v1.keras.layers.CuDNNLSTM(units=128, name='LSTM2', return_sequences=True),
    # tf.keras.layers.Dropout(0.5, name='dropout2'),

    
    tf.compat.v1.keras.layers.CuDNNLSTM(units=64, name='LSTM3'),
    tf.keras.layers.Dropout(0.5, name='dropout3'),

    tf.keras.layers.Dense(6, activation='sigmoid', name='dense') # Output layer, 6 independent possible events from 0-1

    ], name='LSTM_model'
)

loss=tf.keras.losses.binary_crossentropy # Binary cross-entropy treats each label prediction as a separate binary classification problem.
optimizer=tf.keras.optimizers.Adam()


model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy', tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall'), tf.keras.metrics.AUC(name='auc')]) # AUC - minmise false positives, maximise true positives

# Recall is low, meaning its not getting alot of the true positives. This may be expected since if we look at the training data, there are alot of 0's and only a few 1's. This means that the model will be biased towards predicting 0's. This is a problem since we want to predict the 1's. We can try to fix this by weighting the loss function to penalise false negatives more than false positives. This will make the model more likely to predict 1's.

model.summary()

Model: "LSTM_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 LSTM3 (CuDNNLSTM)           (None, 64)                25088     
                                                                 
 dropout3 (Dropout)          (None, 64)                0         
                                                                 
 dense (Dense)               (None, 6)                 390       
                                                                 
Total params: 25,478
Trainable params: 25,478
Non-trainable params: 0
_________________________________________________________________


In [104]:
history = model.fit(x_train, y_train, epochs=5, batch_size=1024, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [7]:
# model.evaluate(x_train, y_train)
# We want to evaluate the whole time series and potentially get a plot to compare the predicted events with the actual events (noting the different time scales)

# plt.figure(figsize=(15,10))
# plt.legend([1,2,3,4,5,6], loc='upper left')
# plt.plot(y_test_data_series[0])

# np.array([x_test_data_series[0][0:5], x_test_data_series[0][5:10]]).shape
# model.predict(np.array([x_test_data_series[0][0:5], x_test_data_series[0][5:10]]))


In [48]:
def prediction_test_time_series(x_test_series, y_test, t_lookback=2, sample_reduction=50):
    # Return the time series prediction for a test time series

    # Make sure t_lookback is the same as how it was trained
    
    

    n_frames = t_lookback * DATA_FREQUENCY  # Number of data frames to look back 
    # print(n_frames)

    print(x_test_series.shape[0])

    x_test = [] # make test be i * n_frames * n_features

    # Need to get the x_test into the form of (i, n_frames, n_features) where i represents which frame we are predicting for

    for i in range(n_frames, x_test_series.shape[0]):
        x_test.append(x_test_series[i-n_frames:i:sample_reduction])

    x_test = tf.convert_to_tensor(x_test, dtype=tf.float16)
    return x_test




# (x_test_data_series[0].shape)

# x_train[0].shape

tmp = prediction_test_time_series(x_test_data_series[0], y_test_data_series[0])
# TODO: Need to do a time series prediction for each of the test data series

# TODO: Need to actually train a model as well to see how it goes on the train data series


119561


In [52]:
print(tmp.shape)

print(x_test_data_series[0].shape)

(119061, 5, 32)
(119561, 32)
