# BREATHING WAVE
## DEEP LEARNING - LSTM
### 04 March 2023

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

df = pd.read_csv("breathing_waveform_data.csv").iloc[:, :-1] # get rid of last column ("notes")

X = df.iloc[:, :-1]
Y = df.iloc[:, -1]

In [2]:
# Check if the data do not have any NULL 
print("X have a null? \t{}".format(X.isnull().values.any()))
print("Y have a null? \t{}".format(Y.isnull().values.any()))

X have a null? 	False
Y have a null? 	False


In [3]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,75,76,77,78,79,80,81,82,83,84
0,0.483309,0.459790,0.431024,0.376565,0.295734,0.193290,0.066060,-0.083445,-0.247221,-0.409374,...,0.332737,0.391514,0.452677,0.521407,0.595845,0.661691,0.702932,0.708613,0.682564,0.637765
1,-2.044518,-1.935588,-1.808629,-1.667919,-1.513497,-1.348760,-1.171044,-0.972509,-0.759554,-0.547793,...,0.325687,0.138731,-0.053860,-0.241691,-0.417603,-0.582320,-0.738485,-0.889731,-1.037066,-1.174654
2,-1.213535,-1.269056,-1.323306,-1.375251,-1.430062,-1.485479,-1.529200,-1.557172,-1.574662,-1.575457,...,0.902226,0.947940,0.996154,1.035743,1.049543,1.024204,0.954716,0.844505,0.702445,0.541555
3,-0.914806,-0.887726,-0.856065,-0.823527,-0.794551,-0.768074,-0.740895,-0.713364,-0.685445,-0.652020,...,-0.407344,-0.478218,-0.571465,-0.684115,-0.817078,-0.966231,-1.122537,-1.264759,-1.376908,-1.461059
4,-1.547469,-1.458818,-1.362120,-1.264829,-1.164948,-1.060064,-0.954496,-0.849448,-0.742812,-0.636614,...,0.322969,0.227050,0.130983,0.041438,-0.038034,-0.106152,-0.163048,-0.210926,-0.253102,-0.290270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26395,-0.152463,-0.164723,-0.165409,-0.152623,-0.118115,-0.066218,-0.010253,0.041637,0.092217,0.140510,...,-0.345803,-0.336787,-0.306774,-0.280607,-0.269843,-0.260062,-0.229981,-0.167654,-0.082300,0.004372
26396,-0.164723,-0.165409,-0.152623,-0.118115,-0.066218,-0.010253,0.041637,0.092217,0.140510,0.188025,...,-0.336787,-0.306774,-0.280607,-0.269843,-0.260062,-0.229981,-0.167654,-0.082300,0.004372,0.089958
26397,-0.165409,-0.152623,-0.118115,-0.066218,-0.010253,0.041637,0.092217,0.140510,0.188025,0.240939,...,-0.306774,-0.280607,-0.269843,-0.260062,-0.229981,-0.167654,-0.082300,0.004372,0.089958,0.179209
26398,-0.152623,-0.118115,-0.066218,-0.010253,0.041637,0.092217,0.140510,0.188025,0.240939,0.294399,...,-0.280607,-0.269843,-0.260062,-0.229981,-0.167654,-0.082300,0.004372,0.089958,0.179209,0.264014


In [4]:
Y.value_counts()

normal        19734
quick          2667
hold           2133
deep           1066
deep_quick      800
Name: labels, dtype: int64

## Fix Random Seed for Reproducibility

In [5]:
# fix random seed for reproducibility
seed = 21
tf.random.set_seed(seed)

### Program Starting
# PART 1 : Data Preprocessing

## Importing Imbalanced Libraries

In [6]:
import imblearn
print(imblearn.__version__)

0.10.1


## Removing Class Overlapped (removing Tomek Links)
> **Tomek links** identify pairs of samples from different classes that are close to each other and potentially contribute to class overlap or ambiguity.
>
> **Conclusion** : Nothing is removed. Indicate that the data is good and there is no ambiguity

In [7]:
def tomek_links(X, Y):
    # define the undersampling method
    undersample = imblearn.under_sampling.TomekLinks()
    # transform the dataset
    return undersample.fit_resample(X, Y)

In [8]:
X, Y = tomek_links(X, Y)

In [9]:
Y.value_counts()

normal        19734
quick          2667
hold           2133
deep           1066
deep_quick      800
Name: labels, dtype: int64

## Undersampling

In [10]:
# NearMiss
def near_miss(X, Y, version, neighbors=3):
    # define the undersampling method
    undersample = imblearn.under_sampling.NearMiss(version=version, n_neighbors=3)
    # transform the dataset
    return undersample.fit_resample(X, Y)

# RandomUnderSample
labels = {
    "normal" : 800,
    "quick" : 800,
    "hold" : 800,
    "deep" : 800,
    "deep_quick" : 800
}

def rus(X, Y, strategy=labels):
    # define the undersampling method
    undersample = imblearn.under_sampling.RandomUnderSampler(sampling_strategy=strategy)
    # transform the dataset
    return undersample.fit_resample(X, Y)

## CNN (CondensedNearestNeighbour) error

In [11]:
X, Y = rus(X, Y)
Y.value_counts()

deep          800
deep_quick    800
hold          800
normal        800
quick         800
Name: labels, dtype: int64

## Augmented Data (UP & DOWN 0.01)
### Current Shape now  : 26400 x 3 = 79200

In [12]:
# combine X and Y first
df = pd.concat([X, Y], axis=1)

# data augmentation
up = df
down = df
up.iloc[:, :-1] += 0.01   # increase value by 0.01
down.iloc[:, :-1] -= 0.01 # decrease value by 0.01
df = pd.concat([df, up, down], ignore_index=True)

In [13]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76,77,78,79,80,81,82,83,84,labels
0,-0.268313,-0.173284,-0.069770,0.047600,0.188498,0.347947,0.507962,0.658035,0.796925,0.922728,...,-1.354905,-1.393633,-1.390611,-1.348206,-1.296693,-1.250288,-1.212452,-1.186786,-1.179858,deep
1,-0.040052,-0.043108,-0.044297,-0.042844,-0.042370,-0.042329,-0.040341,-0.037264,-0.033991,-0.030304,...,-0.023365,-0.021366,-0.018063,-0.013189,-0.007278,-0.001877,0.002095,0.004923,0.007086,deep
2,0.230526,0.108867,-0.024490,-0.164305,-0.300901,-0.424737,-0.529279,-0.612703,-0.677537,-0.723174,...,-0.227252,-0.370599,-0.497504,-0.605621,-0.700073,-0.786514,-0.862373,-0.932655,-0.993687,deep
3,-1.024466,-1.187201,-1.347571,-1.492364,-1.615470,-1.723598,-1.823874,-1.915847,-1.995332,-2.060951,...,1.152974,1.162546,1.170266,1.172485,1.162577,1.143522,1.119604,1.089960,1.049195,deep
4,-0.428839,-0.332615,-0.248961,-0.176027,-0.112658,-0.059278,-0.004777,0.053416,0.101563,0.138137,...,-0.361313,-0.368767,-0.370436,-0.370003,-0.369832,-0.372933,-0.381992,-0.396091,-0.413242,deep
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0.121465,0.146261,0.167992,0.169967,0.144068,0.107878,0.083989,0.088344,0.112372,0.124531,...,0.443562,0.315667,0.188570,0.068180,-0.036559,-0.113396,-0.166452,-0.216866,-0.276723,quick
11996,-0.160531,-0.229866,-0.293130,-0.346173,-0.385350,-0.411390,-0.423413,-0.417905,-0.393790,-0.353185,...,0.076072,0.092789,0.108863,0.121987,0.128518,0.130138,0.130045,0.132653,0.141146,quick
11997,-0.253933,-0.273959,-0.273578,-0.278892,-0.300258,-0.324619,-0.332424,-0.325497,-0.306345,-0.272196,...,0.020242,0.039997,0.042551,0.025041,-0.016089,-0.075329,-0.124937,-0.142297,-0.126571,quick
11998,0.077422,0.070160,0.052233,0.036485,0.034710,0.042757,0.044001,0.029344,-0.005060,-0.058217,...,-0.468333,-0.373065,-0.295179,-0.230432,-0.175512,-0.134452,-0.110760,-0.101800,-0.100119,quick


In [14]:
# separate again to X and Y
X = df.iloc[:, :-1]
Y = df.iloc[:, -1]

In [15]:
Y.value_counts()

deep          2400
deep_quick    2400
hold          2400
normal        2400
quick         2400
Name: labels, dtype: int64

## Hot Encoded The Label Data 

In [16]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

# encode class values as integers [0,0,0,0,0,0,0,1,1,1,1,1,2,2,2,2]
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# convert integers to dummy variables (i.e. one hot encoded)
hot_y = np_utils.to_categorical(encoded_Y)

In [17]:
hot_y

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

## Scale The Training Data (STD)

In [18]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

## Reshaping The Training Data to 3-Dimensional Numpy Array
### STRUCTURE : (batch_size, timestep, feature)

In [19]:
feature = 5
X = np.reshape(X, (X.shape[0], int(85/feature), feature))
# (26400, 17, 5)
# 5 indicator will be used per sequence/timestep per sample/row

# PART 2 : Building The RNN

In [20]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout

## Creating Layer of RNN

In [None]:
# Configuration for Model Structure
from keras.optimizers import Adam
_optimizer = Adam()
_loss = "categorical_crossentropy"
_metric = ["accuracy"]

In [None]:
def create_model(dropout_rate=0.2, init_mode='glorot_uniform', init_recurrent='orthogonal', init_units=60):
    classifier = Sequential()

    # first layer
    classifier.add(LSTM(units=init_units, kernel_initializer=init_mode, recurrent_initializer=init_recurrent, return_sequences=True, input_shape=(17, 5)))
    classifier.add(Dropout(dropout_rate))    # Ignore xx% of the neuron (ex. 50 * 20% = 10 neuoron will be ignored) 

    # second layer
    classifier.add(LSTM(units=init_units, return_sequences=True))
    classifier.add(Dropout(dropout_rate))

    # third layer
    # classifier.add(LSTM(units=20, return_sequences=True))
    # classifier.add(Dropout(dropout_rate))

    # fourth layer
    classifier.add(LSTM(units=init_units))
    classifier.add(Dropout(dropout_rate))

    # last layer
    classifier.add(Dense(units=5, activation='softmax'))

    # Compile
    classifier.compile(optimizer=_optimizer, loss=_loss, metrics=_metric)
    
    return classifier

# PART 3 : Training Time

## Setting up the GridSearchCV

In [None]:
import multiprocessing

cpu_count = multiprocessing.cpu_count()

print(f"Number of CPU cores: {cpu_count}")

In [None]:
from sklearn.model_selection import GridSearchCV
from scikeras.wrappers import KerasClassifier

model = KerasClassifier(model=create_model)

param_grid = {
    'epochs': [15, 20],
    'batch_size': [32, 64],
    'model__dropout_rate': [0.2, 0.3],
    'model__init_mode': ['glorot_uniform', 'he_uniform'],
    'model__init_recurrent': ['glorot_uniform', 'orthogonal'],
    'model__init_units': [17, 30, 60]
}

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, verbose=5, refit=True, n_jobs=cpu_count-2)

## Training

In [None]:
with tf.device('/device:CPU:0'):
    grid_result = grid.fit(X, hot_y)

## Summarize the Result

In [None]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

## Plot The Best Estimator, Param, and Score

In [None]:
print("Best Estimator")
print(grid_result.best_estimator_)
print("Best Param")
print(grid_result.best_params_)
print("Best Score")
print(grid_result.best_score_)