<a href="https://colab.research.google.com/github/yokahealthcare/Anasa-GAN/blob/master/%5BMain_v2%5D%20%5BAE%5D%20Breathing_Wave.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AE - AutoEncoder

## Project Strucuture

### PART 1 : Data Preprocessing


1.   Filter the zeros values (except if in the first column)
2.   Separate the data(q) according to labels
3.   Seperate q into data(X) and label(Y)
4.   Normalize the data
> X normalized using MinMaxScaler between 0 and 1
>
> Y normalized using one-hot encoding

### PART 2 : Neural Network
1.   NN Structure
2.   Optimizer : Adam(learning_rate=0.0001)
3.   Loss      : MAE (Mean Average Error)

### PART 3 : Training
1.   Training
2.   Smoothing using Savitzky-Golay filter


## PART 1 : Data Preprocessing

### Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

### Download the dataset

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/yokahealthcare/Anasa-GAN/master/dataset/breathing_waveform_data.csv").iloc[:, :-1] # get rid of last column ("notes")

### Filter the zeros values
> This will filtered the zeros value from all column (except first column)
>
> CAUSE : I think is natural for the first column to be 0.0 (because the time(X) still on 0 second)

In [None]:
zeros_val = df[df.iloc[:, 1:].eq(0).any(axis=1)]

In [None]:
zeros_val

### Drop the table that has value zeros on it

In [None]:
df = df[~df.isin(zeros_val)].dropna()

In [None]:
df

In [None]:
df['labels'].value_counts()

### Separate the data according to their labels

In [None]:
normal_df = df[df['labels'] == "normal"]
quick_df = df[df['labels'] == "quick"]
hold_df = df[df['labels'] == "hold"]
deep_df = df[df['labels'] == "deep"]
deep_quick_df = df[df['labels'] == "deep_quick"]

### Seperate the data (X) and the label (Y)

In [None]:
X = normal_df.iloc[:, :-1]
Y = normal_df.iloc[:, -1]

### Normalize the data

In [None]:
# Data (X)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [None]:
# Label (Y)
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

# encode class values as integers [0,0,0,0,0,0,0,1,1,1,1,1,2,2,2,2]
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# convert integers to dummy variables (i.e. one hot encoded)
hot_y = np_utils.to_categorical(encoded_Y)

# PART 2 : Setup The Neural Network

### Importing Neural Network Libraries

In [None]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LeakyReLU
from keras.layers import Activation
from tensorflow.keras import activations
from tensorflow.keras.models import Model

### Neural Network : Recurrent Neural Network

In [None]:
feature = 5
X_3d = np.reshape(X, (X.shape[0], int(85/feature), feature))
# (26400, 17, 5)
# 5 indicator will be used per sequence/timestep per sample/row
X_3d.shape

In [None]:
class AE(Model):
    def __init__(self, dropout_rate=0.2, init_mode='glorot_uniform', init_recurrent='orthogonal', init_units=60):
      super(AE, self).__init__()

      ### Encoder ###
      self.encoder = Sequential()

      # first layer
      self.encoder.add(LSTM(units=init_units, kernel_initializer=init_mode, recurrent_initializer=init_recurrent, return_sequences=True, input_shape=(17, 5)))
      self.encoder.add(Dropout(dropout_rate))    # Ignore xx% of the neuron (ex. 50 * 20% = 10 neuoron will be ignored)

      # second layer
      self.encoder.add(LSTM(units=init_units, return_sequences=True))
      self.encoder.add(Dropout(dropout_rate))

      # third layer
      # self.encoder.add(LSTM(units=20, return_sequences=True))
      # self.encoder.add(Dropout(dropout_rate))

      # fourth layer
      self.encoder.add(LSTM(units=init_units))
      self.encoder.add(Dropout(dropout_rate))

      # last layer
      self.encoder.add(Dense(units=16))
      self.encoder.add(Activation(activations.sigmoid))
        
      """
      ### Decoder ###
      self.decoder = Sequential()

      # first layer
      self.decoder.add(LSTM(units=init_units, kernel_initializer=init_mode, recurrent_initializer=init_recurrent, return_sequences=True, input_shape=(8, 2)))
      self.decoder.add(Dropout(dropout_rate))    # Ignore xx% of the neuron (ex. 50 * 20% = 10 neuoron will be ignored)

      # second layer
      self.decoder.add(LSTM(units=init_units, return_sequences=True))
      self.decoder.add(Dropout(dropout_rate))

      # third layer
      # self.decoder.add(LSTM(units=20, return_sequences=True))
      # self.decoder.add(Dropout(dropout_rate))

      # fourth layer
      self.decoder.add(LSTM(units=init_units))
      self.decoder.add(Dropout(dropout_rate))

      # last layer
      self.decoder.add(Dense(units=85))
      self.decoder.add(Activation(activations.sigmoid))
      """

      """
      ### Decoder ###
      self.decoder = Sequential()
      # First Layer
      self.decoder.add(Dense(16, input_shape=(17, )))
      self.encoder.add(LeakyReLU())
      # Second Layer
      self.decoder.add(Dense(32))
      self.decoder.add(LeakyReLU())
      # Third Layer
      self.decoder.add(Dense(64))
      self.decoder.add(LeakyReLU())
      # Fourth Layer
      self.decoder.add(Dense(85))
      self.decoder.add(Activation(activations.sigmoid))
      """

    def call(self, x):
      encoded = self.encoder(x)

      print("type : {}".format(type(encoded)))
      print("shape : {}".format(encoded.shape))

      encoded = tf.reshape(encoded, (None, 8, 2))

      decoded = self.decoder(encoded)
      return decoded

In [None]:
from keras.optimizers import Adam

# Create new autoencoder object
autoencoder = AE()
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mae')

## PART 3 : Training

In [None]:
import multiprocessing

cpu_count = multiprocessing.cpu_count()

print(f"Number of CPU cores: {cpu_count}")

In [None]:
# Fit the Model
with tf.device('/device:CPU:0'):
    model = autoencoder.fit(X_3d, X,
              epochs=10,
              batch_size=32,
              shuffle=True, verbose=1)

### Plot the training loss graph

In [None]:
plt.plot(model.history["loss"], label="Training Loss")
plt.legend()

### Define a function to smoothing the wave curve

In [None]:
# smoothing the wave of decoded_data
from scipy.signal import savgol_filter

def smooth_wave(wave):
  # Define the parameters for the Savitzky-Golay filter
  window_length = 10  # The length of the window (odd number)
  polyorder = 2  # The order of the polynomial fit

  return savgol_filter(wave, window_length, polyorder)

### Testing the model

In [None]:
encoded_data = autoencoder.encoder(X_3d).numpy()
decoded_data = autoencoder.decoder(encoded_data).numpy()

# Apply the Savitzky-Golay filter
decoded_data = smooth_wave(decoded_data)

### Calculate the Mean Average Error (MAE) from all data

In [None]:
loss = tf.keras.losses.mae(decoded_data, X)
print("Mean Average Error : {}".format(np.mean(loss * 100)))

### Plot the result

In [None]:
num_samples = 20
row = int(num_samples / 5)

# Create figure and axis objects
fig, ax = plt.subplots(row, 5, figsize=(20,row*3))

idx=0
for y in range(row):
  for x in range(5):
    # Plot each time series
    ax[y, x].plot(X[idx], 'b')
    ax[y, x].plot(decoded_data[idx], 'g')
    ax[y, x].fill_between(np.arange(X.shape[1]), decoded_data[idx], X[idx], color='lightcoral')
    ax[y, x].set_title("Data {}; err : {:.2f}%".format(idx, loss[idx]*100))
    #ax[y, x].legend()

    idx += 1

# legend
plt.legend(labels=["Input", "Reconstruction", "Error"], loc='center left', bbox_to_anchor=(1, 0.5))

# Customize the overall layout
plt.tight_layout()

# Show
plt.show()



## PART 4 : Generating New Data

In [None]:
# Set the number of data points to generate
num_samples = 10

# Randomly sample latent vectors from a predefined range
noise_vectors = np.random.rand(*(num_samples, 85))

# Reshape the latent vectors
feature = 5
noise_vectors = np.reshape(noise_vectors, (noise_vectors.shape[0], int(85/feature), feature))
# (26400, 17, 5)
# 5 indicator will be used per sequence/timestep per sample/row

# Generate new data by decoding the latent vectors
generated_data = autoencoder.predict(noise_vectors)

In [None]:
# Create figure and axis objects
row = int(num_samples / 5)

fig, ax = plt.subplots(row, 5, figsize=(20,row*3))

idx=0
for y in range(row):
  for x in range(5):
    # Plot each time series
    ax[y, x].plot(generated_data[idx], 'b')
    ax[y, x].set_title("Data {}".format(idx))

    idx += 1

# legend
plt.legend(labels=["Generated"], loc='center left', bbox_to_anchor=(1, 0.5))

# title
plt.suptitle("Generated Data without Smoothing")

# Customize the overall layout
plt.tight_layout()

# Show
plt.show()

## Smoothing it

In [None]:
# Apply the Savitzky-Golay filter
generated_data = smooth_wave(generated_data)

In [None]:
# Create figure and axis objects
row = int(num_samples / 5)

fig, ax = plt.subplots(row, 5, figsize=(20,row*3))

idx=0
for y in range(row):
  for x in range(5):
    # Plot each time series
    ax[y, x].plot(generated_data[idx], 'b')
    ax[y, x].set_title("Data {}".format(idx))

    idx += 1

# legend
plt.legend(labels=["Generated"], loc='center left', bbox_to_anchor=(1, 0.5))

# title
plt.suptitle("Generated Data with Smoothing")

# Customize the overall layout
plt.tight_layout()

# Show
plt.show()