# GAN - BREATHING WAVE
## Generative Adversarial Networks
### 05 May 2023
***

In [1]:
!pip install sdv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## PART 1 : Data Preprocessing

### Importing Library

In [2]:
import pandas as pd
import numpy as np

### Import Dataset

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/yokahealthcare/Anasa-GAN/master/dataset/breathing_waveform_data.csv").iloc[:, :-1] # get rid of last column ("notes")

### Filter the zeros values
> This will filtered the zeros value from all column (except first column)
>
> CAUSE : I think is natural for the first column to be 0.0 (because the time(X) still on 0 second)

In [4]:
zeros_val = df[df.iloc[:, 1:].eq(0).any(axis=1)]

In [5]:
zeros_val

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76,77,78,79,80,81,82,83,84,labels
5473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
5474,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
5475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
5476,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
5477,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
6143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
6144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
6145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


### Drop the table that has value zeros on it

In [6]:
df_formatted = df[~df.isin(zeros_val)].dropna()

In [7]:
df_formatted

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76,77,78,79,80,81,82,83,84,labels
0,0.483309,0.459790,0.431024,0.376565,0.295734,0.193290,0.066060,-0.083445,-0.247221,-0.409374,...,0.391514,0.452677,0.521407,0.595845,0.661691,0.702932,0.708613,0.682564,0.637765,deep
1,-2.044518,-1.935588,-1.808629,-1.667919,-1.513497,-1.348760,-1.171044,-0.972509,-0.759554,-0.547793,...,0.138731,-0.053860,-0.241691,-0.417603,-0.582320,-0.738485,-0.889731,-1.037066,-1.174654,deep
2,-1.213535,-1.269056,-1.323306,-1.375251,-1.430062,-1.485479,-1.529200,-1.557172,-1.574662,-1.575457,...,0.947940,0.996154,1.035743,1.049543,1.024204,0.954716,0.844505,0.702445,0.541555,deep
3,-0.914806,-0.887726,-0.856065,-0.823527,-0.794551,-0.768074,-0.740895,-0.713364,-0.685445,-0.652020,...,-0.478218,-0.571465,-0.684115,-0.817078,-0.966231,-1.122537,-1.264759,-1.376908,-1.461059,deep
4,-1.547469,-1.458818,-1.362120,-1.264829,-1.164948,-1.060064,-0.954496,-0.849448,-0.742812,-0.636614,...,0.227050,0.130983,0.041438,-0.038034,-0.106152,-0.163048,-0.210926,-0.253102,-0.290270,deep
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26395,-0.152463,-0.164723,-0.165409,-0.152623,-0.118115,-0.066218,-0.010253,0.041637,0.092217,0.140510,...,-0.336787,-0.306774,-0.280607,-0.269843,-0.260062,-0.229981,-0.167654,-0.082300,0.004372,quick
26396,-0.164723,-0.165409,-0.152623,-0.118115,-0.066218,-0.010253,0.041637,0.092217,0.140510,0.188025,...,-0.306774,-0.280607,-0.269843,-0.260062,-0.229981,-0.167654,-0.082300,0.004372,0.089958,quick
26397,-0.165409,-0.152623,-0.118115,-0.066218,-0.010253,0.041637,0.092217,0.140510,0.188025,0.240939,...,-0.280607,-0.269843,-0.260062,-0.229981,-0.167654,-0.082300,0.004372,0.089958,0.179209,quick
26398,-0.152623,-0.118115,-0.066218,-0.010253,0.041637,0.092217,0.140510,0.188025,0.240939,0.294399,...,-0.269843,-0.260062,-0.229981,-0.167654,-0.082300,0.004372,0.089958,0.179209,0.264014,quick


### Separate the X and Y

In [8]:
X = df_formatted.iloc[:, :-1]
Y = df_formatted.iloc[:, -1]

### Undersampling

In [9]:
!pip install imblearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
import imblearn
print(imblearn.__version__)

0.10.1


In [11]:
# NearMiss
def near_miss(X, Y, version, neighbors=3):
    # define the undersampling method
    undersample = imblearn.under_sampling.NearMiss(version=version, n_neighbors=3)
    # transform the dataset
    return undersample.fit_resample(X, Y)

# RandomUnderSample
labels = {
    "normal" : 800,
    "quick" : 800,
    "hold" : 800,
    "deep" : 800,
    "deep_quick" : 800
}

def rus(X, Y, strategy=labels):
    # define the undersampling method
    undersample = imblearn.under_sampling.RandomUnderSampler(sampling_strategy=strategy)
    # transform the dataset
    return undersample.fit_resample(X, Y)

## CNN (CondensedNearestNeighbour) error

In [12]:
X, Y = rus(X, Y)
Y.value_counts()

deep          800
deep_quick    800
hold          800
normal        800
quick         800
Name: labels, dtype: int64

In [13]:
df_formatted = pd.concat([X, Y], axis=1)

### Adding person_id

In [14]:
# library and function to create person ID
import uuid
def generate_id():
    # generate a new UUID
    new_id = str(uuid.uuid4())
    
    return new_id

In [15]:
# create a list of generated person id
temp_id = []
length = df_formatted.shape[0]
for i in range(length):
    new_id = generate_id()
    if new_id in temp_id:
        length += 1
    else:
        temp_id.append(new_id)

In [16]:
# convert temp_id to dataframe type
temp_id = pd.DataFrame(np.array(temp_id), columns=['person_id'])

In [17]:
# Reset indices if necessary
temp_id = temp_id.reset_index(drop=True)
df_formatted = df_formatted.reset_index(drop=True)

df = pd.concat([temp_id, df_formatted], axis=1)

### Metadata Creation

In [18]:
# importing libraries
from sdv.metadata import SingleTableMetadata
metadata = SingleTableMetadata()

In [19]:
# auto-detect the type of data in the columns dataset
metadata.detect_from_dataframe(data=df)

In [20]:
# update person_id type to 'ID' type
metadata.update_column(
    column_name='person_id',
    sdtype='id')

In [21]:
# ABANDONED - because this caused an error of runtime

# set 'person_id' as primary key
# metadata.set_primary_key(column_name='person_id')

In [22]:
# set sequence key
metadata.set_sequence_key(column_name='person_id')

## PART 2 : Generating Synthetic Data

### Importing DeepEcho (using PARSynthesizer) Library

In [23]:
from sdv.sequential import PARSynthesizer



### define PARSynthesizer

In [24]:
type(df)

pandas.core.frame.DataFrame

In [None]:
# Step 1: Create the synthesizer
synthesizer = PARSynthesizer(
    metadata, verbose=True)

# Step 2: Train the synthesizer
synthesizer.fit(df)

# Step 3: Generate synthetic data
synthetic_data = synthesizer.sample(num_rows=100)

Epoch 1 | Loss 0.0008876821957528591:   1%|          | 1/128 [10:56<23:09:45, 656.58s/it]

### Evaluate Data

In [None]:
from table_evaluator import TableEvaluator

print(df.shape, synthetic_data.shape)
table_evaluator = TableEvaluator(df, synthetic_data, cat_cols=['person_id','labels'])

table_evaluator.visual_evaluation()

In [None]:
samples

## PART 3 : Data Preprocessing

### Preprocess the generated data

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

encoder = LabelEncoder()
    
def data_preprocessing(X, Y):
    ## DATA
    sc = StandardScaler()
    X = sc.fit_transform(X)
    # reshaping the training data to 3-Dimensional Numpy Array
    feature = 5
    X = np.reshape(X, (X.shape[0], int(85/feature), feature))
    # (26400, 17, 5)
    # 5 indicator will be used per sequence/timestep per sample/row
    
    ## LABEL
    # encode class values as integers [0,0,0,0,0,0,0,1,1,1,1,1,2,2,2,2]
    encoder.fit(Y)
    encoded_Y = encoder.transform(Y)

    # convert integers to dummy variables (i.e. one hot encoded)
    hot_y = np_utils.to_categorical(encoded_Y)
    
    return X, hot_y

In [None]:
X = samples.iloc[:, :-1]
Y = samples.iloc[:, -1]

In [None]:
X_test, Y_test = data_preprocessing(X, Y)

## PART 4 : Predict the generated data using pre-trained model

### Import pre-trained model

In [None]:
from tensorflow.keras.models import load_model
filename = "C:\\Users\\IoT-Lab\\Documents\\!Erwin Yonata\\Anasa\\MODELS\\[3-layer] - 3L1\\CV\\GridSearchCV\\best_param_model.h5"

# load model
loaded_model = load_model(filename)

### Make prediction using generated data

In [None]:
pred = loaded_model.predict(X_test)

## PART 5 : Evaluate

### Plot confusion matrix

In [None]:
y_true = np.argmax(Y_test, axis=1)
y_pred = np.argmax(pred, axis=1)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# Define the confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)

# Plot the confusion matrix
plt.imshow(conf_matrix, cmap=plt.cm.Greens)

# Add labels to the plot
tick_marks = np.arange(len(conf_matrix))
plt.xlabel('Predicted label')
plt.ylabel('True label')

# Add values to the plot
for i in range(len(conf_matrix)):
    for j in range(len(conf_matrix)):
        plt.text(j, i, conf_matrix[i, j], ha='center', va='center')

# Show the plot
plt.show()

### Create Data Frame for generated result vs prediction result

In [None]:
def revert_back(hot_y):
    # Revert one-hot encoded representation to original labels
    reversed_labels = np.argmax(hot_y, axis=1)

    # Transform the reversed labels back to the original integer labels
    original_labels = encoder.inverse_transform(reversed_labels)
    
    return original_labels

In [None]:
rb_pred = revert_back(pred)
rb_Y_test = revert_back(Y_test)

In [None]:
df_pred = pd.DataFrame(np.array(list(zip(rb_Y_test, rb_pred))), columns=[["generated", "prediction"]])

In [None]:
df_pred['generated'].value_counts()

In [None]:
df_pred['prediction'].value_counts()

### Evaluate the predicted label with the generated label

In [None]:
score = loaded_model.evaluate(X_test, Y_test)
print("Accuracy \t: {:.2f}".format(score[1]*100))
print("Loss \t\t: {:.2f}".format(score[0]*100))