## Generative Adversarial Network (GAN)

Generative modeling is an unsupervisd learning task thta involves automatically discovering and learning the regularities in patterns in input data. GAN is a way of training the model by framing the problem as a supervised learning problem with two suib-models: generator and discriminator.

Generator is training to generate new data points and discriminator tries to classify them as real or generated (fake) data. If the discriminator identifies the fake data, the generator is penalized, otherwise, the discriminator is penalized.

Imports

In [None]:
import sklearn.cluster as cluster
from sklearn.preprocessing import PowerTransformer

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam

import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import io
import os

### load_data_from_files

This loads the data from the file. We pass the parameter of the file name.
It converts the label column into simple category code - 0 in our case, as we have split our data based on the emotion label.

It then converts the numbers in gaussian for the training.

*We split our data into separate files, as we were not able to convert the label back from gaussian. We still have to test the difference it makes when generating the data by where all the data is passed and when we split the data by emotion.*

In [None]:
def load_data_from_files(input_file_name):
  # Read the original data and have it preprocessed
  data = pd.read_csv(input_file_name)
  data.drop(["uri"], axis=1, inplace=True)
  data_cols = data.columns

  # Convert Label to category numbers
  data["label"] = data["label"].astype('category').cat.codes

  # Convert Data into gaussian
  data[data.columns] = PowerTransformer(method='yeo-johnson', standardize=True, copy=True).fit_transform(data[data.columns])

  return data

### GAN

The class accepts params that can be changed to improve our model

The class gets the training data and trains the model for the specified number of iterations (epochs)

We save the trained discriminator and generator to generate the data later

In [None]:
class GAN():
    
    def __init__(self, gan_args):
        [self.batch_size, lr, self.noise_dim,
         self.data_dim, layers_dim] = gan_args

        self.generator = Generator(self.batch_size).\
            build_model(input_shape=(self.noise_dim,), dim=layers_dim, data_dim=self.data_dim)

        self.discriminator = Discriminator(self.batch_size).\
            build_model(input_shape=(self.data_dim,), dim=layers_dim)

        optimizer = Adam(lr, 0.5)

        # Build and compile the discriminator
        self.discriminator.compile(loss='binary_crossentropy',
                                   optimizer=optimizer,
                                   metrics=['accuracy'])

        # The generator takes noise as input and generates imgs
        z = Input(shape=(self.noise_dim,))
        record = self.generator(z)

        # For the combined model we will only train the generator
        self.discriminator.trainable = False

        # The discriminator takes generated data as input and determines validity
        validity = self.discriminator(record)

        self.combined = Model(z, validity)
        self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)

    def get_data_batch(self, train, batch_size, seed=0):

        start_i = (batch_size * seed) % len(train)
        stop_i = start_i + batch_size
        shuffle_seed = (batch_size * seed) // len(train)
        np.random.seed(shuffle_seed)
        train_ix = np.random.choice(list(train.index), replace=False, size=len(train))
        train_ix = list(train_ix) + list(train_ix) 
        x = train.loc[train_ix[start_i: stop_i]].values
        return np.reshape(x, (batch_size, -1))
        
    def train(self, data, train_arguments):
        [cache_prefix, epochs, sample_interval] = train_arguments
        
        data_cols = data.columns

        valid = np.ones((self.batch_size, 1))
        fake = np.zeros((self.batch_size, 1))

        for epoch in range(epochs):    
            #  Train Discriminator
            batch_data = self.get_data_batch(data, self.batch_size)
            noise = tf.random.normal((self.batch_size, self.noise_dim))

            # Generate a batch of new data
            gen_data = self.generator.predict(noise)
    
            # Train the discriminator
            d_loss_real = self.discriminator.train_on_batch(batch_data, valid)
            d_loss_fake = self.discriminator.train_on_batch(gen_data, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            #  Train Generator
            noise = tf.random.normal((self.batch_size, self.noise_dim))
            # Train the generator (to have the discriminator label samples as valid)
            g_loss = self.combined.train_on_batch(noise, valid)
    
            # Plot the progress
            print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100 * d_loss[1], g_loss))
      
      # Save generator and discriminator weights
        model_checkpoint_base_name = 'model/' + cache_prefix + '_{}_model_weights.h5'
        self.generator.save_weights(model_checkpoint_base_name.format('generator', epoch))
        self.discriminator.save_weights(model_checkpoint_base_name.format('discriminator', epoch))

    def save(self, path, name):
        assert os.path.isdir(path) == True, \
            "valid path needed from colab"
        model_path = os.path.join(path, name)
        self.generator.save_weights(model_path)  # Load the generator
        return
    
    def load(self, path):
        assert os.path.isdir(path) == True, \
            "valid path needed from colab"
        self.generator = Generator(self.batch_size)
        self.generator = self.generator.load_weights(path)
        return self.generator
    
class Generator():
    def __init__(self, batch_size):
        self.batch_size=batch_size
        
    def build_model(self, input_shape, dim, data_dim):
        input= Input(shape=input_shape, batch_size=self.batch_size)
        x = Dense(dim, activation='relu')(input)
        x = Dense(dim * 2, activation='relu')(x)
        x = Dense(dim * 4, activation='relu')(x)
        x = Dense(data_dim)(x)
        return Model(inputs=input, outputs=x)

class Discriminator():
    def __init__(self,batch_size):
        self.batch_size=batch_size
    
    def build_model(self, input_shape, dim):
        input = Input(shape=input_shape, batch_size=self.batch_size)
        x = Dense(dim * 4, activation='relu')(input)
        x = Dropout(0.1)(x)
        x = Dense(dim * 2, activation='relu')(x)
        x = Dropout(0.1)(x)
        x = Dense(dim, activation='relu')(x)
        x = Dense(1, activation='sigmoid')(x)
        return Model(inputs=input, outputs=x)

### train_gan_model

This method passes the arguments required by GAN. It runs the model and returns the synthesizer

We are using epoch to be 1000, as while testing, I noticed that the accuracy was sometimes reached above 95% when the number of iterations increased significantly.

In [None]:
def train_gan_model(train_data, emotion):
  # GAN training params
  noise_dim = 32
  dim = 128
  batch_size = 64

  log_step = 100
  epochs = 1000+1
  learning_rate = 5e-4
  models_dir = './cache'

  gan_args = [batch_size, learning_rate, noise_dim, train_data.shape[1], dim]
  train_args = [emotion, epochs, log_step]
  model = GAN
  synthesizer = model(gan_args)
  synthesizer.train(train_data, train_args)
  return synthesizer

### generate_synthetic_data

This generates the synthetic data for the specified emotion and given number of data points

In [None]:
def generate_synthetic_data(generator, emotion, data_points=500):
  np.random.seed(17)
  z = np.random.normal(size=(data_points, 32))

  # Generating Synthetic Data
  generator.load_weights( 'model/' + emotion + '_generator_model_weights.h5')

  synthetic_z = generator.predict(z)
  synthetic_data = pd.DataFrame(synthetic_z, columns=train_data.columns)
  synthetic_data.to_csv('output/synthetic_' + emotion + '.csv')


### Simulator

This is where we call the above methods for all our emotion data

In [None]:
# Create Directories to save Model and Synthetic data
!mkdir model
!mkdir output

emotions = ['aggressive', 'calm', 'chill', 'dark', 'energetic', 'relaxing']

for emotion in emotions:
  train_data = load_data_from_files(emotion + '.csv')
  synthesizer = train_gan_model(train_data, emotion)
  generator = synthesizer.generator
  generator.summary()
  synthesizer.discriminator.summary()
  generate_synthetic_data(generator, emotion, data_points=600)

### Further Steps

I will be testing this to generate data for all the labels combined and then separately and analyse the difference in the correlation of the data.

The data currently generated by this was not very useful.
I have to analyse the difference it makes when the data is not converted to gaussian and come up with a better model