<a href="https://colab.research.google.com/github/utkarshgr8/Adversarial-attack-on-image-captioning/blob/master/AudioGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Audio Generation using GAN

Following code demonstrates how to generate audio using a [Continuous recurrent neural networks with adversarial training](https://arxiv.org/pdf/1611.09904.pdf). The code is written using the [Keras Sequential API](https://www.tensorflow.org/guide/keras) with a `tf.GradientTape` training loop.

### Setup

In [None]:
import tensorflow as tf

In [None]:
from __future__ import print_function, division
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inl
from PIL import Image
import pathlib
import csv
import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Input, Dense, Reshape, Flatten, Dropout, multiply
from keras.layers import BatchNormalization, Activation, Embedding, ZeroPadding2D, LSTM
from keras.layers.advanced_activations import LeakyReLU
from keras.layers.convolutional import UpSampling2D, Conv2D, Conv1D, MaxPooling1D
import sys, os
import glob
import uuid
import ntpath
from sys import getsizeof
from scipy.io.wavfile import read, write
import time
from IPython import display


In [None]:
tf.__version__

'2.4.0'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load and prepare the dataset



The dataset used is drums(http://deepyeti.ucsd.edu/cdonahue/wavegan/data/drums.tar.gz) which contains around 3000 audio samples of drum sounds 

we first process the audio files and convert them into spectogram which are further used in training the GAN

In [None]:
import tarfile
my_tar = tarfile.open('/content/drums.tar.gz')
my_tar.extractall('/content/afile') # specify which folder to extract to
my_tar.close()

In [None]:
cmap = plt.get_cmap('inferno')
for g in ['train','test','valid']:
  pathlib.Path(f'img_data').mkdir(parents=True, exist_ok=True)
  for filename in os.listdir(f'/content/afile/drums/{g}'):
    songname=f'/content/afile/drums/{g}/{filename}'
    y, sr = librosa.load(songname, mono=True, duration=5)
    plt.specgram(y, NFFT=2048, Fs=2, Fc=0, noverlap=128, cmap=cmap, sides='default', mode='default', scale='dB');
    plt.axis('off');
    plt.savefig(f'img_data/{filename[:-3].replace(".", "")}.png')
    plt.clf()
  
  





<Figure size 432x288 with 0 Axes>

In [None]:
train_images=[]
from PIL import Image
from numpy import asarray 
from numpy import array
a=0
for img_path in os.listdir(f'/content/img_data/train'):
   Img_path=f'/content/img_data/train/{img_path}'
   img=Image.open(Img_path)
   new_img=asarray(img)
   train_images.append(new_img)
train_images=array(train_images)
train_images.shape


(2350, 288, 432, 4)

In [None]:
train_images = train_images.reshape(9400,432,288).astype('float32')
train_images = (train_images - 127.5) / 127.5 # Normalize the images to [-1, 1]

## Create the models

Both the generator and discriminator are defined using the [Keras Sequential API](https://www.tensorflow.org/guide/keras#sequential_model).

# The Generator

In [None]:
from keras.models import Model
frame_size=500
frame_shift=128
audio_shape=(frame_size, 256)

def make_generator_model():
    model = Sequential()
    model.add(LSTM(256, input_shape=np.array(audio_shape), return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(256, return_sequences=True))
    model.add(Dropout(0.3))
    model.add(LSTM(256))
    model.add(Dense(256))
    model.add(Dropout(0.3))
    model.add(Dense(256*frame_size))
    model.add(Activation('softmax'))
    model.add(Reshape((frame_size, 256)))

    model.summary()

    noise = Input(shape=audio_shape)

    sound = model(noise)

    return Model(noise, sound)

Use the (as yet untrained) generator to create an image.

In [None]:
generator = make_generator_model()
print(generator.output_shape)
noise = Input(shape=audio_shape)
generated_image = generator(noise,training=False)
print(generated_image)
#plt.imshow(generated_image, cmap='gray')

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_15 (LSTM)               (None, 500, 256)          525312    
_________________________________________________________________
dropout_15 (Dropout)         (None, 500, 256)          0         
_________________________________________________________________
lstm_16 (LSTM)               (None, 500, 256)          525312    
_________________________________________________________________
dropout_16 (Dropout)         (None, 500, 256)          0         
_________________________________________________________________
lstm_17 (LSTM)               (None, 256)               525312    
_________________________________________________________________
dense_10 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_17 (Dropout)         (None, 256)              

### The Discriminator

The discriminator is a CNN-based image classifier.

In [None]:
def make_discriminator_model():
    model = Sequential()
    model.add(Conv1D(32, kernel_size=(2), padding="same", input_shape=audio_shape))
    model.add(MaxPooling1D(pool_size=(2)))
    model.add(Dropout(0.25))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(128))

    model.summary()

    audio = Input(shape=audio_shape)

    # Extract feature representation
    features = model(audio)

    # Determine validity and label of the audio
    validity = Dense(1, activation="sigmoid")(features)

    return Model(audio, validity)


Use the (as yet untrained) discriminator to classify the generated images as real or fake. The model will be trained to output positive values for real images, and negative values for fake images.

In [None]:
discriminator = make_discriminator_model()
decision = discriminator(generated_image)
print (decision)

## Define the loss and optimizers

Define loss functions and optimizers for both models.


In [None]:
# This method returns a helper function to compute cross entropy loss
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

### Discriminator loss

This method quantifies how well the discriminator is able to distinguish real images from fakes. It compares the discriminator's predictions on real images to an array of 1s, and the discriminator's predictions on fake (generated) images to an array of 0s.

In [None]:
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss

### Generator loss
The generator's loss quantifies how well it was able to trick the discriminator. Intuitively, if the generator is performing well, the discriminator will classify the fake images as real (or 1). Here, we will compare the discriminators decisions on the generated images to an array of 1s.

In [None]:
def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)

The discriminator and the generator optimizers are different since we will train two networks separately.

In [None]:
generator_optimizer = tf.keras.optimizers.Adam(0.0002,0.5)
discriminator_optimizer = tf.keras.optimizers.Adam(0.0002,0.5)

### Save checkpoints
This notebook also demonstrates how to save and restore models, which can be helpful in case a long running training task is interrupted.

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 generator=generator,
                                 discriminator=discriminator)

## Define the training loop


In [None]:
EPOCHS = 50
noise_dim = 100
num_examples_to_generate = 16

# We will reuse this seed overtime (so it's easier)
# to visualize progress in the animated GIF)
seed = tf.random.normal([num_examples_to_generate, noise_dim])

The training loop begins with generator receiving a random seed as input. That seed is used to produce an image. The discriminator is then used to classify real images (drawn from the training set) and fakes images (produced by the generator). The loss is calculated for each of these models, and the gradients are used to update the generator and discriminator.

In [None]:
# Notice the use of `tf.function`
# This annotation causes the function to be "compiled".
@tf.function
def train_step(images):
    noise = tf.random.normal([BATCH_SIZE, noise_dim])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
      generated_images = generator(noise, training=True)

      real_output = discriminator(images, training=True)
      fake_output = discriminator(generated_images, training=True)

      gen_loss = generator_loss(fake_output)
      disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

In [None]:
def train(dataset, epochs):
  for epoch in range(epochs):
    start = time.time()

    for image_batch in dataset:
      train_step(image_batch)

    # Produce images for the GIF as we go
    display.clear_output(wait=True)
    generate_and_save_images(generator,
                             epoch + 1,
                             seed)

    # Save the model every 15 epochs
    if (epoch + 1) % 15 == 0:
      checkpoint.save(file_prefix = checkpoint_prefix)

    print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))

  # Generate after the final epoch
  display.clear_output(wait=True)
  generate_and_save_images(generator,
                           epochs,
                           seed)

**Generate and save images**


In [None]:
def generate_and_save_images(model, epoch, test_input):
  # Notice `training` is set to False.
  # This is so all layers run in inference mode (batchnorm).
  predictions = model(test_input, training=False)

  fig = plt.figure(figsize=(4,4))

  for i in range(predictions.shape[0]):
      plt.subplot(4, 4, i+1)
      plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap='gray')
      plt.axis('off')

  plt.savefig('image_at_epoch_{:04d}.png'.format(epoch))
  plt.show()

## Train the model
Call the `train()` method defined above to train the generator and discriminator simultaneously. Note, training GANs can be tricky. It's important that the generator and discriminator do not overpower each other (e.g., that they train at a similar rate).

At the beginning of the training, the generated images look like random noise. As training progresses, the generated digits will look increasingly real. After about 50 epochs, they start to resemble the dataset. This may take about one minute / epoch with the default settings on Colab.

In [None]:
train(train_dataset, EPOCHS)

Restore the latest checkpoint.

In [None]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
def get_audio_from_model(model, sr, duration, frame_size):
    print ('Generating audio...')
    print ('Sample rate: ' + str(sr))
    new_audio = np.zeros((sr * duration))
    curr_sample_idx = 0
    while curr_sample_idx < new_audio.shape[0]:
        pred_audio = model.predict(np.random.normal(0, 1, (1, frame_size, 256)))
        for i in range(pred_audio.shape[1]):
            curr_sample_idx += 1
            if curr_sample_idx > len(new_audio)-1:
                print('Exiting loop')
                break
            pred_audio_sample = pred_audio[0,i,:]

            pred_audio_sample = pred_audio_sample.reshape(256)
            pred_audio_sample /= pred_audio_sample.sum().astype(float)
            predicted_val = np.random.choice(range(256), p=pred_audio_sample)
            ampl_val_8 = ((((predicted_val) / 255.0) - 0.5) * 2.0)
            ampl_val_16 = (np.sign(ampl_val_8) * (1/256.0) * ((1 + 256.0)**abs(ampl_val_8) - 1)) * 2**15

            new_audio[curr_sample_idx] = ampl_val_16

            pc_str = str(round(100*curr_sample_idx/float(new_audio.shape[0]), 2))

            sys.stdout.write('Percent complete: ' + pc_str + '\r')
            sys.stdout.flush()


    print ('Audio generated.')
    return np.array(new_audio, dtype=np.int16)