<a href="https://colab.research.google.com/github/yootazi/VAE_for_Audio_v_1/blob/main/VAE_for_Audio_v_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# VAE for Audio v.1
# **Creating and Training a Variational Autoencoder with an Audio Dataset located in Google Drive**

---

 by Yalda Zamani, 2021
> Website: [www.yaldazamani.com](https://www.yaldazamani.com)

> Twitter: [@yootazi](https://twitter.com/yootazi)


> 

VAE for Audio is a Variational Autoencoder which synthesizes Mel-Spectrograms that can be inverted into raw audio waveform.
Currently it can be trained with any dataset of .wav audio at 44.1khz Sample Rate and 16bit bitdepth.

> Credits:
* VAE neural network architecture coded following 'The Sound of AI' Youtube tutorial series by Valerio Velardo



+++

## 1. **Preprocessing component** 
As it's first component, VAE for Audio Pre-processes an audio dataset stored on a specific location by extracting features such as MFCCs (Mel-Frequency Cepstral Coefficients), spectrograms and Mel-spectrogram 

Create a folder in your Google Drive called **musicdata**. Create another folder within musicdata called **vae_for_audio**. Create two empty folders **spectrograms** and **audio** within vae_for_audio. 

Make sure the path pointing to your folders look like these:

'/content/gdrive/MyDrive/musicdata/vae_for_audio/spectrograms'&
'/content/gdrive/MyDrive/musicdata/vae_for_audio/audio'

Move your audio files into the **audio** folder.














In [None]:
#@title **I. Connect to your Google Drive** { form-width: "50%" }



from google.colab import drive
drive.mount('/content/gdrive/')
print("You are now connected to your Google drive.")






In [None]:
#@title **II.a Installing Python 3.7 (ignore if already installed)** { form-width: "50%" }


#installing python 3.7

# install Anaconda3
!wget -qO ac.sh https://repo.anaconda.com/archive/Anaconda3-2020.07-Linux-x86_64.sh 
!bash ./ac.sh -b

# a fake google.colab library
!ln -s /usr/local/lib/python3.7/dist-packages/google \
       /root/anaconda3/lib/python3.7/site-packages/google

# start jupyterlab, which now has Python3 = 3.8
!nohup /root/anaconda3/bin/jupyter-lab --ip=0.0.0.0&

# access through ngrok, click the link
!pip install pyngrok -q
from pyngrok import ngrok
print(ngrok.connect(8888))

# Install the python version
!apt-get install python3.7

# Select the version
!python3.7 setup.py



In [None]:
#@title **II.b Installing TensorFlow 2.3.1 / Importing Libraries** { form-width: "50%" }

#installing tensorflow 2.3.1
!pip install tensorflow==2.3.1
import tensorflow as tf
print(tf.__version__)

# installing tensorflow and torchaudio 
import tensorflow as tf
!pip install soundfile                    #to save wav files
!pip install --no-deps torchaudio==0.5
!pip install git+https://github.com/pvigier/perlin-numpy #for generating perlin and fractal noise

# importing libraries
import os
import pickle
import os.path
from os import path

from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Conv2D, ReLU, BatchNormalization, \
    Flatten, Dense, Reshape, Conv2DTranspose, Activation, Lambda
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
import numpy as np
import tensorflow as tf

# libraries for extracting features, plotting and analysing data
from glob import glob
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from numpy import asarray
from numpy.random import randn
from numpy.random import randint
from numpy import linspace
import soundfile as sf             
import time
import IPython
from perlin_numpy import (
    generate_fractal_noise_2d, generate_fractal_noise_3d,
    generate_perlin_noise_2d, generate_perlin_noise_3d
)

**--> Restart Runtime by connecting to hosted runtime (if a new version of python is installed)**

In [None]:
#@title **III. Create Spectrograms from the Audio Files Located in 'audio' Folder**

"""
1- load a file
2- pad the signal (if necessary)
3- extracting log spectrogram from signal
4- normalise spectrogram
5- save the normalised spectrogram
PreprocessingPipeline
"""
import os
import pickle

import librosa
import numpy as np


class Loader:
    """Loader is responsible for loading an audio file."""

    def __init__(self, sample_rate, duration, mono):                        # constructor / in MIR audio can be analized when they are mono
        self.sample_rate = sample_rate
        self.duration = duration
        self.mono = mono

    def load(self, file_path):
        signal = librosa.load(file_path,                                    # librosa.load returns 2 things: the signal and the sample rate
                              sr=self.sample_rate,
                              duration=self.duration,
                              mono=self.mono)[0]                            # we take just the first index which is the signal (don'T need the sample rate)
        return signal


class Padder:
    """Padder is responsible to apply padding to an array."""

    def __init__(self, mode="constant"):                                    # creating a constructor
        self.mode = mode

    def left_pad(self, array, num_missing_items):                           # missing items to prepend (add to the beginning of the array) / 0 items to append (add to the end of the array)
        padded_array = np.pad(array,
                              (num_missing_items, 0),
                              mode=self.mode)
        return padded_array

    def right_pad(self, array, num_missing_items):                          # 0 items to prepend (add to the beginning of the array) / missing items to append (add to the end of the array)
        padded_array = np.pad(array,
                              (0, num_missing_items),
                              mode=self.mode)
        return padded_array


class LogSpectrogramExtractor:
    """LogSpectrogramExtractor extracts log spectrograms (in dB) from a
    time-series signal.
    """

    def __init__(self, frame_size, hop_length):
        self.frame_size = frame_size
        self.hop_length = hop_length

    def extract(self, signal):
        stft = librosa.stft(signal,                                          # extracting short time furier transport (stft)   (1 + frame_size / 2, num_frames)   1024 -> 513 (-1) -> 512
                            n_fft=self.frame_size,
                            hop_length=self.hop_length)[:-1]
        spectrogram = np.abs(stft)
        log_spectrogram = librosa.amplitude_to_db(spectrogram)
        return log_spectrogram


class MinMaxNormaliser:                                                      # we take the array and squish it into a normalized range between 1 and 0 (min -> 0 / max ->1)
    """MinMaxNormaliser applies min max normalisation to an array."""

    def __init__(self, min_val, max_val):
        self.min = min_val
        self.max = max_val

    def normalise(self, array):
        norm_array = (array - array.min()) / (array.max() - array.min())     # -> 0 / 1
        norm_array = norm_array * (self.max - self.min) + self.min           # adding another range
        return norm_array

    def denormalise(self, norm_array, original_min, original_max):           # inverting the above expression
        array = (norm_array - self.min) / (self.max - self.min)
        array = array * (original_max - original_min) + original_min
        return array


class Saver:
    """saver is responsible to save features, and the min max values."""

    def __init__(self, feature_save_dir, min_max_values_save_dir):
        self.feature_save_dir = feature_save_dir
        self.min_max_values_save_dir = min_max_values_save_dir

    def save_feature(self, feature, file_path):
        save_path = self._generate_save_path(file_path)
        np.save(save_path, feature)
        return save_path                   # was added

    def save_min_max_values(self, min_max_values):                                # we need to store the min max values for all the log spectrograms to reuse that for the reconstructing the signals and regenerating it
        save_path = os.path.join(self.min_max_values_save_dir,
                                 "min_max_values.pkl")
        self._save(min_max_values, save_path)

    @staticmethod
    def _save(data, save_path):
        with open(save_path, "wb") as f:
            pickle.dump(data, f)

    def _generate_save_path(self, file_path):
        file_name = os.path.split(file_path)[1]
        save_path = os.path.join(self.feature_save_dir, file_name + ".npy")
        return save_path


class PreprocessingPipeline:                                                       # The higher level Class
    """PreprocessingPipeline processes audio files in a directory, applying
    the following steps to each file:
        1- load a file
        2- pad the signal (if necessary)
        3- extracting log spectrogram from signal
        4- normalise spectrogram
        5- save the normalised spectrogram
    Storing the min max values for all the log spectrograms.
    """

    def __init__(self):
        self.padder = None
        self.extractor = None                                                      # general extractor (logSpectrogramExtractor included)
        self.normaliser = None
        self.saver = None
        self.min_max_values = {}                                                   # we need to store the min max values for all the log spectrograms to reuse that for the reconstructing the signals and regenerating it
        self._loader = None
        self._num_expected_samples = None

    @property
    def loader(self):
        return self._loader

    @loader.setter
    def loader(self, loader):
        self._loader = loader
        self._num_expected_samples = int(loader.sample_rate * loader.duration)

    def process(self, audio_files_dir):                                            # path to directory in which all audio files are stored
        for root, _, files in os.walk(audio_files_dir):                            # lopping through all the files
            for file in files:
                file_path = os.path.join(root, file)
                self._process_file(file_path)
                print(f"Processed file {file_path}")
        self.saver.save_min_max_values(self.min_max_values)

    def _process_file(self, file_path):
        signal = self.loader.load(file_path)
        if self._is_padding_necessary(signal):
            signal = self._apply_padding(signal)
        feature = self.extractor.extract(signal)
        norm_feature = self.normaliser.normalise(feature)
        save_path = self.saver.save_feature(norm_feature, file_path)
        self._store_min_max_value(save_path, feature.min(), feature.max())

    def _is_padding_necessary(self, signal):
        if len(signal) < self._num_expected_samples:
            return True
        return False

    def _apply_padding(self, signal):
        num_missing_samples = self._num_expected_samples - len(signal)
        padded_signal = self.padder.right_pad(signal, num_missing_samples)
        return padded_signal

    def _store_min_max_value(self, save_path, min_val, max_val):
        self.min_max_values[save_path] = {
            "min": min_val,
            "max": max_val
        }

if __name__ == "__main__":
    FRAME_SIZE = 512
    HOP_LENGTH = 256
    DURATION = 0.74  # in seconds
    SAMPLE_RATE = 22050
    MONO = True

    SPECTROGRAMS_SAVE_DIR = '/content/gdrive/MyDrive/musicdata/vae_for_audio/spectrograms'
    MIN_MAX_VALUES_SAVE_DIR = '/content/gdrive/MyDrive/musicdata/vae_for_audio'
    FILES_DIR = '/content/gdrive/MyDrive/musicdata/vae_for_audio/audio/'

    # instantiate all objects
    loader = Loader(SAMPLE_RATE, DURATION, MONO)
    padder = Padder()
    log_spectrogram_extractor = LogSpectrogramExtractor(FRAME_SIZE, HOP_LENGTH)
    min_max_normaliser = MinMaxNormaliser(0, 1)
    saver = Saver(SPECTROGRAMS_SAVE_DIR, MIN_MAX_VALUES_SAVE_DIR)

    preprocessing_pipeline = PreprocessingPipeline()
    preprocessing_pipeline.loader = loader
    preprocessing_pipeline.padder = padder
    preprocessing_pipeline.extractor = log_spectrogram_extractor
    preprocessing_pipeline.normaliser = min_max_normaliser
    preprocessing_pipeline.saver = saver

    preprocessing_pipeline.process(FILES_DIR)


Processed file /content/gdrive/MyDrive/musicdata/vae_for_audio/audio/0_george_16.wav
Processed file /content/gdrive/MyDrive/musicdata/vae_for_audio/audio/6_george_29.wav
Processed file /content/gdrive/MyDrive/musicdata/vae_for_audio/audio/6_lucas_21.wav
Processed file /content/gdrive/MyDrive/musicdata/vae_for_audio/audio/1_nicolas_3.wav
Processed file /content/gdrive/MyDrive/musicdata/vae_for_audio/audio/4_lucas_4.wav
Processed file /content/gdrive/MyDrive/musicdata/vae_for_audio/audio/4_george_45.wav
Processed file /content/gdrive/MyDrive/musicdata/vae_for_audio/audio/2_nicolas_17.wav
Processed file /content/gdrive/MyDrive/musicdata/vae_for_audio/audio/8_nicolas_49.wav
Processed file /content/gdrive/MyDrive/musicdata/vae_for_audio/audio/3_yweweler_1.wav
Processed file /content/gdrive/MyDrive/musicdata/vae_for_audio/audio/2_yweweler_30.wav
Processed file /content/gdrive/MyDrive/musicdata/vae_for_audio/audio/9_jackson_45.wav
Processed file /content/gdrive/MyDrive/musicdata/vae_for_audio

In [None]:
#@title ****IV. Create a Variational Autoencoder, with the Following Architecture:****

import os.path
from os import path

import tensorflow as tf
tf.compat.v1.disable_eager_execution()


class VAE:
    """
    VAE represents a Deep Convolutional variational autoencoder architecture
    with mirrored encoder and decoder components.
    """

    def __init__(self,
                 input_shape,  #shape of the input data
                 conv_filters, #convolutional network filters
                 conv_kernels, #convNet kernel size
                 conv_strides, #convNet strides
                 latent_space_dim):
        self.input_shape = input_shape # [28, 28, 1], in this case is 28 x 28 pixels on 1 channel for greyscale
        self.conv_filters = conv_filters # [2, 4, 8] is a list for each layer, i.e. 
        self.conv_kernels = conv_kernels # [1,2,3] list of kernels per layer
        self.conv_strides = conv_strides # [1, 2, 2] stride for each filter , note: 2 means you are downsampling the data in half
        self.latent_space_dim = latent_space_dim # 2 how many neurons on bottleneck
        self.reconstruction_loss_weight = 1000000

        self.encoder = None
        self.decoder = None
        self.model = None

        self._num_conv_layers = len(conv_filters)
        self._shape_before_bottleneck = None
        self._model_input = None

        self._build()

    def summary(self):
        self.encoder.summary()
        self.decoder.summary()
        self.model.summary()

    def compile(self, learning_rate=0.0001):
        optimizer = Adam(learning_rate=learning_rate)
        self.model.compile(optimizer=optimizer,
                           loss=self._calculate_combined_loss,
                           metrics=[self._calculate_reconstruction_loss,
                                    self._calculate_kl_loss]
        )

    def train(self, x_train, batch_size, num_epochs):
        self.model.fit(x_train,
                       x_train,
                       batch_size=batch_size,
                       epochs=num_epochs,
                       shuffle=True)

    def save(self, save_folder="/content/gdrive/MyDrive/ai_music_projects/VAE-for-Audio-v.1"):             # should be the working directory where the mcodel is saved
        self._create_folder_if_it_doesnt_exist(save_folder)
        self._save_parameters(save_folder)
        self._save_weights(save_folder)

    def load_weights(self, weights_path):
        self.model.load_weights(weights_path)

    def reconstruct(self, images):
        latent_representations = self.encoder.predict(images)
        reconstructed_images = self.decoder.predict(latent_representations)
        return reconstructed_images, latent_representations

    @classmethod
    def load(cls, save_folder="/content/gdrive/MyDrive/ai_music_projects/VAE-for-Audio-v.1"):              # should be the working directory where the model is saved
        parameters_path = os.path.join(save_folder, "parameters.pkl")           
        with open(parameters_path, "rb") as f:
            parameters = pickle.load(f)
        autoencoder = VAE(*parameters)
        weights_path = os.path.join(save_folder, "weights.h5")
        autoencoder.load_weights(weights_path)
        return autoencoder

    def _calculate_combined_loss(self, y_target, y_predicted):
        reconstruction_loss = self._calculate_reconstruction_loss(y_target, y_predicted)
        kl_loss = self._calculate_kl_loss(y_target, y_predicted)
        combined_loss = self.reconstruction_loss_weight * reconstruction_loss\
                                                         + kl_loss
        return combined_loss

    def _calculate_reconstruction_loss(self, y_target, y_predicted):
        error = y_target - y_predicted
        reconstruction_loss = K.mean(K.square(error), axis=[1, 2, 3])
        return reconstruction_loss

    def _calculate_kl_loss(self, y_target, y_predicted):
        kl_loss = -0.5 * K.sum(1 + self.log_variance - K.square(self.mu) -
                               K.exp(self.log_variance), axis=1)
        return kl_loss

    def _create_folder_if_it_doesnt_exist(self, folder):
        if not os.path.exists(folder):
            os.makedirs(folder)

    def _save_parameters(self, save_folder):
        parameters = [
            self.input_shape,
            self.conv_filters,
            self.conv_kernels,
            self.conv_strides,
            self.latent_space_dim
        ]
        save_path = os.path.join(save_folder, "parameters.pkl")
        with open(save_path, "wb") as f:
            pickle.dump(parameters, f)

    def _save_weights(self, save_folder):
        save_path = os.path.join(save_folder, "weights.h5")
        self.model.save_weights(save_path)

    def _build(self):
        self._build_encoder()
        self._build_decoder()
        self._build_autoencoder()


# ********************************* Autoencoder ********************************


    def _build_autoencoder(self):
        model_input = self._model_input
        model_output = self.decoder(self.encoder(model_input))
        self.model = Model(model_input, model_output, name="autoencoder")


# ********************************* Decoder ************************************


    def _build_decoder(self):
        decoder_input = self._add_decoder_input()
        dense_layer = self._add_dense_layer(decoder_input)
        reshape_layer = self._add_reshape_layer(dense_layer)
        conv_transpose_layers = self._add_conv_transpose_layers(reshape_layer)
        decoder_output = self._add_decoder_output(conv_transpose_layers)
        self.decoder = Model(decoder_input, decoder_output, name="decoder")

    def _add_decoder_input(self):
        return Input(shape=self.latent_space_dim, name="decoder_input")

    def _add_dense_layer(self, decoder_input):
        num_neurons = np.prod(self._shape_before_bottleneck) # [1, 2, 4] -> 8
        dense_layer = Dense(num_neurons, name="decoder_dense")(decoder_input)
        return dense_layer

    def _add_reshape_layer(self, dense_layer):
        return Reshape(self._shape_before_bottleneck)(dense_layer)

    def _add_conv_transpose_layers(self, x):
        """Add conv transpose blocks."""
        # loop through all the conv layers in reverse order and stop at the
        # first layer
        for layer_index in reversed(range(1, self._num_conv_layers)):
            x = self._add_conv_transpose_layer(layer_index, x)
        return x

    def _add_conv_transpose_layer(self, layer_index, x):
        layer_num = self._num_conv_layers - layer_index
        conv_transpose_layer = Conv2DTranspose(
            filters=self.conv_filters[layer_index],
            kernel_size=self.conv_kernels[layer_index],
            strides=self.conv_strides[layer_index],
            padding="same",
            name=f"decoder_conv_transpose_layer_{layer_num}"
        )
        x = conv_transpose_layer(x)
        x = ReLU(name=f"decoder_relu_{layer_num}")(x)
        x = BatchNormalization(name=f"decoder_bn_{layer_num}")(x)
        return x

    def _add_decoder_output(self, x):
        conv_transpose_layer = Conv2DTranspose(
            filters=1,
            kernel_size=self.conv_kernels[0],
            strides=self.conv_strides[0],
            padding="same",
            name=f"decoder_conv_transpose_layer_{self._num_conv_layers}"
        )
        x = conv_transpose_layer(x)
        output_layer = Activation("sigmoid", name="sigmoid_layer")(x)
        return output_layer


# ********************************* Encoder ************************************

    def _build_encoder(self):
        encoder_input = self._add_encoder_input()
        conv_layers = self._add_conv_layers(encoder_input)
        bottleneck = self._add_bottleneck(conv_layers)
        self._model_input = encoder_input
        self.encoder = Model(encoder_input, bottleneck, name="encoder")

    def _add_encoder_input(self):
        return Input(shape=self.input_shape, name="encoder_input")

    def _add_conv_layers(self, encoder_input):
        """Create all convolutional blocks in encoder."""
        x = encoder_input
        for layer_index in range(self._num_conv_layers):
            x = self._add_conv_layer(layer_index, x)
        return x

    def _add_conv_layer(self, layer_index, x):
        """Add a convolutional block to a graph of layers, consisting of
        conv 2d + ReLU + batch normalization.
        """
        layer_number = layer_index + 1
        conv_layer = Conv2D(
            filters=self.conv_filters[layer_index],
            kernel_size=self.conv_kernels[layer_index],
            strides=self.conv_strides[layer_index],
            padding="same",
            name=f"encoder_conv_layer_{layer_number}"
        )
        x = conv_layer(x)
        x = ReLU(name=f"encoder_relu_{layer_number}")(x)
        x = BatchNormalization(name=f"encoder_bn_{layer_number}")(x)
        return x


# ********************************* Bottleneck *********************************


    def _add_bottleneck(self, x):
        """Flatten data and add bottleneck with Guassian sampling (Dense
        layer).
        """
        self._shape_before_bottleneck = K.int_shape(x)[1:]
        x = Flatten()(x)
        self.mu = Dense(self.latent_space_dim, name="mu")(x)
        self.log_variance = Dense(self.latent_space_dim,
                                  name="log_variance")(x)

        def sample_point_from_normal_distribution(args):
            mu, log_variance = args
            epsilon = K.random_normal(shape=K.shape(self.mu), mean=0.,
                                      stddev=1.)
            sampled_point = mu + K.exp(log_variance / 2) * epsilon
            return sampled_point

        x = Lambda(sample_point_from_normal_distribution,
                   name="encoder_output")([self.mu, self.log_variance])
        return x

print("VAE successfully built")

if __name__ == "__main__":
    autoencoder = VAE(
        input_shape=(28, 28, 1),
        conv_filters=(32, 64, 64, 64),
        conv_kernels=(3, 3, 3, 3),
        conv_strides=(1, 2, 2, 1),
        latent_space_dim=2
    )


In [None]:
#@title ****Summary of the Created Variational Autoencoder****

autoencoder.summary()

##2. **Training Component**
Train your model by choosing training parameters and running it:


In [None]:
# Training

import os
import numpy as np


#@title Hyperparameters 

learning_rate = 0.0005 #@param {type:"raw"}
num_epochs_to_train =  40#@param {type:"integer"}
batch_size = 64 #@param {type:"integer"}
vector_dimension = 64 #@param {type:"integer"}

hop=256               #hop size (window size = 4*hop)
sr=44100              #sampling rate
min_level_db=-100     #reference values to normalize data
ref_level_db=20

LEARNING_RATE = learning_rate
BATCH_SIZE = batch_size
EPOCHS = num_epochs_to_train
VECTOR_DIM=vector_dimension

shape=128           #length of time axis of split specrograms         
spec_split=1        

SPECTROGRAMS_SAVE_DIR = '/content/gdrive/MyDrive/musicdata/vae_for_audio/spectrograms'
MIN_MAX_VALUES_SAVE_DIR = '/content/gdrive/MyDrive/musicdata/vae_for_audio'
FILES_DIR = '/content/gdrive/MyDrive/musicdata/vae_for_audio/audio/'



def load_fsdd(SPECTROGRAMS_PATH):                                      # loading spectrograms and saving it in x_train array
    x_train = []
    for root, _, file_names in os.walk(SPECTROGRAMS_SAVE_DIR):
        for file_name in file_names:
            file_path = os.path.join(root, file_name)
            spectrogram = np.load(file_path, allow_pickle=True)        # (n_bins, n_frames, 1)  # ->shape of the array / convolutional layers have 3 dimension array/ we should another dimension (1)
            x_train.append(spectrogram)
    x_train = np.array(x_train)
    x_train = x_train[..., np.newaxis]                                 # -> (3000, 256, 64, 1)       treating spectrograms as a grayscale images
    return x_train


def train(x_train, learning_rate, batch_size, epochs):
    autoencoder = VAE(
        input_shape=(256, 64, 1),
        conv_filters=(512, 256, 128, 64, 32),
        conv_kernels=(3, 3, 3, 3, 3),
        conv_strides=(2, 2, 2, 2, (2, 1)),
        latent_space_dim= 128                     #instead of 128
    )
    autoencoder.summary()
    autoencoder.compile(learning_rate)
    autoencoder.train(x_train, batch_size, epochs)
    return autoencoder


if __name__ == "__main__":
    x_train = load_fsdd(SPECTROGRAMS_SAVE_DIR)
    autoencoder = train(x_train, LEARNING_RATE, BATCH_SIZE, EPOCHS)
    autoencoder.save("model")



+++

##3. **Audio Generation Component**

In [None]:
#@title **Sound Generator** { form-width: "50%" }

import librosa

class SoundGenerator:
    """SoundGenerator is responsible for generating audios from
    spectrograms.
    """

    def __init__(self, vae, hop_length):
        self.vae = vae
        self.hop_length = hop_length
        self._min_max_normaliser = MinMaxNormaliser(0, 1)        # private attribute

    def generate(self, spectrograms, min_max_values):
        generated_spectrograms, latent_representations = \
            self.vae.reconstruct(spectrograms)
        signals = self.convert_spectrograms_to_audio(generated_spectrograms, min_max_values)
        return signals, latent_representations

    def convert_spectrograms_to_audio(self, spectrograms, min_max_values):
        signals = []
        for spectrogram, min_max_value in zip(spectrograms, min_max_values):
            # reshape the log spectrogram
            log_spectrogram = spectrogram[:, :, 0]
            # apply denormalisation
            denorm_log_spec = self._min_max_normaliser.denormalise(
                log_spectrogram, min_max_value["min"], min_max_value["max"])
            # log spectrogram -> spectrogram
            spec = librosa.db_to_amplitude(denorm_log_spec)
            # apply Griffin-Lim
            signal = librosa.istft(spec, hop_length=self.hop_length)        # applying Griffin-Lim (inversed short term furier transform -> STFT) to spectrogram to get audio
            # append signal to "signals"
            signals.append(signal)
        return signals


In [None]:
#@title **Audio Generation** { form-width: "50%" }

import os
import pickle

import numpy as np
import soundfile as sf


HOP_LENGTH = 256
SAVE_DIR_ORIGINAL = "/content/gdrive/MyDrive/ai_music_projects/VAE-for-Audio-v.1/samples/original/"          # sampled spectrograms - we should have 5 original audio files
SAVE_DIR_GENERATED = "/content/gdrive/MyDrive/ai_music_projects/VAE-for-Audio-v.1/samples/generated/"        # we should have 5 audio files created from 5 spectrograms that has been sampled, saved in this folder at the end
MIN_MAX_VALUES_PATH = "/content/gdrive/MyDrive/musicdata/vae_for_audio/min_max_values.pkl"
SPECTROGRAMS_PATH = "/content/gdrive/MyDrive/musicdata/vae_for_audio/spectrograms"

def load_fsdd(spectrograms_path):
    x_train = []
    file_paths = []
    for root, _, file_names in os.walk(spectrograms_path):
        for file_name in file_names:
            file_path = os.path.join(root, file_name)
            spectrogram = np.load(file_path) # (n_bins, n_frames, 1)
            x_train.append(spectrogram)
            file_paths.append(file_path)
    x_train = np.array(x_train)
    x_train = x_train[..., np.newaxis] # -> (3000, 256, 64, 1)
    return x_train, file_paths


def select_spectrograms(spectrograms,
                        file_paths,
                        min_max_values,
                        num_spectrograms=2):
    sampled_indexes = np.random.choice(range(len(spectrograms)), num_spectrograms)
    sampled_spectrogrmas = spectrograms[sampled_indexes]
    file_paths = [file_paths[index] for index in sampled_indexes]
    sampled_min_max_values = [min_max_values[file_path] for file_path in
                           file_paths]
    print(file_paths)
    print(sampled_min_max_values)
    return sampled_spectrogrmas, sampled_min_max_values


def save_signals(signals, save_dir, sample_rate=22050):
    for i, signal in enumerate(signals):
        save_path = os.path.join(save_dir, str(i) + ".wav")
        sf.write(save_path, signal, sample_rate)


if __name__ == "__main__":
    # initialise sound generator
    vae = VAE.load("model")
    sound_generator = SoundGenerator(vae, HOP_LENGTH)

    # load spectrograms + min max values
    with open(MIN_MAX_VALUES_PATH, "rb") as f:     # open in reading binary mode
        min_max_values = pickle.load(f)

    specs, file_paths = load_fsdd(SPECTROGRAMS_PATH)

    # sample spectrograms + min max values
    sampled_specs, sampled_min_max_values = select_spectrograms(specs,
                                                                file_paths,
                                                                min_max_values,
                                                                5)

    # generate audio for sampled spectrograms
    signals, _ = sound_generator.generate(sampled_specs,
                                          sampled_min_max_values)

    # convert spectrogram samples to audio
    original_signals = sound_generator.convert_spectrograms_to_audio(
        sampled_specs, sampled_min_max_values)

    # save audio signals
    save_signals(signals, SAVE_DIR_GENERATED)
    save_signals(original_signals, SAVE_DIR_ORIGINAL)



##4. **Evaluation**

In [None]:
!pip install package.  # creating a package of class of vae 

In [None]:
From package import VAE
From package import trainer
From package import preprocessor

In [None]:
    model = VAE(
        input_shape=(28, 28, 1),
        conv_filters=(32, 64, 64, 64),
        conv_kernels=(3, 3, 3, 3),
        conv_strides=(1, 2, 2, 1),
        latent_space_dim=2
    )
    model.summary()