In [1]:
import os
from glob import glob
import numpy as np
from tqdm import tqdm

from models.VAE import VariationalAutoencoder
from keras.preprocessing.image import ImageDataGenerator

Using TensorFlow backend.


In [2]:
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [3]:
import os
import glob

In [5]:
#spectrogram dimensions
# 336, 1024
INPUT_DIM = (336,1024,1)
INPUT_DIM[:2]

(336, 1024)

In [6]:
# run params
section = 'music'
run_id = '0001'
data_name = 'musicdata'
RUN_FOLDER = 'musicVAE/{}/'.format(section)
RUN_FOLDER += '_'.join([run_id, data_name])

if not os.path.exists(RUN_FOLDER):
    os.mkdir(RUN_FOLDER)
    os.mkdir(os.path.join(RUN_FOLDER, 'viz'))
    os.mkdir(os.path.join(RUN_FOLDER, 'images'))
    os.mkdir(os.path.join(RUN_FOLDER, 'weights'))

mode =  'build' #'load' #

In [33]:
nb_train_samples = 807
batch_size = 1
epochs = 100
path_to_data = './musicVAE/data/'

In [15]:
source_images = []
path = path_to_data
for filename in tqdm(glob.glob(os.path.join(path, '*.npy'))):
    source_images.append(os.path.join(filename))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 807/807 [00:00<00:00, 398399.64it/s]


In [18]:
len(source_images)

807

In [19]:
from random import shuffle
shuffle(source_images)
len(source_images)

807

In [51]:
def data_gen():
    while True:
        for start in range(0, nb_train_samples, batch_size):
            x_batch = []
            y_batch = []
            end = min(start + batch_size, nb_train_samples)
            for img_path in range(start, end):
                img = np.load(source_images[img_path]).astype('float64')
                img = np.reshape(img,(img.shape[0],img.shape[1],1))
                x_batch.append(img)
                y_batch.append(["0"])
            yield (np.array(x_batch), np.array(y_batch))

In [89]:
vae = VariationalAutoencoder(
                input_dim = INPUT_DIM
                , encoder_conv_filters=[32,32,32, 32]
                , encoder_conv_kernel_size=[3,3,3,3]
                , encoder_conv_strides=[2,2,2,2]
                , decoder_conv_t_filters=[32,32,32,1]
                , decoder_conv_t_kernel_size=[3,3,3,3]
                , decoder_conv_t_strides=[2,2,2,2]
                , z_dim=200
                , use_batch_norm=True
                , use_dropout=True)

if mode == 'build':
    vae.save(RUN_FOLDER)
else:
    vae.load_weights(os.path.join(RUN_FOLDER, 'weights/weights.h5'))

In [90]:
vae.encoder.summary()

Model: "model_38"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_input (InputLayer)      (None, 336, 1024, 1) 0                                            
__________________________________________________________________________________________________
encoder_conv_0 (Conv2D)         (None, 168, 512, 32) 320         encoder_input[0][0]              
__________________________________________________________________________________________________
batch_normalization_68 (BatchNo (None, 168, 512, 32) 128         encoder_conv_0[0][0]             
__________________________________________________________________________________________________
leaky_re_lu_68 (LeakyReLU)      (None, 168, 512, 32) 0           batch_normalization_68[0][0]     
___________________________________________________________________________________________

In [91]:
vae.decoder.summary()

Model: "model_39"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
decoder_input (InputLayer)   (None, 200)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 43008)             8644608   
_________________________________________________________________
reshape_10 (Reshape)         (None, 21, 64, 32)        0         
_________________________________________________________________
decoder_conv_t_0 (Conv2DTran (None, 42, 128, 32)       9248      
_________________________________________________________________
batch_normalization_72 (Batc (None, 42, 128, 32)       128       
_________________________________________________________________
leaky_re_lu_72 (LeakyReLU)   (None, 42, 128, 32)       0         
_________________________________________________________________
dropout_72 (Dropout)         (None, 42, 128, 32)       0  

In [92]:
LEARNING_RATE = 0.0005
R_LOSS_FACTOR = 10000
EPOCHS = 500
PRINT_EVERY_N_BATCHES = 100
INITIAL_EPOCH = 0

In [93]:
vae.compile(LEARNING_RATE, R_LOSS_FACTOR)

In [None]:
vae.train_with_generator(     
    data_gen(),
    epochs= epochs,
    steps_per_epoch= nb_train_samples // batch_size
    , run_folder = RUN_FOLDER
    , print_every_n_batches = PRINT_EVERY_N_BATCHES
    , initial_epoch = INITIAL_EPOCH
)

Epoch 1/100
 58/807 [=>............................] - ETA: 40:16 - loss: 2415.6760 - vae_r_loss: 2021.8387 - vae_kl_loss: 393.8373