In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import h5py
from data_prep import one_hot_decode
from predictions import define_models
import tensorflow as tf
import keras.backend as K

timesteps = 16
cardinality = 131
latent_dim = 64
avg_song_len = 113


vae_weights_path = "weights/512_64/weights-improvement-2455-1.00.hdf5"

os.environ["CUDA_VISIBLE_DEVICES"]="1"
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)

############################ VAE initalisation ###################################
n_features = 131
timesteps = 16
learning_rate = 0.0001
n_encoder_units = 512
n_decoder_units = n_encoder_units
latent_dim = 64
dropout=0.3
beta=1

Using TensorFlow backend.


In [2]:
f = np.load('songs_encoder_inputs/id-7688.npy')
f.shape, len(f)

((97, 16, 131), 97)

### Some details

The MDN will predict sequences of bars, and the sequences must be equal. Therefore, some preparation must be done. Some songs in the dataset are long and some songs are short, this calls for some decisions to be made.  
  
Here's the solution:  
An [average song length](url=https://www.statcrunch.com/5.0/viewreport.php?groupid=948&reportid=28647) is 226 seconds.  
  
The [average bpm](url=https://learningmusic.ableton.com/make-beats/tempo-and-genre.html) for a song is 120. 

The [semiquaver length](url=http://bradthemad.org/guitar/tempo_explanation.php) is 15 / bpm.  
  
15 / 120 = 0.125 seconds per semiquaver  
0.125 * 16 notes per vector = 2 seconds per bar   
226 / 2 = 113 bars per song.  
 
  
#### But first, let's remove empty lists and too long songs.

In [None]:
"""post_process3 = []
pre_process = []

for path, dirs, files in os.walk("songs_encoder_inputs"):
    for file in files:
        song_len = len(np.load(path + os.sep + file))
        
        # Removing songs shorter than one bar
        if song_len == 0:
            pre_process.append(len(np.load(path + os.sep + file)))
            os.remove(path + os.sep + file)
            print("Removing empty list.")
            
        elif song_len > 3 * avg_song_len:
            pre_process.append(len(np.load(path + os.sep + file)))
            os.remove(path + os.sep + file)
            print("Song too long. {} bars. Removing.".format(song_len))
            
        else: 
            post_process3.append(song_len)
            pre_process.append(len(np.load(path + os.sep + file)))"""

In [None]:
for path, dirs, files in os.walk("songs_encoder_inputs"):
    for file in files:
        song_len = len(np.load(path + os.sep + file))
        if song_len > 80000:
            print(song_len)

## Loading model and weights

In [3]:
# define model
train, infenc, _ = define_models(n_encoder_units=n_encoder_units,
                                        n_decoder_units=n_decoder_units,
                                        latent_dim=latent_dim, 
                                      timesteps=timesteps,
                                      n_features=n_features,
                                      learning_rate=learning_rate,
                                        dropout=dropout,
                                      beta=beta,
                                        epsilon_std=1.)

train.load_weights(vae_weights_path)
print("Loaded VAE weights from disk")

all_weights = train.get_weights()
decoder_position = len(infenc.get_weights())
encoder_weights = all_weights[:decoder_position]

infenc.set_weights(encoder_weights)

Loaded VAE weights from disk


### Get no of files

In [4]:
path, dirs, files = next(os.walk("songs_encoder_inputs"))
file_count = len(files)

file_count

9977

### Check the data

In [5]:
# one song file
f = np.load("songs_encoder_inputs/id-0.npy")

print("Shape of first song file: ", f.shape)
print("No. of slices: ", f.shape[0])
print("Timesteps: ", f.shape[1])
print("One encoded note: ", f[0][0])
print("One decoded bar :", one_hot_decode(f[31]))

Shape of first song file:  (72, 16, 131)
No. of slices:  72
Timesteps:  16
One encoded note:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
One decoded bar : [67, 129, 129, 129, 66, 129, 66, 129, 66, 129, 64, 129, 64, 129, 66, 129]


### Define function to get z-list from song

In [6]:
# generate target given source sequence
def create_z_array(infenc, song, latent_dim):
    z_list = []
    
    for bar in song:
        
        bar = bar.reshape(1, timesteps, cardinality)
        
        # encode
        encoder_output = infenc.predict(bar)

        z = encoder_output[2]
        z = z.reshape(1, 1, latent_dim)
        z_list.append(z)
    
    return np.array(z_list)

### Create the z-dataset

(This takes time)

In [7]:
h5f = h5py.File('z_dataset_pre.h5', 'w')

for i in tqdm(range(file_count)):
    
    song_from_file = np.load("songs_encoder_inputs/id-" + str(i) + ".npy")
    song_len = song_from_file.shape[0]
    
    # reshaping to work as input to lstm
    song = np.array(song_from_file).reshape(song_len, timesteps, cardinality)
    
    # predicting list of z's
    z_array = create_z_array(infenc, song, latent_dim)
    
    # appending list of z's to dataset
    h5f.create_dataset("z_list" + str(i), data=z_array)
    
h5f.close()

100%|██████████| 9977/9977 [4:22:32<00:00,  1.58s/it]  


### Slice the z's into the required sequence lenght for the MDN

In [8]:
SEQ_LEN = 16

hf_zs = h5py.File('z_dataset_pre.h5', 'r')

hf_mdn = h5py.File('mdn_dataset_pre.h5', 'w')

counter = 0
for i in range(len(hf_zs.keys())):
    
    f = hf_zs.get('z_list' + str(i))

    z_i = np.array(f)

    len_list = z_i.shape[0]

    num_steps = int(len_list / SEQ_LEN)
    
    # don't keep short songs
    if num_steps == 0: continue
        
    else:

        #avoiding errors if the number of steps leaves no
        #room for an extra +1 for the target
        if len_list % SEQ_LEN == 0:
            num_steps = num_steps - 1

        idx = 0

        for j in range(num_steps):

            data = z_i[idx : idx + SEQ_LEN]
            target = z_i[idx + 1 : idx + SEQ_LEN + 1]

            hf_mdn.create_dataset("z_x_id-" + str(counter), data=data)
            hf_mdn.create_dataset("z_y_id-" + str(counter), data=target)
            
            counter += 1

            idx += SEQ_LEN
            
hf_zs.close()
hf_mdn.close()

In [9]:
hf_mdn = h5py.File('mdn_dataset_pre.h5', 'r')

print("Number of sliced z's: ", len(list(hf_mdn.keys()))/2)
print("Is the first time step of y the same as the second time step of x? \n" ,\
      list(hf_mdn.get("z_x_id-1"))[1] == list(hf_mdn.get("z_y_id-1"))[0])
hf_mdn.close()

Number of sliced z's:  54415.0
Is the first time step of y the same as the second time step of x? 
 [[[ True  True  True  True  True  True  True  True  True  True  True
    True  True  True  True  True  True  True  True  True  True  True
    True  True  True  True  True  True  True  True  True  True  True
    True  True  True  True  True  True  True  True  True  True  True
    True  True  True  True  True  True  True  True  True  True  True
    True  True  True  True  True  True  True  True  True]]]
