# Deep Music Prior using MusicVAE
For theoretical information see the github repo.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1L3fry0_WJS2f1OtQc3C2uXcDNO9Tldo6?usp=sharing)

## Setup

In [9]:
import glob

BASE_DIR = "gs://download.magenta.tensorflow.org/models/music_vae/colab2"

print('Installing dependencies...')
!apt-get update -qq && apt-get install -qq libfluidsynth1 fluid-soundfont-gm build-essential libasound2-dev libjack-dev
!pip install -q pyfluidsynth
!pip install -qU magenta

# Hack to allow python to pick up the newly-installed fluidsynth lib.
# This is only needed for the hosted Colab environment.
import ctypes.util
orig_ctypes_util_find_library = ctypes.util.find_library
def proxy_find_library(lib):
  if lib == 'fluidsynth':
    return 'libfluidsynth.so.1'
  else:
    return orig_ctypes_util_find_library(lib)
ctypes.util.find_library = proxy_find_library


print('Importing libraries and defining some helper functions...')
import magenta.music as mm
from magenta.models.music_vae import configs
from magenta.models.music_vae.trained_model import TrainedModel

import numpy as np
import tensorflow.compat.v1 as tf

import os
from google.colab import files
from time import perf_counter
from copy import deepcopy
from tqdm import tqdm

# tf.disable_v2_behavior()

# Necessary until pyfluidsynth is updated (>1.2.5).
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

def play(note_sequence):
  mm.plot_sequence(note_sequence)
  mm.play_sequence(note_sequence, synth=mm.fluidsynth)

def download(note_sequence, filename):
  mm.sequence_proto_to_midi_file(note_sequence, filename)
  files.download(filename)

print('Done')

Installing dependencies...
Importing libraries and defining some helper functions...
Done


## Generate MusicVAE melodies experiments

In [10]:
mel_16bar_models = {}
hierdec_mel_16bar_config = configs.CONFIG_MAP['hierdec-mel_16bar']
mel_16bar_models['hierdec_mel_16bar'] = TrainedModel(hierdec_mel_16bar_config, batch_size=4, checkpoint_dir_or_path=BASE_DIR + '/checkpoints/mel_16bar_hierdec.ckpt')

flat_mel_16bar_config = configs.CONFIG_MAP['flat-mel_16bar']
mel_16bar_models['baseline_flat_mel_16bar'] = TrainedModel(flat_mel_16bar_config, batch_size=4, checkpoint_dir_or_path=BASE_DIR + '/checkpoints/mel_16bar_flat.ckpt')

INFO:tensorflow:Building MusicVAE model with BidirectionalLstmEncoder, HierarchicalLstmDecoder, and hparams:
{'max_seq_len': 256, 'z_size': 512, 'free_bits': 256, 'max_beta': 0.2, 'beta_rate': 0.0, 'batch_size': 4, 'grad_clip': 1.0, 'clip_mode': 'global_norm', 'grad_norm_clip_to_zero': 10000, 'learning_rate': 0.001, 'decay_rate': 0.9999, 'min_learning_rate': 1e-05, 'conditional': True, 'dec_rnn_size': [1024, 1024], 'enc_rnn_size': [2048, 2048], 'dropout_keep_prob': 1.0, 'sampling_schedule': 'constant', 'sampling_rate': 0.0, 'use_cudnn': False, 'residual_encoder': False, 'residual_decoder': False, 'control_preprocessing_rnn_size': [256]}
INFO:tensorflow:
Encoder Cells (bidirectional):
  units: [2048, 2048]

INFO:tensorflow:
Hierarchical Decoder:
  input length: 256
  level output lengths: [16, 16]

INFO:tensorflow:
Decoder Cells:
  units: [1024, 1024]



  name=name),
  return layer.apply(inputs)
  self._names["W"], [input_size + self._num_units, self._num_units * 4])
  initializer=tf.constant_initializer(0.0))
  kernel_initializer=tf.random_normal_initializer(stddev=0.001))
  kernel_initializer=tf.random_normal_initializer(stddev=0.001))


INFO:tensorflow:Restoring parameters from gs://download.magenta.tensorflow.org/models/music_vae/colab2/checkpoints/mel_16bar_hierdec.ckpt
INFO:tensorflow:Building MusicVAE model with BidirectionalLstmEncoder, CategoricalLstmDecoder, and hparams:
{'max_seq_len': 256, 'z_size': 512, 'free_bits': 256, 'max_beta': 0.2, 'beta_rate': 0.0, 'batch_size': 4, 'grad_clip': 1.0, 'clip_mode': 'global_norm', 'grad_norm_clip_to_zero': 10000, 'learning_rate': 0.001, 'decay_rate': 0.9999, 'min_learning_rate': 1e-05, 'conditional': True, 'dec_rnn_size': [2048, 2048, 2048], 'enc_rnn_size': [2048, 2048], 'dropout_keep_prob': 1.0, 'sampling_schedule': 'constant', 'sampling_rate': 0.0, 'use_cudnn': False, 'residual_encoder': False, 'residual_decoder': False, 'control_preprocessing_rnn_size': [256]}
INFO:tensorflow:
Encoder Cells (bidirectional):
  units: [2048, 2048]

INFO:tensorflow:
Decoder Cells:
  units: [2048, 2048, 2048]

INFO:tensorflow:Restoring parameters from gs://download.magenta.tensorflow.org/m

In [11]:
#@title Generate 4 samples from the selected model prior.
mel_sample_model = "hierdec_mel_16bar" #@param ["hierdec_mel_16bar", "baseline_flat_mel_16bar"]
temperature = 0.5 #@param {type:"slider", min:0.1, max:1.5, step:0.1}
mel_16_samples = mel_16bar_models[mel_sample_model].sample(n=4, length=256, temperature=temperature)
for ns in mel_16_samples:
  # mm.plot_sequence(ns)
  play(ns)

In [12]:
model = TrainedModel(hierdec_mel_16bar_config, batch_size=1, checkpoint_dir_or_path=BASE_DIR + '/checkpoints/mel_16bar_hierdec.ckpt')

INFO:tensorflow:Building MusicVAE model with BidirectionalLstmEncoder, HierarchicalLstmDecoder, and hparams:
{'max_seq_len': 256, 'z_size': 512, 'free_bits': 256, 'max_beta': 0.2, 'beta_rate': 0.0, 'batch_size': 1, 'grad_clip': 1.0, 'clip_mode': 'global_norm', 'grad_norm_clip_to_zero': 10000, 'learning_rate': 0.001, 'decay_rate': 0.9999, 'min_learning_rate': 1e-05, 'conditional': True, 'dec_rnn_size': [1024, 1024], 'enc_rnn_size': [2048, 2048], 'dropout_keep_prob': 1.0, 'sampling_schedule': 'constant', 'sampling_rate': 0.0, 'use_cudnn': False, 'residual_encoder': False, 'residual_decoder': False, 'control_preprocessing_rnn_size': [256]}
INFO:tensorflow:
Encoder Cells (bidirectional):
  units: [2048, 2048]

INFO:tensorflow:
Hierarchical Decoder:
  input length: 256
  level output lengths: [16, 16]

INFO:tensorflow:
Decoder Cells:
  units: [1024, 1024]



  name=name),
  return layer.apply(inputs)
  self._names["W"], [input_size + self._num_units, self._num_units * 4])
  initializer=tf.constant_initializer(0.0))


INFO:tensorflow:Restoring parameters from gs://download.magenta.tensorflow.org/models/music_vae/colab2/checkpoints/mel_16bar_hierdec.ckpt


  kernel_initializer=tf.random_normal_initializer(stddev=0.001))
  kernel_initializer=tf.random_normal_initializer(stddev=0.001))


### Sampled Z with increasing variance 
The music structure get worse as we increase the varience, and the worst when z sampled from the uniform distribution.

In [13]:
for i in range(1, 1000, 100):
  latent = tf.random.normal(shape=(4, 512), stddev=i)
  note_seq = model.decode(latent, length=256)[0]
  print(f"Sampled Z from Gaussian with mean: 0, std: {i}")
  play(note_seq)
  
uniform_latent = tf.random.uniform(shape=(4, 512))
note_seq = model.decode(uniform_latent, length=256)[0]
print("Sampled Z uniformly")
play(note_seq)

print("Z = zeros")
latent_code = tf.get_variable(name='latent_code', shape=(4, 512), dtype='float32', initializer=tf.initializers.zeros())
note_seq = model.decode(uniform_latent, length=256)[0]
play(note_seq)

Sampled Z from Gaussian with mean: 0, std: 1


Sampled Z from Gaussian with mean: 0, std: 101


Sampled Z from Gaussian with mean: 0, std: 201


Sampled Z from Gaussian with mean: 0, std: 301


Sampled Z from Gaussian with mean: 0, std: 401


Sampled Z from Gaussian with mean: 0, std: 501


Sampled Z from Gaussian with mean: 0, std: 601


Sampled Z from Gaussian with mean: 0, std: 701


Sampled Z from Gaussian with mean: 0, std: 801


Sampled Z from Gaussian with mean: 0, std: 901


Sampled Z uniformly


Z = zeros


### Test reconstruction capabilities

In [14]:
latent = tf.random.normal(shape=(4, 512))
note_seq = model.decode(latent, length=256)
play(note_seq[0])

In [15]:
encoded = model.encode(note_seq)

In [16]:
play(model.decode(encoded[0], length=256)[0])

In [17]:
import tensorflow_probability as tfp
from magenta.models.music_vae.base_model import MusicVAE
from magenta.models.music_vae.lstm_models import BaseLstmDecoder

ds = tfp.distributions

# BaseLstmDecoder()
# r_loss, metric_map = MusicVAE.decoder.reconstruction_loss(x_input, x_target, x_length, z, control_sequence)[0:2]

### ...


In [18]:
# Seems it generates the latent vector in the constructor
z = mel_16bar_models[mel_sample_model]._z
mel_16bar_models[mel_sample_model]._z.shape

TensorShape([4, 512])

In [19]:
rand_latent = tf.random.uniform(shape=(4, 512))
model = mel_16bar_models['hierdec_mel_16bar']
note_seq = model.decode(rand_latent, length=256)

In [20]:
print(len(note_seq))
play(note_seq[3])

4


In [21]:
download(note_seq[1], "bad_one")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Melody Prior  - fine tuning

In [None]:
from magenta.models.music_vae.trained_model import TrainedModel
from magenta.models.music_vae.base_model import MusicVAE

hierdec_mel_16bar_config.data_converter.set_mode('train')


## Melody Prior - latent optimization

In [None]:
class ReconstructionModel(object):
  # todo?
  pass

In [None]:
def optimize_latent_codes(model,
													target,
													input_signal_size,
													reconstruct_sig_size,
													latents_dir,
													reconstructions_dir,
													midi_dir,
													optimizer_algo='adam',
													learning_rate=1e-3,
													total_iterations=1000,
													piece_name="reconstruction"
													):
	
	latent_code = tf.get_variable(
		name='latent_code', shape=(1, 18, 512), dtype='float32', initializer=tf.initializers.zeros()
	)

	generated_notes = model.encode(latent_code)
	generated_notes_for_display = deepcopy(generated_notes)
	# todo: convert notes sequence into tensors

	# todo
	target_sig = tf.placeholder(tf.float32, [None, input_signal_size[0], input_signal_size[1], 3])

	# Perceptual loss
	target_sig_resized = tf.image.resize_images(
		target_sig, tuple(reconstruct_sig_size), method=tf.image.ResizeMethod.NEAREST_NEIGHBOR
	)
	generated_sig_resized = tf.image.resize_images(
		generated_notes, tuple(reconstruct_sig_size), method=tf.image.ResizeMethod.NEAREST_NEIGHBOR
	)

	recon_model = ReconstructionModel(sig_size=reconstruct_sig_size)
	generated_sig_features = recon_model(generated_sig_resized)
	target_sig_features = recon_model(target_sig_resized)

	loss_op = tf.reduce_mean(tf.abs(generated_sig_features - target_sig_features))

	if optimizer_algo == 'adam':
		optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
	else:
		optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)

	train_op = optimizer.minimize(loss_op, var_list=[latent_code])

	sess = tf.get_default_session()
	
	sess.run(tf.variables_initializer([latent_code] + optimizer.variables()))

	progress_bar_iterator = tqdm(
		iterable=range(total_iterations),
		bar_format='{desc}: {percentage:3.0f}% |{bar}| {n_fmt}/{total_fmt}{postfix}',
		desc=piece_name
	)

	for i in progress_bar_iterator:
		loss, _ = sess.run(
			fetches=[loss_op, train_op],
			feed_dict={
				target_sig: target[np.newaxis, ...]
			}
		)

	progress_bar_iterator.set_postfix_str('loss=%.2f' % loss)

	reconstructed_sigs, latent_codes = sess.run(
		fetches=[generated_notes_for_display, latent_code],
		feed_dict={
			target_sig: target[np.newaxis, ...]
		}
	)

	play(reconstructed_sigs[0])
	np.savez(file=os.path.join(latents_dir, piece_name + '.npz'), latent_code=latent_codes[0])


In [None]:
# Start with the general structure
def invert_signal(degradation_mode, target, outdir, num_steps=1000):
  # setup
  # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  
  # todo: degration according degradation_mode

  start_time = perf_counter()
  optimization_steps = optimize_latent_codes(target, num_steps=num_steps)
  print (f'Elapsed: {(perf_counter()-start_time):.1f} s')

  inverted_latent = optimization_steps[-1]
  np.savez(f'{outdir}/inverted_latent.npz', latent=inverted_latent.unsqueeze(0).cpu().numpy())

In [8]:
model.decode.

<bound method TrainedModel.decode of <magenta.models.music_vae.trained_model.TrainedModel object at 0x7f2cfd475890>>