# WGAN-GP with DCGAN layers
Code is mainly based upon the DCGAN implementation in the TensorFlow tutorials

In [1]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf
#import memory_saving_gradients
# monkey patch tf.gradients to point to our custom version, with automatic checkpoint selection
#tf.__dict__["gradients"] = memory_saving_gradients.gradients_memory
import librosa
import os
import functools
import subprocess
import time
import numpy as np
import matplotlib.pyplot as plt
import PIL
#import imageio
#import simpleaudio as sa
import math
import gc
from IPython import display
import time
import scipy.io.wavfile as wavfile
from IPython.display import clear_output, Image, display, HTML
from tensorflow.layers import dense, flatten
from tensorflow.nn import relu, leaky_relu
from tensorflow import tanh
from tensorflow.image import ResizeMethod
#from google.cloud import storage
#import google.auth
#from google.auth import compute_engine

resize_images = lambda x, size: tf.transpose(tf.image.resize_bilinear(tf.transpose(x, [0, 2, 3, 1]), size, align_corners=True), [0, 3, 1, 2])
conv2d_transpose = functools.partial(tf.layers.conv2d_transpose, padding='same', data_format='channels_first')
conv2d = functools.partial(tf.layers.conv2d, padding='same', data_format='channels_first')
batch_norm = functools.partial(tf.layers.batch_normalization, axis=1)
layer_norm = lambda x: tf.transpose(tf.contrib.layers.layer_norm(tf.transpose(x, [0, 2, 3, 1])), [0, 3, 1, 2])
pool = functools.partial(tf.nn.pool, window_shape=(2, 2), pooling_type='AVG', padding='SAME', data_format='NCHW')

In [2]:
def playAudio(audio, sr):
    audio = audio.astype(np.int16)
    play_obj = sa.play_buffer(audio, 1, 2, sr)
    play_obj.wait_done()

# Hyperparameters

In [3]:
# Number for large nsynth-train dataset
TOTAL_NUM = 102165
# Number for small nsynth-test dataset
# TOTAL_NUM = 1689
BUFFER_SIZE = 2048
BATCH_SIZE = 64
PREFETCH_BUFFER_SIZE = 2 * BATCH_SIZE
# EPOCHS = 150
LAMBDA = 10
ALPHA = 0.0002
BETA1 = 0.5
BETA2 = 0.999
CRITIC_UPDATES_PER_GEN_UPDATE = 5
noise_dim = 100
num_steps = 20
num_examples_to_generate = 16
padded_spec_dim = (1, 512, 256)
spec_dim = (1, 512, 251)
epoch_proportion_counter = 0.0
model_dir = 'gs://jz-model-checkpoints/gan-tpu/'

# Defining models

In [4]:
def generate_images(images, source='fake', save=True):
    # make sure the training parameter is set to False because we
    # don't want to train the batchnorm layer when doing inference.
    
    if(source=='fake'):
        disp_images = images['fake_images']
    elif(source=='real'):
        disp_images = images['real_images']
    else:
        raise ValueError
    fig = plt.figure(figsize=(4,4))
    
    for i in range(16):
        plt.subplot(4, 4, i+1)
        plt.imshow(disp_images[i, 0, :, :] * 127.5, cmap="magma", origin="lower", aspect="auto")
        plt.axis('off')
    if(save):
        plt.savefig('images/image_at_epoch_{:.2f}.png'.format(epoch_counter))
    plt.show()

In [5]:
def residBlockUpscale(num, x, num_filters, filter_size=5, strides=(2, 2), 
                      num_filters_intermed=None, training=True):
    with tf.variable_scope('resid-block-upscale-{}'.format(num)):
        if(num_filters_intermed is None):
            num_filters_intermed = num_filters
        x_orig = resize_images(x, (x.shape[2]*strides[0], x.shape[3]*strides[1]))
        x_orig = conv2d(x_orig, num_filters, 1, 1)
        x = batch_norm(x, training=training)
        x = relu(x)
        x = resize_images(x, (x.shape[2]*strides[0], x.shape[3]*strides[1]))
        x = conv2d(x, num_filters_intermed, filter_size, 1, use_bias=False)
        x = batch_norm(x, training=training)
        x = relu(x)
        x = conv2d(x, num_filters, filter_size, 1, use_bias=False)
        x = x_orig + x
        return x

def residBlockDownscale(num, x, num_filters, filter_size=5, strides=(2, 2), 
                        num_filters_intermed=None, training=True):
    with tf.variable_scope('resid-block-downscale-{}'.format(num)):
        if(num_filters_intermed is None):
            num_filters_intermed = num_filters
        x_orig = pool(x, window_shape=strides)
        x_orig = conv2d(x_orig, num_filters, 1, 1)
        x = layer_norm(x)
        x = relu(x)
        x = conv2d(x, num_filters_intermed, filter_size, 1)
        x = layer_norm(x)
        x = relu(x)
        x = conv2d(x, num_filters_intermed, filter_size, 1)
        x = pool(x)
        x = x_orig + x
        return x
    
def upsampleConvBatchNorm(num, x, num_filters, filter_size=5, strides=(2, 2), training=True):
    with tf.variable_scope('upsample-conv-batch-norm-{}'.format(num)):
        x = resize_images(x, (x.shape[2]*strides[0], x.shape[3]*strides[1]))
        x = conv2d(x, num_filters, filter_size, 1, use_bias=False)
        x = batch_norm(x, training=training)
        x = relu(x)
        return x

def convLayerNorm(num, x, num_filters, filter_size=5, strides=(2, 2), training=True):
    with tf.variable_scope('conv-layer-norm-{}'.format(num)):
        x = conv2d(x, num_filters, filter_size)
        x = layer_norm(x)
        x = leaky_relu(x)
        return x

In [6]:
def generator(x, training=True):
    # Input x is noise vector
    with tf.variable_scope('generator', reuse=tf.AUTO_REUSE):
        # x_orig = tf.transpose(tf.image.resize_bicubic(tf.transpose(x, [0, 2, 3, 1]), 512*256), [0, 3, 1, 2])
        x = dense(x, 256*32*16)
        # MAKE SURE SHAPES ARE EQUAL, OR ELSE SIZE -1 WILL CAUSE BATCH SIZE MISMATCH
        x = tf.reshape(x, shape = (-1, 256, 32, 16))
        x = upsampleConvBatchNorm(1, x, 256, training=training)
        x = upsampleConvBatchNorm(2, x, 256, training=training)
        x = upsampleConvBatchNorm(3, x, 128, training=training)
        x = upsampleConvBatchNorm(4, x, 128, training=training)
        x = batch_norm(x, training=training)
        x = relu(x)
        x = conv2d(x, 3, 5, 1)
        x = tanh(x)
        x = x[:, :, :, 0:251]
        return x
    
def critic(x, training=True):
    with tf.variable_scope('critic', reuse=tf.AUTO_REUSE):
        x = convLayerNorm(1, x, 128, strides=(1, 1), training=training)
        x = convLayerNorm(2, x, 128, training=training)
        x = convLayerNorm(3, x, 256, training=training)
        x = convLayerNorm(4, x, 256, training=training)
        x = convLayerNorm(5, x, 256, training=training)
        x = flatten(x)
        x = dense(x, 1)
        return x

# Making a TPUEstimator

In [7]:
def record_parser(raw_data):
    read_features = {
            'note': tf.FixedLenFeature([], dtype=tf.int64),
            'note_str': tf.FixedLenFeature([], dtype=tf.string),
            'instrument': tf.FixedLenFeature([], dtype=tf.int64),
            'instrument_str': tf.FixedLenFeature([], dtype=tf.string),
            'pitch': tf.FixedLenFeature([], dtype=tf.int64),
            'velocity': tf.FixedLenFeature([], dtype=tf.int64),
            'sample_rate': tf.FixedLenFeature([], dtype=tf.int64),
            'spectrogram': tf.FixedLenFeature([385536], dtype=tf.float32),
            #'spectrogram': tf.FixedLenFeature([128512], dtype=tf.float32),
            'instrument_family': tf.FixedLenFeature([], dtype=tf.int64),
            'instrument_family_str': tf.FixedLenFeature([], dtype=tf.string),
            'instrument_source': tf.FixedLenFeature([], dtype=tf.int64),
            'instrument_source_str': tf.FixedLenFeature([], dtype=tf.string)
    }
    return tf.reshape(tf.parse_single_example(serialized=raw_data, features=read_features)['spectrogram'], (3, 512, 251))

params = {
    'batch_size': BATCH_SIZE,
}

def input_fn(params):
    with tf.variable_scope('input-pipeline'):
        # Reading features of TFRecord file
        files = tf.data.Dataset.list_files('gs://jz-datasets/spec-files/*.tfrecord')
        specs = files.apply(tf.data.experimental.parallel_interleave(tf.data.TFRecordDataset, cycle_length=2))
        specs = specs.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=BUFFER_SIZE))
        # specs = specs.map(map_func=(lambda raw_data: tf.reshape(tf.parse_single_example(serialized=raw_data, features=read_features)['spectrogram'], spec_dim)), num_parallel_calls=-1)
        specs = specs.apply(tf.data.experimental.map_and_batch(
            map_func=lambda x: (record_parser(x), tf.zeros(params['batch_size'])), 
            num_parallel_calls=-1, 
            batch_size=params['batch_size'], 
            drop_remainder=True))
        specs = specs.prefetch(buffer_size=2*params['batch_size'])
        return specs
    
def model_fn(features, labels, mode, params):
    global_step = tf.train.get_or_create_global_step()
    with tf.variable_scope('runs'):
        real_images = features
        noise = tf.random_normal([int(BATCH_SIZE/8), noise_dim])
        fake_images = generator(noise)
        critic_real = critic(real_images)
        critic_fake = critic(fake_images)
        # keeping the random vector constant for generation (prediction) so
        # it will be easier to see the improvement of the gan.
    random_vector_for_generation = lambda: tf.Variable(
        tf.random_normal([16, noise_dim]), 
        trainable=False, 
        name='constant-latent-space'
    )
    if mode == tf.estimator.ModeKeys.PREDICT:
        test_images = {
            'fake_images': generator(random_vector_for_generation, training=False),
            'real_images': features
        }
        return tf.contrib.tpu.TPUEstimatorSpec(mode, predictions=test_images)
    with tf.variable_scope('costs'):
            gen_cost = -tf.reduce_mean(critic_fake)
            critic_cost_naive = tf.reduce_mean(critic_fake)-tf.reduce_mean(critic_real)

            with tf.variable_scope('gradient-penalty'):
                alpha = tf.random_uniform(shape=[int(BATCH_SIZE/8), spec_dim[0], spec_dim[1], spec_dim[2]], minval=0., maxval=1.)
                differences = fake_images-real_images
                interpolates = real_images+(alpha*differences)
                gradients = tf.gradients(critic(interpolates), [interpolates])[0]
                slopes = tf.sqrt(tf.reduce_sum(tf.square(gradients), reduction_indices=[1]))
                gradient_penalty = tf.reduce_mean((slopes-1.)**2)
                critic_cost = critic_cost_naive + LAMBDA*gradient_penalty

            costs = {
                'gen_cost': gen_cost,
                'critic_cost': critic_cost
            }

    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.contrib.tpu.TPUEstimatorSpec(mode, loss=0) # , eval_metric_ops=costs)

    if mode == tf.estimator.ModeKeys.TRAIN:
        with tf.variable_scope('optimizers'):
            with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
                gen_opt = tf.contrib.tpu.CrossShardOptimizer(tf.train.AdamOptimizer(ALPHA, BETA1, BETA2)).minimize(gen_cost, global_step=global_step, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='runs/generator'))
                critic_opt = tf.contrib.tpu.CrossShardOptimizer(tf.train.AdamOptimizer(ALPHA, BETA1, BETA2)).minimize(critic_cost, global_step=global_step, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='runs/critic'))
            opt = tf.cond(
                tf.equal(tf.mod(tf.train.get_or_create_global_step(), CRITIC_UPDATES_PER_GEN_UPDATE+1), CRITIC_UPDATES_PER_GEN_UPDATE+1), 
                lambda: gen_opt, 
                lambda: critic_opt
            )
        return tf.contrib.tpu.TPUEstimatorSpec(mode, loss=critic_cost, train_op=opt)

    return
    #return generator, gen_opt, critic_opt, real_images, test_images, ranEpoch, getEpoch, increment, merged, global_step

In [8]:
cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
    tpu=["asianzhang812",], 
    zone="us-central1-f", 
    project="jz-cloud-test"
)

tpu_run_config = tf.contrib.tpu.RunConfig(
    cluster=cluster_resolver, 
    model_dir=model_dir,
    session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True),
    tpu_config=tf.contrib.tpu.TPUConfig(num_steps, 8)
)  

In [9]:
def runOneEpoch(model, epoch_proportion=1):
    start = time.time()
    steps_to_take = np.ceil((epoch_proportion_counter+epoch_proportion) * float(TOTAL_NUM)/BATCH_SIZE)# - sess.run(global_step)
    
    model.train(input_fn, steps=num_steps)
    
    epoch_counter += epoch_proportion
    clear_output(wait=True)
    print("Finished epoch {:.2f}".format(epoch_counter))

    generate_images(model.predict(input_fn)['images'])
    # saving (checkpoint) the model every 15 epochs
    #if (epoch + 1) % 15 == 0:
        #checkpoint.save(file_prefix = checkpoint_prefix)

    print('Time taken {} sec'.format(time.time()-start))

In [10]:
def testAudio(batch, prefix):
    specs = sess.run(batch)
    for i in range(specs.shape[0]):
        spec = specs[i]
        mag = spec[:, :, 0]
        angles = spec[:, :, 1]
        mag = ((mag+1)/2)*48-32
        angles = angles*math.pi
        ft =(np.exp(mag)-1.2664166e-14)*np.exp(1j*angles)
        newaudio = librosa.istft(ft, 512, 2048).astype(np.int16)
        #print('Generated audio')
        #print('Interval of audio: [{}, {}]'.format(np.amin(newaudio), np.amax(newaudio)))
        wavfile.write("audio/{}-{}.wav".format(prefix, i), 16000, newaudio)

# Running the model

In [11]:
model = tf.contrib.tpu.TPUEstimator(
    model_fn=model_fn, 
    config=tpu_run_config, 
    use_tpu=True, 
    train_batch_size=BATCH_SIZE, 
    predict_batch_size=16
)
global_step = tf.train.get_or_create_global_step()

INFO:tensorflow:Using config: {'_tf_random_seed': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f21e44642b0>, '_evaluation_master': 'grpc://10.240.1.4:8470', '_save_checkpoints_steps': None, '_master': 'grpc://10.240.1.4:8470', '_train_distribute': None, '_session_config': allow_soft_placement: true
log_device_placement: true
cluster_def {
  job {
    name: "worker"
    tasks {
      value: "10.240.1.4:8470"
    }
  }
}
, '_num_ps_replicas': 0, '_save_summary_steps': 100, '_num_worker_replicas': 1, '_task_type': 'worker', '_tpu_config': TPUConfig(iterations_per_loop=20, num_shards=8, num_cores_per_replica=None, per_host_input_for_training=2, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_log_step_count_steps': None, '_eval_distribute': None, '_experimental_distribute': None, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_secs': 600, '_protocol': None, '_is_chief': True, '_model_dir': 'gs://jz-model-

In [None]:
# Waits for another program to remove the .lock file before continuing execution
!rm .lock
# Comment out next line to disable lock
# !touch .lock
while(subprocess.check_output('test -f .lock ; echo $?', shell=True).decode('utf-8')[0]=='0'):
    time.sleep(5)
print('Unlocked', end='\r')

while True:
        runOneEpoch(model, 0.0001)

rm: cannot remove '.lock': No such file or directory
INFO:tensorflow:Querying Tensorflow master (grpc://10.240.1.4:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 15343209410290932223)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 9001183881914023182)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_GPU:0, XLA_GPU, 17179869184, 11194082492982754393)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 10262883742944767923)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 12561902318491508

In [None]:
fake_images, real_images = model.predict(input_fn)
print(fake_images)
print(type(prediction))
generate_images(model.predict(input_fn), source='real', save=False)

In [None]:
print(sess.run(images).shape)

In [None]:
testAudio(images, "real")
testAudio(generated, "fake")

In [None]:
while(kernel is dead):
    plant_a_new_seed()