In [1]:
import sys
import os
import tensorflow as tf
import numpy as np
import librosa
import math
import time
sys.path.append('..')
from wavenet.model import WaveNetModel
from wavenet.ops import mu_law_decode, mu_law_encode
from IPython.display import Audio
from time import time

In [2]:
tf.reset_default_graph()
batch_size = 1
filter_width = 3
n_stack = 2
max_dilation = 7
dilations = [2 ** i for j in range(n_stack) for i in range(max_dilation)]

residual_channels, dilation_channels, skip_channels = 128, 128, 128
use_biases = True
quantization_channels = 256
gc_cardinality = None
gc_channels = None
scalar_input = False
initial_filter_width = 2

net = WaveNetModel(batch_size=batch_size,
                        dilations=dilations,
                        filter_width=filter_width,
                        scalar_input=scalar_input,
                        initial_filter_width=initial_filter_width,
                        residual_channels=residual_channels,
                        dilation_channels=dilation_channels,
                        quantization_channels=quantization_channels,
                        skip_channels=skip_channels,
                        global_condition_channels=gc_channels,
                        global_condition_cardinality=gc_cardinality,
                        use_biases=use_biases)



# For training
input_placeholder = tf.placeholder(tf.float32, shape=(None, None))

waveform = tf.placeholder(tf.int32)
decode = mu_law_decode(waveform, net.quantization_channels)

quantized = mu_law_encode(input_placeholder, net.quantization_channels)

loss = net.loss(input_placeholder, None, None)
optimizer = tf.train.AdamOptimizer(0.001)
optim = optimizer.minimize(loss, var_list=tf.trainable_variables())

# For generation
generation_batch_size = 1
sample_placeholder = tf.placeholder(tf.int32)
gen_num = tf.placeholder(tf.int32)

next_sample_prob, layers_out, qs = \
    net.predict_proba_incremental(sample_placeholder, gen_num, batch_size=generation_batch_size)

initial = tf.placeholder(tf.float32)
others = tf.placeholder(tf.float32)
update_q_ops = net.create_update_q_ops(qs, initial, others, gen_num, batch_size=generation_batch_size)

var_q = net.get_vars_of_q()

bins = np.linspace(-1, 1, quantization_channels)

print("created.")

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

created.


In [3]:
sr = 22050
data = librosa.load("voice.wav", sr=sr)[0]
data = np.pad(data.reshape(-1, 1), [[net.receptive_field, 0], [0, 0]],'constant') 
data = data.reshape(-1)
n_samples = data.shape[0]

In [4]:
Audio(data, rate=sr)

In [5]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.12)

with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(10000):
        _loss, _ , targets = sess.run([loss, optim, quantized], 
                                      feed_dict={input_placeholder:data.reshape(1, -1)})
        
        if i % 100 == 0:
            print(i, _loss)
        
        if _loss < 1e-3:
            print("Start to generate...")
            
            t = time()
            
            sess.run(tf.variables_initializer(var_q))
            n_seeds = net.receptive_field + 500
            seeds = targets[:, :n_seeds]
            samples = []
            for j in range(n_seeds):
                feed_dict = {sample_placeholder:seeds[:,j], gen_num:j}
                prob, _layers = sess.run([next_sample_prob, layers_out], feed_dict=feed_dict)
                sess.run(update_q_ops, feed_dict={initial:_layers[0], others:np.array(_layers[1:]), gen_num:j})
                samples.append(seeds[:,j])

            for j in range(n_seeds, n_samples):
                if j % 1000 == 0:
                    print('\t generated samples', j)
                feed_dict = {sample_placeholder:samples[-1], gen_num:j}
                prob, _layers = sess.run([next_sample_prob, layers_out], feed_dict=feed_dict)
                sess.run(update_q_ops, feed_dict={initial:_layers[0], others:np.array(_layers[1:]), gen_num:j})
                next_sample = np.argmax(prob[0], axis=-1)
                samples.append(np.array([next_sample]))
            
            samples = np.array(samples).reshape(-1)
            wav = sess.run(decode, feed_dict={waveform:samples})
            
            elapsed = time()-t
            print("elapsed:", time()-t)
            break

0 5.56841
100 0.0263413
200 0.00309637
300 0.00160965
400 0.00111115
Start to generate...
	 generated samples 2000
	 generated samples 3000
	 generated samples 4000
	 generated samples 5000
	 generated samples 6000
	 generated samples 7000
	 generated samples 8000
	 generated samples 9000
	 generated samples 10000
	 generated samples 11000
	 generated samples 12000
	 generated samples 13000
	 generated samples 14000
	 generated samples 15000
	 generated samples 16000
	 generated samples 17000
	 generated samples 18000
elapsed: 136.20520663261414


In [6]:
Audio(wav, rate=sr)