In [1]:
import sys
import os
import tensorflow as tf
import numpy as np
import argparse
from time import time

module_path = os.path.expanduser("~/martin/wavenet")
if module_path not in sys.path:
    sys.path.append(module_path)
    
from apps.vocoder.model import Vocoder, Generator, optimizer_factory
from apps.vocoder.hparams import hparams
from apps.vocoder.datasets.data_feeder import ensure_divisible
import apps.vocoder.audio as audio
from wavenet.mixture import discretized_mix_logistic_loss, sample_from_discretized_mix_logistic

import IPython
from IPython.display import Audio

In [2]:
def load_data(fpath):
    wav = audio.load_wav(fpath)
    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max
    mel = audio.melspectrogram(wav).astype(np.float32).T
    l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size())
    wav = np.pad(wav, (l, r), mode="constant", constant_values=0.)
    N = mel.shape[0]
    assert len(wav) >= N * audio.get_hop_size()
    wav = wav[:N * audio.get_hop_size()]
    return wav, mel

In [3]:
tf.reset_default_graph()
vocoder = Vocoder()
generator = Generator(vocoder, gc_enable=False, batch_size=1)
inputs = tf.placeholder(tf.float32)
l = tf.placeholder(tf.float32)
lc = vocoder.create_upsample(l)
raw_output = vocoder.net.create_network(inputs, lc, None)
output = sample_from_discretized_mix_logistic(raw_output)

In [4]:
data_raw, data_lc = load_data(os.path.expanduser("~/data/ljspeech/wavs/LJ019-0063.wav"))
sample_size = 10000
sample_size = ensure_divisible(sample_size, audio.get_hop_size(), True)
max_frames = sample_size // audio.get_hop_size()
s = np.random.randint(0, len(data_lc) - max_frames)
ts = s * audio.get_hop_size()
wav = data_raw[ts:ts + audio.get_hop_size() * max_frames]
local_condition = data_lc[s:s + max_frames, :]

In [5]:
wav = wav.reshape(1,-1,1)
local_condition = local_condition.reshape(1,-1,hparams.num_mels)

In [6]:
sess_config = tf.ConfigProto(
        device_count = {'GPU': 0}
    )
with tf.Session(config=sess_config) as sess:
    sess.run(tf.global_variables_initializer())
    vocoder.load(sess, "../logs")
    
    result, _lc = sess.run([output, lc], feed_dict={inputs:wav, l:local_condition})
    samples = generator.generate(sess, wav.shape[1], _lc, None)

Trying to restore saved checkpoints from ../logs ...  Checkpoint found: ../logs/model.ckpt-5000
  Global step was: 5000
  Restoring...INFO:tensorflow:Restoring parameters from ../logs/model.ckpt-5000
 Done.


100%|██████████| 10489/10489 [02:56<00:00, 59.51it/s]


In [7]:
Audio(result.reshape(-1), rate=22050)

In [8]:
Audio(samples.reshape(-1), rate=22050)