In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import librosa
import numpy as np
import os
import glob
import json

import paddle
from text import text_to_sequence, cmudict
from text.symbols import symbols
import commons
import attentions
import modules
import models
import utils

In [2]:
# If you are using your own trained model
# model_dir = "/home1/zhaoyh/audio/glow-tts/checkpoints/pretrained.pth"
# hps = utils.get_hparams_from_dir(model_dir)
# checkpoint_path = utils.latest_checkpoint_path(model_dir)

# If you are using a provided pretrained model
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
paddle.set_device("gpu")
hps = utils.get_hparams_from_file("/home1/zhaoyh/audio/glow-tts/configs/base.json")
checkpoint_path = "/home1/zhaoyh/paddlemodel/glow-tts_paddle/checkpoints/pretrained.pdparams"

model = models.FlowGenerator(
    len(symbols) + getattr(hps.data, "add_blank", False),
    out_channels=hps.data.n_mel_channels,
    **hps.model)

utils.load_checkpoint(checkpoint_path, model)
model.decoder.store_inverse() # do not calcuate jacobians for fast decoding
_ = model.eval()

cmu_dict = cmudict.CMUDict(hps.data.cmudict_path)

W1224 21:29:11.853161 2066281 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 6.1, Driver API Version: 11.4, Runtime API Version: 11.2
W1224 21:29:11.862488 2066281 gpu_resources.cc:164] device: 0, cuDNN Version: 8.2.


INFO:root:Loaded checkpoint '/home1/zhaoyh/paddlemodel/glow-tts_paddle/checkpoints/pretrained.pdparams' (iteration 0)


In [3]:
tst_stn = "Glow TTS is really awesome !" 

if getattr(hps.data, "add_blank", False):
    text_norm = text_to_sequence(tst_stn.strip(), ['english_cleaners'], cmu_dict)
    text_norm = commons.intersperse(text_norm, len(symbols))
else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality
    tst_stn = " " + tst_stn.strip() + " "
    text_norm = text_to_sequence(tst_stn.strip(), ['english_cleaners'], cmu_dict)
sequence = np.array(text_norm)[None, :]
print("".join([symbols[c] if c < len(symbols) else "<BNK>" for c in sequence[0]]))
x_tst = paddle.to_tensor(sequence).astype('int64') 
x_tst_lengths = paddle.to_tensor([x_tst.shape[1]])

@G@L@OW1 @T@IY1@T@IY1@EH1@S @IH1@Z @R@IH1@L@IY0 @AO1@S@AH0@M !


In [4]:
with paddle.no_grad():
  noise_scale = .667
  length_scale = 1.0
  (y_gen_tst, *_), *_, (attn_gen, *_) = model(x_tst, x_tst_lengths, gen=True, noise_scale=noise_scale, length_scale=length_scale)

# save mel-frames
if not os.path.exists('./hifigan/test_mel_files'):
    os.makedirs('./hifigan/test_mel_files')
np.save("./hifigan/test_mel_files/sample.npy", y_gen_tst.cpu().numpy())

In [5]:
# Use finetuned HiFi-GAN with Tacotron 2, which is provided in the repo of HiFi-GAN.
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
!python ./hifigan/inference_e2e.py --checkpoint_file /home1/zhaoyh/paddlemodel/glow-tts_paddle/checkpoints/LJ_V1/generator_v1.pdparams

Initializing Inference Process..
W1224 21:29:36.399806 2067621 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 6.1, Driver API Version: 11.4, Runtime API Version: 11.2
W1224 21:29:36.406845 2067621 gpu_resources.cc:164] device: 0, cuDNN Version: 8.2.
Loading '/home1/zhaoyh/paddlemodel/glow-tts_paddle/checkpoints/LJ_V1/generator_v1.pdparams'
Complete.
Removing weight norm...
generated_files_from_mel/sample_generated_e2e.wav
[0m