In [1]:
from waveglow.denoiser import Denoiser
import sys
import torch
import numpy as np
from scipy.io.wavfile import write

from model import TacotronSTFT
from tacotron2 import Tacotron2
from text import text_to_sequence
from utils import load_wav_to_torch

from hparams import create_hparams
hparams = create_hparams()

sys.path.append("waveglow")

from hparams import create_hparams
from g2pM import G2pM
import IPython.display as ipd

In [2]:
tacotron2_pth = "D:/backup/etts/pt/full_train/checkpoint_120000.pt"
waveglow_pth = "D:/backup/waveglow/Mandarin/waveglow_280000"
# load cmess
synthesizer = Tacotron2(hparams).cuda()
synthesizer.load_state_dict(torch.load(tacotron2_pth)['state_dict'], strict=True)
synthesizer.cuda().eval()
# load waveglow and denoiser
waveglow = torch.load(waveglow_pth)['model']
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow.cuda().eval()
denoiser = Denoiser(waveglow).cuda()

#### Prepare Inputs

In [3]:
# mandarin
text = "你可以离我远一点吗"
text = " ".join(G2pM()(text, tone=True, char_split=False))
# english
# text = "She had said, so that one could keep up a conversation."
cleaners = "mandarin_cleaners" # english_cleaners
speaker = 1

# get reference audio mel spectrogram
reference_audio = "D:/Downloads/Emotional Speech Dataset (ESD)/0002/Sad/train/0002_001108.wav"
stft = TacotronSTFT(
    filter_length=1024,
    hop_length=256,
    win_length=1024,
    sampling_rate=16000
)
audio, sampling_rate = load_wav_to_torch(reference_audio)
if sampling_rate != stft.sampling_rate:
    raise ValueError("{} SR doesn't match target {} SR".format(sampling_rate, stft.sampling_rate))
ref_melspec = stft.mel_spectrogram(torch.autograd.Variable(audio.unsqueeze(0), requires_grad=False))
ref_melspec = torch.squeeze(ref_melspec, 0)

#### Generating Emotional Sample

In [4]:
text_input = torch.from_numpy(np.array(text_to_sequence(text, [cleaners]))[None, :]).cuda().long()
ref_mel = torch.FloatTensor(ref_melspec).cuda().float()
speaker_id = torch.IntTensor([speaker]).cuda().long()
with torch.no_grad():
    mel_outputs, mel_outputs_postnet, _, _ = synthesizer.inference(
        text=text_input, mels=ref_mel, speaker_id=speaker_id,
    )
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.8)
    audio = denoiser(audio, strength=0.9)  # denoise

audio = audio.squeeze()
audio = audio.cpu().numpy()
ipd.Audio(audio, rate=hparams.sampling_rate)

#### Generating emotioanl samples from manul control

In [None]:
mean = -0.05
std = 0.32
with torch.no_grad():
    mel_outputs, mel_outputs_postnet, _, _ = synthesizer.inference(
        text=text_input, mels=ref_mel, speaker_id=speaker_id,
        cg=True, mean=mean, std = std
    )
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.8)
    audio = denoiser(audio, strength=0.9)  # denoise
 
audio = audio.squeeze()
audio = audio.cpu().numpy()
ipd.Audio(audio, rate=hparams.sampling_rate)