In [1]:
import sys
sys.path.append("../tts/")
import torch
import torchaudio
from tqdm.notebook import tqdm
import numpy as np

from datasets import LJSPEECHList

In [2]:
# Download the dataset if you haven't
# torchaudio.datasets.LJSPEECH("../tts/", download=True)

<torchaudio.datasets.ljspeech.LJSPEECH at 0x7ff8c06fb430>

# torchaudio implementation

In [31]:
from torchaudio.models import Tacotron2, WaveRNN
sys.path.append("../tts/wavernn/")
from processing import NormalizeDB
from wavernn_inference_wrapper import WaveRNNInferenceWrapper
from text.text_preprocessing import (
    text_to_sequence,
)

class NormalizeDB(torch.nn.Module):
    r"""Normalize the spectrogram with a minimum db value
    """

    def __init__(self, min_level_db, normalization):
        super().__init__()
        self.min_level_db = min_level_db
        self.normalization = normalization

    def forward(self, specgram):
        specgram = torch.log10(torch.clamp(specgram.squeeze(0), min=1e-5))
        if self.normalization:
            return torch.clamp(
                (self.min_level_db - 20 * specgram) / self.min_level_db, min=0, max=1
            )
        return specgram

# inverse of the normalization done when training Tacotron2
# needed for WaveRNN and Griffin-Lim as WaveGlow also does the same
# normalization
class InverseSpectralNormalization(torch.nn.Module):
    def forward(self, input):
        return torch.exp(input)



In [44]:
device = "cuda"

res = torch.load("./models/torchaudio_tacotron2_ckpt.pth")
tacotron2 = Tacotron2(n_symbol=38).eval().to(device)
tacotron2.load_state_dict({k.replace("module.", ""): v for k, v, in res['state_dict'].items()})

res = torch.load("./models/parallel_wavernn_nvidia_ckpt.pt")
wavernn_model = WaveRNN(upsample_scales=[5, 5, 11], n_classes=2**8, hop_length=275, n_freq=80)
wavernn_model.load_state_dict({k.replace("module.", ""): v for k, v, in res['state_dict'].items()})
wavernn_inference_model = WaveRNNInferenceWrapper(wavernn_model).eval().to(device)

transforms = torch.nn.Sequential(
    InverseSpectralNormalization(),
    NormalizeDB(min_level_db=-100, normalization=True),
)

val_dset = LJSPEECHList(root="../tts/", metadata_path="../tts/data/ljs_audio_text_test_filelist.txt")
index = np.random.RandomState(0).choice(np.arange(len(val_dset)), replace=False, size=100)

for sample_no, i in tqdm(enumerate(index), total=len(index)):
    (waveform, sample_rate, text, _) = val_dset[i]
    torchaudio.save(filepath=f"./audio_samples/original/original_{sample_no:04d}.wav", src=waveform, sample_rate=sample_rate)
    sequence = text_to_sequence(text)
    lengths = torch.LongTensor([len(sequence)])
    sequences = torch.LongTensor(sequence[:]).reshape(1, -1)
    with torch.no_grad():
        mel, _, _ = tacotron2.infer(sequences.to(device), lengths.to(device))
        mel = transforms(mel)
        audio = wavernn_inference_model(mel, mulaw=True, batched=False).cpu()
    audio_numpy = audio[0]
    torchaudio.save(filepath=f"./audio_samples/torchaudio/torchaudio_{sample_no:04d}.wav", src=audio, sample_rate=sample_rate)

  0%|          | 0/100 [00:00<?, ?it/s]

NameError: name 'mel_specgram' is not defined

# Nvidia's implementation

In [1]:
tacotron2 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp16')
tacotron2 = tacotron2.to('cuda')
tacotron2.eval()

waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp16')
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow.eval()

utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')

Using cache found in /home/arbiter/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
Using cache found in /home/arbiter/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
The boolean parameter 'some' has been replaced with a string parameter 'mode'.
Q, R = torch.qr(A, some)
should be replaced with
Q, R = torch.linalg.qr(A, 'reduced' if some else 'complete') (Triggered internally at  /pytorch/aten/src/ATen/native/BatchLinearAlgebra.cpp:1940.)
  W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
Using cache found in /home/arbiter/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub


In [3]:
val_dset = LJSPEECHList(root="../tts/", metadata_path="../tts/data/ljs_audio_text_test_filelist.txt")

In [8]:
index = np.random.RandomState(0).choice(np.arange(len(val_dset)), replace=False, size=100)

for sample_no, i in tqdm(enumerate(index), total=len(index)):
    (waveform, sample_rate, text, _) = val_dset[i]
    sequences, lengths = utils.prepare_input_sequence([text])
    with torch.no_grad():
        mel, _, _ = tacotron2.infer(sequences, lengths)
        audio = waveglow.infer(mel).cpu()
    audio_numpy = audio[0]
    torchaudio.save(filepath=f"./audio_samples/nvidia/nvidia_{sample_no:04d}.wav", src=audio, sample_rate=sample_rate)

  0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
sample_no

99