In [1]:
import sys
sys.path.append("../tts/")
import torch
import torchaudio
from tqdm.notebook import tqdm
import numpy as np

from datasets import LJSPEECHList

In [2]:
# Download the dataset if you haven't
# torchaudio.datasets.LJSPEECH("../tts/", download=True)

# torchaudio implementation

In [3]:
from torchaudio.models import Tacotron2, WaveRNN, wavernn
sys.path.append("../tts/wavernn/")
from processing import NormalizeDB
#from wavernn_inference_wrapper import WaveRNNInferenceWrapper
from wavernn_inference_wrapper_3 import WaveRNNInferenceWrapper
from text.text_preprocessing import (
    text_to_sequence,
)

class NormalizeDB(torch.nn.Module):
    r"""Normalize the spectrogram with a minimum db value
    """

    def __init__(self, min_level_db, normalization):
        super().__init__()
        self.min_level_db = min_level_db
        self.normalization = normalization

    def forward(self, specgram):
        specgram = torch.log10(torch.clamp(specgram.squeeze(0), min=1e-5))
        if self.normalization:
            return torch.clamp(
                (self.min_level_db - 20 * specgram) / self.min_level_db, min=0, max=1
            )
        return specgram

# inverse of the normalization done when training Tacotron2
# needed for WaveRNN and Griffin-Lim as WaveGlow also does the same
# normalization
class InverseSpectralNormalization(torch.nn.Module):
    def forward(self, input):
        return torch.exp(input)

In [4]:
device = "cuda"

res = torch.load("./models/torchaudio_tacotron2_wavernn_ckpt.pth")
tacotron2 = Tacotron2(n_symbol=38).eval().to(device)
tacotron2.load_state_dict({k.replace("module.", ""): v for k, v, in res['state_dict'].items()})

res = torch.load("./models/wave_step550K_weights.pth")
del res['module.step']
state_dict = {k.replace("module.", ""): v for k, v, in res.items()}
state_dict = {k.replace("upsample.resnet.melresnet_model.0.batch_norm1.running_mean", "upsample.resnet.melresnet_model.1"): v for k, v, in state_dict.items()}
state_dict = {k.replace("I.", "fc."): v for k, v, in state_dict.items()}
state_dict = {k.replace("upsample.resnet.conv_in.", "upsample.resnet.melresnet_model.0."): v for k, v, in state_dict.items()}
state_dict = {k.replace("upsample.resnet.batch_norm.", "upsample.resnet.melresnet_model.1."): v for k, v, in state_dict.items()}
for i in range(10):
    state_dict = {k.replace(f"upsample.resnet.layers.{i}.conv1", f"upsample.resnet.melresnet_model.{i+3}.resblock_model.0"): v for k, v, in state_dict.items()}
    state_dict = {k.replace(f"upsample.resnet.layers.{i}.conv2", f"upsample.resnet.melresnet_model.{i+3}.resblock_model.3"): v for k, v, in state_dict.items()}
    state_dict = {k.replace(f"upsample.resnet.layers.{i}.batch_norm1", f"upsample.resnet.melresnet_model.{i+3}.resblock_model.1"): v for k, v, in state_dict.items()}
    state_dict = {k.replace(f"upsample.resnet.layers.{i}.batch_norm2", f"upsample.resnet.melresnet_model.{i+3}.resblock_model.4"): v for k, v, in state_dict.items()}
state_dict = {k.replace(f"upsample.resnet.conv_out", f"upsample.resnet.melresnet_model.13"): v for k, v, in state_dict.items()}
for i in [1, 3, 5]:
    state_dict = {k.replace(f"upsample.up_layers.{i}", f"upsample.upsample_layers.{i}"): v for k, v, in state_dict.items()}


#res = torch.load("./models/parallel_wavernn_nvidia_ckpt.pt")
#state_dict = {k.replace("module.", ""): v for k, v, in res['state_dict'].items()}
wavernn_model = WaveRNN(upsample_scales=[5, 5, 11], n_classes=2**8, hop_length=275, n_freq=80).eval()
wavernn_model.load_state_dict(state_dict)
wavernn_inference_model = WaveRNNInferenceWrapper(wavernn_model).eval().to(device)

transforms = torch.nn.Sequential(
    InverseSpectralNormalization(),
    NormalizeDB(min_level_db=-100, normalization=True),
)

val_dset = LJSPEECHList(root="../tts/", metadata_path="../tts/data/ljs_audio_text_test_filelist.txt")
index = np.random.RandomState(0).choice(np.arange(len(val_dset)), replace=False, size=100)

for sample_no, i in tqdm(enumerate(index[27:]), total=len(index)):
    (waveform, sample_rate, text, _) = val_dset[i]
    #torchaudio.save(filepath=f"./audio_samples/original/original_{sample_no:04d}.wav", src=waveform, sample_rate=sample_rate)
    sequence = text_to_sequence(text)
    lengths = torch.LongTensor([len(sequence)])
    sequences = torch.LongTensor(sequence[:]).reshape(1, -1)
    with torch.no_grad():
        mel, _, _ = tacotron2.infer(sequences.to(device), lengths.to(device))
        mel = transforms(mel)
        #audio = wavernn_inference_model(mel, mulaw=True, batched=False).cpu()
        audio = torch.from_numpy(wavernn_inference_model.generate(mel.unsqueeze(0), mu_law=True, batched=False)).float()
        import ipdb; ipdb.set_trace()
    audio = audio.reshape(1, -1)
    torchaudio.save(filepath=f"./audio_samples/temp_{sample_no:04d}.wav", src=audio, sample_rate=sample_rate)
    #torchaudio.save(filepath=f"./audio_samples/torchaudio2/torchaudio2_{sample_no:04d}.wav", src=audio, sample_rate=sample_rate)

  0%|          | 0/100 [00:00<?, ?it/s]

> [0;32m/home/arbiter/projects/torchaudio-benchmark/tts/wavernn/wavernn_inference_wrapper_3.py[0m(412)[0;36mgenerate[0;34m()[0m
[0;32m    411 [0;31m[0;34m[0m[0m
[0m[0;32m--> 412 [0;31m            [0mb_size[0m[0;34m,[0m [0mseq_len[0m[0;34m,[0m [0m_[0m [0;34m=[0m [0mmels[0m[0;34m.[0m[0msize[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    413 [0;31m[0;34m[0m[0m
[0m


ipdb>  mels.shape


torch.Size([1, 80, 189750])


ipdb>  aux.shape


torch.Size([1, 128, 189750])


ipdb>  exit


BdbQuit: 

In [6]:
audio.max()

tensor(0.6748)

In [5]:
import IPython
IPython.display.Audio("./audio_samples/temp_0000.wav")

# Tacotron2 + nvidia's WaveGlow

In [72]:
device = "cuda"

res = torch.load("./models/torchaudio_tacotron2_ckpt.pth")
tacotron2 = Tacotron2(n_symbol=38).eval().to(device)
tacotron2.load_state_dict({k.replace("module.", ""): v for k, v, in res['state_dict'].items()})

waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp16')
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow.eval()

val_dset = LJSPEECHList(root="../tts/", metadata_path="../tts/data/ljs_audio_text_test_filelist.txt")
index = np.random.RandomState(0).choice(np.arange(len(val_dset)), replace=False, size=100)

for sample_no, i in tqdm(enumerate(index), total=len(index)):
    (_, sample_rate, text, _) = val_dset[i]
    sequence = text_to_sequence(text)
    lengths = torch.LongTensor([len(sequence)])
    sequences = torch.LongTensor(sequence[:]).reshape(1, -1)
    with torch.no_grad():
        mel, _, _ = tacotron2.infer(sequences.to(device), lengths.to(device))
        audio = waveglow.infer(mel).cpu()
    audio = audio.reshape(1, -1)
    torchaudio.save(filepath=f"./audio_samples/tacotron2waveglow/tacotron2waveglow_{sample_no:04d}.wav", src=audio, sample_rate=sample_rate)

Using cache found in /home/arbiter/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
The boolean parameter 'some' has been replaced with a string parameter 'mode'.
Q, R = torch.qr(A, some)
should be replaced with
Q, R = torch.linalg.qr(A, 'reduced' if some else 'complete') (Triggered internally at  ../aten/src/ATen/native/BatchLinearAlgebra.cpp:1937.)
  W = torch.qr(torch.FloatTensor(c, c).normal_())[0]


  0%|          | 0/100 [00:00<?, ?it/s]

# Nvidia's implementation

In [1]:
tacotron2 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp16')
tacotron2 = tacotron2.to('cuda')
tacotron2.eval()

waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp16')
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow.eval()

utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')

Using cache found in /home/arbiter/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
Using cache found in /home/arbiter/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
The boolean parameter 'some' has been replaced with a string parameter 'mode'.
Q, R = torch.qr(A, some)
should be replaced with
Q, R = torch.linalg.qr(A, 'reduced' if some else 'complete') (Triggered internally at  /pytorch/aten/src/ATen/native/BatchLinearAlgebra.cpp:1940.)
  W = torch.qr(torch.FloatTensor(c, c).normal_())[0]
Using cache found in /home/arbiter/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub


In [3]:
val_dset = LJSPEECHList(root="../tts/", metadata_path="../tts/data/ljs_audio_text_test_filelist.txt")

In [8]:
index = np.random.RandomState(0).choice(np.arange(len(val_dset)), replace=False, size=100)

for sample_no, i in tqdm(enumerate(index), total=len(index)):
    (waveform, sample_rate, text, _) = val_dset[i]
    sequences, lengths = utils.prepare_input_sequence([text])
    with torch.no_grad():
        mel, _, _ = tacotron2.infer(sequences, lengths)
        audio = waveglow.infer(mel).cpu()
    torchaudio.save(filepath=f"./audio_samples/nvidia/nvidia_{sample_no:04d}.wav", src=audio, sample_rate=sample_rate)

  0%|          | 0/100 [00:00<?, ?it/s]

# Evaluation

In [67]:
import random

import joblib
from torchaudio.transforms import Resample
from tqdm.notebook import tqdm
import numpy as np

from pesq import pesq
from pystoi import stoi

In [69]:
all_stois, pesqs_wb, pesqs_nb = [], [], []
for i in tqdm(range(100)):
    pred, sample_rate = torchaudio.load(f"./audio_samples/torchaudio2/torchaudio2_{i:04d}.wav")
    ref, sample_rate = torchaudio.load(f"./audio_samples/original/original_{i:04d}.wav")

    resampler = Resample(sample_rate, 16000, dtype=ref.dtype)
    re_pred = resampler(pred).numpy()
    re_ref = resampler(ref).numpy()

    pesqs_nb.append(pesq(16000, re_ref[0], re_pred[0], 'nb'))
    pesqs_wb.append(pesq(16000, re_ref[0], re_pred[0], 'wb'))
    continue

    pred, _ = torchaudio.load(f"./audio_samples/torchaudio2/torchaudio2_{i:04d}.wav")
    ref, _ = torchaudio.load(f"./audio_samples/original/original_{i:04d}.wav")
    pred, ref = pred.numpy(), ref.numpy()
    len_diff = pred.shape[1] - ref.shape[1]
    stois = []
    if len_diff == 0:
        all_stois.append(stoi(ref[0], pred[0], sample_rate, extended=False))
    else:
        for j in range(abs(len_diff)):
            if len_diff > 0:
                stois.append(stoi(ref[0], pred[0, j: j + ref.shape[1]], sample_rate, extended=False))
            else:
                stois.append(stoi(ref[0, j: j + pred.shape[1]], pred[0], sample_rate, extended=False))
        all_stois.append(np.max(stois))

print(np.mean(pesqs_nb))
print(np.mean(pesqs_wb))
print(np.mean(all_stois))

  0%|          | 0/100 [00:00<?, ?it/s]

1.291838254928589
1.166411771774292
nan


In [64]:
len_diff

-359