# Setup packages

In [None]:
# Install required packages
!pip3 install pycuda
!pip3 install nvidia-dllogger
!pip3 install nvidia-pyindex
!pip3 install nvidia-tensorrt==8.4.3.1
# !pip3 install torch-tensorrt -f https://github.com/NVIDIA/Torch-TensorRT/releases
!pip3 install numpy scipy librosa unidecode inflect librosa colored
!pip3 install onnxruntime onnx_graphsurgeon
!pip3 install parallel_wavegan

!apt-get update
!apt-get install -y libsndfile1

# Download models

### Tacotron2

In [None]:
# Download tacotron2 amp checkpoint from NGC
!mkdir -p checkpoints
!wget -nc --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/dle/tacotron2__pyt_ckpt/versions/19.12.0_amp/zip -O tacotron2__pyt_ckpt_19.12.0_amp.zip

# Unzip and cp to ./checkpoints
!unzip -o tacotron2__pyt_ckpt_19.12.0_amp.zip
!mv nvidia_tacotron2pyt_fp16.pt ./checkpoints
!rm tacotron2__pyt_ckpt_19.12.0_amp.zip

### Waveglow

You might need to setup **NGC** CLI in advance.

In [None]:
# Download waveglow amp checkpoint
!mkdir -p checkpoints
# !ngc registry model download-version "nvidia/waveglow256pyt_fp16:2"  # or download the ckpt directly from https://ngc.nvidia.com/models/nvidia:waveglow256pyt_fp16

# mv to ./checkpoints
!mv waveglow256pyt_fp16_v2/waveglow_1076430_14000_amp checkpoints/nvidia_waveglow256pyt_fp16.pt
!rmdir waveglow256pyt_fp16_v2

### ParallelWaveGan

Referring https://github.com/kan-bayashi/ParallelWaveGAN

In [None]:
# Download parallelwavegan checkpoint
from parallel_wavegan.utils import download_pretrained_model, PRETRAINED_MODEL_LIST
print(PRETRAINED_MODEL_LIST.keys())
download_pretrained_model("ljspeech_parallel_wavegan.v1.long","checkpoints")

# Simple inference test from the ckpts

### Tacotron2 + WaveGlow

In [None]:
## Simple Inference FP16 test(Tacotron2 + Waveglow)
# Define config parser
class SoftDict:
	def __init__(self, user_dict):
		self._user_dict = user_dict
		self._parse()

	def _parse(self):
		for key in self._user_dict.keys():
			value = self._user_dict[key]
			if type(value) == dict:
				value = SoftDict(value)
			setattr(self, key, value)

# Get Tacotron2 from the checkpoint
import torch, models
tacotron2_ckpt = torch.load("checkpoints/nvidia_tacotron2pyt_fp16.pt")
config = tacotron2_ckpt['config']
config['fp16_run'] = True
args = SoftDict(config)
state_dict = tacotron2_ckpt['state_dict']
for key in list(state_dict.keys()):
    state_dict[key.replace("module.","")] = state_dict.pop(key)    
    
model_config = models.get_model_config("Tacotron2", args)
tacotron2 = models.get_model("Tacotron2", model_config, False, forward_is_infer=True)
tacotron2.load_state_dict(state_dict)
tacotron2.eval()

# Get Waveglow from the checkpoint
# waveglow_ckpt = torch.load("checkpoints/waveglowpyt_fp16_20210323.pt")
waveglow_ckpt = torch.load("checkpoints/nvidia_waveglow256pyt_fp16.pt")
config = waveglow_ckpt['config']
args = SoftDict(config)
state_dict = waveglow_ckpt['state_dict']
for key in list(state_dict.keys()):
    state_dict[key.replace("module.","")] = state_dict.pop(key)  

waveglow = models.get_model("WaveGlow", config, False, forward_is_infer=True)
waveglow.load_state_dict(state_dict)
waveglow.eval()

In [None]:
# Setup Sample text
text = "Hello my name is Woojin. Nice to meet you."
utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')
sequences, sequence_lengths = utils.prepare_input_sequence([text])

# Infer
with torch.no_grad():
    rate = 22050
    mel, mel_lengths, alignments = tacotron2(sequences, sequence_lengths)
    audio = waveglow.infer(mel)
    audio_numpy = audio[0].data.cpu().numpy()
#     from scipy.io.wavfile import write
#     write("audio.wav", rate, audio_numpy)

from IPython.display import Audio
display(Audio(audio_numpy, rate=rate))

### Tacotron2 + ParallelWaveGan

Mel basis might be different between pretrained Tacotron2(from NGC) and ParallelWaveGan(from https://github.com/kan-bayashi/ParallelWaveGAN), which could causes lots of noise to the synthesis result. Just to check whether inference-flow works or not.

In [None]:
# Define config parser
class SoftDict:
	def __init__(self, user_dict):
		self._user_dict = user_dict
		self._parse()

	def _parse(self):
		for key in self._user_dict.keys():
			value = self._user_dict[key]
			if type(value) == dict:
				value = SoftDict(value)
			setattr(self, key, value)

# Get Tacotron2 from the checkpoint
import torch, models
tacotron2_ckpt = torch.load("checkpoints/nvidia_tacotron2pyt_fp16.pt")
config = tacotron2_ckpt['config']
config['fp16_run'] = True
args = SoftDict(config)
state_dict = tacotron2_ckpt['state_dict']
for key in list(state_dict.keys()):
    state_dict[key.replace("module.","")] = state_dict.pop(key)    
    
model_config = models.get_model_config("Tacotron2", args)
tacotron2 = models.get_model("Tacotron2", model_config, False, forward_is_infer=True)
tacotron2.load_state_dict(state_dict)
tacotron2.eval()

# Get ParallelWaveGan Generator
import torch
import yaml
from parallel_wavegan.models import ParallelWaveGANGenerator

pwg_ckpt=torch.load("checkpoints/ljspeech_parallel_wavegan.v1.long/checkpoint-1000000steps.pkl")
state_dict = pwg_ckpt['model']['generator']
with open("checkpoints/ljspeech_parallel_wavegan.v1.long/config.yml") as f:
    cfg = yaml.safe_load(f)
generator_cfg = cfg['generator_params']

parallelwavegan = ParallelWaveGANGenerator(**generator_cfg)
parallelwavegan.load_state_dict(state_dict)
parallelwavegan.half()
parallelwavegan.cuda().eval()

In [None]:
import numpy as np
from tacotron2_common.audio_processing import dynamic_range_decompression
from parallel_wavegan.utils import read_hdf5

# Setup Sample text
text = "Hello my name is Woojin. Nice to meet you."
utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')
sequences, sequence_lengths = utils.prepare_input_sequence([text])

# Infer
with torch.no_grad():
    # Generate Spectrogram
    mel, mel_lengths, alignments = tacotron2(sequences, sequence_lengths)
    
    # Decompress and log10 the output
    decompressed = dynamic_range_decompression(mel)
    decompressed_log10 = np.log10(decompressed.cpu()).cuda()
    stats_path = "checkpoints/ljspeech_parallel_wavegan.v1.long/stats.h5"
    mu = read_hdf5(stats_path, "mean")
    sigma = read_hdf5(stats_path, "scale")
    decompressed_log10_norm = (decompressed_log10 - torch.from_numpy(mu).view(1, -1, 1).cuda()) / torch.from_numpy(sigma).view(1, -1, 1).cuda()

    # Prepare for inputs
    rate = 22050
    upsample_factor=256
    c = decompressed_log10_norm.squeeze(0).transpose(0,1)
    x = torch.randn(1, 1, len(c) * upsample_factor).to("cuda")
    c = c.transpose(1,0).unsqueeze(0)
    c = torch.nn.ReplicationPad1d(2)(c)

    x = x.half()
    c = c.half()
    
    # Synthesis
    pred = parallelwavegan(x,c).squeeze(0).transpose(1,0)

from IPython.display import display, Audio
display(Audio(pred.view(-1).cpu().detach().numpy(), rate=rate))