## HiFiGAN Validation

In [1]:
import json
import numpy as np
import torch
import soundfile as sf

import wandb

from pathlib import Path

import pytorch_lightning as pl
from hydra import compose, initialize

from nemo.utils.exp_manager import exp_manager
from nemo.collections.tts.models import FastPitchModel, HifiGanModel

import IPython.display as ipd
import pandas as pd
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt

[NeMo W 2022-12-08 14:32:45 optimizers:55] Apex was not found. Using the lamb or fused_adam optimizer will error out.
[NeMo W 2022-12-08 14:32:45 experimental:27] Module <class 'nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2022-12-08 14:32:45 experimental:27] Module <class 'nemo.collections.tts.models.radtts.RadTTSModel'> is experimental, not ready for production and is not fully supported. Use at your own risk.


In [2]:
SPEAKER_ID = "lukas"
MODEL_NAME = "tts_en_fastpitch"

WANDB_PROJECT = "tts-lukas"
WANDB_ENTITY = "capecape" # replace with your wandb username or team

In [5]:
# which split we are using
validation_split_artifact = f'{WANDB_ENTITY}/{WANDB_PROJECT}/lukas_split:latest'

# which model
fastpitch_artifact = f'{WANDB_ENTITY}/{WANDB_PROJECT}/model-2022-12-08_13-54-17:v3'
hifigan_artifact = f'{WANDB_ENTITY}/{WANDB_PROJECT}/model-m9x5wwpw:v3'

In [7]:
wandb.init(entity=WANDB_ENTITY, project=WANDB_PROJECT, job_type="hifigan_validation")

In [9]:
split_artifact = wandb.use_artifact(validation_split_artifact)
split_artifact_dir = split_artifact.download()

fastpitch_artifact = wandb.use_artifact(fastpitch_artifact, type='model')
fastpitch_artifact_dir = fastpitch_artifact.download()

hifi_artifact = wandb.use_artifact(hifigan_artifact, type='model')
hifi_artifact_dir = hifi_artifact.download()

[34m[1mwandb[0m:   2 of 2 files downloaded.  
[34m[1mwandb[0m: Downloading large artifact model-2022-12-08_13-54-17:v3, 524.07MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.0
[34m[1mwandb[0m: Downloading large artifact model-m9x5wwpw:v3, 969.73MB. 1 files... 
[34m[1mwandb[0m:   1 of 1 files downloaded.  
Done. 0:0:0.7


In [11]:
def ls(path): return list(Path(path).iterdir())

In [12]:
last_ckpt = str(ls(fastpitch_artifact_dir)[0])
print(last_ckpt)

spec_model = FastPitchModel.load_from_checkpoint(last_ckpt)
spec_model.eval().cuda();

artifacts/model-2022-12-08_13-54-17:v3/model.ckpt
[NeMo I 2022-12-08 14:35:23 tokenize_and_classify:87] Creating ClassifyFst grammars.


[NeMo W 2022-12-08 14:35:54 experimental:27] Module <class 'nemo_text_processing.g2p.modules.IPAG2P'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2022-12-08 14:35:55 modules:95] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2022-12-08 14:35:55 modelPT:142] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: lukas_manifest_train_local.json
      sample_rate: 22050
      sup_data_path: ./fastpitch_sup_data
      sup_data_types:
      - align_prior_matrix
      - pitch
      

[NeMo I 2022-12-08 14:35:55 features:267] PADDING: 1


In [13]:
last_ckpt = str(ls(hifi_artifact_dir)[0])
print(last_ckpt)

vocoder_model = HifiGanModel.load_from_checkpoint(last_ckpt)
vocoder_model.eval().cuda();

artifacts/model-m9x5wwpw:v3/model.ckpt


[NeMo W 2022-12-08 14:35:58 modelPT:142] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.VocoderDataset
      manifest_filepath: lukas_hifigan_train_ft.json
      sample_rate: 22050
      n_segments: 8192
      max_duration: null
      min_duration: 0.75
      load_precomputed_mel: true
      hop_length: 256
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 16
      num_workers: 4
      pin_memory: true
    
[NeMo W 2022-12-08 14:35:58 modelPT:149] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.VocoderDataset

[NeMo I 2022-12-08 14:35:58 features:267] PADDING: 0
[NeMo I 2022-12-08 14:35:58 features:275] STFT using exact pad
[NeMo I 2022-12-08 14:35:58 features:267] PADDING: 0
[NeMo I 2022-12-08 14:35:58 features:275] STFT using exact pad


In [14]:
def infer(spec_gen_model, vocoder_model, str_input, speaker=None):
    """
    Synthesizes spectrogram and audio from a text string given a spectrogram synthesis and vocoder model.
    
    Args:
        spec_gen_model: Spectrogram generator model (FastPitch in our case)
        vocoder_model: Vocoder model (HiFiGAN in our case)
        str_input: Text input for the synthesis
        speaker: Speaker ID
    
    Returns:
        spectrogram and waveform of the synthesized audio.
    """
    with torch.no_grad():
        parsed = spec_gen_model.parse(str_input)
        if speaker is not None:
            speaker = torch.tensor([speaker]).long().to(device=spec_gen_model.device)
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker=speaker)
        audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

### View results in the notebook

In [15]:
valid_df = pd.read_json(Path(split_artifact_dir)/f"{SPEAKER_ID}_manifest_valid_local.json", lines=True)
valid_df

Unnamed: 0,audio_filepath,text,duration,text_no_preprocessing,text_normalized
0,lukas/seg238.wav,"is yes, then you really do have a machine lea...",5,"is yes, then you really do have a machine lea...","is yes, then you really do have a machine lear..."
1,lukas/seg239.wav,excited enough about all the applications of ...,4,excited enough about all the applications of ...,excited enough about all the applications of m...
2,lukas/seg240.wav,videos that explain actually how to build the...,4,videos that explain actually how to build the...,videos that explain actually how to build thes...
3,lukas/seg241.wav,we're going to keep creating these videos so ...,4,we're going to keep creating these videos so ...,we're going to keep creating these videos so y...
4,lukas/seg242.wav,first to know when a new video comes out.,21,first to know when a new video comes out.,first to know when a new video comes out.


In [16]:
def generate_audio(text, speaker_id):
    "Generate MEL and Synth Audio"
    spec, audio = infer(spec_model, vocoder_model, text, speaker=speaker_id)
    return spec, audio.flatten()

In [17]:
new_speaker_id = 42
duration_mins = 5
mixing = False
original_speaker_id = "ljspeech"

In [18]:
table = wandb.Table(columns=['Text', 'Real validation audio', f'Audio Speaker {new_speaker_id}', 'Spec'])

sample_rate=22050

for _, val_record in valid_df.iterrows():
    speaker_spec, speaker_audio = generate_audio(val_record['text'], speaker_id=new_speaker_id)
    row = [val_record["text_no_preprocessing"],
           wandb.Audio(val_record['audio_filepath'], sample_rate=sample_rate), 
           wandb.Audio(speaker_audio.flatten(), sample_rate=sample_rate),
           wandb.Image(speaker_spec)]
    table.add_data(*row)

wandb.log({"hifigan_predictions": table})

In [19]:
wandb.finish()

VBox(children=(Label(value='2.495 MB of 2.496 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.999749…

# A Random phrase!

In [67]:
t = "Today, in our our workshop with NVIDIA. You will learn how to create a text to speech model."

In [68]:
speaker_spec, speaker_audio = generate_audio(t, speaker_id=new_speaker_id)

In [69]:
ipd.display(ipd.Audio(speaker_audio, rate=22050))