## Improving Speech Quality

We see that from fine-tuning FastPitch, we were able to generate audio in a male voice but the audio quality is not as good as we expect. We recommend two steps to improve audio quality:

* Finetuning HiFi-GAN
* Adding more data

We'll focus on the former for this workshop.

### Finetuning HiFi-GAN
From the synthesized samples, there might be audible audio crackling. To fix this, we need to finetune HiFi-GAN on the new speaker's data. HiFi-GAN shows improvement using **synthesized mel spectrograms**, so the first step is to generate mel spectrograms with our finetuned FastPitch model to use as input.

The code below uses our finetuned model to generate synthesized mels for the training and validation sets we have been using.

Import the necessary modules

In [None]:
import json
import numpy as np
import torch
import soundfile as sf

import IPython.display as ipd
from matplotlib.pyplot import imshow
from matplotlib import pyplot as plt

from pathlib import Path

from nemo.collections.tts.torch.helpers import BetaBinomialInterpolator

from nemo.collections.tts.models import FastPitchModel

from nemo.collections.tts.models import HifiGanModel

Reload our fine-tuned FastPitch spectrogram generation model

In [None]:
def get_best_ckpt_from_last_run(
        base_dir, 
        new_speaker_id, 
        duration_mins, 
        mixing_enabled, 
        original_speaker_id, 
        model_name="FastPitch"
    ):    
    mixing = "no_mixing" if not mixing_enabled else "mixing"
    
    d = f"{original_speaker_id}_to_{new_speaker_id}_{mixing}_{duration_mins}_mins"
    
    exp_dirs = list([i for i in (Path(base_dir) / d / model_name).iterdir() if i.is_dir()])
    last_exp_dir = sorted(exp_dirs)[-1]
    
    last_checkpoint_dir = last_exp_dir / "checkpoints"
    
    last_ckpt = list(last_checkpoint_dir.glob('*-last.ckpt'))

    if len(last_ckpt) == 0:
        raise ValueError(f"There is no last checkpoint in {last_checkpoint_dir}.")
    
    return str(last_ckpt[0])

In [None]:
new_speaker_id = 9017
duration_mins = 5
mixing = False
original_speaker_id = "ljspeech"

last_ckpt = get_best_ckpt_from_last_run("./", new_speaker_id, duration_mins, mixing, original_speaker_id)
print(last_ckpt)

spec_model = FastPitchModel.load_from_checkpoint(last_ckpt)
spec_model.eval().cuda()

Helper function for loading and transcribing `.wav` files

In [None]:
def load_wav(audio_file, target_sr=None):
    with sf.SoundFile(audio_file, 'r') as f:
        samples = f.read(dtype='float32')
        sample_rate = f.samplerate
        if target_sr is not None and target_sr != sample_rate:
            samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)
    return samples.transpose()

This helper function takes a FastPitch manifest file (located at `manifest_path`), uses the spectrogram model (`spec_model`) to generate spectrograms and store them in `mel_dir`, then generates an associated HiFi-GAN manifest file at `hifigan_manifest_path`

In [None]:
def generate_spectrograms(manifest_path, mel_dir, hifigan_manifest_path, spec_model=spec_model):

    # Get records from the training manifest (at manifest_path)
    records = []
    with open(manifest_path, "r") as f:
        for i, line in enumerate(f):
            records.append(json.loads(line))

    beta_binomial_interpolator = BetaBinomialInterpolator()
    spec_model.eval()

    device = spec_model.device

    save_dir = Path(mel_dir)
    save_dir.mkdir(exist_ok=True, parents=True)

    # Generate spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
    for i, r in enumerate(records):
        audio = load_wav(r["audio_filepath"])
        audio = torch.from_numpy(audio).unsqueeze(0).to(device)
        audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
        
        # Again, our finetuned FastPitch model doesn't use multiple speakers,
        # but we keep the code to support it here for reference
        if spec_model.fastpitch.speaker_emb is not None and "speaker" in r:
            speaker = torch.tensor([r['speaker']]).to(device)
        else:
            speaker = None
        
        with torch.no_grad():
            if "normalized_text" in r:
                text = spec_model.parse(r["normalized_text"], normalize=False)
            else:
                text = spec_model.parse(r['text'])
            
            text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
        
            spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)

            # Generate attention prior and spectrogram inputs for HiFi-GAN
            attn_prior = torch.from_numpy(
            beta_binomial_interpolator(spect_len.item(), text_len.item())
            ).unsqueeze(0).to(text.device)
                
            spectrogram = spec_model.forward(
            text=text, 
            input_lens=text_len, 
            spec=spect, 
            mel_lens=spect_len, 
            attn_prior=attn_prior,
            speaker=speaker,
            )[0]
            
            save_path = save_dir / f"mel_{i}.npy"
            np.save(save_path, spectrogram[0].to('cpu').numpy())
            r["mel_filepath"] = str(save_path)

    hifigan_manifest_path = "hifigan_train_ft.json"
    with open(hifigan_manifest_path, "w") as f:
        for r in records:
            f.write(json.dumps(r) + '\n')

Generate the mel-spectrograms and HiFi-GAN manifest files for the training and validation datasets

In [None]:
dataset = "train"
manifest_path = f"{new_speaker_id}_manifest_{dataset}_dur_{duration_mins}_mins_local.json"
mel_dir = f"{new_speaker_id}_manifest_{dataset}_dur_{duration_mins}_mins_local_mels"
hifigan_manifest_path = f"hifigan_{dataset}_ft.json"
generate_spectrograms(manifest_path, mel_dir, hifigan_manifest_path)

In [None]:
dataset = "val"
manifest_path = f"{new_speaker_id}_manifest_dev_ns_all_local.json"
mel_dir = f"{new_speaker_id}_manifest_{dataset}_ns_all_local_mels"
hifigan_manifest_path = f"hifigan_{dataset}_ft.json"
generate_spectrograms(manifest_path, mel_dir, hifigan_manifest_path)

Fine-tune HiFi-GAN

In [None]:
! python hifigan_finetune.py \
    --config-name=hifigan.yaml \
    model.max_steps=1000 \
    ~model.optim.sched \
    train_dataset=./hifigan_train_ft.json \
    validation_datasets=./hifigan_val_ft.json \
    exp_manager.exp_dir=./hifigan_finetune \
    +exp_manager.create_wandb_logger=True \
    +exp_manager.wandb_logger_kwargs='{project:tts-workshop, job_type:training, log_model:True}' \
    +init_from_pretrained_model=tts_hifigan 

Load the best checkpoint from the latest HiFi-GAN fine-tuning run and use it to generate a synthesized voice

In [None]:
def get_best_ckpt_from_last_hifigan_run(base_dir):    
    d = f"hifigan_ft/HifiGan"    
    exp_dirs = list([i for i in (Path(base_dir) / d ).iterdir() if i.is_dir()])
    last_exp_dir = sorted(exp_dirs)[-1]
    
    last_checkpoint_dir = last_exp_dir / "checkpoints"
    
    last_ckpt = list(last_checkpoint_dir.glob('*-last.ckpt'))

    if len(last_ckpt) == 0:
        raise ValueError(f"There is no last checkpoint in {last_checkpoint_dir}.")
    
    return str(last_ckpt[0])

In [None]:
last_hifigan_ckpt = get_best_ckpt_from_last_hifigan_run("./")

vocoder_model = HifiGanModel.load_from_checkpoint(last_hifigan_ckpt)
vocoder_model.eval().cuda()

Inference helper function

In [None]:
def infer(spec_gen_model, vocoder_model, str_input, speaker=None):
    """
    Synthesizes spectrogram and audio from a text string given a spectrogram synthesis and vocoder model.
    
    Args:
        spec_gen_model: Spectrogram generator model (FastPitch in our case)
        vocoder_model: Vocoder model (HiFiGAN in our case)
        str_input: Text input for the synthesis
        speaker: Speaker ID
    
    Returns:
        spectrogram and waveform of the synthesized audio.
    """
    with torch.no_grad():
        parsed = spec_gen_model.parse(str_input)
        if speaker is not None:
            speaker = torch.tensor([speaker]).long().to(device=spec_gen_model.device)
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker=speaker)
        audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

Run inference

In [None]:
num_val = 2  # Number of validation samples
val_records = []
with open(f"hifigan_val_ft.json", "r") as f:
    for i, line in enumerate(f):
        val_records.append(json.loads(line))
        if len(val_records) >= num_val:
            break
            
for val_record in val_records:
    print("Real validation audio")
    ipd.display(ipd.Audio(val_record['audio_filepath'], rate=22050))
    print(f"SYNTHESIZED FOR -- Speaker: {new_speaker_id} | Dataset size: {duration_mins} mins | Mixing:{mixing} | Text: {val_record['text']}")
    spec, audio = infer(spec_model, vocoder_model, val_record['text'], speaker=speaker_id)
    ipd.display(ipd.Audio(audio, rate=22050))
    %matplotlib inline
    imshow(spec, origin="lower", aspect="auto")
    plt.show()