### Installation

In [16]:
!pip install -q espnet==202308 pypinyin==0.44.0 parallel_wavegan==0.5.4 gdown==4.4.0 espnet_model_zoo

In [17]:
#@title English model { run: "auto" }
lang = 'English'
tag = 'kan-bayashi/ljspeech_vits' #@param ["kan-bayashi/ljspeech_tacotron2", "kan-bayashi/ljspeech_fastspeech", "kan-bayashi/ljspeech_fastspeech2", "kan-bayashi/ljspeech_conformer_fastspeech2", "kan-bayashi/ljspeech_joint_finetune_conformer_fastspeech2_hifigan", "kan-bayashi/ljspeech_joint_train_conformer_fastspeech2_hifigan", "kan-bayashi/ljspeech_vits"] {type:"string"}
vocoder_tag = "none" #@param ["none", "parallel_wavegan/ljspeech_parallel_wavegan.v1", "parallel_wavegan/ljspeech_full_band_melgan.v2", "parallel_wavegan/ljspeech_multi_band_melgan.v2", "parallel_wavegan/ljspeech_hifigan.v1", "parallel_wavegan/ljspeech_style_melgan.v1"] {type:"string"}

### Model Setup

In [18]:
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none

text2speech = Text2Speech.from_pretrained(
    model_tag=str_or_none(tag),
    vocoder_tag=str_or_none(vocoder_tag),
    device="cuda",
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    speed_control_alpha=1.0,
    noise_scale=0.333,
    noise_scale_dur=0.333,
)

  WeightNorm.apply(module, name, dim)
  model.load_state_dict(torch.load(model_file, map_location=device))
 - discriminator_params.follow_official_norm
 - discriminator_params.scale_discriminator_params.use_weight_norm
 - discriminator_params.scale_discriminator_params.use_spectral_norm

See also:
 - https://github.com/espnet/espnet/pull/5240
 - https://github.com/espnet/espnet/pull/5249


### Synthesis

In [19]:
import time
import torch

print(f"Input your favorite sentence in {lang}.")
x = input()

with torch.no_grad():
    start = time.time()
    wav = text2speech(x)["wav"]
rtf = (time.time() - start) / (len(wav) / text2speech.fs)
print(f"RTF = {rtf:5f}")

from IPython.display import display, Audio
display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))

### Evaluation

In [41]:
!pip install datasets
!pip install python_speech_features
!pip install pesq

Collecting pesq
  Downloading pesq-0.0.4.tar.gz (38 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pesq
  Building wheel for pesq (setup.py) ... [?25l[?25hdone
  Created wheel for pesq: filename=pesq-0.0.4-cp310-cp310-linux_x86_64.whl size=262876 sha256=b71c0e1b8f0cf0dcbc5e372e045ab0190535da81bd1217fe63d7540d192b3e98
  Stored in directory: /root/.cache/pip/wheels/c5/4e/2c/251524370c0fdd659e99639a0fbd0ca5a782c3aafcd456b28d
Successfully built pesq
Installing collected packages: pesq
Successfully installed pesq-0.0.4


In [72]:
from datasets import load_dataset
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none
from IPython.display import Audio
from python_speech_features import mfcc
from pesq import pesq
import torchaudio
import numpy as np
import torch
import time

In [91]:
def mel_cepstral_distortion(reference_wav, generated_wav, sample_rate):
    ref_mfcc = mfcc(reference_wav, samplerate=sample_rate, numcep=13)
    gen_mfcc = mfcc(generated_wav, samplerate=sample_rate, numcep=13)

    min_len = min(ref_mfcc.shape[0], gen_mfcc.shape[0])
    ref_mfcc = ref_mfcc[:min_len]
    gen_mfcc = gen_mfcc[:min_len]

    mcd = np.mean(np.sqrt(np.sum((ref_mfcc - gen_mfcc) ** 2, axis=1)))
    return mcd


def calculate_pesq(reference_wav, generated_wav, sample_rate):
    if sample_rate not in [8000, 16000]:
        reference_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(reference_wav)
        generated_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(generated_wav)
        sample_rate = 16000

    reference_wav_flat = reference_wav.numpy().flatten()
    generated_wav_flat = generated_wav.numpy().flatten()

    min_length = min(len(reference_wav_flat), len(generated_wav_flat))
    reference_wav_flat = reference_wav_flat[:min_length]
    generated_wav_flat = generated_wav_flat[:min_length]

    pesq_score = pesq(sample_rate, reference_wav_flat, generated_wav_flat, 'wb')

    return pesq_score


def calculate_mos(pesq_score):
    return 1 + 0.035 * pesq_score + 7.10e-6 * (pesq_score ** 2)


def evaluate_tts_metrics(reference_wav, generated_wav, sample_rate):
    if sample_rate not in [8000, 16000]:
        reference_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(reference_wav)
        generated_wav = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(generated_wav)
        sample_rate = 16000

    mcd_score = mel_cepstral_distortion(reference_wav.numpy(), generated_wav.numpy(), sample_rate)

    pesq_score = calculate_pesq(reference_wav, generated_wav, sample_rate)
    mos_score = calculate_mos(pesq_score)

    return {
        "MCD": mcd_score,
        "PESQ": pesq_score,
        "MOS": mos_score
    }


def evaluate_tts(dataset, num_samples=10):
    total_mcd = 0
    total_pesq = 0
    total_mos = 0

    for i in range(num_samples):
        text = dataset["train"][i]["text"]
        reference_file = dataset["train"][i]["file"]

        reference_wav, sample_rate = torchaudio.load(reference_file)

        with torch.no_grad():
            generated_wav = text2speech(text)["wav"].view(-1).cpu()

        generated_wav_resampled = torchaudio.transforms.Resample(
            orig_freq=text2speech.fs, new_freq=sample_rate
        )(generated_wav)

        metrics = evaluate_tts_metrics(reference_wav, generated_wav_resampled, sample_rate)
        print(f"Sample {i + 1}: MCD = {metrics['MCD']:.2f}, PESQ = {metrics['PESQ']:.2f}, MOS = {metrics['MOS']:.2f}")

        total_mcd += metrics['MCD']
        total_pesq += metrics['PESQ']
        total_mos += metrics['MOS']

    avg_mcd = total_mcd / num_samples
    avg_pesq = total_pesq / num_samples
    avg_mos = total_mos / num_samples

    print(f"\nAverage MCD: {avg_mcd:.2f}, Average PESQ: {avg_pesq:.2f}, Average MOS: {avg_mos:.2f}")

In [69]:
dataset = load_dataset("keithito/lj_speech")

In [70]:
tag = "espnet/kan-bayashi_ljspeech_vits"
vocoder_tag = None

text2speech = Text2Speech.from_pretrained(
    model_tag=tag,
    vocoder_tag=None,
    device="cuda" if torch.cuda.is_available() else "cpu",
    threshold=0.5,
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    speed_control_alpha=1.0,
    noise_scale=0.333,
    noise_scale_dur=0.333,
)

Fetching 27 files:   0%|          | 0/27 [00:00<?, ?it/s]

 - discriminator_params.follow_official_norm
 - discriminator_params.scale_discriminator_params.use_weight_norm
 - discriminator_params.scale_discriminator_params.use_spectral_norm

See also:
 - https://github.com/espnet/espnet/pull/5240
 - https://github.com/espnet/espnet/pull/5249


In [94]:
test_values = int(len(dataset["train"]) * 0.2)
evaluate_tts(dataset, num_samples=test_values)

Sample 1: MCD = 65.41, PESQ = 1.26, MOS = 1.04
Sample 2: MCD = 42.95, PESQ = 1.22, MOS = 1.04
Sample 3: MCD = 48.98, PESQ = 1.19, MOS = 1.04
Sample 4: MCD = 61.43, PESQ = 1.15, MOS = 1.04
Sample 5: MCD = 46.82, PESQ = 1.28, MOS = 1.04
Sample 6: MCD = 56.91, PESQ = 1.19, MOS = 1.04
Sample 7: MCD = 53.59, PESQ = 1.14, MOS = 1.04
Sample 8: MCD = 49.17, PESQ = 1.45, MOS = 1.05
Sample 9: MCD = 61.11, PESQ = 1.23, MOS = 1.04
Sample 10: MCD = 64.66, PESQ = 1.24, MOS = 1.04
Sample 11: MCD = 45.94, PESQ = 1.33, MOS = 1.05
Sample 12: MCD = 53.99, PESQ = 1.17, MOS = 1.04
Sample 13: MCD = 65.57, PESQ = 1.15, MOS = 1.04
Sample 14: MCD = 67.40, PESQ = 1.21, MOS = 1.04
Sample 15: MCD = 50.68, PESQ = 1.19, MOS = 1.04
Sample 16: MCD = 50.90, PESQ = 1.21, MOS = 1.04
Sample 17: MCD = 46.87, PESQ = 1.23, MOS = 1.04
Sample 18: MCD = 50.30, PESQ = 1.21, MOS = 1.04
Sample 19: MCD = 72.33, PESQ = 1.16, MOS = 1.04
Sample 20: MCD = 65.10, PESQ = 1.24, MOS = 1.04
Sample 21: MCD = 59.62, PESQ = 1.24, MOS = 1.04
S