<a href="https://colab.research.google.com/github/y-kamiya/machine-learning-samples/blob/master/scripts/tts/Glow_TTS_MultiBandMelGAN_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🐸 [Coqui TTS](https://github.com/coqui-ai/TTS) on CPU Real-Time Speech Synthesis 

original notebook: https://colab.research.google.com/drive/1NC4eQJFvVEqD8L4Rd8CVK25_Z-ypaBHD?usp=sharing

## Glow-TTS
Paper: https://arxiv.org/abs/2005.11129

This model is different than Tacotron by using a **greedy search algorithm** instead of an attention mechanism. In our experiments, it produces less **natural speech** but** easier to train** especially with lower quality datasets. It is also
**faster than Tacotron** models since it does not rely on auto-regression and **computes output with a single pass**. You can also **control speech pace and variation** with certain model parameters as shown below.

## MultiBand-MelGAN
Paper: https://arxiv.org/abs/2005.05106 


### Download Models

In [None]:
from google.colab import drive
drive.mount('/gdrive')
!ls /gdrive

!ln -s "/gdrive/My Drive" /mydrive
DATAROOT='/mydrive/machine-learning/tts/data/jsut_ver1.1_ljspeech_structure/model_zoo'

### Setup Libraries

In [None]:
#%%script false --no-raise-error
!sudo apt-get install automake libtool
!git clone https://github.com/tset-tset-tset/espeak-ng -b tset-tset-tset-patch-1
%cd espeak-ng/
!./autogen.sh && ./configure --libdir=/usr/lib/x86_64-linux-gnu
!make
!sudo make install

## 以下がゴミなく変換できれば成功
!echo 'スウェーデン' | espeak-ng -x -v ja

%cd ..

In [None]:
!git clone https://github.com/coqui-ai/TTS TTS_repo

In [4]:
%cd TTS_repo
!git checkout 4132240
!pip install -e .
!pip install numba
# 二回呼んでいるのは意図的
!pip install -e .
%cd /content

/content/TTS_repo
Note: checking out '4132240'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by performing another checkout.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -b with the checkout command again. Example:

  git checkout -b <new-branch-name>

HEAD is now at 41322408 Merge branch 'dev' of https://github.com/mozilla/TTS into dev
Obtaining file:///content/TTS_repo
Collecting numba==0.48
  Downloading numba-0.48.0-1-cp37-cp37m-manylinux2014_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.1 MB/s 
[?25hCollecting librosa==0.7.2
  Downloading librosa-0.7.2.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 56.2 MB/s 
[?25hCollecting phonemizer>=2.2.0
  Downloading phonemizer-2.2.2-py3-none-any.whl (49 kB)
[K     |████████████████████████████████|

### Define TTS function

In [5]:
def interpolate_vocoder_input(scale_factor, spec):
    """Interpolation to tolarate the sampling rate difference
    btw tts model and vocoder"""
    print(" > before interpolation :", spec.shape)
    spec = torch.tensor(spec).unsqueeze(0).unsqueeze(0)
    spec = torch.nn.functional.interpolate(spec, scale_factor=scale_factor, mode='bilinear').squeeze(0)
    print(" > after interpolation :", spec.shape)
    return spec

def tts(model, text, CONFIG, use_cuda, ap, use_gl, figures=True):
    t_1 = time.time()
    # run tts
    target_sr = CONFIG.audio['sample_rate']
    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs =\
     synthesis(model,
               text,
               CONFIG,
               use_cuda,
               ap,
               speaker_id,
               None,
               False,
               CONFIG.enable_eos_bos_chars,
               use_gl)
  
    # run vocoder
    mel_postnet_spec = ap._denormalize(mel_postnet_spec.T).T
    if not use_gl:
        target_sr = VOCODER_CONFIG.audio['sample_rate']
        vocoder_input = ap_vocoder._normalize(mel_postnet_spec.T)
        if scale_factor[1] != 1:
            vocoder_input = interpolate_vocoder_input(scale_factor, vocoder_input)
        else:
            vocoder_input = torch.tensor(vocoder_input).unsqueeze(0)
        waveform = vocoder_model.inference(vocoder_input)
    # format output
    if use_cuda and not use_gl:
        waveform = waveform.cpu()
    if not use_gl:
        waveform = waveform.numpy()
    waveform = waveform.squeeze()
    # compute run-time performance
    rtf = (time.time() - t_1) / (len(waveform) / ap.sample_rate)
    tps = (time.time() - t_1) / len(waveform)
    print(waveform.shape)
    print(" > Run-time: {}".format(time.time() - t_1))
    print(" > Real-time factor: {}".format(rtf))
    print(" > Time per step: {}".format(tps))
    # display audio
    IPython.display.display(IPython.display.Audio(waveform, rate=target_sr))
    
    return alignment, mel_postnet_spec, stop_tokens, waveform

### Load Models

In [7]:
# numpy-1.19.5がないというエラーで一度失敗するが、再度実行すれば通る
import sys
import os
import torch
import time
import IPython

# for some reason TTS installation does not work on Colab
sys.path.append('TTS_repo')

from TTS.utils.io import load_config
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.generic_utils import setup_model
from TTS.tts.utils.text.symbols import symbols, phonemes
from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.io import load_checkpoint

In [8]:
# runtime settings
use_cuda = False

In [9]:
!cp $DATAROOT/glow-tts-residual_bn_conv-jsut-stats/checkpoint_440000.pth.tar tts_model.pth.tar
!cp $DATAROOT/glow-tts-residual_bn_conv-jsut-stats/scale_stats.npy scale_stats.npy
!cp $DATAROOT/glow-tts-residual_bn_conv-jsut-stats/config.json config.json

In [10]:
# 自前で学習したvocoderを使いたい場合
%%script false --no-raise-error
!cp $DATAROOT/multiband-melgan/checkpoint_440000.pth.tar vocoder_model.pth.tar
!cp $DATAROOT/multiband-melgan/config.json config_vocoder.json

In [None]:
#%%script false --no-raise-error
!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O vocoder_model.pth.tar
!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O config_vocoder.json
!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O scale_stats_vocoder.npy

In [12]:
# model paths
TTS_MODEL = "tts_model.pth.tar"
TTS_CONFIG = "config.json"
VOCODER_MODEL = "vocoder_model.pth.tar"
VOCODER_CONFIG = "config_vocoder.json"

In [13]:
# load configs
TTS_CONFIG = load_config(TTS_CONFIG)
TTS_CONFIG.audio['stats_path'] = "./scale_stats.npy"
TTS_CONFIG.audio['signal_norm'] = True
VOCODER_CONFIG = load_config(VOCODER_CONFIG)
#VOCODER_CONFIG.audio['stats_path'] = None
VOCODER_CONFIG.audio['signal_norm'] = True
VOCODER_CONFIG.audio['stats_path'] = "./scale_stats_vocoder.npy"


In [14]:
# load the audio processor
ap = AudioProcessor(**TTS_CONFIG.audio)         

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.1
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > stats_path:./scale_stats.npy
 | > hop_length:256
 | > win_length:1024


In [None]:
# LOAD TTS MODEL
# multi speaker 
speakers = []
speaker_id = None
    
if 'characters' in TTS_CONFIG.keys():
    symbols, phonemes = make_symbols(**c.characters)

# load the model
num_chars = len(phonemes) if TTS_CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, len(speakers), TTS_CONFIG)      

# load model state
model, _ =  load_checkpoint(model, TTS_MODEL, use_cuda=use_cuda)
model.eval();
model.store_inverse();

In [16]:
from TTS.vocoder.utils.generic_utils import setup_generator

# LOAD VOCODER MODEL
vocoder_model = setup_generator(VOCODER_CONFIG)
vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"])
vocoder_model.remove_weight_norm()
vocoder_model.inference_padding = 0

# scale factor for sampling rate difference
scale_factor = [1,  VOCODER_CONFIG['audio']['sample_rate'] / ap.sample_rate]
print(f"scale_factor: {scale_factor}")

ap_vocoder = AudioProcessor(**VOCODER_CONFIG['audio'])    
if use_cuda:
    vocoder_model.cuda()
vocoder_model.eval();

 > Generator Model: multiband_melgan_generator
scale_factor: [1, 1.0]
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > stats_path:./scale_stats_vocoder.npy
 | > hop_length:256
 | > win_length:1024


In [17]:
# 任意の音声ファイルをmelspectrogramに変換してvocodeしたいだけの場合
%%script false --no-raise-error
remote_dir='/mydrive/machine-learning/tts/data/sample'
!mkdir -p /content/sample
!cp $remote_dir/*.wav /content/sample/

import glob
from TTS.vocoder.datasets.gan_dataset import GANDataset
from TTS.vocoder.utils.generic_utils import plot_results
import matplotlib.pyplot as plt
from TTS.utils.tensorboard_logger import TensorboardLogger
from torch.utils.data import DataLoader

wav_paths = glob.glob(os.path.join("/content/sample/", "**", "*.wav"), recursive=True)
dataset = GANDataset(
    ap=ap_vocoder, 
    items=wav_paths,
    seq_len=VOCODER_CONFIG.seq_len,
    hop_len=ap_vocoder.hop_length,
    pad_short=VOCODER_CONFIG.pad_short,
    conv_pad=VOCODER_CONFIG.conv_pad,
    is_training=False,
    return_segments=False,
    use_noise_augment=VOCODER_CONFIG.use_noise_augment,
    use_cache=VOCODER_CONFIG.use_cache,
    verbose=False)

data = dataset[0]
c_G, y_G = data
c_G = c_G.unsqueeze(0)
y_G = y_G.unsqueeze(0)

y_hat = vocoder_model.inference(c_G)
print(y_hat.shape, y_G.shape)

!rm -rf ./sample/test
figures = plot_results(y_hat, y_G, ap_vocoder, 0, "test")
tb_logger = TensorboardLogger("/content/sample/test", model_name="vocoder_test")
tb_logger.tb_eval_figures(0, figures)

sample_voice = y_hat[0].squeeze(0).detach().cpu().numpy()
real_voice = y_G[0].squeeze(0).cpu().numpy()
tb_logger.tb_eval_audios(0, {'eval/audio': sample_voice, 'eval/real': real_voice}, VOCODER_CONFIG.audio["sample_rate"])

IPython.display.display(IPython.display.Audio(sample_voice, rate=22050))
IPython.display.display(IPython.display.Audio(real_voice, rate=22050))

%load_ext tensorboard
%tensorboard --logdir "/content/sample"

## Run Inference

In [19]:
#sentence = 'ミズヲ マレーシアカラ カワナクテハ ナラナイ ノデス.'
sentence = 'サイキンノナヤミハ メンバーニマダ シタノナマエヲ オボエラレテイナイコトデスネ.'

In [None]:
model.length_scale = 1.0  # set speed of the speech. 
model.noise_scale = 0.33  # set speech variationd

align, spec, stop_tokedns, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)

In [None]:
# faster speech
model.length_scale = 0.3  # set speed of the speech. 
model.noise_scale = 0.33  # set speech variationd

align, spec, stop_tokedns, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)

In [None]:
# even more faster speech with less variantion
model.length_scale = 0.6  # set speed of the speech. 
model.noise_scale = 0.01  # set speech variation

align, spec, stop_tokedns, wav = tts(model, sentence, TTS_CONFIG, use_cuda, ap, use_gl=False, figures=True)