<a href="https://colab.research.google.com/github/y-kamiya/machine-learning-samples/blob/feature%2Ftts-scripts/TTS_jsut_multiband_melgan_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Hands-on example for 🐸 [Coqui TTS](https://github.com/coqui-ai/TTS)

This notebook trains Tacotron model on LJSpeech dataset.

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/gdrive')
!ls /gdrive

!ln -s "/gdrive/My Drive" /mydrive
DATAROOT='/mydrive/machine-learning/tts/data'

In [3]:
# zipに固めてあった学習データを展開
!cp -r "$DATAROOT/jsut_ljspeech_structure_22050.zip" /content/
!unzip -q /content/jsut_ljspeech_structure_22050.zip -d /content/

!cp "$DATAROOT/jsut_ver1.1_ljspeech_structure/scale_stats.npy" /content/

In [None]:
# get TTS to your local
!git clone https://github.com/coqui-ai/TTS

In [None]:
%cd TTS
!git checkout v0.0.13
!pip install -e .
!pip install numba==0.48

In [6]:
# load the default config file and update with the local paths and settings.
import json
from TTS.utils.io import load_config

DATAROOT = '/content/ljspeech_structure_22050'
DATAROOT_DRIVE ='/mydrive/machine-learning/tts/data/jsut_ver1.1_ljspeech_structure'

CONFIG = load_config('TTS/vocoder/configs/multiband_melgan_config.json') 

CONFIG['data_path'] = f"{DATAROOT}/wavs/"
CONFIG['audio']['stats_path'] = None
CONFIG['output_path'] = f"{DATAROOT_DRIVE}/output"
CONFIG['num_loader_workers'] = 4
CONFIG['num_val_loader_workers'] = 1
CONFIG['test_sentences_file'] = f"{DATAROOT_DRIVE}/test_sentences_file"
CONFIG['print_step'] = 1000
CONFIG['save_step'] = 5000

CONFIG['use_l1_spec_loss'] = False
CONFIG['diff_samples_for_G_and_D'] = False


with open('config.json', 'w') as fp:
    json.dump(CONFIG, fp)


In [None]:
%%script false --no-raise-error
!CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_vocoder_gan.py --config_path config.json

In [None]:
#%%script false --no-raise-error
# 学習再開
!cp config.json $DATAROOT_DRIVE/output/multiband-melgan
!CUDA_VISIBLE_DEVICES="0" python TTS/bin/train_vocoder_gan.py --continue_path $DATAROOT_DRIVE/output/multiband-melgan

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
     | > avg_D_mse_gan_real_loss: 0.13808
     | > avg_D_mse_gan_fake_loss: 0.13824
     | > avg_D_loss: 0.44636
     | > avg_loader_time: 0.86925
     | > avg_step_time: 0.62491

[1m > EVALUATION [0m

  [1m--> EVAL PERFORMANCE[0m
     | > avg_G_stft_loss_mg:[91m 0.78846 [0m(+0.00223)
     | > avg_G_stft_loss_sc:[91m 0.37229 [0m(+0.00812)
     | > avg_G_subband_stft_loss_mg:[91m 0.70276 [0m(+0.00310)
     | > avg_G_subband_stft_loss_sc:[91m 0.39029 [0m(+0.01677)
     | > avg_G_mse_fake_loss:[91m 0.35986 [0m(+0.00595)
     | > avg_G_loss:[91m 2.02654 [0m(+0.03000)
     | > avg_G_gen_loss:[91m 1.12690 [0m(+0.01511)
     | > avg_G_adv_loss:[91m 0.89964 [0m(+0.01488)
     | > avg_D_mse_gan_loss:[91m 0.44544 [0m(+0.00003)
     | > avg_D_mse_gan_real_loss:[92m 0.17272 [0m(-0.00232)
     | > avg_D_mse_gan_fake_loss:[91m 0.10046 [0m(+0.00031)
     | > avg_D_loss:[91m 0.44544 [0m(+0.00003)
     | > avg_loader_time:[92m 0

In [None]:
%load_ext tensorboard
%tensorboard --logdir "$DATAROOT/jsut_ver1.1_ljspeech_structure/output/"