In [66]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Union, Text,List
from tests import get_test_input_path, get_test_data_path, get_test_output_path

In [67]:
DATA_PATH = os.path.join(get_test_data_path(),"vispeech")
OUT_PATH = os.path.join(get_test_output_path(), "output_train")

In [68]:
from IPython.display import Audio, display

display(Audio(os.path.join(DATA_PATH,"wavs/0/09981.wav"), autoplay=True))

In [69]:
from vitts.utils.config.config_share import BaseDatasetConfig

In [70]:
dataset_config = BaseDatasetConfig(
    name="vispeech",
    meta_file_train="metadata_dummy.csv",
    path=DATA_PATH,
    language="vi"
)

In [71]:
dataset_config

BaseDatasetConfig(name='vispeech', path='/home/truc/Documents/ViTTS/tests/data/vispeech', meta_file_train='metadata_dummy.csv', ignored_speakers=None, language='vi', meta_file_val='', meta_file_attn_mask='')

In [72]:
from vitts.components.vitts.configs.glow_tts_config import GlowTTSConfig


In [73]:
config_glowtts = GlowTTSConfig(
    batch_size= 32,
    eval_batch_size= 16,
    num_loader_workers= 0,
    num_eval_loader_workers=0,
    run_eval= True,
    test_delay_epochs= -1,
    epochs=2,
    text_cleaner="vi_cleaners",
    use_phonemes= False,
    print_step= 25,
    print_eval= False,
    mixed_precision= True,
    output_path= OUT_PATH,
    datasets=[dataset_config],
    save_step= 1000
)

In [74]:
config_glowtts

GlowTTSConfig(output_path='/home/truc/Documents/ViTTS/tests/output/output_train', logger_uri=None, run_name='run', project_name=None, run_description='🐸Coqui trainer run.', print_step=25, plot_step=100, model_param_stats=False, wandb_entity=None, dashboard_logger='tensorboard', log_model_step=None, save_step=1000, save_n_checkpoints=5, save_checkpoints=True, save_all_best=False, save_best_after=10000, target_loss=None, print_eval=False, test_delay_epochs=-1, run_eval=True, distributed_backend='nccl', distributed_url='tcp://localhost:54321', mixed_precision=True, epochs=2, batch_size=32, eval_batch_size=16, grad_clip=5.0, scheduler_after_epoch=True, lr=0.001, optimizer='RAdam', optimizer_params={'betas': [0.9, 0.998], 'weight_decay': 1e-06}, lr_scheduler='NoamLR', lr_scheduler_params={'warmup_steps': 4000}, use_grad_scaler=False, cudnn_enable=True, cudnn_deterministic=False, cudnn_benchmark=False, training_seed=54321, model='glow_tts', num_loader_workers=0, num_eval_loader_workers=0, us

In [75]:
config_glowtts.characters

In [76]:
from vitts.utils.audio import AudioProcessor

In [77]:
ap = AudioProcessor.init_from_config(config_glowtts, verbose= False)

In [78]:
ap.resample = True

In [79]:
ap

<vitts.utils.audio.AudioProcessor at 0x7f0be2c8ac10>

In [80]:
from vitts.components.vitts.utils.text.tokenizer import TTSTokenizer
tokenizer, config_glowtts = TTSTokenizer.init_from_config(config_glowtts)

In [81]:
config_glowtts

GlowTTSConfig(output_path='/home/truc/Documents/ViTTS/tests/output/output_train', logger_uri=None, run_name='run', project_name=None, run_description='🐸Coqui trainer run.', print_step=25, plot_step=100, model_param_stats=False, wandb_entity=None, dashboard_logger='tensorboard', log_model_step=None, save_step=1000, save_n_checkpoints=5, save_checkpoints=True, save_all_best=False, save_best_after=10000, target_loss=None, print_eval=False, test_delay_epochs=-1, run_eval=True, distributed_backend='nccl', distributed_url='tcp://localhost:54321', mixed_precision=True, epochs=2, batch_size=32, eval_batch_size=16, grad_clip=5.0, scheduler_after_epoch=True, lr=0.001, optimizer='RAdam', optimizer_params={'betas': [0.9, 0.998], 'weight_decay': 1e-06}, lr_scheduler='NoamLR', lr_scheduler_params={'warmup_steps': 4000}, use_grad_scaler=False, cudnn_enable=True, cudnn_deterministic=False, cudnn_benchmark=False, training_seed=54321, model='glow_tts', num_loader_workers=0, num_eval_loader_workers=0, us

In [82]:
config_glowtts.characters

CharactersConfig(characters_class='vitts.components.vitts.utils.text.characters.Graphemes', vocab_dict=None, pad='<PAD>', eos='<EOS>', bos='<BOS>', blank='<BLNK>', characters='abcdefghijklmnopqrstuvwxyzáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ', punctuations="!'(),-.:;? ", phonemes=None, is_unique=False, is_sorted=True)

In [83]:
from vitts.components.vitts.datasets import load_tts_samples

train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split= True,
    eval_split_size=0.2,
)

 | > Found 48 files in /home/truc/Documents/ViTTS/tests/data/vispeech


In [84]:
train_samples

[{'text': 'tiết kiêm nhiên liệu giảm khí thải ô nhiễm môi trường giữ sạch buồng đốt và các chi tiết bên trong động cơ\n',
  'audio_file': '/home/truc/Documents/ViTTS/tests/data/vispeech/wavs/1/14914.wav',
  'speaker_name': 'vispeech',
  'language': 'vi'},
 {'text': 'kiên trì uống nước hạt ươi mỗi ngày có thể điều trị được bệnh gai cột sống\n',
  'audio_file': '/home/truc/Documents/ViTTS/tests/data/vispeech/wavs/1/14916.wav',
  'speaker_name': 'vispeech',
  'language': 'vi'},
 {'text': 'kết quả nội soi ban đầu về khối u thanh quản phải của chị hoàng hà\n',
  'audio_file': '/home/truc/Documents/ViTTS/tests/data/vispeech/wavs/0/09985.wav',
  'speaker_name': 'vispeech',
  'language': 'vi'},
 {'text': 'giới phân tích cho biết ngay cả hai nhà lãnh đạo quyền lực nhất trong quá khứ là ông mao trạch đông và ông đặng tiểu bình cũng không dám\n',
  'audio_file': '/home/truc/Documents/ViTTS/tests/data/vispeech/wavs/0/09992.wav',
  'speaker_name': 'vispeech',
  'language': 'vi'},
 {'text': 'dạt né 

In [85]:
from vitts.components.vitts.models.glow_tts import GlowTTS

model = GlowTTS(
    config_glowtts,
    ap,
    tokenizer = tokenizer,
    speaker_manager= None
)

In [86]:
from trainer import Trainer, TrainerArgs

trainer = Trainer(
    TrainerArgs(),
    config_glowtts,
    output_path= OUT_PATH,
    model = model,
    train_samples= train_samples,
    eval_samples= eval_samples

)

 > Using CUDA: False
 > Number of GPUs: 0

 > Model has 28605841 parameters


In [87]:
# trainer.fit()

In [88]:
# from phonemizer.phonemize import phonemize

In [89]:
# text1 = ['một', 'chuyện', 'tình yêu']
# phn1 = phonemize(text1, language='vi', backend='espeak', strip=True)

In [90]:
from vitts.inference.synthesizer import Synthesizer

In [91]:
def test_in_out():
    tts_checkpoint = "/home/truc/Documents/ViTTS/tests/output/output_train/run-July-19-2022_10+33PM-56a0cb4/best_model.pth"
    tts_config = "/home/truc/Documents/ViTTS/tests/output/output_train/run-July-19-2022_10+33PM-56a0cb4/config.json"
    synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None)
    wav = synthesizer.tts("Xin chào bạn em")
    synthesizer.save_wav(wav, "hihi.wav")

In [92]:
test_in_out()

ModuleNotFoundError: ! Config for glow_tts cannot be found.