In [1]:
import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig, CharactersConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig
from TTS.tts.configs.vits_config import VitsConfig

In [2]:
output_path = os.path.dirname(os.path.abspath('__file__'))


dataset_config = BaseDatasetConfig(
    formatter="ljspeech",
    meta_file_train="Mixed_formatted.txt",
    # meta_file_train = "No_Shouting_formatted.txt",
    path=os.path.join(output_path, "data/"),
    language = "en"
)

In [3]:
audio_config = VitsAudioConfig(
    sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
)

In [4]:
character_config = CharactersConfig(
    characters_class= "TTS.tts.models.vits.VitsCharacters",
    characters= "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890",
    punctuations=""" !,.?-"'""",
    pad= "<PAD>",
    eos= "<EOS>",
    bos= "<BOS>",
    blank= "<BLNK>",
)

In [5]:
config = VitsConfig(
    audio=audio_config,
    # characters=character_config, # Comment out if with phonemes
    run_name="vits_tyler1_phonemes",
    # run_name = "vits_tyler1_noshouting_phonemes",
    batch_size=2,
    eval_batch_size=2,
    # batch_group_size=4,
    num_loader_workers=4,
    num_eval_loader_workers=2,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=5000,
    text_cleaner="english_cleaners",
    use_phonemes=True, # Replace with False if no phonemes
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=True,
    print_step=25,
    print_eval=True,
    mixed_precision=True,
    output_path=output_path + "/vitstts_checkpoint",
    datasets=[dataset_config],
    cudnn_benchmark=False,
    test_sentences = ["Mic test one two three", "I am Tyler1", "Welcome chat!"]
)

In [6]:
def formatter(root_path, manifest_file, **kwargs):  # pylint: disable=unused-argument
    """Assumes each line as ```<filename>|<transcription>```
    """
    txt_file = os.path.join(root_path, manifest_file)
    items = []
    speaker_name = "Tyler1"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.dirname(os.path.abspath('__file__')) + f"/data/wavs/{cols[0]}.wav"
            text = cols[1]
            # print(text)
            items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name, "root_path": root_path})
    return items

In [7]:
ap = AudioProcessor.init_from_config(config)

tokenizer, config = TTSTokenizer.init_from_config(config)

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [8]:
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    # eval_split_max_size=config.eval_split_max_size,
    eval_split_size=0.1,
    formatter = formatter
)

 | > Found 76 files in /home/sagemaker-user/voiceclone/data


In [9]:
# init model
model = Vits(config, ap, tokenizer, speaker_manager=None)

In [10]:
# Make sure to change phoneme in config cell when training a phoneme model

# model_path = "vitstts_checkpoint/vits_tyler1-January-16-2024_01+41PM-d5007e2/" # no phonemes
# model_path = "vitstts_checkpoint/vits_tyler1_phonemes-January-26-2024_03+51PM-7e6c2b3/" # with phonemes
# model_path = "vitstts_checkpoint/vits_tyler1_noshouting_phonemes-January-30-2024_12+17PM-7e6c2b3/" # with phonemes no shouting

trainer = Trainer(
    # TrainerArgs(restore_path = model_path + "best_model.pth"), # Load from checkpoint
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 4
 | > Num. of Torch Threads: 2
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
2024-02-12 15:12:47.063416: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-12 15:12:47.113956: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 > Start Tensorboard

In [11]:
trainer.fit()


[4m[1m > EPOCH: 0/5000[0m
 --> /home/sagemaker-user/voiceclone/vitstts_checkpoint/vits_tyler1_phonemes-February-12-2024_03+12PM-7e6c2b3
 ! Run is removed from /home/sagemaker-user/voiceclone/vitstts_checkpoint/vits_tyler1_phonemes-February-12-2024_03+12PM-7e6c2b3
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/mutagen/_util.py", line 251, in _openfile
    fileobj = open(filename, "rb+" if writable else "rb")
FileNotFoundError: [Errno 2] No such file or directory: '/home/sagemaker-user/voiceclone/data/wavs/60. Tyler1 about his Brother_s Daughter Emmy 5.wav'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/trainer/trainer.py", line 1833, in fit
    self._fit()
  File "/opt/conda/lib/python3.10/site-packages/trainer/trainer.py", line 1785, in _fit
    self.train_epoch()
  File "/opt/conda/lib/python3.10/site-packages/trainer/trainer.py", line 1483,



> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
| > Number of instances : 69


AttributeError: 'tuple' object has no attribute 'tb_frame'