In [1]:
import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig, CharactersConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig
from TTS.tts.configs.vits_config import VitsConfig



In [2]:
output_path = os.path.dirname(os.path.abspath('__file__'))


dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="Mixed_formatted.txt", path=os.path.join(output_path, "data/"), language = "en"
)

In [3]:
audio_config = VitsAudioConfig(
    sample_rate=22050, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
)

In [4]:
character_config = CharactersConfig(
    characters_class= "TTS.tts.models.vits.VitsCharacters",
    characters= "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890",
    punctuations=""" !,.?-"'""",
    pad= "<PAD>",
    eos= "<EOS>",
    bos= "<BOS>",
    blank= "<BLNK>",
)

In [5]:
config = VitsConfig(
    audio=audio_config,
    # characters=character_config, # Comment out if with phonemes
    run_name="vits_tyler1_phonemes",
    batch_size=4,
    eval_batch_size=2,
    # batch_group_size=4,
    num_loader_workers=8,
    num_eval_loader_workers=2,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=5000,
    text_cleaner="english_cleaners",
    use_phonemes=True, # Replace with False if no phonemes
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=True,
    print_step=25,
    print_eval=True,
    mixed_precision=True,
    output_path=output_path + "/vitstts_checkpoint",
    datasets=[dataset_config],
    cudnn_benchmark=False,
    test_sentences = ["Mic test one two three", "I am Tyler1", "Welcome chat!"]
)

In [6]:
def formatter(root_path, manifest_file, **kwargs):  # pylint: disable=unused-argument
    """Assumes each line as ```<filename>|<transcription>```
    """
    txt_file = os.path.join(root_path, manifest_file)
    items = []
    speaker_name = "Tyler1"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.dirname(os.path.abspath('__file__')) + f"/data/wavs/{cols[0]}.wav"
            text = cols[1]
            # print(text)
            items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name, "root_path": root_path})
    return items

In [7]:
ap = AudioProcessor.init_from_config(config)

tokenizer, config = TTSTokenizer.init_from_config(config)

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [8]:
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    # eval_split_max_size=config.eval_split_max_size,
    eval_split_size=0.05,
    formatter = formatter
)

 | > Found 60 files in /home/ubuntu/tyler1/data


In [9]:
# init model
model = Vits(config, ap, tokenizer, speaker_manager=None)

In [10]:
# Make sure to change phoneme in config cell when training a phoneme model

model_path = "vitstts_checkpoint/vits_tyler1-January-16-2024_01+41PM-d5007e2/" # no phonemes
model_path = "vitstts_checkpoint/vits_tyler1-January-17-2024_09+38AM-d5007e2/" # with phonemes

trainer = Trainer(
    # TrainerArgs(restore_path = model_path + "/best_model.pth"), # Load from checkpoint
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 30
 | > Num. of Torch Threads: 30
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
2024-01-23 09:21:01.016354: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-23 09:21:01.059846: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 > Start Tensorboard: tensorboard --logdir=/h

In [None]:
trainer.fit()


[4m[1m > EPOCH: 0/5000[0m
 --> /home/ubuntu/tyler1/vitstts_checkpoint/vits_tyler1_phonemes-January-23-2024_09+21AM-1edd3ad


[*] Pre-computing phonemes...


  4%|▎         | 2/57 [00:00<00:03, 16.74it/s]

ɡˈaɪz oʊkˈeɪ lˈɪsən lˈɛts tˈɔːk ɐbˌaʊt sˈʌmθɪŋ ɹˈiːəl kwˈɪk. ˈaɪ swˈɛɹ tə ɡˈɑːd lˈaɪk jˈɛh maɪ kˈɑːɹdɪˌoʊ wˈʌzn̩t ðə bˈɛst bˌʌt bɹˈoʊ aɪm nˌɑːt lˈaɪk aɪm nˌɑːt kˈoʊp lˈaɪk aɪm nˌɑːt kɹˈeɪzi. ˈaɪ hæv ðɪs ʃˈɑːɹp pˈeɪn ɪn maɪ hˈɑːɹt fɹʌm tˈaɪm tə tˈaɪm. lˈaɪk ˈæktʃuːəl ʃˈɑːɹp, lˈaɪk ɹˈaɪt wɛn ˈaɪ lˈaɪk dˈuː ɐ θˈɜːɾi sˈɛkənd dʒˈɑːɡ ɔːɹ wʌtˈɛvɚɹ ɔːɹ wɛn ˈaɪ snˈiːz ɔːɹ wɛn ˈaɪ dˈuː bˈɛntʃ. ˈaɪ fˈiːl ɪɾ ɐ lˈɑːt. aɪm nˌɑːt kɹˈeɪzi lˈaɪk aɪm nˌɑːt ˈaɪ swˈɛɹ tə ɡˈɑːd ˈæktʃuːəli.
 [!] Character '̩' not found in the vocabulary. Discarding it.


100%|██████████| 57/57 [00:08<00:00,  6.94it/s]

[1m > TRAINING (2024-01-23 09:21:11) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: espeak
	| > 1 not found characters:
	| > ̩
| > Number of instances : 57
 | > Preprocessing samples
 | > Max text length: 1462
 | > Min text length: 131
 | > Avg text length: 685.6140350877193
 | 
 | > Max audio length: 7408822.0
 | > Min audio length: 578318.0
 | > Avg audio length: 3487299.50877193
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:863.)
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]

[1m   --> TIME: 2024-01-23 09:21:14 -- STEP: 0/15 -- GLOBAL_STEP: 0[0m
     | > loss_disc: 5.958283424377441  (5.958283424377441)
     | > loss_disc_real_0: 1.0065133571624756  (1.0065133571624756)
     | > loss_disc_real_1: 1.022413730621338  (1.022413730621338)
     | > loss_disc_real_2: 0.9817718863487244  (0.9817718863487244)
     | > loss_disc_real_3: 0.9567512273788452  (0.9567512273788452)
     | > loss_disc_real_4: 1.0066996812820435  (1.0066996812820435)
     | > loss_disc_real_5: 0.9833380579948425  (0.9833380579948425)
     | > loss_0: 5.958283424377441  (5.958283424377441)
     | > grad_norm_0: 0  (0)
     | > loss_gen: 5.957746982574463  (5.957746982574463)
     | > loss_kl: 160.70339965820312  (160.7033996582



> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: espeak
	| > 1 not found characters:
	| > ̩
| > Number of instances : 3
 | > Preprocessing samples
 | > Max text length: 1148
 | > Min text length: 261
 | > Avg text length: 681.3333333333334
 | 
 | > Max audio length: 6162562.0
 | > Min audio length: 1429014.0
 | > Avg audio length: 3654345.3333333335
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.


[1m   --> STEP: 0[0m
     | > loss_disc: 3.053609848022461  (3.053609848022461)
     | > loss_disc_real_0: 0.29904425144195557  (0.29904425144195557)
     | > loss_disc_real_1: 0.2782394289970398  (0.2782394289970398)
     | > loss_disc_real_2: 0.28470495343208313  (0.28470495343208313)
     | > loss_disc_real_3: 0.26765817403793335  (0.26765817403793335)
     | > loss_disc_real_4: 0.2995864152908325  (0.2995864152908325)
     | > loss_disc_real_5: 0.2713654339313507  (0.2713654339313507)
     | > loss_0: 3.053609848022461  (3.053609848022461)
     | > loss_gen: 1.7278966903686523  (1.7278966903686523)
     | > loss_kl: 17.80422592163086  (17.80422592163086)
     | > loss_feat: 0.20188529789447784  (0.20188529789447784)
     | > loss_mel: 114.70222473144531  (114.70222473144531)
     | > loss_duration: 2.957717180252075  (2.957717180252075)
     | > loss_1: 137.39395141601562  (137.39395141601562)

[1m   --> STEP: 1[0m
     | > loss_disc: 3.0608999729156494  (3.0608999729156494)
  

 | > Synthesizing test sentences.


  test_figures["{}-alignment".format(idx)] = plot_alignment(alignment.T, output_fig=False)

  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.006366252899169922 [0m(+0)
     | > avg_loss_disc: 3.0608999729156494 [0m(+0)
     | > avg_loss_disc_real_0: 0.29910117387771606 [0m(+0)
     | > avg_loss_disc_real_1: 0.2792060077190399 [0m(+0)
     | > avg_loss_disc_real_2: 0.2866024374961853 [0m(+0)
     | > avg_loss_disc_real_3: 0.26852667331695557 [0m(+0)
     | > avg_loss_disc_real_4: 0.3008735775947571 [0m(+0)
     | > avg_loss_disc_real_5: 0.27346551418304443 [0m(+0)
     | > avg_loss_0: 3.0608999729156494 [0m(+0)
     | > avg_loss_gen: 1.7277858257293701 [0m(+0)
     | > avg_loss_kl: 15.162726402282715 [0m(+0)
     | > avg_loss_feat: 0.17718160152435303 [0m(+0)
     | > avg_loss_mel: 64.30608367919922 [0m(+0)
     | > avg_loss_duration: 2.9432733058929443 [0m(+0)
     | > avg_loss_1: 84.31704711914062 [0m(+0)

 > BEST MODEL : /home/ubuntu/tyler1/vitstts_checkpoint

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.005904436111450195 [0m(-0.00046181678771972656)
     | > avg_loss_disc:[91m 3.1309821605682373 [0m(+0.07008218765258789)
     | > avg_loss_disc_real_0:[92m 0.24920599162578583 [0m(-0.04989518225193024)
     | > avg_loss_disc_real_1:[91m 0.40514233708381653 [0m(+0.1259363293647766)
     | > avg_loss_disc_real_2:[91m 0.3895493447780609 [0m(+0.10294690728187561)
     | > avg_loss_disc_real_3:[91m 0.33496612310409546 [0m(+0.06643944978713989)
     | > avg_loss_disc_real_4:[91m 0.3695238530635834 [0m(+0.0686502754688263)
     | > avg_loss_disc_real_5:[91m 0.35567277669906616 [0m(+0.08220726251602173)
     | > avg_loss_0:[91m 3.1309821605682373 [0m(+0.07008218765258789)
     | > avg_loss_gen:[91m 2.1165528297424316 [0m(+0.3887670040130615)
     | > avg_loss_kl:[92m 4.365316390991211 [0m(-10.797410011291504)
     | > avg_loss_feat:[92m 0.0339265875518322 [0m(-0.14325501397252083)
     | > avg_loss_mel:[91

 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.0062410831451416016 [0m(+0.00033664703369140625)
     | > avg_loss_disc:[92m 3.0899081230163574 [0m(-0.04107403755187988)
     | > avg_loss_disc_real_0:[92m 0.2353300005197525 [0m(-0.013875991106033325)
     | > avg_loss_disc_real_1:[92m 0.23704499006271362 [0m(-0.1680973470211029)
     | > avg_loss_disc_real_2:[92m 0.2671225666999817 [0m(-0.12242677807807922)
     | > avg_loss_disc_real_3:[92m 0.32696661353111267 [0m(-0.007999509572982788)
     | > avg_loss_disc_real_4:[91m 0.3918340504169464 [0m(+0.022310197353363037)
     | > avg_loss_disc_real_5:[92m 0.14259199798107147 [0m(-0.2130807787179947)
     | > avg_loss_0:[92m 3.0899081230163574 [0m(-0.04107403755187988)
     | > avg_loss_gen:[92m 1.619883418083191 [0m(-0.4966694116592407)
     | > avg_loss_kl:[92m 2.281982898712158 [0m(-2.0833334922790527)
     | > avg_loss_feat:[91m 0.09966837614774704 [0m(+0.06574178859591484)
     | > avg_loss_mel: