In [1]:
import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig, CharactersConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig
from TTS.tts.configs.vits_config import VitsConfig

from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.configs.glow_tts_config import GlowTTSConfig



In [2]:
output_path = os.path.dirname(os.path.abspath('__file__'))

dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="Mixed_formatted.txt", path=os.path.join(output_path, "data/")
)

In [3]:
config = GlowTTSConfig(
    batch_size=8,
    eval_batch_size=2,
    num_loader_workers=16,
    num_eval_loader_workers=2,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=5000,
    text_cleaner="phoneme_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=50,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path + '/glowtts_checkpoint',
    datasets=[dataset_config],
    lr = 1e-3
)

In [4]:
ap = AudioProcessor.init_from_config(config)

tokenizer, config = TTSTokenizer.init_from_config(config)

 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


In [5]:
def formatter(root_path, manifest_file, **kwargs):  # pylint: disable=unused-argument
    """Assumes each line as ```<filename>|<transcription>```
    """
    txt_file = os.path.join(root_path, manifest_file)
    items = []
    speaker_name = "Tyler1"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.dirname(os.path.abspath('__file__')) + f"/data/wavs/{cols[0]}.wav"
            text = cols[1]
            # print(text)
            items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name, "root_path": root_path})
    return items

In [6]:
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=0.05,
    formatter=formatter
)

 | > Found 60 files in /home/ubuntu/tyler1/data


In [7]:
# init model

model = GlowTTS(config, ap, tokenizer, speaker_manager=None)

# init trainer
trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 30
 | > Num. of Torch Threads: 30
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
2024-01-13 10:24:18.264151: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-13 10:24:18.307533: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.
 > Start Tensorboard: tensorboard --logdir=/h

In [None]:
trainer.fit()


[4m[1m > EPOCH: 0/5000[0m
 --> /home/ubuntu/tyler1/glowtts_checkpoint/run-January-13-2024_10+24AM-f3397c1

[1m > TRAINING (2024-01-13 10:24:20) [0m




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
| > Number of instances : 57
 | > Preprocessing samples
 | > Max text length: 1462
 | > Min text length: 131
 | > Avg text length: 685.6491228070175
 | 
 | > Max audio length: 7408822.0
 | > Min audio length: 578318.0
 | > Avg audio length: 3487299.50877193
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.



[1m   --> TIME: 2024-01-13 10:24:24 -- STEP: 0/8 -- GLOBAL_STEP: 0[0m
     | > current_lr: 2.5e-07 
     | > step_time: 1.7199  (1.719942569732666)
     | > loader_time: 2.0312  (2.031230926513672)

 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.

[1m > EVALUATION [0m





> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: en-us
		| > phoneme backend: gruut
| > Number of instances : 3
 | > Preprocessing samples
 | > Max text length: 1148
 | > Min text length: 261
 | > Avg text length: 681.3333333333334
 | 
 | > Max audio length: 6162562.0
 | > Min audio length: 1429014.0
 | > Avg audio length: 3654345.3333333335
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
 | > Synthesizing test sentences.



  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time: 0.0023560523986816406 [0m(+0)
     | > avg_loss: 3.2444334030151367 [0m(+0)
     | > avg_log_mle: 1.7664422988891602 [0m(+0)
     | > avg_loss_dur: 1.4779911041259766 [0m(+0)

 > BEST MODEL : /home/ubuntu/tyler1/glowtts_checkpoint/run-January-13-2024_10+24AM-f3397c1/best_model_8.pth

[4m[1m > EPOCH: 1/5000[0m
 --> /home/ubuntu/tyler1/glowtts_checkpoint/run-January-13-2024_10+24AM-f3397c1

[1m > TRAINING (2024-01-13 10:24:56) [0m
 [!] `train_step()` retuned `None` outputs. Skipping training step.
 [!] `train_step()` retuned `None` outputs. Skipping training step.

[1m > EVALUATION [0m

 > Keyboard interrupt detected.
 > Saving model before exiting...

 > CHECKPOINT : /home/ubuntu/tyler1/glowtts_checkpoint/run-January-13-2024_10+24AM-f3397c1/checkpoint_16.pth
 ! Run is kept in /home/ubuntu/tyler1/glowtts_checkpoint/run-January-13-2024_10+24AM-f3397c1


In [None]:
train_samples