## Train

In [1]:
import os

from trainer import Trainer, TrainerArgs

from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
from TTS.utils.manage import ModelManager

2024-02-07 14:15:08.344124: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-07 14:15:08.397726: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
OUT_PATH = os.path.dirname(os.path.abspath("__file__"))

config_dataset = BaseDatasetConfig(
    formatter="ljspeech",
    meta_file_train="Mixed_formatted.txt",
    # meta_file_train = "No_Shouting_formatted.txt",
    path=os.path.join(OUT_PATH, "data/"),
    language = "en"
)
# Define here the dataset that you want to use for the fine-tuning on.
# config_dataset = BaseDatasetConfig(
#     formatter="ljspeech",
#     dataset_name="ljspeech",
#     path="/raid/datasets/LJSpeech-1.1_24khz/",
#     meta_file_train="/raid/datasets/LJSpeech-1.1_24khz/metadata.csv",
#     language="en",
# )

# Add here the configs of the datasets
DATASETS_CONFIG_LIST = [config_dataset]

In [3]:
# define audio config
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
# training parameters config

In [10]:
# Define the path where XTTS v1.1.1 files will be downloaded
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "xttsv2_checkpoint", "XTTS_v2.0_original_model_files/")
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)


# DVAE files
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"

# Set the path to the downloaded files
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))

# download DVAE files if needed
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
    print(" > Downloading DVAE files!")
    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)

# Download XTTS v2.0 checkpoint if needed
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"

# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file

# download XTTS v2.0 files if needed
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
    print(" > Downloading XTTS v2.0 files!")
    ModelManager._download_model_files(
        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
    )

 > Downloading DVAE files!


  0%|          | 0.00/1.07k [00:00<?, ?iB/s]
100%|██████████| 1.07k/1.07k [00:00<00:00, 3.73kiB/s]

  5%|▌         | 11.1M/211M [00:00<00:01, 111MiB/s][A
 11%|█         | 22.1M/211M [00:00<00:02, 86.5MiB/s][A
 16%|█▌        | 33.4M/211M [00:00<00:01, 97.0MiB/s][A
 21%|██▏       | 44.8M/211M [00:00<00:01, 103MiB/s] [A
 27%|██▋       | 57.3M/211M [00:00<00:01, 111MiB/s][A
 33%|███▎      | 69.8M/211M [00:00<00:01, 115MiB/s][A
 39%|███▊      | 81.5M/211M [00:00<00:01, 91.6MiB/s][A
 44%|████▍     | 93.4M/211M [00:00<00:01, 98.8MiB/s][A
 50%|█████     | 105M/211M [00:01<00:01, 104MiB/s]  [A
 55%|█████▌    | 116M/211M [00:01<00:00, 102MiB/s][A
 60%|██████    | 127M/211M [00:01<00:00, 98.9MiB/s][A
 65%|██████▌   | 137M/211M [00:01<00:00, 78.7MiB/s][A
 71%|███████   | 149M/211M [00:01<00:00, 87.8MiB/s][A
 76%|███████▋  | 161M/211M [00:01<00:00, 96.5MiB/s][A
 82%|████████▏ | 173M/211M [00:01<00:00, 103MiB/s] [A
 88%|████████▊ | 185M/211M [00:01<00:00, 108MiB/s][A
 94%|█████████▎|

 > Downloading XTTS v2.0 files!


100%|██████████| 211M/211M [00:02<00:00, 90.3MiB/s]

100%|██████████| 361k/361k [00:00<00:00, 1.18MiB/s]

  1%|          | 11.0M/1.87G [00:00<00:16, 110MiB/s][A
  1%|          | 22.9M/1.87G [00:00<00:16, 115MiB/s][A
  2%|▏         | 35.7M/1.87G [00:00<00:15, 121MiB/s][A
  3%|▎         | 47.8M/1.87G [00:00<00:15, 118MiB/s][A
  3%|▎         | 59.7M/1.87G [00:00<00:15, 117MiB/s][A
  4%|▍         | 71.4M/1.87G [00:00<00:15, 117MiB/s][A
  4%|▍         | 83.1M/1.87G [00:00<00:15, 115MiB/s][A
  5%|▌         | 94.7M/1.87G [00:00<00:15, 115MiB/s][A
  6%|▌         | 107M/1.87G [00:00<00:14, 118MiB/s] [A
  6%|▋         | 119M/1.87G [00:01<00:14, 118MiB/s][A
  7%|▋         | 131M/1.87G [00:01<00:14, 119MiB/s][A
  8%|▊         | 144M/1.87G [00:01<00:14, 122MiB/s][A
  8%|▊         | 156M/1.87G [00:01<00:13, 123MiB/s][A
  9%|▉         | 169M/1.87G [00:01<00:13, 124MiB/s][A
 10%|▉         | 182M/1.87G [00:01<00:13, 125MiB/s][A
 10%|█         | 195M/1.87G [00:01<00:13, 125MiB/s][A
 11%|

In [11]:
# init args and config
model_args = GPTArgs(
    max_conditioning_length=132300*2,  # 6 secs
    min_conditioning_length=66150*2,  # 3 secs
    debug_loading_failures=False,
    max_wav_length=255995*4,  # ~11.6 seconds
    max_text_length=700,
    # mel_norm_file=MEL_NORM_FILE,
    # dvae_checkpoint=DVAE_CHECKPOINT,
    # xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
    # tokenizer_file=TOKENIZER_FILE,
    gpt_num_audio_tokens=1026,
    gpt_start_audio_token=1024,
    gpt_stop_audio_token=1025,
    gpt_use_masking_gt_prompt_approach=True,
    gpt_use_perceiver_resampler=True,
)

In [12]:
# Training sentences generations
SPEAKER_REFERENCE = [
    "data/wavs/1. Tyler1_THE_WORST_JUNGLER_EVER.wav"  # speaker reference to be used in training test sentences
]
LANGUAGE = config_dataset.language

config = GPTTrainerConfig(
    output_path=OUT_PATH,
    model_args=model_args,
    run_name="tyler1_xttsv2",
    project_name="tyler1",
    dashboard_logger="tensorboard",
    # logger_uri=None,
    audio=audio_config,
    batch_size=2,
    batch_group_size=48,
    eval_batch_size=2,
    num_loader_workers=2,
    # eval_split_max_size=256,
    print_step=50,
    plot_step=100,
    log_model_step=1000,
    save_step=10000,
    save_n_checkpoints=1,
    save_checkpoints=True,
    # target_loss="loss",
    print_eval=True,
    # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
    optimizer="AdamW",
    optimizer_wd_only_on_weights=True, # for multi-gpu training please make it False
    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
    lr=5e-06,  # learning rate
    lr_scheduler="MultiStepLR",
    # it was adjusted accordly for the new step scheme
    lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
    test_sentences=[
        {
            "text": "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
            "speaker_wav": SPEAKER_REFERENCE,
            "language": LANGUAGE,
        },
        {
            "text": "This cake is great. It's so delicious and moist.",
            "speaker_wav": SPEAKER_REFERENCE,
            "language": LANGUAGE,
        },
    ],
)

In [17]:
def formatter(root_path, manifest_file, **kwargs):  # pylint: disable=unused-argument
    """Assumes each line as ```<filename>|<transcription>```
    """
    txt_file = os.path.join(root_path, manifest_file)
    items = []
    speaker_name = "Tyler1"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.dirname(os.path.abspath('__file__')) + f"/data/wavs/{cols[0]}.wav"
            text = cols[1]
            # print(text)
            items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name, "root_path": root_path})
    return items

# load training samples
# train_samples, eval_samples = load_tts_samples(
#     DATASETS_CONFIG_LIST,
#     eval_split=True,
#     eval_split_max_size=config.eval_split_max_size,
#     eval_split_size=config.eval_split_size,
# )

train_samples, eval_samples = load_tts_samples(
    # config_dataset,
    DATASETS_CONFIG_LIST,
    eval_split=True,
    # eval_split_max_size=config.eval_split_max_size,
    eval_split_size=0.2,
    formatter = formatter
)

 | > Found 76 files in /home/sagemaker-user/voiceclone/data


In [19]:
# init the model from config
model = GPTTrainer.init_from_config(config)

# init the trainer and 🚀
trainer = Trainer(
    TrainerArgs(
        restore_path=None,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
        skip_train_epoch=False,
        start_with_eval=True,
        grad_accum_steps=128, # batch_size * grad_accum_steps >= 256
    ),
    config,
    output_path=OUT_PATH,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 4
 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/home/sagemaker-user/voiceclone/tyler1_xttsv2-February-07-2024_03+13PM-7e6c2b3


>> DVAE weights restored from: /home/sagemaker-user/voiceclone/xttsv2_checkpoint/XTTS_v2.0_original_model_files/dvae.pth



 > Model has 518442047 parameters


In [20]:
trainer.fit()


[4m[1m > EPOCH: 0/1000[0m
 --> /home/sagemaker-user/voiceclone/tyler1_xttsv2-February-07-2024_03+13PM-7e6c2b3
 ! Run is removed from /home/sagemaker-user/voiceclone/tyler1_xttsv2-February-07-2024_03+13PM-7e6c2b3


 > Filtering invalid eval samples!!
 > Total eval samples after filtering: 0


Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/trainer/trainer.py", line 1833, in fit
    self._fit()
  File "/opt/conda/lib/python3.10/site-packages/trainer/trainer.py", line 1787, in _fit
    self.eval_epoch()
  File "/opt/conda/lib/python3.10/site-packages/trainer/trainer.py", line 1628, in eval_epoch
    self.get_eval_dataloader(
  File "/opt/conda/lib/python3.10/site-packages/trainer/trainer.py", line 990, in get_eval_dataloader
    return self._get_loader(
  File "/opt/conda/lib/python3.10/site-packages/trainer/trainer.py", line 914, in _get_loader
    len(loader) > 0
AssertionError:  ❗ len(DataLoader) returns 0. Make sure your dataset is not empty or len(dataset) > 0. 


AttributeError: 'tuple' object has no attribute 'tb_frame'

##  Generate

In [None]:
from TTS.api import TTS

# using the default version set in 🐸TTS
# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)

# using a specific version
# 👀 see the branch names for versions on https://huggingface.co/coqui/XTTS-v2/tree/main
# ❗some versions might be incompatible with the API
# tts = TTS("xtts_v2.0.2", gpu=True)

# getting the latest XTTS_v2
tts = TTS("xtts").to("cuda")

# tts = TTS("tts_models/en/ljspeech/vits").to("cuda")

In [None]:
from subprocess import getoutput
wavs = getoutput("ls data/wavs/*.wav").split("\n")

In [None]:
# generate speech by cloning a voice using default settings
tts.tts_to_file(text="It's Tyler 1 baby! Finally with the voice!",
                file_path="generated_audio/output.wav",
                speaker_wav=wavs,
                language="en"
               )

In [None]:
from IPython.display import Audio
Audio("generated_audio/output.wav")