## Train

In [12]:
import torch
torch.cuda.empty_cache()

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [13]:
import os

from trainer import Trainer, TrainerArgs

from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
from TTS.utils.manage import ModelManager
from TTS.config import load_config

In [14]:
OUT_PATH = os.path.dirname(os.path.abspath("__file__"))

config_dataset = BaseDatasetConfig(
    formatter="ljspeech",
    meta_file_train="Mixed_formatted.txt",
    # meta_file_train = "No_Shouting_formatted.txt",
    path=os.path.join(OUT_PATH, "data/"),
    language = "en"
)
# Define here the dataset that you want to use for the fine-tuning on.
# config_dataset = BaseDatasetConfig(
#     formatter="ljspeech",
#     dataset_name="ljspeech",
#     path="/raid/datasets/LJSpeech-1.1_24khz/",
#     meta_file_train="/raid/datasets/LJSpeech-1.1_24khz/metadata.csv",
#     language="en",
# )

# Add here the configs of the datasets
DATASETS_CONFIG_LIST = [config_dataset]

In [15]:
# define audio config
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=22050)
# training parameters config

In [16]:
# Define the path where XTTS v1.1.1 files will be downloaded
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "xttsv2_checkpoint", "XTTS_v2.0_original_model_files/")
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)


# DVAE files
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"

# Set the path to the downloaded files
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))

# download DVAE files if needed
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
    print(" > Downloading DVAE files!")
    ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)

# Download XTTS v2.0 checkpoint if needed
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"

# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK))  # vocab.json file
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK))  # model.pth file

# download XTTS v2.0 files if needed
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
    print(" > Downloading XTTS v2.0 files!")
    ModelManager._download_model_files(
        [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
    )

In [17]:
# init args and config

XTTS_CHECKPOINT = "xttsv2_checkpoint/tyler1_xttsv2-February-27-2024_11+49AM-2b31060/best_model.pth"

model_args = GPTArgs(
    max_conditioning_length=int(132300*2),  # 12 secs
    min_conditioning_length=66150,  # 3 secs
    debug_loading_failures=True,
    max_wav_length=255995*3,  # ~33 seconds
    max_text_length=700,
    mel_norm_file=MEL_NORM_FILE,
    dvae_checkpoint=DVAE_CHECKPOINT,
    xtts_checkpoint=XTTS_CHECKPOINT,  # checkpoint path of the model that you want to fine-tune
    tokenizer_file=TOKENIZER_FILE,
    gpt_num_audio_tokens=1026,
    gpt_start_audio_token=1024,
    gpt_stop_audio_token=1025,
    gpt_use_masking_gt_prompt_approach=True,
    gpt_use_perceiver_resampler=True,
    # gpt_max_text_tokens = 500,
    # gpt_max_prompt_tokens = 100
)

In [18]:
# Training sentences generations
# SPEAKER_REFERENCE = [
#     "data/wavs/1. Tyler1 THE_WORST_JUNGLER_EVER 1.wav"  # speaker reference to be used in training test sentences
# ]

SPEAKER_REFERENCE = ["data/wavs/" + wav for wav in os.listdir('data/wavs/') if "wav" in wav]

LANGUAGE = config_dataset.language

config = GPTTrainerConfig(
    output_path=OUT_PATH + '/xttsv2_checkpoint',
    model_args=model_args,
    run_name="tyler1_xttsv2",
    project_name="tyler1",
    dashboard_logger="tensorboard",
    # logger_uri=None,
    audio=audio_config,
    batch_size=1,
    # batch_group_size=48,
    eval_batch_size=1,
    num_loader_workers=2,
    # eval_split_max_size=256,
    print_step=50,
    plot_step=100,
    log_model_step=1000,
    save_step=10000,
    save_n_checkpoints=1,
    save_checkpoints=True,
    # target_loss="loss",
    print_eval=True,
    # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
    optimizer="AdamW",
    optimizer_wd_only_on_weights=True, # for multi-gpu training please make it False
    optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
    lr=5e-06,  # learning rate
    lr_scheduler="MultiStepLR",
    # it was adjusted accordly for the new step scheme
    lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
    test_sentences=[
        {
            "text": "So he starts off level one just doing this. Like oh he's gonna like bro he losing, doesn't hit a single thing. Look at it like- what the fuck. Like bro okay whatever. Watch this top dive bro. I solo made Vayne one HP, right? Just wait out and fucking ghost, you twat.",
            "speaker_wav": SPEAKER_REFERENCE,
            "language": LANGUAGE,
        },
        {
            "text": "Okay whatever you're auto-ing a ward sure it's fine. Bro what are you d- just wait you fucking freak. Where's he walking to by the way? What the fuck! It's not a win-trade this guy played as our Jarvan too. He's a one-trick like bro.",
            "speaker_wav": SPEAKER_REFERENCE,
            "language": LANGUAGE,
        },
        {
            "text": "Hey! Sup? It's me, Tyler1 ready for the pre-alpha. We're back baby!",
            "speaker_wav": SPEAKER_REFERENCE,
            "language": LANGUAGE,
        },
    ],
    # mixed_precision = False
)

In [19]:
def formatter(root_path, manifest_file, **kwargs):  # pylint: disable=unused-argument
    """Assumes each line as ```<filename>|<transcription>```
    """
    txt_file = os.path.join(root_path, manifest_file)
    items = []
    speaker_name = "Tyler1"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = os.path.dirname(os.path.abspath('__file__')) + f"/data/wavs/{cols[0]}.wav"
            text = cols[1]
            # print(text)
            items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name, "root_path": root_path})
    return items


train_samples, eval_samples = load_tts_samples(
    config_dataset,
    # DATASETS_CONFIG_LIST,
    eval_split=True,
    # eval_split_max_size=config.eval_split_max_size,
    eval_split_size=0.1,
    formatter = formatter
)

 | > Found 558 files in /home/sagemaker-user/voiceclone/data


In [20]:
# init the model from config
model = GPTTrainer.init_from_config(config)

KeyboardInterrupt: 

In [None]:
# init the trainer and 🚀

# model_path = "xttsv2_checkpoint/tyler1_xttsv2-February-19-2024_05+30AM-2b31060/"
trainer = Trainer(
    TrainerArgs(
        # continue_path = model_path,  # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
        skip_train_epoch=False,
        start_with_eval=True,
        grad_accum_steps=256, # batch_size * grad_accum_steps >= 256,
    ),
    config,
    output_path=OUT_PATH,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

In [None]:
trainer.fit() 

##  Generate

In [30]:
from TTS.api import TTS
from subprocess import getoutput
from IPython.display import Audio

# using the default version set in 🐸TTS
# tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=True)

# using a specific version
# 👀 see the branch names for versions on https://huggingface.co/coqui/XTTS-v2/tree/main
# ❗some versions might be incompatible with the API
# tts = TTS("xtts_v2.0.2", gpu=True)

# getting the latest XTTS_v2
tts = TTS("xtts").to("cuda")

SPEAKER_REFERENCE = ["data/wavs/" + wav for wav in os.listdir('data/wavs/') if "wav" in wav]

tts.tts_to_file(text= "Das ist meine Stimme auf Deutsch. Es ist Tyler 1 Baby! Scheiße, ja, ich spreche Deutsch. Ich lasse mich nicht zum Schweigen bringen!!",
                file_path="generated_audio/xtts_output_es_1shot.wav",
                speaker_wav=SPEAKER_REFERENCE,
                language="de"
               )

Audio("generated_audio/xtts_output.wav")

 > Using model: xtts
 > Text splitted to sentences.
['Esta es mi voz en español.', '¡Es Tyler1, bebé!', 'Joder, si, hablo español.']
 > Processing time: 115.97198724746704
 > Real-time factor: 14.642592297335366


In [2]:
config

XttsConfig(output_path='/home/sagemaker-user/voiceclone/xttsv2_checkpoint', logger_uri=None, run_name='tyler1_xttsv2', project_name='tyler1', run_description='🐸Coqui trainer run.', print_step=50, plot_step=100, model_param_stats=False, wandb_entity=None, dashboard_logger='tensorboard', save_on_interrupt=True, log_model_step=1000, save_step=10000, save_n_checkpoints=1, save_checkpoints=True, save_all_best=False, save_best_after=0, target_loss=None, print_eval=True, test_delay_epochs=0, run_eval=True, run_eval_steps=None, distributed_backend='nccl', distributed_url='tcp://localhost:54321', mixed_precision=False, precision='fp16', epochs=1000, batch_size=1, eval_batch_size=1, grad_clip=0.0, scheduler_after_epoch=True, lr=5e-06, optimizer='AdamW', optimizer_params={'betas': [0.9, 0.96], 'eps': 1e-08, 'weight_decay': 0.01}, lr_scheduler='MultiStepLR', lr_scheduler_params={'milestones': [900000, 2700000, 5400000], 'gamma': 0.5, 'last_epoch': -1}, use_grad_scaler=False, allow_tf32=False, cudn

In [1]:
import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from subprocess import getoutput
from IPython.display import Audio
from TTS.tts.models import setup_model as setup_tts_model
from TTS.config import load_config
import time

# Add here the xtts_config path
CONFIG_PATH = "xttsv2_checkpoint/tyler1_xttsv2-February-27-2024_11+49AM-2b31060/config.json"
# Add here the vocab file that you have used to train the model
TOKENIZER_PATH = "xttsv2_checkpoint/XTTS_v2.0_original_model_files/vocab.json"
# Add here the checkpoint that you want to do inference with
XTTS_CHECKPOINT = "xttsv2_checkpoint/tyler1_xttsv2-February-27-2024_11+49AM-2b31060/best_model.pth"

# List of all wavs for speaker reference
wavs = getoutput("ls data/wavs/*.wav").split("\n")
# Add here the speaker reference
SPEAKER_REFERENCE = ["data/wavs/" + wav for wav in os.listdir('data/wavs/') if "wav" in wav]

# output wav path
OUTPUT_WAV_PATH = "generated_audio/xtts_output.wav"

start = time.time()
# config = XttsConfig()
# config.load_json(CONFIG_PATH)
# model = Xtts.init_from_config(config)
config = load_config(CONFIG_PATH)
model = setup_tts_model(config)
model.load_checkpoint(config, checkpoint_dir = "xttsv2_checkpoint/tyler1_xttsv2-February-27-2024_11+49AM-2b31060/",
                      checkpoint_path=XTTS_CHECKPOINT, vocab_path=TOKENIZER_PATH, use_deepspeed=False)
model.to("cuda")

gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path= SPEAKER_REFERENCE)

end = time.time()
print(end-start)

2024-03-05 07:29:26.868322: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-05 07:29:26.920049: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


 > Using model: xtts
86.21867299079895


In [9]:
import time

# output wav path
OUTPUT_WAV_PATH = "generated_audio/xtts_output.wav"

start = time.time()
out = model.inference(
    "So he starts off level one just doing this. Like oh he's gonna like bro he losing, doesn't hit a single thing. Look at it like- what the fuck.",
    "en",
    # "Esta es mi voz en español. ¡Es Tyler1, bebé! Joder, si, hablo español.",
    # "es",
    # "Hallo Isi, hier ist Tyler1 und das ist meine Stimme auf Deutsch. Willlkommen in Manila. Ich hoffe du hast eine gute Zeit und wirst viel Spass haben. Vergiss nicht deinen Lümmel einzutüte",
    # "de",
    # "C'est ma voix en français. C'est Tyler 1 bébé ! Putain ouais je parle français. J'ai mangé un croissant et une baguette today",
    # "fr",
    gpt_cond_latent,
    speaker_embedding,
    temperature=0.2, # Add custom parameters here
    top_k = model.config.top_k,
    top_p = model.config.top_p,
)
torchaudio.save(OUTPUT_WAV_PATH, torch.tensor(out["wav"]).unsqueeze(0), 22050)

end = time.time()
print(end-start)

Audio(OUTPUT_WAV_PATH)

3.3867244720458984
