In [1]:
import os
import torch
from trainer import Trainer, TrainerArgs

from TTS.bin.compute_embeddings import compute_embeddings
from TTS.bin.resample import resample_files
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig
from TTS.utils.downloaders import download_vctk

In [2]:
#torch.set_num_threads(24)

"""
    이 레시피는 YourTTS 논문(https://arxiv.org/abs/2112.02418)에 제안된 첫 번째 실험을 복제합니다.
    YourTTS 모델은 VITS 모델을 기반으로 하지만, 사전 학습된 스피커 인코더에서 추출된 외부 스피커 임베딩을 사용하며, 소규모 아키텍처 변경이 있습니다.
    또한, YourTTS는 다국어 데이터를 사용하여 학습할 수 있지만, 이 레시피는 VCTK 데이터셋을 사용한 단일 언어 학습을 복제합니다.
    다국어 학습에 관심이 있다면 VitsArgs 클래스 인스턴스에서 주석 처리된 매개변수를 활성화해야 합니다.
    또한, VCTK를 예로 들어 추가 데이터셋을 추가해야 합니다.
"""
CURRENT_PATH = '/home/research'

# 실행을 위한 이름
RUN_NAME = "YourTTS-EN-VCTK"

# 모델 출력(구성, 체크포인트, 텐서보드 로그)을 저장할 경로
OUT_PATH = '/home/research'  # "/raid/coqui/Checkpoints/original-YourTTS/"

# 전이 학습을 수행하고 학습 속도를 높이고 싶다면, 여기에서 원래 YourTTS 모델의 경로를 설정할 수 있습니다.
RESTORE_PATH = None  # "/root/.local/share/tts/tts_models--multilingual--multi-dataset--your_tts/model_file.pth"

# 이 매개변수는 디버깅에 유용합니다. 학습 epoch를 건너뛰고 평가를 수행하며 테스트 문장을 생성합니다.
SKIP_TRAIN_EPOCH = False

# 학습 및 평가에 사용할 배치 크기를 여기에서 설정합니다.
BATCH_SIZE = 16

# 학습 샘플링 비율과 다운로드된 데이터셋을 리샘플링할 대상 샘플링 비율 설정
# 참고: 이 값을 변경하면 데이터셋을 다시 다운로드해야 할 수 있습니다.
# 참고: 새 데이터셋을 추가하는 경우 데이터셋 샘플링 비율이 이 값과 일치하는지 확인해야 하며, 그렇지 않으면 오디오를 리샘플링해야 합니다.
SAMPLE_RATE = 16000

# 학습에 사용되는 최대 오디오 길이(초 단위). 이보다 큰 오디오는 무시됩니다.
MAX_AUDIO_LEN_IN_SECONDS = 10

### VCTK 데이터셋 다운로드
VCTK_DOWNLOAD_PATH = os.path.join(CURRENT_PATH, "VCTK")
# 오디오 리샘플링에 사용할 스레드 수 정의
NUM_RESAMPLE_THREADS = 10

In [3]:
# VCTK 데이터셋이 이미 다운로드되지 않았다면 다운로드
if not os.path.exists(VCTK_DOWNLOAD_PATH):
    print(">>> VCTK 데이터셋을 다운로드 중:")
    download_vctk(VCTK_DOWNLOAD_PATH, use_kaggle=True)
    resample_files(VCTK_DOWNLOAD_PATH, SAMPLE_RATE, file_ext="wav", n_jobs=NUM_RESAMPLE_THREADS)

In [4]:
# 구성 초기화
vctk_config = BaseDatasetConfig(
    formatter="vctk_old",
    dataset_name="vctk",
    meta_file_train="",
    meta_file_val="",
    path=VCTK_DOWNLOAD_PATH,
    language="en",
    ignored_speakers=[
        "p261",
        "p225",
        "p294",
        "p347",
        "p238",
        "p234",
        "p248",
        "p335",
        "p245",
        "p326",
        "p302",
    ],  # 논문 실험을 완전히 복제하기 위해 테스트 화자를 무시
)

# 모든 데이터셋 구성을 여기에 추가합니다. 현재 VCTK 데이터셋만으로 학습하고자 하므로 VCTK만 추가합니다.
# 참고: 새 데이터셋을 추가하고자 할 경우, 여기에 추가하면 자동으로 이 새 데이터셋의 스피커 임베딩(d-벡터)이 계산됩니다.
DATASETS_CONFIG_LIST = [vctk_config]

In [5]:
## 스피커 임베딩 추출
SPEAKER_ENCODER_CHECKPOINT_PATH = (
    "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar"
)
SPEAKER_ENCODER_CONFIG_PATH = "https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json"

D_VECTOR_FILES = []  # 학습 중에 사용할 스피커 임베딩/d-벡터 목록

In [6]:
# 모든 데이터셋 구성을 반복하여 스피커 임베딩이 이미 계산되었는지 확인하고, 계산되지 않았으면 계산
for dataset_conf in DATASETS_CONFIG_LIST:
    # 임베딩이 아직 계산되지 않았다면 계산
    embeddings_file = os.path.join(dataset_conf.path, "speakers.pth")
    if not os.path.isfile(embeddings_file):
        print(f">>> Computing the speaker embeddings for the {dataset_conf.dataset_name} dataset")
        compute_embeddings(
            SPEAKER_ENCODER_CHECKPOINT_PATH,
            SPEAKER_ENCODER_CONFIG_PATH,
            embeddings_file,
            old_speakers_file=None,
            config_dataset_path=None,
            formatter_name=dataset_conf.formatter,
            dataset_name=dataset_conf.dataset_name,
            dataset_path=dataset_conf.path,
            meta_file_train=dataset_conf.meta_file_train,
            meta_file_val=dataset_conf.meta_file_val,
            disable_cuda=False,
            no_eval=False,
        )
    D_VECTOR_FILES.append(embeddings_file)

In [7]:
# 학습에 사용되는 오디오 구성
audio_config = VitsAudioConfig(
    sample_rate=SAMPLE_RATE,
    hop_length=256,
    win_length=1024,
    fft_size=1024,
    mel_fmin=0.0,
    mel_fmax=None,
    num_mels=80,
)

# YourTTS 모델에 필요한 매개변수를 설정하여 VITSArgs 초기화
model_args = VitsArgs(
    d_vector_file=D_VECTOR_FILES,
    use_d_vector_file=True,
    d_vector_dim=512,
    num_layers_text_encoder=10,
    speaker_encoder_model_path=SPEAKER_ENCODER_CHECKPOINT_PATH,
    speaker_encoder_config_path=SPEAKER_ENCODER_CONFIG_PATH,
    resblock_type_decoder="2",  # 논문에서는 실수로 ResNet 블록 유형 2를 사용하여 YourTTS를 학습했지만, 원한다면 VITS 모델처럼 유형 1을 사용할 수 있음
    # 논문에서 설명된 스피커 일관성 손실(SCL)을 활성화하는 데 유용한 매개변수
    # use_speaker_encoder_as_loss=True,
    # 다국어 학습을 활성화하는 데 유용한 매개변수
    use_language_embedding=True,
    embedded_language_dim=6,
)

In [8]:
# 일반 학습 구성. 여기에서 배치 크기 및 기타 유용한 매개변수를 변경할 수 있음
config = VitsConfig(
    output_path=OUT_PATH,
    model_args=model_args,
    run_name=RUN_NAME,
    project_name="YourTTS",
    run_description="""
            - Original YourTTS trained using VCTK dataset
        """,
    dashboard_logger="tensorboard",
    logger_uri=None,
    audio=audio_config,
    batch_size=BATCH_SIZE,
    batch_group_size=48,
    eval_batch_size=BATCH_SIZE,
    num_loader_workers=8,
    eval_split_max_size=256,
    print_step=25,
    plot_step=100,
    log_model_step=1000,
    save_step=5000,
    save_n_checkpoints=2,
    save_checkpoints=False,
    target_loss="loss_1",
    print_eval=True,
    use_phonemes=False,
    phonemizer="espeak",
    phoneme_language="en",
    compute_input_seq_cache=True,
    add_blank=True,
    text_cleaner="multilingual_cleaners",
    characters=CharactersConfig(
        characters_class="TTS.tts.models.vits.VitsCharacters",
        pad="_",
        eos="&",
        bos="*",
        blank=None,
        characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\u00af\u00b7\u00df\u00e0\u00e1\u00e2\u00e3\u00e4\u00e6\u00e7\u00e8\u00e9\u00ea\u00eb\u00ec\u00ed\u00ee\u00ef\u00f1\u00f2\u00f3\u00f4\u00f5\u00f6\u00f9\u00fa\u00fb\u00fc\u00ff\u0101\u0105\u0107\u0113\u0119\u011b\u012b\u0131\u0142\u0144\u014d\u0151\u0153\u015b\u016b\u0171\u017a\u017c\u01ce\u01d0\u01d2\u01d4\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u044f\u0451\u0454\u0456\u0457\u0491\u2013!'(),-.:;? ",
        punctuations="!'(),-.:;? ",
        phonemes="",
        is_unique=True,
        is_sorted=True,
    ),
    phoneme_cache_path=None,
    precompute_num_workers=12,
    start_by_longest=True,
    datasets=DATASETS_CONFIG_LIST,
    cudnn_benchmark=False,
    max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
    mixed_precision=False,
    test_sentences=[
        [
            "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
            "VCTK_old_p277",
            None,
            "en",
        ],
        [
            "Be a voice, not an echo.",
            "VCTK_old_p239",
            None,
            "en",
        ],
        [
            "I'm sorry Dave. I'm afraid I can't do that.",
            "VCTK_old_p258",
            None,
            "en",
        ],
        [
            "This cake is great. It's so delicious and moist.",
            "VCTK_old_p244",
            None,
            "en",
        ],
        [
            "Prior to November 22, 1963.",
            "VCTK_old_p305",
            None,
            "en",
        ],
    ],
    # Enable the weighted sampler
    use_weighted_sampler=True,
    # Ensures that all speakers are seen in the training batch equally no matter how many samples each speaker has
    weighted_sampler_attrs={"speaker_name": 1.0},
    weighted_sampler_multipliers={"speaker_name":{}},
    # It defines the Speaker Consistency Loss (SCL) α to 9 like the paper
    speaker_encoder_loss_alpha=9.0,
)

In [9]:
# 학습 샘플 및 평가 샘플 로드
train_samples, eval_samples = load_tts_samples(
    config.datasets,
    eval_split=True,  # 학습 및 평가 데이터를 분리
    eval_split_max_size=config.eval_split_max_size,  # 평가 데이터셋의 최대 크기
    eval_split_size=config.eval_split_size,  # 평가 데이터셋의 비율
)

 | > Found 39846 files in /home/research/VCTK


In [10]:
train_samples

[{'text': '"It will be too hot."',
  'audio_file': '/home/research/VCTK/wav48/p376/p376_127.wav',
  'speaker_name': 'VCTK_old_p376',
  'root_path': '/home/research/VCTK',
  'language': 'en',
  'audio_unique_name': 'vctk#wav48/p376/p376_127'},
 {'text': "And it's more subversive.",
  'audio_file': '/home/research/VCTK/wav48/p336/p336_199.wav',
  'speaker_name': 'VCTK_old_p336',
  'root_path': '/home/research/VCTK',
  'language': 'en',
  'audio_unique_name': 'vctk#wav48/p336/p336_199'},
 {'text': 'To the Hebrews it was a token that there would be no more universal floods.',
  'audio_file': '/home/research/VCTK/wav48/p313/p313_014.wav',
  'speaker_name': 'VCTK_old_p313',
  'root_path': '/home/research/VCTK',
  'language': 'en',
  'audio_unique_name': 'vctk#wav48/p313/p313_014'},
 {'text': 'It was good.\n',
  'audio_file': '/home/research/VCTK/wav48/p266/p266_289.wav',
  'speaker_name': 'VCTK_old_p266',
  'root_path': '/home/research/VCTK',
  'language': 'en',
  'audio_unique_name': 'vctk#

In [11]:
# 모델 초기화
model = Vits.init_from_config(config)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location="cpu")
  return torch.load(f, map_location=map_location, **kwargs)


 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:True
 | > db_level:-27.0
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400
 > initialization of language-embedding layers.


In [12]:
#from trainer import Trainer, TrainerArgs

: 

In [None]:
# 학습기 초기화 및 🚀 시작
trainer = Trainer(
    TrainerArgs(restore_path=RESTORE_PATH, continue_path='/home/research/YourTTS-EN-VCTK-January-28-2025_09+47AM-0000000', skip_train_epoch=SKIP_TRAIN_EPOCH, gpu=6), # gpu번호 설정
    config,  # 모델 구성
    output_path=OUT_PATH,  # 출력 경로
    model=model,  # 모델 객체
    train_samples=train_samples,  # 학습 샘플
    eval_samples=eval_samples,  # 평가 샘플
)
trainer.fit()  # 학습 시작

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 6
 | > Num. of GPUs: 15
 | > Num. of CPUs: 64
 | > Num. of Torch Threads: 32
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/home/research/YourTTS-EN-VCTK-January-28-2025_09+47AM-0000000
 > Restoring from checkpoint_769132.pth ...


 > `speakers.pth` is saved to /home/research/YourTTS-EN-VCTK-January-28-2025_09+47AM-0000000/speakers.pth.
 > `speakers_file` is updated in the config.json.
 > `language_ids.json` is saved to /home/research/YourTTS-EN-VCTK-January-28-2025_09+47AM-0000000/language_ids.json.
 > `language_ids_file` is updated in the config.json.


  return torch.load(f, map_location=map_location, **kwargs)
 > Restoring Model...
 > Restoring Optimizer...
 > Model restored from step 769132
  self.scaler = torch.cuda.amp.GradScaler()

 > Model has 86941642 parameters
 > Restoring best loss from best_model_769081.pth ...
 > Starting with loaded last best loss {'train_loss': 60.88365840911865, 'eval_loss': None}

[4m[1m > EPOCH: 0/1000[0m
 --> /home/research/YourTTS-EN-VCTK-January-28-2025_09+47AM-0000000




> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: False
| > Number of instances : 39590
 | > Preprocessing samples
 | > Max text length: 180
 | > Min text length: 9
 | > Avg text length: 39.19906186612576
 | 
 | > Max audio length: 159830.0
 | > Min audio length: 9665.0
 | > Avg audio length: 56672.46181541582
 | > Num. instances discarded samples: 150
 | > Batch group size: 768.
 > Using weighted sampler for attribute 'speaker_name' with alpha '1.0'
{}



[1m > TRAINING (2025-02-05 07:56:40) [0m


 > Attribute weights for '['VCTK_old_p226', 'VCTK_old_p227', 'VCTK_old_p228', 'VCTK_old_p229', 'VCTK_old_p230', 'VCTK_old_p231', 'VCTK_old_p232', 'VCTK_old_p233', 'VCTK_old_p236', 'VCTK_old_p237', 'VCTK_old_p239', 'VCTK_old_p240', 'VCTK_old_p241', 'VCTK_old_p243', 'VCTK_old_p244', 'VCTK_old_p246', 'VCTK_old_p247', 'VCTK_old_p249', 'VCTK_old_p250', 'VCTK_old_p251', 'VCTK_old_p252', 'VCTK_old_p253', 'VCTK_old_p254', 'VCTK_old_p255', 'VCTK_old_p256', 'VCTK_old_p257', 'VCTK_old_p258', 'VCTK_old_p259', 'VCTK_old_p260', 'VCTK_old_p262', 'VCTK_old_p263', 'VCTK_old_p264', 'VCTK_old_p265', 'VCTK_old_p266', 'VCTK_old_p267', 'VCTK_old_p268', 'VCTK_old_p269', 'VCTK_old_p270', 'VCTK_old_p271', 'VCTK_old_p272', 'VCTK_old_p273', 'VCTK_old_p274', 'VCTK_old_p275', 'VCTK_old_p276', 'VCTK_old_p277', 'VCTK_old_p278', 'VCTK_old_p279', 'VCTK_old_p280', 'VCTK_old_p281', 'VCTK_old_p282', 'VCTK_old_p283', 'VCTK_old_p284', 'VCTK_old_p285', 'VCTK_old_p286', 'VCTK_old_p287', 'VCTK_old_p288', 'VCTK_old_p292', 'VCT



tokens
tensor([[164,  56, 164,  ...,   0,   0,   0],
        [164,  57, 164,  ...,   0,   0,   0],
        [164,  57, 164,  ...,   0,   0,   0],
        ...,
        [164,  38, 164,  ...,   0,   0,   0],
        [164,  60, 164,  ...,   0,   0,   0],
        [164,  39, 164,  ...,   0,   0,   0]])
token_lens
tensor([ 77,  99,  95, 129,  57,  57, 117,  91,  85,  63,  69,  85,  59, 125,
         75,  97])
token_rel_lens
tensor([0.5969, 0.7674, 0.7364, 1.0000, 0.4419, 0.4419, 0.9070, 0.7054, 0.6589,
        0.4884, 0.5349, 0.6589, 0.4574, 0.9690, 0.5814, 0.7519])
waveform
tensor([[[-0.0008, -0.0014, -0.0013,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0038,  0.0069,  0.0061,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.0042, -0.0078, -0.0068,  ...,  0.0000,  0.0000,  0.0000]],

        ...,

        [[ 0.0008,  0.0017,  0.0016,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0008,  0.0015,  0.0014,  ...,  0.0064,  0.0000,  0.0000]],

        [[-0.0044, -0.0080, -0.0068,  ...,  0.0

Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:873.)
  return _VF.stft(  # type: ignore[attr-defined]


tokens
tensor([[164,  56, 164,  ...,   0,   0,   0],
        [164,  57, 164,  ...,   0,   0,   0],
        [164,  57, 164,  ...,   0,   0,   0],
        ...,
        [164,  38, 164,  ...,   0,   0,   0],
        [164,  60, 164,  ...,   0,   0,   0],
        [164,  39, 164,  ...,   0,   0,   0]], device='cuda:6')
token_lens
tensor([ 77,  99,  95, 129,  57,  57, 117,  91,  85,  63,  69,  85,  59, 125,
         75,  97], device='cuda:6')
token_rel_lens
tensor([0.5969, 0.7674, 0.7364, 1.0000, 0.4419, 0.4419, 0.9070, 0.7054, 0.6589,
        0.4884, 0.5349, 0.6589, 0.4574, 0.9690, 0.5814, 0.7519],
       device='cuda:6')
waveform
tensor([[[-0.0008, -0.0014, -0.0013,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0038,  0.0069,  0.0061,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.0042, -0.0078, -0.0068,  ...,  0.0000,  0.0000,  0.0000]],

        ...,

        [[ 0.0008,  0.0017,  0.0016,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0008,  0.0015,  0.0014,  ...,  0.0064,  0.0000,  

  with autocast(enabled=False):  # use float32 for the criterion
  with autocast(enabled=False):
  with autocast(enabled=False):  # use float32 for the criterion


tokens
tensor([[164,  57, 164,  ...,   0,   0,   0],
        [164,  57, 164,  ...,   0,   0,   0],
        [164,  57, 164,  ...,   0,   0,   0],
        ...,
        [164,  52, 164,  ...,   0,   0,   0],
        [164,  60, 164,  ...,   0,   0,   0],
        [164,  57, 164,  ...,   0,   0,   0]])
token_lens
tensor([ 75,  95, 101,  79,  61, 119,  95,  63, 109,  71, 115,  71,  85, 111,
         67,  57])
token_rel_lens
tensor([0.6303, 0.7983, 0.8487, 0.6639, 0.5126, 1.0000, 0.7983, 0.5294, 0.9160,
        0.5966, 0.9664, 0.5966, 0.7143, 0.9328, 0.5630, 0.4790])
waveform
tensor([[[ 0.0030,  0.0053,  0.0046,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0005,  0.0009,  0.0008,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.0015, -0.0026, -0.0022,  ...,  0.0000,  0.0000,  0.0000]],

        ...,

        [[-0.0008, -0.0015, -0.0013,  ..., -0.0024,  0.0000,  0.0000]],

        [[-0.0016, -0.0032, -0.0031,  ..., -0.0024, -0.0024, -0.0026]],

        [[-0.0028, -0.0052, -0.0045,  ..., -0.0

 > Keyboard interrupt detected.
 > Saving model before exiting...

 > CHECKPOINT : /home/research/YourTTS-EN-VCTK-January-28-2025_09+47AM-0000000/checkpoint_769136.pth
 ! Run is kept in /home/research/YourTTS-EN-VCTK-January-28-2025_09+47AM-0000000
