In [1]:
import os
import torch
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import CharactersConfig, Vits, VitsArgs, VitsAudioConfig

In [2]:
#torch.set_num_threads(24)

CURRENT_PATH = '/home/research'

# 실행을 위한 이름
RUN_NAME = "YourTTS-ko-standard"

# 모델 출력(구성, 체크포인트, 텐서보드 로그)을 저장할 경로
OUT_PATH = '/home/research'  # "/raid/coqui/Checkpoints/original-YourTTS/"

# 학습 및 평가에 사용할 배치 크기를 여기에서 설정합니다.
BATCH_SIZE = 32

SAMPLE_RATE = 16000

# 학습에 사용되는 최대 오디오 길이(초 단위). 이보다 큰 오디오는 무시됩니다.
MAX_AUDIO_LEN_IN_SECONDS = 6

In [3]:
# 구성 초기화
data_config = BaseDatasetConfig(
    formatter="ksd",
    dataset_name="korean_standard_dataset",
    meta_file_train="trainset.txt",
    meta_file_val="validset.txt",
    path='/data2/personal/sungjin/korean_standard',
    language="standard",
)

DATASETS_CONFIG_LIST = [data_config]

In [4]:
D_VECTOR_FILES = []  # 학습 중에 사용할 스피커 임베딩/d-벡터 목록

In [5]:
for dataset_conf in DATASETS_CONFIG_LIST:
    embeddings_file = os.path.join(dataset_conf.path, "speakers.json")
    D_VECTOR_FILES.append(embeddings_file)

In [6]:
# 학습에 사용되는 오디오 구성
audio_config = VitsAudioConfig(
    sample_rate=SAMPLE_RATE,
    hop_length=256,
    win_length=1024,
    fft_size=1024,
    mel_fmin=0,
    mel_fmax=8000,
    num_mels=80,
)

# YourTTS 모델에 필요한 매개변수를 설정하여 VITSArgs 초기화
model_args = VitsArgs(
    d_vector_file=D_VECTOR_FILES,
    use_d_vector_file=True,
    d_vector_dim=192,
    num_layers_text_encoder=10,
    hidden_channels=196,
    num_layers_flow=4,
    resblock_type_decoder="1",
    # 다국어 학습을 활성화하는 데 유용한 매개변수
    use_language_embedding=True,
    embedded_language_dim=6,
    num_languages=6
)

In [7]:
phoneme_cache_folder_path = OUT_PATH + '/phoneme_cache'

# 폴더가 존재하는지 확인
if os.path.exists(phoneme_cache_folder_path):
    # 폴더 내부의 모든 파일 삭제
    for filename in os.listdir(phoneme_cache_folder_path):
        file_path = os.path.join(phoneme_cache_folder_path, filename)
        os.remove(file_path)
else:
    # 폴더가 존재하지 않으면 생성
    os.makedirs(phoneme_cache_folder_path)

# 일반 학습 구성. 여기에서 배치 크기 및 기타 유용한 매개변수를 변경할 수 있음
config = VitsConfig(
    output_path=OUT_PATH,
    model_args=model_args,
    run_name=RUN_NAME,
    project_name="YourTTS",
    run_description="",
    dashboard_logger="tensorboard",
    logger_uri=None,
    audio=audio_config,
    batch_size=BATCH_SIZE,
    batch_group_size=48,
    eval_batch_size=BATCH_SIZE,
    num_loader_workers=8,
    print_step=25,
    plot_step=100,
    log_model_step=1000,
    save_step=28250,
    save_n_checkpoints=1,
    save_checkpoints=True,
    target_loss="loss_1",
    print_eval=False,
    use_phonemes=True,
    phonemizer="ko_kr_phonemizer",
    compute_input_seq_cache=True,
    add_blank=False,
    characters=CharactersConfig(
        pad="_",
        eos="&",
        bos="*",
        blank=None,
        characters="ᄀᄁᄂᄃᄄᄅᄆᄇᄈᄉᄊᄋᄌᄍᄎᄏᄐᄑ하ᅢᅣᅤᅥᅦᅧᅨᅩᅪᅫᅬᅭᅮᅯᅰᅱᅲᅳᅴᅵᆨᆩᆪᆫᆬᆭᆮᆯᆰᆱᆲᆳᆴᆵᆶᆷᆸᆹᆺᆻᆼᆽᆾᆿᇀᇁᇂ",
        punctuations=".,!?~ ",
        is_unique=True,
        is_sorted=True,
    ),
    phoneme_cache_path=phoneme_cache_folder_path,
    precompute_num_workers=12,
    start_by_longest=True,
    datasets=DATASETS_CONFIG_LIST,
    cudnn_benchmark=False,
    max_audio_len=SAMPLE_RATE * MAX_AUDIO_LEN_IN_SECONDS,
    mixed_precision=False,
    test_sentences=[
        [
            "안녕하세요, 테스트 문장입니다.",
            "0005_G1A3E7_KYG",
            None,
            "standard",
        ],
        [
            "제 이름은 김성진이라고 합니다.",
            "0028_G1A2E7_CDL",
            None,
            "standard",
        ],
        [
            "저는 현재 서울대학교에 재학중이에요.",
            "0101_G1A4E7_HDH",
            None,
            "standard",
        ],
        [
            "제 나이가 올해로 스물네살입니다.",
            "0413_G2A6E7_PHK",
            None,
            "standard",
        ],
        [
            "오늘도 힘찬 하루가 되세요.",
            "0388_G2A2E7_CHJ",
            None,
            "standard",
        ],
    ],
    # It defines the Speaker Consistency Loss (SCL) α to 9 like the paper
    speaker_encoder_loss_alpha=9.0,
)

In [8]:
# 학습 샘플 및 평가 샘플 로드
train_samples, eval_samples = load_tts_samples(
    config.datasets,
    eval_split=True,  # 학습 및 평가 데이터를 분리
)

 | > Found 91671 files in /data2/personal/sungjin/korean_standard


In [9]:
# 모델 초기화
model = Vits.init_from_config(config)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:8000
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of language-embedding layers.


: 

In [None]:
# 학습기 초기화 및 🚀 시작
trainer = Trainer(
    #TrainerArgs(continue_path='/home/research/YourTTS-EN-VCTK-January-28-2025_09+47AM-0000000', gpu=6), # gpu번호 설정
    TrainerArgs(gpu=2), # gpu번호 설정
    config,  # 모델 구성
    output_path=OUT_PATH,  # 출력 경로
    model=model,  # 모델 객체
    train_samples=train_samples,  # 학습 샘플
    eval_samples=eval_samples,  # 평가 샘플
)
trainer.fit()  # 학습 시작

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Current device: 2
 | > Num. of GPUs: 15
 | > Num. of CPUs: 64
 | > Num. of Torch Threads: 32
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=/home/research/YourTTS-ko-standard-February-06-2025_08+35AM-0000000

 > Model has 91007858 parameters


 > `speakers.pth` is saved to /home/research/YourTTS-ko-standard-February-06-2025_08+35AM-0000000/speakers.pth.
 > `speakers_file` is updated in the config.json.
 > `language_ids.json` is saved to /home/research/YourTTS-ko-standard-February-06-2025_08+35AM-0000000/language_ids.json.
 > `language_ids_file` is updated in the config.json.



[4m[1m > EPOCH: 0/1000[0m
 --> /home/research/YourTTS-ko-standard-February-06-2025_08+35AM-0000000




> DataLoader initialization
| > Tokenizer:
	| > add_blank: False
	| > use_eos_bos: False
	| > use_phonemes: True
	| > phonemizer:
		| > phoneme language: ko-kr
		| > phoneme backend: ko_kr_phonemizer
| > Number of instances : 91671



[1m > TRAINING (2025-02-06 08:35:42) [0m


 | > Preprocessing samples
 | > Max text length: 54
 | > Min text length: 6
 | > Avg text length: 19.92432540077666
 | 
 | > Max audio length: 95996.0
 | > Min audio length: 25546.0
 | > Avg audio length: 48011.9193689358
 | > Num. instances discarded samples: 1284
 | > Batch group size: 1536.


Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:873.)
  return _VF.stft(  # type: ignore[attr-defined]
  with autocast(enabled=False):  # use float32 for the criterion
  with autocast(enabled=False):
  with autocast(enabled=False):  # use float32 for the criterion

[1m   --> TIME: 2025-02-06 08:35:56 -- STEP: 0/2825 -- GLOBAL_STEP: 0[0m
     | > loss_disc: 6.11601448059082  (6.11601448059082)
     | > loss_disc_real_0: 1.00130295753479  (1.00130295753479)
     | > loss_disc_real_1: 0.983552873134613  (0.983552873134613)
     | > loss_disc_real_2: 1.0362628698349  (1.0362628698349)
     | > loss_disc_real_3: 1.0310255289077759  (1.0310255289077759)
     | > loss_disc_real_4: 1.030104160308838  (1.030104160308838)
     | > loss_disc_real_5: 1.0326470136642456  (1.0326470136642456)
     | > loss_0: 6.11601448059082  (6.11601448059082)
     | > grad_norm_0: tensor(6.8593, 

: 