In [46]:
from models.edenTTS import EdenTTS
from hparams import hparams as hp
from utils.paths import Paths
import torch
import time
import soundfile as sf
import os
from utils.log_util import get_logger
import numpy as np
from text.en_util import text_to_sequence
from qwen_tts import Qwen3TTSTokenizer
import soundfile as sf
# THÊM THƯ VIỆN ĐỂ NGHE AUDIO TRỰC TIẾP TRÊN JUPYTER
from IPython.display import Audio, display 

device = "cuda" if torch.cuda.is_available() else "cpu"
log = get_logger(__name__)

In [2]:
# ==========================================
# 1. LOAD TOKENIZER VÀ MODEL (Chỉ cần chạy 1 lần)
# ==========================================
log.info("Loading Qwen3 TTS Tokenizer...")
tokenizer = Qwen3TTSTokenizer.from_pretrained(
    "Qwen/Qwen3-TTS-Tokenizer-12Hz",
    device_map=device,
)

2026-02-20 09:07:32,108 INFO 2871649631.py-4 Loading Qwen3 TTS Tokenizer...


In [51]:
# Khởi tạo và load trọng số EdenTTS
tts_model_id = hp.tts_model_id
paths = Paths(hp.data_path, tts_model_id, speaker=hp.speaker)
tts_model_path = paths.tts_latest_weights
print(f"Đường dẫn checkpoint: {tts_model_path}")

if not os.path.exists(tts_model_path):
    print(f"⚠️ LỖI: Không tìm thấy file checkpoint tại: {tts_model_path}")
else:
    tts_model = EdenTTS(hp).to(device)
    tts_model.load(tts_model_path)
    tts_model.eval() # Bắt buộc set eval mode
    log.info("EdenTTS Model loaded successfully!")

out_path = "./"
os.makedirs(out_path, exist_ok=True)

Đường dẫn checkpoint: checkpoints/ljs/tts_eden/latest_weights.pyt


  WeightNorm.apply(module, name, dim)
2026-02-20 09:27:49,782 INFO net_utils.py-25  number of weights:28449408, tol params:104, params requires_grad:103
2026-02-20 09:27:49,783 INFO net_utils.py-25  number of weights:8141328, tol params:22, params requires_grad:22
2026-02-20 09:27:49,783 INFO net_utils.py-25  number of weights:591617, tol params:10, params requires_grad:10
2026-02-20 09:27:49,784 INFO net_utils.py-25  number of weights:41126289, tol params:147, params requires_grad:146
2026-02-20 09:27:49,784 INFO edenTTS.py-70 tol_params: 41126289, text_encoder:28449408,decoder:8141328,dur:591617,tol_infer:37182353
2026-02-20 09:27:49,889 INFO 1863235423.py-13 EdenTTS Model loaded successfully!


In [52]:
enc = tokenizer.encode("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/tokenizer_demo_1.wav")
OutputClass = enc.__class__

In [53]:
# ==========================================
# 2. ĐỊNH NGHĨA HÀM SUY LUẬN
# ==========================================
def synthesize_text(text):
    if not os.path.exists(tts_model_path):
        return
        
    log.info(f"Processing: '{text}'")
    phones = text_to_sequence(text, token_type=hp.token_type) 
    phones = torch.tensor(phones).long().unsqueeze(0).to(device)
    
    s1 = time.time()
    # Dự đoán Latent vector liên tục
    mel_pred = tts_model.inference(phones)
    log.info(f"Acoustic model inference time: {time.time() - s1:.3f}s")
    safe_name = "".join([c if c.isalnum() else "_" for c in text[:15]])
    file_name = f"{safe_name}_{int(time.time())}.wav"
    file_path = os.path.join(out_path, file_name)
    with torch.no_grad():
        # De-normalize: Nhân 2048, làm tròn, ép kiểu integer và kẹp trong khoảng codebook
        tokens = torch.round(mel_pred * 2048).long()
        tokens = torch.clamp(tokens, min=0, max=2047)
        new_output = OutputClass(audio_codes=[tokens[0]])
        print(new_output)
        # print({"audio_codes": [tokens[0]]})
        
        # Bọc tensor vào dictionary với key "audio_codes"
        wav, sr = tokenizer.decode(new_output)
        print(wav)
        sf.write("test.wav", wav[0], sr)
    
    log.info(f"Saved at: {file_path}")
    
    # Render thanh Audio ngay trên Jupyter Cell
    display(Audio(wav, rate=sr))

In [55]:
# Ô CHẠY THỬ NGHIỆM (Chạy bao nhiêu lần tùy thích)
text_to_test = "in being comparatively modern."
synthesize_text(text_to_test)

2026-02-20 09:32:36,886 INFO 481343099.py-8 Processing: 'in being comparatively modern.'
2026-02-20 09:32:36,904 INFO 481343099.py-15 Acoustic model inference time: 0.016s
2026-02-20 09:32:36,971 INFO 481343099.py-32 Saved at: ./in_being_compar_1771579956.wav


Qwen3TTSTokenizerV2EncoderOutput(audio_codes=[tensor([[ 803, 1068, 1040, 1117,  959, 1265, 1166,  989, 1073,  938, 1154, 1074,
          990, 1068,  983, 1042],
        [1018, 1040, 1009, 1048, 1142, 1139, 1140,  993, 1077, 1025, 1117, 1079,
         1006, 1075,  959, 1015],
        [1124, 1070, 1051, 1050, 1135, 1176, 1130, 1042, 1103, 1017, 1144, 1109,
         1021, 1089,  974, 1021],
        [1137, 1064, 1062, 1053, 1112, 1166, 1117, 1041, 1099, 1017, 1135, 1104,
         1025, 1097,  978, 1031],
        [1144, 1054, 1069, 1061, 1100, 1156, 1111, 1046, 1097, 1020, 1130, 1105,
         1033, 1107,  984, 1039],
        [1150, 1049, 1075, 1064, 1095, 1154, 1111, 1051, 1097, 1024, 1130, 1109,
         1041, 1112,  988, 1043],
        [1151, 1047, 1078, 1066, 1092, 1154, 1110, 1053, 1097, 1026, 1129, 1111,
         1045, 1114,  991, 1044],
        [1152, 1046, 1079, 1067, 1091, 1154, 1110, 1053, 1097, 1027, 1129, 1111,
         1046, 1115,  992, 1045],
        [1152, 1046, 1079, 1067, 1