In [None]:
from models.edenTTS import EdenTTS
from hparams import hparams as hp
from utils.paths import Paths
import torch
import time
import soundfile as sf
import os
from utils.log_util import get_logger
import numpy as np
from text.en_util import text_to_sequence
from qwen_tts import Qwen3TTSTokenizer
import soundfile as sf
# THÊM THƯ VIỆN ĐỂ NGHE AUDIO TRỰC TIẾP TRÊN JUPYTER
from IPython.display import Audio, display 

device = "cuda" if torch.cuda.is_available() else "cpu"
log = get_logger(__name__)

In [2]:
# ==========================================
# 1. LOAD TOKENIZER VÀ MODEL (Chỉ cần chạy 1 lần)
# ==========================================
log.info("Loading Qwen3 TTS Tokenizer...")
tokenizer = Qwen3TTSTokenizer.from_pretrained(
    "Qwen/Qwen3-TTS-Tokenizer-12Hz",
    device_map=device,
)

2026-02-20 15:34:35,280 INFO 2871649631.py-4 Loading Qwen3 TTS Tokenizer...


In [12]:
# Khởi tạo và load trọng số EdenTTS
tts_model_id = hp.tts_model_id
paths = Paths(hp.data_path, tts_model_id, speaker=hp.speaker)
tts_model_path = paths.tts_latest_weights
print(f"Đường dẫn checkpoint: {tts_model_path}")

if not os.path.exists(tts_model_path):
    print(f"⚠️ LỖI: Không tìm thấy file checkpoint tại: {tts_model_path}")
else:
    tts_model = EdenTTS(hp).to(device)
    tts_model.load("/kaggle/edentts-with-qwen3-tokenizer/checkpoints/ljs/tts_eden/step_10000_weights.pyt")
    tts_model.eval() # Bắt buộc set eval mode
    log.info("EdenTTS Model loaded successfully!")

out_path = "./"
os.makedirs(out_path, exist_ok=True)

Đường dẫn checkpoint: checkpoints/ljs/tts_eden/latest_weights.pyt


2026-02-20 15:36:24,052 INFO net_utils.py-25  number of weights:28449408, tol params:104, params requires_grad:103
2026-02-20 15:36:24,053 INFO net_utils.py-25  number of weights:34124800, tol params:45, params requires_grad:45
2026-02-20 15:36:24,053 INFO net_utils.py-25  number of weights:591617, tol params:10, params requires_grad:10
2026-02-20 15:36:24,054 INFO net_utils.py-25  number of weights:69198209, tol params:170, params requires_grad:169
2026-02-20 15:36:24,054 INFO edenTTS.py-74 tol_params: 69198209, text_encoder:28449408,decoder:34124800,dur:591617,tol_infer:63165825
2026-02-20 15:36:24,205 INFO 3107919420.py-13 EdenTTS Model loaded successfully!


In [13]:
enc = tokenizer.encode("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-TTS-Repo/tokenizer_demo_1.wav")
OutputClass = enc.__class__

In [14]:
# ==========================================
# 2. ĐỊNH NGHĨA HÀM SUY LUẬN
# ==========================================
def synthesize_text(text):
    if not os.path.exists(tts_model_path):
        return
        
    log.info(f"Processing: '{text}'")
    phones = text_to_sequence(text, token_type=hp.token_type) 
    phones = torch.tensor(phones).long().unsqueeze(0).to(device)
    
    s1 = time.time()
    # Dự đoán Latent vector liên tục
    mel_pred = tts_model.inference(phones)
    print(mel_pred.shape)
    log.info(f"Acoustic model inference time: {time.time() - s1:.3f}s")
    safe_name = "".join([c if c.isalnum() else "_" for c in text[:15]])
    file_name = f"{safe_name}_{int(time.time())}.wav"
    file_path = os.path.join(out_path, file_name)
    with torch.no_grad():
        # De-normalize: Nhân 2048, làm tròn, ép kiểu integer và kẹp trong khoảng codebook
        tokens = torch.round(mel_pred).long()
        tokens = torch.clamp(tokens, min=0, max=2047)
        new_output = OutputClass(audio_codes=[tokens[0]])
        print(new_output)
        # print({"audio_codes": [tokens[0]]})
        
        # Bọc tensor vào dictionary với key "audio_codes"
        wav, sr = tokenizer.decode(new_output)
        print(wav)
        sf.write("test.wav", wav[0], sr)
    
    log.info(f"Saved at: {file_path}")
    
    # Render thanh Audio ngay trên Jupyter Cell
    display(Audio(wav, rate=sr))

In [15]:
# Ô CHẠY THỬ NGHIỆM (Chạy bao nhiêu lần tùy thích)
text_to_test = "in being comparatively modern."
synthesize_text(text_to_test)

2026-02-20 15:36:30,596 INFO 21558280.py-8 Processing: 'in being comparatively modern.'
2026-02-20 15:36:30,607 INFO 21558280.py-16 Acoustic model inference time: 0.010s
2026-02-20 15:36:30,668 INFO 21558280.py-33 Saved at: ./in_being_compar_1771601790.wav


torch.Size([1, 24, 16])
Qwen3TTSTokenizerV2EncoderOutput(audio_codes=[tensor([[   9, 1501,  540, 2018,  681, 1463, 1658,  226,  255, 1919, 1847, 1537,
          305, 1861, 1878, 1772],
        [  70, 1537,  201, 2010, 1634,  504,   32,  789, 1401,  251,  362,  180,
          252, 1136,  909,  974],
        [ 181, 1234, 1250, 1676, 1746, 1451,  462, 1168, 1206,  566,  712,  213,
          699, 1735,  119, 1298],
        [1534,   65,  352, 1828, 1503, 1309, 1229, 1986, 1448,   20, 1176,  357,
          661, 1654, 2031,  162],
        [ 744, 1757,   78, 1934,   17,  488, 1171,  789,  392, 1944, 1857,  714,
         1151,  101,  390, 1196],
        [1134,  885, 1543, 2016, 1634,  886, 1171, 1168, 1401,  387, 1471, 1216,
         1315,  355, 1454, 1904],
        [ 191,  930, 2045, 1224, 1321, 1431,  442,  825, 2044,  391, 1100,  284,
          535, 1971, 1854,    0],
        [ 191,  105, 1983,   64, 1634, 1315, 1579, 1130, 1401,  387, 1538,  896,
          291,  490, 1521, 1261],
        [1