In [1]:
# (ixtiyoriy) torchtext ogohlantirishini o'chirish
try:
    import torchtext; torchtext.disable_torchtext_deprecation_warning()
except Exception:
    pass

import IPython.display as ipd
import torch

from utils.hparams import get_hparams_from_file
from utils.task import load_checkpoint, load_vocab          # CHANGED: vocab loader
from model.models import SynthesizerTrn
from text import tokenizer      

In [2]:
# ---- Config ----
model = "custom_base"
logdir = f"./datasets/{model}/logs"                        # agar logs bo'lsa, shuni qo'ying
checkpoint = f"{logdir}/G_1950000_uz.pth"   

In [3]:
# ---- HParams & Vocab ----
hps = get_hparams_from_file(f"./datasets/{model}/config.yaml")
vocab = load_vocab(hps.data.vocab_file)                    # CHANGED: treningdagi vocab.txt ni yuklaymiz
print("Vocab size:", len(vocab))

Vocab size: 90


In [4]:
# ---- Model ----
filter_length = hps.data.n_mels if hps.data.use_mel else hps.data.n_fft // 2 + 1
segment_size  = hps.train.segment_size // hps.data.hop_length
device = "cuda" if torch.cuda.is_available() else "cpu"

net_g = SynthesizerTrn(
    len(vocab),                                           # CHANGED: len(symbols) o'rniga len(vocab)
    filter_length,
    segment_size,
    **hps.model
).to(device).eval()



In [5]:
# checkpoint yuklash (optimizer kerak emas)
_ = load_checkpoint(checkpoint, net_g, None)

INFO:root:Loaded checkpoint './datasets/custom_base/logs/G_1950000_uz.pth' (iteration 6725)


In [6]:
# ---- Text -> ids ----
def get_text(text: str) -> torch.LongTensor:               # CHANGED: vocab bilan tokenize
    ids = tokenizer(text, vocab, hps.data.text_cleaners, language=hps.data.language)
    return torch.LongTensor(ids)

In [7]:
# # ---- Inference ----
# text = "Single-stage text-to-speech models are improving rapidly, but evaluation still requires careful listening and measurement. The meeting starts at 7:45 p.m. on Friday, October 3rd, 2025. The total cost was $1,249.50, excluding tax and shipping. Dr. Smith emailed Prof. Johnson re: the NLP seminar at MIT. Please read items No. 12–17; skip Fig. 3(b) and Eq. (2). “Quality is not an act,” Aristotle said, “it is a habit.” Can you record the session and record the results in the log? (homograf: RE-cord vs re-CORD). The lead pipes lead to lower water pressure. (homograf: LEED vs leed). In 2024–2025, productivity rose by 3.7%, then fell by 1.2%. E-mail, co-operate, and state-of-the-art are written with hyphens. NASA’s JWST observed a planet 1,000 light-years away. “He whispered, ‘Don’t move,’” she said. Wait… is that twenty-two or twenty-two thousand?. 0 is not number, 10 is number "

In [11]:
# ---- Inference ----
text = "akfa universitetiga qariyb oʻttiz gektar yerni doimiy foydalanishga berish ko‘zda tutilgan qaror loyihasi tanqid qilindi, sog‘liqni saqlash vazirligi tomonidan normativ-huquqiy hujjatlar loyihalari muhokamasi portaliga qo‘yilgan. akfa universitetining moddiy-texnik bazasini mustahkamlash bo‘yicha qo‘shimcha chora-tadbirlar to‘g‘risidagi prezident qarori loyihasi kelishish uchun agentligimizga yuborilmagan."

In [7]:
text = "Assalomu alaykum Komil aka, yaxshimisiz, ishlariz yaxshimi, axvollariz qaley. Ertaga vaqtiz bo'lsa bir telefon qiling, 25 minutlarcha gaplashamiz. sana 2025.09.12."

In [12]:
with torch.no_grad():
    x = get_text(text).unsqueeze(0).to(device)
    x_lens = torch.LongTensor([x.size(1)]).to(device)

    # noise_scale, noise_scale_w, length_scale — xohishga ko'ra sozlang
    y_hat, attn, mask, *_ = net_g.infer(x, x_lens, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0)
    audio = y_hat[0, 0].float().cpu().numpy()

ipd.Audio(audio, rate=hps.data.sample_rate, normalize=False)

In [33]:
import os, soundfile as sf
import torch
from text import tokenizer

texts = [
    "Single-stage text-to-speech models are improving rapidly, but evaluation still requires careful listening and measurement.",
    "The meeting starts at 7:45 p.m. on Friday, October 3rd, 2025.",
    "The total cost was $1,249.50, excluding tax and shipping.",
    "Dr. Smith emailed Prof. Johnson re: the NLP seminar at MIT.",
    "Please read items No. 12–17; skip Fig. 3(b) and Eq. (2).",
    "Quality is not an act, Aristotle said, it is a habit.",
    "Can you record the session and record the results in the log?",
    "The lead pipes lead to lower water pressure.",
    "In 2024–2025, productivity rose by 3.7%, then fell by 1.2%.",
    "E-mail, co-operate, and state-of-the-art are written with hyphens.",
    "NASA's JWST observed a planet 1,000 light-years away.",
    "He whispered, 'Don't move,' she said.",
    "Wait… is that twenty-two or twenty-two thousand?",
    "This is a long sentence to stress attention stability; it contains multiple clauses, commas, and pauses, and it should remain fluent without skipping words or repeating syllables unnecessarily.",
    "The quick brown fox jumps over the lazy dog. Pack my box with five dozen liquor jugs. While synthetic voices improve each month, natural prosody remains challenging—especially for long, punctuation-rich passages."
]

def to_ids(text: str):
    return torch.LongTensor(tokenizer(text, vocab, hps.data.text_cleaners, language=hps.data.language))

os.makedirs("tts_out", exist_ok=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
net_g.eval()

# inference parametrlarini xohishingizga ko'ra sozlang:
noise_scale   = 0.667     # timbre/variability
noise_scale_w = 0.8       # duration randomness
length_scale  = 1.0       # >1.0 = sekinroq, <1.0 = tezroq

with torch.no_grad():
    for i, t in enumerate(texts, 1):
        x = to_ids(t).unsqueeze(0).to(device)
        x_lens = torch.LongTensor([x.size(1)]).to(device)
        y_hat, attn, mask, *_ = net_g.infer(
            x, x_lens,
            noise_scale=noise_scale,
            noise_scale_w=noise_scale_w,
            length_scale=length_scale,
            max_len=2000
        )
        wav = y_hat[0, 0].float().cpu().numpy()
        sf.write(f"tts_out/sample_{i:02d}.wav", wav, samplerate=hps.data.sample_rate, subtype="PCM_16")
        if i <= 3:
            # Jupyter ichida bir nechtasini eshittirish
            import IPython.display as ipd
            display(ipd.Audio(wav, rate=hps.data.sample_rate, normalize=False))

print("Saved wavs in ./tts_out")






Saved wavs in ./tts_out


In [14]:
import IPython.display as ipd
import torch
from torch.utils.data import DataLoader

from utils.task import load_checkpoint
from utils.hparams import get_hparams_from_file
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
from model.models import SynthesizerTrn
from text.symbols import symbols
from text import tokenizer


def get_text(text: str, hps) -> torch.LongTensor:
    text_norm = tokenizer(text, hps.data.text_cleaners, language=hps.data.language)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

## LJ Speech


In [6]:
model = "ljs_base"
checkpoint = "G_2600.pth"

In [7]:
hps = get_hparams_from_file(f"./datasets/{model}/config.yaml")
filter_length = hps.data.n_mels if hps.data.use_mel else hps.data.n_fft // 2 + 1
segment_size = hps.train.segment_size // hps.data.hop_length
net_g = SynthesizerTrn(len(symbols), filter_length, segment_size, **hps.model).cuda()
_ = net_g.eval()
_ = load_checkpoint(f"./datasets/{model}/logs0/{checkpoint}", net_g, None)

RuntimeError: Error(s) in loading state_dict for SynthesizerTrn:
	size mismatch for enc_p.emb.weight: copying a param with shape torch.Size([129, 192]) from checkpoint, the shape in current model is torch.Size([6, 192]).

In [8]:
stn_tst = get_text("Single-stage text-to-speech models have been actively studied recently, and their results have outperformed two-stage pipeline systems.", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()

    out = net_g.infer(x_tst, x_tst_lengths, noise_scale=0.667, noise_scale_w=0.8, length_scale=1)
    audio = out[0][0, 0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sample_rate, normalize=False))

Failed to create secure directory (/pulse): Permission denied
Failed to create secure directory (/pulse): Permission denied
Failed to create secure directory (/pulse): Permission denied
Failed to create secure directory (/pulse): Permission denied


## MADASR23


In [2]:
model = "madasr23_base"
checkpoint = "G_1000.pth"

In [56]:
hps = get_hparams_from_file(f"./datasets/{model}/config.yaml")
filter_length = hps.data.n_mels if hps.data.use_mel else hps.data.n_fft // 2 + 1
segment_size = hps.train.segment_size // hps.data.hop_length
net_g = SynthesizerTrn(len(symbols), filter_length, segment_size, n_speakers=hps.data.n_speakers, **hps.model).cuda()
_ = net_g.eval()
_ = load_checkpoint(f"./datasets/{model}/logs/{checkpoint}", net_g, None)

INFO:root:Loaded checkpoint './logs/madasr23_base/G_55000.pth' (iteration 31)


In [64]:
stn_tst = get_text("রোপক বা প্ল্যান্টার মেশিন দুই ধরনের হয় বৈদ্যুতিন এবং হাইড্রলিক যান্ত্রিক", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([78]).cuda()

    out = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1)
    audio = out[0][0, 0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sample_rate, normalize=False))

## VCTK


In [None]:
model = "vctk_base"
checkpoint = "G_1000.pth"

In [None]:
hps = get_hparams_from_file(f"./datasets/{model}/config.yaml")
filter_length = hps.data.n_mels if hps.data.use_mel else hps.data.n_fft // 2 + 1
segment_size = hps.train.segment_size // hps.data.hop_length
net_g = SynthesizerTrn(len(symbols), filter_length, segment_size, n_speakers=hps.data.n_speakers, **hps.model).cuda()
_ = net_g.eval()
_ = load_checkpoint(f"./datasets/{model}/logs/{checkpoint}", net_g, None)

In [None]:
stn_tst = get_text("VITS is Awesome!", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([4]).cuda()

    out = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1)
    audio = out[0][0, 0].data.cpu().float().numpy()
ipd.display(ipd.Audio(audio, rate=hps.data.sample_rate, normalize=False))

### Voice Conversion


In [17]:
# bular multispikerlar uchun

In [15]:
dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
collate_fn = TextAudioSpeakerCollate()
loader = DataLoader(dataset, num_workers=8, shuffle=False, batch_size=1, pin_memory=True, drop_last=True, collate_fn=collate_fn)
data_list = list(loader)

UnicodeDecodeError: 'ascii' codec can't decode byte 0xc9 in position 60: ordinal not in range(128)

In [16]:
with torch.no_grad():
    x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda() for x in data_list[0]]
    sid_tgt1 = torch.LongTensor([1]).cuda()
    sid_tgt2 = torch.LongTensor([2]).cuda()
    sid_tgt3 = torch.LongTensor([4]).cuda()
    audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0, 0].data.cpu().float().numpy()
    audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0, 0].data.cpu().float().numpy()
    audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0, 0].data.cpu().float().numpy()
print("Original SID: %d" % sid_src.item())
ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sample_rate, normalize=False))
print("Converted SID: %d" % sid_tgt1.item())
ipd.display(ipd.Audio(audio1, rate=hps.data.sample_rate, normalize=False))
print("Converted SID: %d" % sid_tgt2.item())
ipd.display(ipd.Audio(audio2, rate=hps.data.sample_rate, normalize=False))
print("Converted SID: %d" % sid_tgt3.item())
ipd.display(ipd.Audio(audio3, rate=hps.data.sample_rate, normalize=False))

NameError: name 'data_list' is not defined