In [None]:
from model.vf import VFEstimator
from model.textencoder import TextEncoder
import torch
import librosa
from utils.feature import TorchAudioFbank, TorchAudioFbankConfig
from tokenizerown import LibriTTSTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sample_rate = 24000
n_mels      = 100
d_model     = 256
depth       = 5
num_heads   = 4
downsample_factors = [1, 2, 4, 2, 1]

text_encoder = TextEncoder(vocab_size=160, emb_dim=128).to(device)
vf_estimator = VFEstimator(dim_in=n_mels, dim_model=d_model, conv_hidden=1024, num_heads=num_heads, Nm=depth, downsample_factors=downsample_factors).to(device)

ref_audio_path = './test.wav'
script = "Hello, world!"

tokenizer = LibriTTSTokenizer(
    special_tokens=["<filler>"],
    token_file="./vocab_small.txt",
    lowercase=True,
    oov_policy="skip",        # OOV은 버림 (또는 "use_unk", "error")
    unk_token="[UNK]",        # oov_policy="use_unk"일 때만 필요
)
fbank = TorchAudioFbank(config=TorchAudioFbankConfig(sampling_rate=sample_rate, n_mels=n_mels, n_fft=1024, hop_length=256))

audio, sr = librosa.load(ref_audio_path, sr=24000, mono=True)
audio = torch.from_numpy(audio)
logmel = fbank.extract(audio, sr).unsqueeze(0)

In [None]:
audio, sr = librosa.load(ref_audio_path, sr=24000, mono=True)
audio = torch.from_numpy(audio)
logmel = fbank.extract(audio, sr).unsqueeze(0)
print("logmel : ", logmel.shape)
B, T_ref, _ = logmel.shape

token_ids = tokenizer.texts_to_token_ids([script])
token_ids = torch.tensor(token_ids, device=device)
print("token_ids : ", token_ids)

In [None]:
text_emb_128 = text_encoder(token_ids)
print("text_emb_128 : ", text_emb_128.shape)

In [None]:
noisy_latents = torch.randn(B, 100*4, n_mels, device=device)
time_t = torch.rand((B,), device=device)
print("time_t : ", time_t, time_t.shape)

output = vf_estimator(noisy_latents, time_t, text_emb_128)
print("output : ", output.shape) # B, secs*100, n_mels

In [None]:
total_params = sum(p.numel() for p in vf_estimator.parameters())
print(f"전체 파라미터 수: {total_params:,}")

total_params = sum(p.numel() for p in text_encoder.parameters())
print(f"전체 파라미터 수: {total_params:,}")

In [1]:
from datasets import load_dataset

ds = load_dataset("atmansingh/ljspeech")
print(len(ds['train']))
print(len(ds['validation']))

  from .autonotebook import tqdm as notebook_tqdm


11775
1309


In [2]:
from data.text_mel_datamodule import TextMelDataset, TextMelDataModule

dm = TextMelDataModule(
    name="ljspeech",
    dataset=ds,
    batch_size=16,
    num_workers=4,
    pin_memory=True,
    n_spks=1,             # LJSpeech = 단일 화자 → 1
    n_fft=1024,
    n_feats=100,           # mel bins
    sample_rate=24000,
    hop_length=256,
    f_min=0,
    f_max=8000,
    data_statistics={"mel_mean": 0.0, "mel_std": 1.0},
    seed=42,
    load_durations=False, # alignment 정보 필요 없으면 False
)


In [3]:
dm.setup(0)

stage ;  0


In [None]:
data = next(iter(dm.train_dataloader()))

print(data['text'].shape, data['audio'].shape)

  return {"x": torch.tensor(token_ids), "y": torch.tensor(mel), "durations":None, "text": data['text']}
  return {"x": torch.tensor(token_ids), "y": torch.tensor(mel), "durations":None, "text": data['text']}
  return {"x": torch.tensor(token_ids), "y": torch.tensor(mel), "durations":None, "text": data['text']}
  return {"x": torch.tensor(token_ids), "y": torch.tensor(mel), "durations":None, "text": data['text']}


torch.Size([16, 147]) torch.Size([16, 940, 100])


: 

In [None]:
from model.module import TTSModule

model = TTSModule(
    dim=256,
    depth=5,
    num_heads=4,
    attn_dropout=0.0,
    ff_dropout=0.0,
    min_span=10,
    voco_type='vocos',
    sample_rate=24000,
    max_audio_len=2000,
    optimizer = "AdamW",
    lr = 1e-4,
    scheduler = "linear_warmup_decay",
    use_torchode = True,
    torchdiffeq_ode_method = "midpoint",
    torchode_method_klass = "tsit5",
    max_steps = 1_000_000,
    n_mels = 100,
    text_emb_dim = 128,
    downsample_factors = [1, 2, 4, 2, 1],
    # 추가 하이퍼파라미터(옵션)
    warmup_ratio = 0.05,
    min_lr_ratio = 0.1,
    weight_decay = 0.01,
    betas = (0.9, 0.95),
    grad_clip_val = 1.0,
)

In [None]:
text = data['text']
audio = data['audio']
text_mask = data['text_mask']
audio_mask = data['audio_mask']

print(text.shape, audio.shape, text_mask.shape, audio_mask.shape)

In [None]:
# model(text, audio, text_mask, audio_mask)
model.to('cuda')
model.eval()
model.solve(text.to("cuda"), audio.to("cuda"), text_mask.to("cuda"), audio_mask.to("cuda"))

In [None]:
model(text, audio, text_mask, audio_mask)


In [None]:
import torch.nn as nn
import torch
from einops import repeat

embed = nn.Embedding(160, 128).to('cuda')
text_ids = torch.randint(0, 160, (1, 10)).int().to('cuda')
text_ids = repeat(text_ids, "1 n -> b n", b=8)
print(text_ids.shape)
print(embed(text_ids).shape)

In [None]:
import torch

torch.linspace(0, 1, 64, device='cuda')

In [None]:
from huggingface_hub import hf_hub_download
from vocos import Vocos
import torch

def load_vocoder(is_local=False, local_path="", device='cuda', hf_cache_dir=None):
    # vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
    if is_local:
        print(f"Load vocos from local path {local_path}")
        config_path = f"{local_path}/config.yaml"
        model_path = f"{local_path}/pytorch_model.bin"
    else:
        print("Download Vocos from huggingface charactr/vocos-mel-24khz")
        repo_id = "charactr/vocos-mel-24khz"
        config_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="config.yaml")
        model_path = hf_hub_download(repo_id=repo_id, cache_dir=hf_cache_dir, filename="pytorch_model.bin")
    vocoder = Vocos.from_hparams(config_path)
    state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
    from vocos.feature_extractors import EncodecFeatures

    if isinstance(vocoder.feature_extractor, EncodecFeatures):
        encodec_parameters = {
            "feature_extractor.encodec." + key: value
            for key, value in vocoder.feature_extractor.encodec.state_dict().items()
        }
        state_dict.update(encodec_parameters)
    vocoder.load_state_dict(state_dict)
    vocoder = vocoder.eval().to(device)
    return vocoder

vocoder = load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device='cuda')

In [None]:
from utils.feature import TorchAudioFbank, TorchAudioFbankConfig
import librosa
import torch

fbank = TorchAudioFbank(config=TorchAudioFbankConfig(sampling_rate=24000, n_mels=100, n_fft=1024, hop_length=256))

ref_audio_path='./test.wav'
audio, sr = librosa.load(ref_audio_path, sr=24000, mono=True)
print(audio.shape)
audio = torch.from_numpy(audio)
logmel = fbank.extract(audio, sr).unsqueeze(0)
print(logmel.shape)

In [None]:
wav = vocoder.decode(logmel.to('cuda').permute(0, 2, 1))
print(wav.shape)

from IPython.display import Audio

display(Audio(wav.cpu().numpy(), rate=24000))

In [None]:
from hifigan.config import v1
from hifigan.denoiser import Denoiser
from hifigan.env import AttrDict
from hifigan.models import Generator as HiFiGAN

def load_vocoder(checkpoint_path):
    h = AttrDict(v1)
    hifigan = HiFiGAN(h).to('cuda')
    hifigan.load_state_dict(torch.load(checkpoint_path, map_location='cuda')['generator'])
    _ = hifigan.eval()
    hifigan.remove_weight_norm()
    return hifigan

voco = load_vocoder('./generator_v1')
denoiser = Denoiser(voco, mode='zeros')

In [None]:

@torch.inference_mode()
def to_waveform(mel, vocoder):
    audio = vocoder(mel).clamp(-1, 1)
    audio = denoiser(audio.squeeze(0), strength=0.00025).cpu().squeeze()
    return audio.cpu().squeeze()

to_waveform(logmel, vocoder)