In [None]:
from datasets import load_dataset

ds = load_dataset("atmansingh/ljspeech")

In [None]:
ds

In [None]:
from utils.feature import TorchAudioFbank, TorchAudioFbankConfig
fbank = TorchAudioFbank(config=TorchAudioFbankConfig(sampling_rate=24000, n_mels=100, n_fft=1024, hop_length=256))

ds['train'][0]['audio']['array']

In [None]:
import librosa
import torch
from utils.feature import TorchAudioFbank, TorchAudioFbankConfig
import time
import matplotlib.pyplot as plt
import numpy as np

fbank = TorchAudioFbank(config=TorchAudioFbankConfig(sampling_rate=24000, n_mels=100, n_fft=1024, hop_length=256))

audio_path = './test.wav'
audio, sr = librosa.load(audio_path, sr=24000)

audio = torch.from_numpy(audio)
logmel = fbank.extract(audio, sr)
logmel_np = logmel.numpy() if hasattr(logmel, "numpy") else logmel


plt.figure(figsize=(10, 4))
plt.imshow(logmel_np.T, 
           aspect="auto", 
           origin="lower", 
           interpolation="none")
plt.colorbar(format="%+2.0f dB")
plt.title("Log-Mel Spectrogram")
plt.xlabel("Frames (Time)")
plt.ylabel("Mel bins")
plt.tight_layout()
plt.show()

In [None]:
from tokenizerown import LibriTTSTokenizer, EmiliaTokenizer

text = "Hey... 2015 <filler>"

tokenizer = LibriTTSTokenizer(
    special_tokens=["<filler>"],
    token_file="./vocab_small.txt",
    lowercase=True,
    oov_policy="skip",        # OOV은 버림 (또는 "use_unk", "error")
    unk_token="[UNK]",        # oov_policy="use_unk"일 때만 필요
)
tokens = tokenizer.texts_to_tokens([text])
token_ids = tokenizer.texts_to_token_ids([text])
print(len(token_ids[0]), tokens, token_ids)

In [None]:
from tokenizer import EmiliaTokenizer

text = "Hey... 2015 <filler>"

tokenizer = EmiliaTokenizer(
    token_file="./vocab.txt",
)
tokens = tokenizer.texts_to_tokens([text])
token_ids = tokenizer.texts_to_token_ids([text])
print(len(token_ids[0]), tokens, token_ids)

In [None]:
texts = tokenizer.token_ids_to_texts(token_ids)
print(texts)

In [None]:
from datasets import load_dataset

ds = load_dataset("atmansingh/ljspeech")
print(len(ds['train']))
print(len(ds['validation']))

from data.text_mel_datamodule import TextMelDataset, TextMelDataModule

dm = TextMelDataModule(
    name="ljspeech",
    dataset=ds,
    batch_size=4,
    num_workers=4,
    pin_memory=True,
    n_spks=1,             # LJSpeech = 단일 화자 → 1
    n_fft=1024,
    n_feats=100,           # mel bins
    sample_rate=24000,
    hop_length=256,
    f_min=0,
    f_max=8000,
    data_statistics={"mel_mean": 0.0, "mel_std": 1.0},
    seed=42,
    load_durations=False, # alignment 정보 필요 없으면 False
)

dm.setup(0)

data = next(iter(dm.train_dataloader()))

print(data['text'].shape, data['audio'].shape)

In [21]:
import matplotlib.pyplot as plt
from einops import repeat
import numpy as np

def visualize_mel(data):
    plt.figure(figsize=(10, 4))
    plt.imshow(data.T, 
            aspect="auto", 
            origin="lower", 
            interpolation="none")
    plt.colorbar(format="%+2.0f dB")
    plt.title("Log-Mel Spectrogram")
    plt.xlabel("Frames (Time)")
    plt.ylabel("Mel bins")
    plt.tight_layout()
    plt.show()

for i in range(4):
    masks = repeat(data['audio_mask'][i], 't -> b t', b=16).transpose(0, 1)
    logmel_np = data['audio'][i]
    print(logmel_np.shape, masks.shape)

    logmel = np.concatenate([logmel_np, masks], axis=-1)
    visualize_mel(logmel)
    # visualize_mel(data['audio_mask'][0])

torch.Size([938, 100]) torch.Size([938, 16])
torch.Size([938, 100]) torch.Size([938, 16])
torch.Size([938, 100]) torch.Size([938, 16])
torch.Size([938, 100]) torch.Size([938, 16])


In [16]:
data['original_text']

['as there had been before; as in the year eighteen forty-nine, a year memorable for the Rush murders at Norwich,',
 "Givens said to Oswald, quote, Boy are you going downstairs? It's near lunch time, end quote.",
 'nothing but a linear tract of specially modified protoplasm between two points of an organism',
 'The condition of the stone surface just mentioned assisted him in this, and he managed to get beyond the cistern to the railing below the chevaux-de-frise.']

In [17]:
from model.module import TTSModule

model = TTSModule(
    dim=256,
    depth=5,
    num_heads=4,
    attn_dropout=0.0,
    ff_dropout=0.0,
    min_span=10,
    voco_type='vocos',
    sample_rate=24000,
    max_audio_len=2000,
    optimizer = "AdamW",
    lr = 1e-4,
    scheduler = "linear_warmup_decay",
    use_torchode = True,
    torchdiffeq_ode_method = "midpoint",
    torchode_method_klass = "tsit5",
    max_steps = 1_000_000,
    n_mels = 100,
    text_emb_dim = 128,
    downsample_factors = [1, 2, 4, 2, 1],
    # 추가 하이퍼파라미터(옵션)
    warmup_ratio = 0.05,
    min_lr_ratio = 0.1,
    weight_decay = 0.01,
    betas = (0.9, 0.95),
    grad_clip_val = 1.0,
)

Download Vocos from huggingface charactr/vocos-mel-24khz


In [30]:
span_mask = model.get_span_mask(data['audio_mask'])
print(span_mask.shape)

for i in range(3):
    masks = repeat(data['audio_mask'][i], 't -> b t', b=32).transpose(0, 1)*10-6
    span_masks = repeat(span_mask[i], 't -> b t', b=32).transpose(0, 1)*10-6
    logmel_np = data['audio'][i]
    print(logmel_np.shape, masks.shape, span_masks.shape)

    logmel = np.concatenate([logmel_np, masks, span_masks], axis=-1)
    visualize_mel(logmel)


torch.Size([4, 938])
torch.Size([938, 100]) torch.Size([938, 32]) torch.Size([938, 32])


  plt.figure(figsize=(10, 4))


torch.Size([938, 100]) torch.Size([938, 32]) torch.Size([938, 32])
torch.Size([938, 100]) torch.Size([938, 32]) torch.Size([938, 32])


In [31]:
plt.figure(figsize=(10, 4))
plt.imshow(logmel.T, 
        aspect="auto", 
        origin="lower", 
        interpolation="none")
plt.colorbar(format="%+2.0f dB")
plt.title("Log-Mel Spectrogram")
plt.xlabel("Frames (Time)")
plt.ylabel("Mel bins")
plt.tight_layout()
plt.show()


In [32]:
plt.savefig("test.png")