In [1]:
import os

current_directory = os.getcwd()
print(f"The current working directory is: {current_directory}")

os.chdir("..")
current_directory = os.getcwd()
print(f"The current working directory is: {current_directory}")


The current working directory is: /home/thun/Documents/python_pj/accent_vn/notebooks
The current working directory is: /home/thun/Documents/python_pj/accent_vn


In [2]:
# src/config.py

# Model / tokenizer
HF_MODEL_NAME = "facebook/mms-tts-vie"
TOKENIZER_NAME = HF_MODEL_NAME

# Training hyperparams
PAD_ID = 0
BATCH_SIZE = 1
NUM_EPOCHS = 10
LR = 1e-4
ACCENT_EMB_DIM = 64
DEVICE = "cuda" 

# Audio / training targets
TARGET_SAMPLE_RATE = 16000
MAX_WAVEFORM_LENGTH = None  # in samples, None = don't truncate; set if needed


In [3]:
from transformers import VitsModel
import torch
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


## Load dataset

In [4]:
from datasets import load_from_disk

ds = load_from_disk("processed_dataset16k")

In [5]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def collate_fn(batch):
    texts = [b["text"] for b in batch]
    accent_ids = torch.tensor([b["accent_label"] for b in batch])
    audio_tensors = [torch.tensor(b["audio"]["array"], dtype=torch.float32) for b in batch]
    audio_tensors = pad_sequence(audio_tensors, batch_first=True)
    return {"text": texts, "accent_ids": accent_ids, "audio_tensors": audio_tensors}


In [6]:
train_loader = DataLoader(ds["train"], batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(ds["valid"], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader  = DataLoader(ds["test"], batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [167]:
for batch in train_loader:
    texts = batch["text"]                # list of sentences
    accent_ids = batch["accent_ids"]     # tensor [batch_size]
    audio_tensors = batch["audio_tensors"]  # tensor [batch_size, max_audio_len]

In [5]:
import pickle
processed_ds_file = "train_data.pkl"

In [None]:
train_data = (texts, accent_ids, audio_tensors)

with open(processed_ds_file, "wb") as file:
    pickle.dump(train_data, file)

In [6]:
with open(processed_ds_file, "rb") as file:
    texts, accent_ids, audio_tensors = pickle.load(file)

In [7]:
from transformers import AutoTokenizer
from torch.nn.utils.rnn import pad_sequence

tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-vie")

def batch_tokenize(batch):
    tokenized = [tokenizer(text, return_tensors="pt") for text in batch]
    input_ids = [t["input_ids"].squeeze(0) for t in tokenized]
    attention_masks = [t["attention_mask"].squeeze(0) for t in tokenized]

    # Pad sequences to the same length
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)

    return {"input_ids": input_ids, "attention_mask": attention_masks}



In [None]:
tokenizer.save_pretrained("./my_tokenizer")

('./my_tokenizer/tokenizer_config.json',
 './my_tokenizer/special_tokens_map.json',
 './my_tokenizer/vocab.json',
 './my_tokenizer/added_tokens.json')

: 

In [198]:
tokenized = batch_tokenize(["xin chào", "tôi là một sinh viên"])

tokenized['input_ids'].shape

torch.Size([2, 41])

In [204]:
print(tokenized['input_ids'].device)

cpu


In [205]:
tokenized['attention_mask'].to(DEVICE)

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
         0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]], device='cuda:0')

In [201]:
batch["text"][0]

'Bây giờ ta nói ri nè: Cái rau này là rất là nhiều người, bảo là mày chụp cái hình up lên trên tiktok hay là facebook đó. Nhiều người nói chứ, rau này không đúng bài, rau này là sai. Rau sao đúng sai, không có khái niệm gì đúng sai hết trơn á. Tại răng biết hông? Ở ngoài Quảng Nam họ nói ở ngoài nhà ta cứ có rau gì ăn rau đó, mùa lụt đâu có cái rau gì đâu, trời mưa ngập'

## Training loop

In [8]:
import torch #torch==2.3.0
print(torch.__version__)
print(torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())

2.8.0+cu128
12.8
CUDA available: True


In [10]:
base_model = VitsModel.from_pretrained("facebook/mms-tts-vie")

In [None]:
import torch
import torch.nn as nn

NUM_ACCENTS = 6
HIDDEN_SIZE = base_model.config.hidden_size  # 192
EMBED_DIM = 64  # can be smaller than HIDDEN_SIZE

class VitsAccentAdapter(nn.Module):
    def __init__(self, base_model, num_accents=NUM_ACCENTS, embed_dim=ACCENT_EMB_DIM):
        super().__init__()
        self.base_model = base_model
        self.accent_emb = nn.Embedding(NUM_ACCENTS, EMBED_DIM)
        self.proj = nn.Linear(EMBED_DIM, HIDDEN_SIZE)
        
    def forward(self, input_ids, accent_ids, attention_mask):

        padding_mask = (input_ids != PAD_ID).to(DEVICE)  
        padding_mask = padding_mask.unsqueeze(1).transpose(1, 2)  # [B, 1, T]

        # encode text
        encoder_out = self.base_model.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            padding_mask=padding_mask,
            return_dict=True
        ).last_hidden_state  # [B, T, H]

        # accent embedding injection
        accent_vec = self.accent_emb(accent_ids)        # [B, E]
        accent_proj = self.proj(accent_vec).unsqueeze(1)  # [B, 1, H]
        encoder_out = encoder_out + accent_proj

        # transpose for flow
        encoder_out = encoder_out.transpose(1, 2)  # [B, H, T]
        padding_mask = padding_mask.transpose(1, 2)

        # flow + decoder
        z = self.base_model.flow(encoder_out, padding_mask)
        audio = self.base_model.decoder(z)
        audio = audio.squeeze(1)  # [B, T]
        print("Audio shape: ", audio.shape)

        return audio


In [12]:
model_with_adapter = VitsAccentAdapter(base_model).to(DEVICE)

optimizer = torch.optim.Adam(
    list(model_with_adapter.accent_emb.parameters()) +
    list(model_with_adapter.proj.parameters()),
    lr=1e-3
)

for param in base_model.parameters():
    param.requires_grad = False

base_model.eval()

VitsModel(
  (text_encoder): VitsTextEncoder(
    (embed_tokens): Embedding(95, 192)
    (encoder): VitsEncoder(
      (layers): ModuleList(
        (0-5): 6 x VitsEncoderLayer(
          (attention): VitsAttention(
            (k_proj): Linear(in_features=192, out_features=192, bias=True)
            (v_proj): Linear(in_features=192, out_features=192, bias=True)
            (q_proj): Linear(in_features=192, out_features=192, bias=True)
            (out_proj): Linear(in_features=192, out_features=192, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
          (layer_norm): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
          (feed_forward): VitsFeedForward(
            (conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,))
            (conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,))
            (dropout): Dropout(p=0.1, inplace=False)
            (act_fn): ReLU()
          )
          (final_layer_norm): LayerNorm((192,), eps=1e-05, eleme

In [None]:
import torch.nn.functional as F

def stft_loss(pred_audio, target_audio, n_fft=1024):
    # padding
    max_len = max(pred_audio.shape[-1], target_audio.shape[-1])
    if pred_audio.shape[-1] < max_len:
        pred_audio = F.pad(pred_audio, (0, max_len - pred_audio.shape[-1]))
    if target_audio.shape[-1] < max_len:
        target_audio = F.pad(target_audio, (0, max_len - target_audio.shape[-1]))

    # STFT transform
    pred_stft = torch.abs(torch.stft(pred_audio, n_fft=n_fft, return_complex=True))
    target_stft = torch.abs(torch.stft(target_audio, n_fft=n_fft, return_complex=True))

    loss = torch.mean((pred_stft - target_stft) ** 2)
    return loss


In [15]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [24]:
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()
model_with_adapter.train()

num_epochs = 5
accumulation_steps = 4
val_interval = 100  # do validation every 100 steps
global_step = 0

for epoch in range(num_epochs):
    for i, batch in enumerate(train_loader):
        audio_target = batch["audio_tensors"].to(DEVICE)
        accent_ids = batch["accent_ids"].to(DEVICE)
        tokenized = batch_tokenize(batch['text'])
        input_ids = tokenized['input_ids'].to(DEVICE)
        attention_mask = tokenized['attention_mask'].to(DEVICE)

        # Zero grad at start of accumulation cycle
        if global_step % accumulation_steps == 0:
            optimizer.zero_grad()

        with autocast():
            outputs = model_with_adapter(input_ids, accent_ids, attention_mask)
            loss = stft_loss(outputs, audio_target)
            loss = loss / accumulation_steps  # scale down loss for accumulation

        scaler.scale(loss).backward()

        # Only update weights after accumulation_steps batches
        if (global_step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()

        # Logging
        if global_step % 10 == 0:
            print(f"[Epoch {epoch}] Step {global_step} | Loss: {loss.item():.4f}")

        # Validation
        best_val_loss = float("inf")  # keep track of best model

        if global_step % val_interval == 0 and global_step != 0:
            model_with_adapter.eval()
            val_loss_total = 0.0
            val_steps = 0

            with torch.no_grad():
                for val_batch in valid_loader:
                    val_audio_target = val_batch["audio_tensors"].to(DEVICE)
                    val_accent_ids = val_batch["accent_ids"].to(DEVICE)

                    val_tokenized = batch_tokenize(val_batch['text'])
                    val_input_ids = val_tokenized['input_ids'].to(DEVICE)
                    val_attention_mask = val_tokenized['attention_mask'].to(DEVICE)

                    val_outputs = model_with_adapter(val_input_ids, val_accent_ids, val_attention_mask)
                    val_loss = stft_loss(val_outputs, val_audio_target)

                    val_loss_total += val_loss.item()
                    val_steps += 1

            avg_val_loss = val_loss_total / val_steps
            print(f"Validation — Step {global_step} | Avg Val Loss: {avg_val_loss:.4f}")

            # Save best checkpoint
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save({
                    'step': global_step,
                    'model_state_dict': model_with_adapter.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'scaler_state_dict': scaler.state_dict(),
                    'best_val_loss': best_val_loss
                }, "best_model.pt")
                print(f"Saved best model at step {global_step} (val_loss={best_val_loss:.4f})")

            model_with_adapter.train()

        global_step += 1

torch.save(model_with_adapter.state_dict(), "accent_adapter.pt")


  scaler = GradScaler()
  with autocast():


OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 7.62 GiB of which 22.75 MiB is free. Including non-PyTorch memory, this process has 7.18 GiB memory in use. Of the allocated memory 6.86 GiB is allocated by PyTorch, and 157.38 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [21]:
# train loop

for batch in train_loader:
    print(batch)
    audio_target = batch["audio_tensors"].to(DEVICE) 
    accent_ids = batch["accent_ids"].to(DEVICE)     # tensor [batch_size]

    tokenized = batch_tokenize(batch['text'])
    input_ids = tokenized['input_ids'].to(DEVICE)
    attention_mask = tokenized['attention_mask'].to(DEVICE)
    print(audio_target.shape) #torch.Size([3, 192, 27])

    optimizer.zero_grad()

    outputs = model_with_adapter(input_ids, accent_ids, attention_mask)

    loss = stft_loss(outputs, audio_target)  # e.g. STFT loss
    loss.backward()
    optimizer.step()


{'text': ['Nguyện vọng của dân ở đây là mong muốn các cấp ở trên là làm răng tạo điều kiện, giúp cho dân có cái khu xử lý nước thải tập trung ở ngay tại địa phương. Để rồi tạo được, có được một cái đà để làng nghề được duy trì hơn.'], 'accent_ids': tensor([2]), 'audio_tensors': tensor([[ 0.0000,  0.0000,  0.0000,  ..., -0.0006,  0.0005,  0.0027]])}
torch.Size([1, 252000])
Audio shape:  torch.Size([1, 110848])


  return _VF.stft(  # type: ignore[attr-defined]


{'text': ['khuyến khích nâng cao về trình độ chuyên môn nghiệp vụ như ứng dụng công nghệ thông tin vào thực hiện các nhiệm vụ. Cụ thể trong sáu tháng đầu năm, xã cũng đã tiếp nhận được trên bốn mươi chín hồ sơ giao dịch qua cổng dịch vụ công.'], 'accent_ids': tensor([5]), 'audio_tensors': tensor([[ 0.0011,  0.0014,  0.0026,  ..., -0.0073, -0.0090, -0.0067]])}
torch.Size([1, 345378])
Audio shape:  torch.Size([1, 116992])
{'text': ['Trước hết thì ủy ban xã xây dựng kế hoạch và chỉ đạo cho hợp tác xã cũng như là các thôn, củng cố lại thời gian lội đồng của thôn. Sau đó thì chúng tôi cho triển khai nậu quét các cây mương..'], 'accent_ids': tensor([3]), 'audio_tensors': tensor([[7.7169e-08, 8.1652e-07, 1.0474e-06,  ..., 1.5764e-03, 1.3518e-03,
         1.4041e-03]])}
torch.Size([1, 243264])
Audio shape:  torch.Size([1, 96000])
{'text': ['Năm nay là năm đầu tiên Ban thường huyện Tỉnh Đoàn, Hội Đồng đội Tỉnh quyết định tổ chức Đêm Hội Trăng Rằm dành cho các cháu ở huyện Ngọc Hiển Đúng với tin

OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB. GPU 0 has a total capacity of 7.62 GiB of which 58.00 MiB is free. Including non-PyTorch memory, this process has 7.05 GiB memory in use. Of the allocated memory 6.15 GiB is allocated by PyTorch, and 765.06 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [7]:
for batch in train_loader:
    audio_target = batch["audio_tensors"].to(DEVICE)

In [8]:
audio_target.shape

torch.Size([1, 301376])

In [None]:
import torchaudio

# Suppose outputs is [B, T] and sampling_rate = 22050
generated_audio = outputs[0].detach().cpu()  # Take first sample in batch
torchaudio.save("generated.wav", generated_audio.unsqueeze(0), TARGET_SAMPLE_RATE)


  s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)


In [11]:
from IPython.display import Audio
Audio(generated_audio.numpy(), rate=TARGET_SAMPLE_RATE)
