In [None]:
# Cài đặt các thư viện cần thiết 
!pip install --upgrade transformers datasets jiwer librosa evaluate matplotlib
!pip install -U ipywidgets

In [None]:
# Import các thư viện
import os
import pandas as pd
import numpy as np
import librosa
import matplotlib.pyplot as plt
import seaborn as sns
import wave
import contextlib
import torch
import torchaudio
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import shutil
import re
import json
import evaluate  # Thay thế load_metric bằng evaluate
from datasets import load_dataset, Audio, Dataset
from transformers import (Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments,
                          Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, DataCollatorWithPadding)
from dataclasses import dataclass
from typing import Any, Dict, List, Union
%matplotlib inline

In [None]:
# Kiểm tra thiết bị
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Sử dụng thiết bị: {device}")

# Đường dẫn đến dataset VIVOS
train_audio_path = '../input/vivos-vietnamese-speech-corpus-for-asr/vivos/train/waves'
train_prompts_path = '../input/vivos-vietnamese-speech-corpus-for-asr/vivos/train/prompts.txt'
train_genders_path = '../input/vivos-vietnamese-speech-corpus-for-asr/vivos/train/genders.txt'

test_audio_path = '../input/vivos-vietnamese-speech-corpus-for-asr/vivos/test/waves'
test_prompts_path = '../input/vivos-vietnamese-speech-corpus-for-asr/vivos/test/prompts.txt'
test_genders_path = '../input/vivos-vietnamese-speech-corpus-for-asr/vivos/test/genders.txt'

In [None]:
# Hàm để đọc file prompts.txt và trả về DataFrame
def load_prompts(prompts_path):
    transcripts = []
    with open(prompts_path, 'r', encoding='utf-8') as f:
        for line in f:
            id, text = line.strip().split(' ', 1)
            transcripts.append({'id': id, 'text': text.lower()})
    return pd.DataFrame(transcripts)

# Tạo DataFrame cho tập train và test
train_transcripts = load_prompts(train_prompts_path)
test_transcripts = load_prompts(test_prompts_path)

In [None]:
# Thêm đường dẫn âm thanh vào DataFrame
def get_audio_path(audio_base_path, audio_id):
    speaker = audio_id.split('_')[0]
    return os.path.join(audio_base_path, speaker, audio_id + '.wav')

train_transcripts['audio'] = train_transcripts['id'].apply(lambda x: get_audio_path(train_audio_path, x))
test_transcripts['audio'] = test_transcripts['id'].apply(lambda x: get_audio_path(test_audio_path, x))

In [None]:
# Loại bỏ các ký tự đặc biệt và chuyển văn bản về chữ thường
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"“%‘”�]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
    return batch

# Áp dụng hàm tiền xử lý dữ liệu
train_transcripts = train_transcripts.apply(remove_special_characters, axis=1)
test_transcripts = test_transcripts.apply(remove_special_characters, axis=1)

In [None]:
vocab_dict = {"ẻ": 0, "ẵ": 1, "k": 2, "ặ": 3, "d": 4, "õ": 5, "á": 6, "ở": 7, "s": 8, "ả": 9, "u": 10, "ừ": 11, "ử": 12, "ạ": 13, "ổ": 14, "â": 15, "ệ": 16, "ủ": 17, "ầ": 18, "e": 19, "ẳ": 20, "ỡ": 21, "v": 22, "ú": 23, "à": 24, "ù": 25, "m": 26, "ờ": 27, "ớ": 28, "ỵ": 29, "ắ": 30, "ấ": 31, "ó": 32, "y": 33, "ỏ": 34, "ỳ": 35, "ỷ": 36, "ê": 37, "ĩ": 38, "ậ": 39, "ợ": 40, "l": 41, "ố": 42, "ữ": 43, "ỗ": 44, "h": 45, "ẹ": 46, "ò": 47, "ộ": 48, "è": 49, "ơ": 50, "ồ": 51, "é": 52, "ế": 53, "ự": 54, "ô": 55, "c": 56, "n": 57, "ẽ": 58, "ă": 59, "ũ": 60, "ứ": 61, "ọ": 62, "ụ": 63, "ể": 64, "t": 65, "q": 66, "ý": 67, "í": 68, "ẩ": 69, "ề": 70, "ỉ": 71, "ư": 72, "r": 73, "ỹ": 74, "ị": 75, "ì": 76, "ằ": 77, "ã": 78, "đ": 79, "a": 80, "g": 81, "ễ": 82, "i": 83, "x": 84, "ẫ": 85, "b": 87, "o": 88, "p": 89, "|": 86, "[UNK]": 90, "[PAD]": 91}

In [None]:
# Lưu vocabulary thành file JSON
import json

with open('vocab.json', 'w', encoding='utf-8') as vocab_file:
    json.dump(vocab_dict, vocab_file, ensure_ascii=False)

# Khởi tạo tokenizer
vi_tokenizer = Wav2Vec2CTCTokenizer("vocab.json", 
                                 unk_token="[UNK]", 
                                 pad_token="[PAD]", 
                                 word_delimiter_token="|")

In [None]:
# Tải processor và model từ mô hình pre-trained

feature = Wav2Vec2FeatureExtractor.from_pretrained("CuongLD/wav2vec2-large-xlsr-vietnamese")

processor = Wav2Vec2Processor.from_pretrained("CuongLD/wav2vec2-large-xlsr-vietnamese", tokenizer = vi_tokenizer, feature_extractor = feature)

model = Wav2Vec2ForCTC.from_pretrained("CuongLD/wav2vec2-large-xlsr-vietnamese", 
                                       attention_dropout=0.15,       # Tăng dropout đế giảm overfitting
                                       hidden_dropout=0.15, activation_dropout=0.15,
                                       ctc_loss_reduction="mean",
                                       pad_token_id = processor.tokenizer.pad_token_id)

In [None]:
# Chuyển đổi DataFrame thành Dataset của Hugging Face
train_dataset = Dataset.from_pandas(train_transcripts)
test_dataset = Dataset.from_pandas(test_transcripts)

# Chuyển cột 'audio' thành kiểu Audio
train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
test_dataset = test_dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
# Hàm tiền xử lý dữ liệu
def prepare_dataset(batch):
    # Xử lý âm thanh
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    # Xử lý văn bản
    batch["labels"] = processor.tokenizer(batch["text"]).input_ids
    return batch

# Áp dụng hàm tiền xử lý dữ liệu
train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names)
test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        # Tách inputs và labels
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Padding inputs
        batch = self.processor.feature_extractor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        # Padding labels
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # Thay thế giá trị padding bằng -100
        labels = labels_batch["input_ids"].masked_fill(labels_batch["attention_mask"].ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
# Định nghĩa hàm tính WER
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # Giải mã dự đoán
    pred_str = processor.batch_decode(pred_ids)

    # Giải mã nhãn
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, group_tokens=False)

    # Tính WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [None]:
class MyTrainer:
    def __init__(self, 
                 model = None, 
                 train_dataset = None,
                 eval_dataset = None,
                 processor = None,
                 data_collator = None, 
                 learning_rate = 1e-3,
                 weight_decay = 0.01,
                 num_train_epochs = None, 
                 train_batch_size = None, 
                 gradient_accumulation_steps = 1,
                 save_steps = 500,  
                 logging_steps = 500, 
                 warmup_steps = 500,
                 save_dir = "./wav2vec2-vivos/checkpoints", # path to folder to save result
                 resume_from_checkpoint = False,
                 checkpoint_dir = None): # path đến folder checkpoint, only use when resume_from_checkpoint = True
        
        self.model = model
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.processor = processor
        self.data_collator = data_collator
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.num_train_epochs = num_train_epochs
        self.train_batch_size = train_batch_size
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.save_steps = save_steps
        self.logging_steps = logging_steps
        self.warmup_steps = warmup_steps
        self.save_dir = save_dir
        self.checkpoint_dir = checkpoint_dir  
        self.resume_from_checkpoint = resume_from_checkpoint
        self.train_losses = []
        self.eval_losses = []
        self.learning_rate_values = []

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

        # Optimizer
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.learning_rate,weight_decay = self.weight_decay)

        self.start_epoch = 0
        self.global_step = 0
        if self.resume_from_checkpoint:
            self.load_checkpoint()
            
    def train(self):
        trainer_loader = DataLoader(self.train_dataset, batch_size = self.train_batch_size, collate_fn= self.data_collator, shuffle=True)
        self.total_steps = len(trainer_loader) * self.num_train_epochs
        self.scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, max_lr=self.learning_rate, total_steps=self.total_steps, pct_start=self.warmup_steps / self.total_steps)

        for epoch in range(self.start_epoch, self.num_train_epochs):
            self.model.train()
            progress_bar = tqdm(trainer_loader, desc = f"Epoch {epoch + 1}/{self.num_train_epochs}", leave=False)
            for step, batch in enumerate(progress_bar):
                inputs = batch['input_values'].to(self.device)
                labels = batch['labels'].to(self.device)
                
                outputs = self.model(inputs, labels = labels)
                loss = outputs.loss
                loss = loss / self.gradient_accumulation_steps # divide loss to simulate computing gradient over new_batch = batch * grad_accu_step
                loss.backward()

                if (step + 1) % self.gradient_accumulation_steps == 0:
                    self.optimizer.step()
                    self.scheduler.step()
                    self.optimizer.zero_grad()
                    self.global_step += 1

                    if self.global_step % self.logging_steps == 0:
                        progress_bar.set_postfix(loss = loss.item())

                    if self.global_step % self.save_steps == 0:
                        self.save_checkpoint(epoch, loss.item(), self.evaluate())
                        
            self.save_checkpoint(epoch + 1, loss.item(), self.evaluate())
        
    def evaluate(self):
        self.model.eval()
        eval_loader = DataLoader(self.eval_dataset, batch_size = self.train_batch_size, collate_fn = self.data_collator)
        losses = []
        for batch in tqdm(eval_loader, desc = "Evaluating", leave=False):
            inputs = batch['input_values'].to(self.device)
            labels = batch['labels'].to(self.device)
            with torch.no_grad():
                outputs = self.model(inputs, labels=labels)
            losses.append(outputs.loss.item())
        avg_loss = np.mean(losses)
        self.model.train()
        return avg_loss

    def save_checkpoint(self, epoch, train_loss, eval_loss):
        os.makedirs(self.save_dir, exist_ok = True)
        for item in os.listdir(self.save_dir):
            item_path = os.path.join(self.save_dir, item)
            # Remove all old files and folder, save only the lastest one
            if os.path.isfile(item_path):
                os.remove(item_path)
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)
        checkpoint_path = os.path.join(self.save_dir, f"checkpoint_epoch_{epoch}.pth")
        processor_path = os.path.join(self.save_dir, 'processor')
        
        self.train_losses.append(train_loss)
        self.eval_losses.append(eval_loss)
        
        self.learning_rate_values.append(self.scheduler.get_last_lr()[0])
        
        torch.save({
            'epoch': epoch,
            'global_step': self.global_step,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'train_losses': self.train_losses,
            'eval_losses': self.eval_losses,
            'learning_rate_values': self.learning_rate_values,
            'learning_rate': self.learning_rate,
            'weight_decay': self.weight_decay,
            'scheduler': self.scheduler,
            'num_train_epochs': self.num_train_epochs, 
            'train_batch_size': self.train_batch_size, 
            'gradient_accumulation_steps': self.gradient_accumulation_steps,
            'save_steps': self.save_steps,  
            'logging_steps': self.logging_steps, 
            'warmup_steps': self.warmup_steps,
        }, checkpoint_path)

        self.processor.save_pretrained(processor_path)
        print(f"Checkpoint saved at {checkpoint_path}, step: {self.global_step}/{self.total_steps / self.gradient_accumulation_steps}")
        print(f"Processor saved at {processor_path}")
            

    def load_checkpoint(self):
        # Tìm checkpoint gần nhất
        checkpoints = [f for f in os.listdir(self.checkpoint_dir) if f.startswith('checkpoint_epoch_')]
        if checkpoints:
            lastest_checkpoint = max(checkpoints, key=lambda x: int(x.split('_')[-1].split(".")[0]))
            checkpoint_path = os.path.join(self.checkpoint_dir, lastest_checkpoint)
            processor_path = os.path.join(self.checkpoint_dir, 'processor')
            
            checkpoint = torch.load(checkpoint_path, map_location=self.device)
            # load model's + optim's parameters
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

            # load tham số chạy mô hình
            self.start_epoch = checkpoint['epoch']
            self.global_step = checkpoint['global_step']
            self.learning_rate = checkpoint['learning_rate']
            self.weight_decay = checkpoint['weight_decay']
            self.scheduler = checkpoint['scheduler']
            self.num_train_epochs = checkpoint['num_train_epochs']
            self.train_batch_size = checkpoint['train_batch_size']
            self.gradient_accumulation_steps = checkpoint['gradient_accumulation_steps']
            self.save_steps = checkpoint['save_steps']
            self.logging_steps = checkpoint['logging_steps'] 
            self.warmup_steps = checkpoint['warmup_steps']

            # load plotting values
            self.train_losses = checkpoint['train_losses']
            self.eval_losses = checkpoint['eval_losses']
            self.learning_rate_values = checkpoint['learning_rate_values']

            # load processor
            self.processor = self.processor.from_pretrained(processor_path)
            print(f"Resumed from checkpoint at epoch {self.start_epoch}, global step {self.global_step}")

In [None]:
trainer = MyTrainer(model = model,
                    train_dataset = train_dataset,
                    eval_dataset = test_dataset,
                    processor = processor,
                    data_collator = data_collator,
                    learning_rate = 1e-4, 
                    num_train_epochs=30,
                    train_batch_size=4,
                    gradient_accumulation_steps=2)

In [None]:
# Bắt đầu huấn luyện
trainer.train()