In [None]:
!pip install evaluate jiwer

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting jiwer
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting rapidfuzz>=3.9.7 (from jiwer)
  Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jiwer-4.0.0-py3-none-any.whl (23 kB)
Downloading rapidfuzz-3.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer, evaluate
Successfully installed evaluate-0.4.4 jiwer-4.0.0 rapidfuzz-3.13.0


In [None]:
import pandas as pd
from datasets import Dataset, Audio, concatenate_datasets
from transformers import (
    AutoProcessor,
    AutoModelForCTC,
    TrainingArguments,
    Trainer
)
import torchaudio
import numpy as np
import evaluate
from typing import Dict, List, Union
import torch
from transformers import PreTrainedTokenizerBase, BatchEncoding

wer_metric = evaluate.load("wer")


# 1. Load & gộp nhiều file parquet
dfs = []
for i in range(3):  # hoặc bao nhiêu file bạn có
    df = pd.read_parquet(f"valid_{i}.parquet")
    dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

# 2. Giữ lại cột cần thiết
df = df[["audio", "text"]]

# 3. Convert sang Hugging Face Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# 4. Tách train/validation
split_dataset = dataset.train_test_split(test_size=0.1)
train_ds = split_dataset["train"]
val_ds = split_dataset["test"]

# 5. Load mô hình và processor
processor = AutoProcessor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
model = AutoModelForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")

# 6. Hàm tiền xử lý
def prepare_batch(batch):
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=16000).input_values[0]
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

train_ds = train_ds.map(prepare_batch, remove_columns=train_ds.column_names)
val_ds = val_ds.map(prepare_batch, remove_columns=val_ds.column_names)



def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # Decode output tokens thành text
    pred_str = processor.batch_decode(pred_ids)

    # Xử lý label tokens (-100 là padding cho CTC loss)
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(label_ids, group_tokens=False)

    # Tính WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

class DataCollatorCTCWithPadding:
    def __init__(
        self,
        processor,
        padding: Union[bool, str] = True,
        max_length: Union[int, None] = None,
        max_length_labels: Union[int, None] = None,
        pad_to_multiple_of: Union[int, None] = None,
        pad_to_multiple_of_labels: Union[int, None] = None,
    ):
        self.processor = processor
        self.feature_extractor = processor.feature_extractor
        self.tokenizer = processor.tokenizer
        self.padding = padding
        self.max_length = max_length
        self.max_length_labels = max_length_labels
        self.pad_to_multiple_of = pad_to_multiple_of
        self.pad_to_multiple_of_labels = pad_to_multiple_of_labels

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # input_values (audio features)
        input_features = [{"input_values": f["input_values"]} for f in features]
        batch = self.feature_extractor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt"
        )

        # labels (text tokens)
        with self.processor.as_target_processor():
            label_features = [{"input_ids": f["labels"]} for f in features]
            labels_batch = self.tokenizer.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt"
            )

        # Replace padding with -100 to ignore in loss
        labels = labels_batch["input_ids"].masked_fill(labels_batch["input_ids"] == self.tokenizer.pad_token_id, -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor)


# 9. Cấu hình huấn luyện
training_args = TrainingArguments(
    output_dir="./wav2vec2-vi-asr",
    per_device_train_batch_size=4, # Reduced batch size
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=10,
    fp16=True,
    learning_rate=3e-4,
    logging_steps=10,
    save_total_limit=2,
    push_to_hub=False,
    report_to="tensorboard",  # để tránh dùng wandb nếu bạn không cần
    logging_dir="./logs"
)

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 11. Huấn luyện
trainer.train()

# 12. Đánh giá
metrics = trainer.evaluate()
print("WER trên tập validation:", metrics["eval_wer"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]



vocab.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/378M [00:00<?, ?B/s]

Map:   0%|          | 0/527 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Wer
1,0.7749,0.544041,0.345757
2,0.6157,0.497701,0.339942
3,0.5955,0.465218,0.293682
4,0.394,0.375527,0.213333
5,0.3215,0.36789,0.209892
6,0.2215,0.324601,0.184605
7,0.2136,0.280576,0.15456
8,0.2045,0.2296,0.163722
9,0.2011,0.218428,0.157951
10,0.1815,0.189625,0.154251


WER trên tập validation: 0.15425059476605868


In [None]:
!zip -r /content/file.zip /content/wav2vec2-vi-asr

  adding: content/wav2vec2-vi-asr/ (stored 0%)
  adding: content/wav2vec2-vi-asr/checkpoint-1320/ (stored 0%)
  adding: content/wav2vec2-vi-asr/checkpoint-1320/rng_state.pth (deflated 25%)
  adding: content/wav2vec2-vi-asr/checkpoint-1320/trainer_state.json (deflated 81%)
  adding: content/wav2vec2-vi-asr/checkpoint-1320/scheduler.pt (deflated 55%)
  adding: content/wav2vec2-vi-asr/checkpoint-1320/training_args.bin (deflated 52%)
  adding: content/wav2vec2-vi-asr/checkpoint-1320/optimizer.pt (deflated 7%)
  adding: content/wav2vec2-vi-asr/checkpoint-1320/model.safetensors (deflated 7%)
  adding: content/wav2vec2-vi-asr/checkpoint-1320/config.json (deflated 66%)
  adding: content/wav2vec2-vi-asr/checkpoint-1320/preprocessor_config.json (deflated 33%)
  adding: content/wav2vec2-vi-asr/checkpoint-1320/scaler.pt (deflated 60%)
  adding: content/wav2vec2-vi-asr/checkpoint-1188/ (stored 0%)
  adding: content/wav2vec2-vi-asr/checkpoint-1188/rng_state.pth (deflated 25%)
  adding: content/wav2v

In [None]:
import evaluate
import google.generativeai as genai
from tqdm import tqdm
import numpy as np

# 1. Cấu hình Gemini API
genai.configure(api_key="AIzaSyCsBIetGP3Mun2Yi9Fsn3bF70WX-ZeRbus")  # ⚠️ Thay bằng API key thật

model = genai.GenerativeModel("gemini-pro")

# 2. Load WER metric
wer_metric = evaluate.load("wer")

# 3. Hàm post-process với Gemini
def post_process_with_gemini(text):
    prompt = f"""
    Văn bản sau được tạo bởi mô hình nhận dạng giọng nói, có thể có lỗi. Hãy sửa lại chính tả, dấu câu và ngữ pháp cho đúng tiếng Việt chuẩn:
    ---
    {text}
    ---
    Văn bản đã chỉnh sửa:"""
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        print(f"[!] Lỗi khi gọi Gemini: {e}")
        return text  # fallback

# 4. Lấy pred và label từ trainer hoặc đã lưu trước đó
# Bạn cần có 2 list: pred_texts và label_texts
# Nếu chưa có, tạo từ trainer.evaluate như sau:

def get_pred_and_label_from_trainer(trainer, processor, val_ds):
    pred = trainer.predict(val_ds)
    pred_ids = np.argmax(pred.predictions, axis=-1)
    pred_texts = processor.batch_decode(pred_ids)

    label_ids = pred.label_ids
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_texts = processor.batch_decode(label_ids, group_tokens=False)
    return pred_texts, label_texts

# Ví dụ sử dụng:
# pred_texts, label_texts = get_pred_and_label_from_trainer(trainer, processor, val_ds)

# 5. Post-process tất cả pred_texts
def post_process_predictions(pred_texts):
    post_preds = []
    for text in tqdm(pred_texts, desc="Gemini post-processing"):
        print(text)
        cleaned = post_process_with_gemini(text)
        post_preds.append(cleaned)
    return post_preds

# 6. Đánh giá trước và sau
def evaluate_wer(pre_preds, post_preds, labels):
    pre_wer = wer_metric.compute(predictions=pre_preds, references=labels)
    post_wer = wer_metric.compute(predictions=post_preds, references=labels)
    return pre_wer, post_wer

# ----------- THỰC THI -----------
# pred_texts, label_texts = ...
# Nếu bạn đã có, hãy load chúng từ file hoặc biến đã lưu
pred_texts, label_texts = get_pred_and_label_from_trainer(trainer, processor, val_ds)

# Post-processing bằng Gemini
post_pred_texts = post_process_predictions(pred_texts)

# Đánh giá
pre_wer, post_wer = evaluate_wer(pred_texts, post_pred_texts, label_texts)

print(f"📊 WER trước xử lý:  {pre_wer:.4f}")
print(f"✅ WER sau Gemini:  {post_wer:.4f}")


Gemini post-processing:   100%|||||||||||||| 59/59 [00:00<?, ?it/s]


 📊 WER trước xử lý: 0.1543 
 ✅ WER sau Gemini: 0.0000