In [1]:
# pip install openai-whisper jiwer

In [2]:
# pip install ffmpeg

In [3]:
import os
import torch
import librosa
import datasets
import jiwer
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Định nghĩa đường dẫn dữ liệu
AUDIO_FOLDER = "./train/data"
TRANSCRIPT_FILE = "./transcriptAll.txt"
FINETUNED_MODEL_PATH = "./whisper_finetuned"

# Load processor từ OpenAI Whisper
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3")

# Đặt mô hình train trên GPU nếu có
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Đọc transcript chuẩn từ file
data = []
with open(TRANSCRIPT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("|")
        if len(parts) == 2:
            filename, transcript = parts
            file_path = os.path.join(AUDIO_FOLDER, filename)
            if os.path.exists(file_path):
                data.append({"file": file_path, "text": transcript.lower()})

# Chuyển dữ liệu thành dataset
def load_audio(example):
    audio, sr = librosa.load(example["file"], sr=16000)
    example["input_features"] = processor(audio, sampling_rate=16000, return_tensors="pt").input_features.squeeze(0)
    example["labels"] = processor.tokenizer(example["text"]).input_ids
    return example

dataset = datasets.Dataset.from_list(data).map(load_audio)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper_checkpoints",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=3,
    predict_with_generate=True,
    save_total_limit=2,
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Train model
trainer.train()

# Lưu mô hình đã fine-tune
model.save_pretrained(FINETUNED_MODEL_PATH)
processor.save_pretrained(FINETUNED_MODEL_PATH)
print("✅ Fine-tuning hoàn tất! Mô hình đã lưu tại:", FINETUNED_MODEL_PATH)


import
