In [None]:
import os
import torch
import librosa
import jiwer
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# ƒê·ªãnh nghƒ©a ƒë∆∞·ªùng d·∫´n d·ªØ li·ªáu
AUDIO_FOLDER = "./test/data"
TRANSCRIPT_FILE = "./transcriptAll.txt"
RESULT_FILE = "./transcript_result.txt"
FINETUNED_MODEL_PATH = "./whisper_finetuned"

# Load processor v√† m√¥ h√¨nh ƒë√£ fine-tune
processor = WhisperProcessor.from_pretrained(FINETUNED_MODEL_PATH)
model = WhisperForConditionalGeneration.from_pretrained(FINETUNED_MODEL_PATH)
model.to("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# ƒê·ªçc transcript chu·∫©n
ground_truths = {}
with open(TRANSCRIPT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(" | ")
        if len(parts) == 2:
            filename, transcript = parts
            ground_truths[filename.lower()] = transcript.strip().lower()

# S·∫Øp x·∫øp file audio theo t√™n
audio_files = sorted([f for f in os.listdir(AUDIO_FOLDER) if f.endswith(".mp3")])

# Kh·ªüi t·∫°o danh s√°ch ƒë·ªÉ t√≠nh WER
hypotheses = []
references = []
results = []

# X·ª≠ l√Ω t·ª´ng file √¢m thanh
for audio_file in audio_files:
    file_path = os.path.join(AUDIO_FOLDER, audio_file)
    
    # Load √¢m thanh
    audio, sr = librosa.load(file_path, sr=16000)
    input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features.to(model.device)

    # D·ª± ƒëo√°n transcript
    with torch.no_grad():
        predicted_ids = model.generate(input_features)
    
    # Gi·∫£i m√£ transcript v√† chuy·ªÉn v·ªÅ lowercase
    predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip().lower()
    
    # Ki·ªÉm tra c√≥ ground truth kh√¥ng
    if audio_file.lower() in ground_truths:
        reference_text = ground_truths[audio_file.lower()]
        if reference_text and predicted_text:
            references.append(reference_text)
            hypotheses.append(predicted_text)
    else:
        print(f"‚ö†Ô∏è Warning: No reference transcript for {audio_file}")

    # Ghi k·∫øt qu·∫£ v√†o danh s√°ch
    results.append(f"{audio_file} | {predicted_text}")

# Ghi k·∫øt qu·∫£ v√†o file transcript_result.txt
with open(RESULT_FILE, "w", encoding="utf-8") as f:
    f.write("\n".join(results))

# T√≠nh to√°n WER n·∫øu c√≥ d·ªØ li·ªáu h·ª£p l·ªá
if references and hypotheses:
    print("üîπ T√≠nh to√°n WER...")
    wer_score = jiwer.wer(references, hypotheses)
    print(f"üî• Word Error Rate (WER): {wer_score:.2%}")
else:
    print("‚ö†Ô∏è Kh√¥ng c√≥ d·ªØ li·ªáu h·ª£p l·ªá ƒë·ªÉ t√≠nh WER.")
