In [None]:
import torch
import torchaudio
import soundfile as sf
import io
import numpy as np
from datasets import load_dataset, Audio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import evaluate
from tqdm import tqdm
import json
from datetime import datetime

TARGET_SR = 16000  # Whisper expects 16kHz

class WhisperEvaluator:
    def __init__(self, model_name="vinai/PhoWhisper-medium", device=None):
        """
        Initialize the Whisper evaluator
        
        Args:
            model_name: Hugging Face model identifier for Whisper
            device: Device to run evaluation on (cuda/cpu)
        """
        self.model_name = model_name
        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        
        print(f"Loading model: {model_name}")
        print(f"Using device: {self.device}")
        
        # Load processor and model
        self.processor = WhisperProcessor.from_pretrained(model_name)
        self.model = WhisperForConditionalGeneration.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()
        
        # Load metrics
        self.wer_metric = evaluate.load("wer")
        self.cer_metric = evaluate.load("cer")
    
    def load_dataset(self, split=None, streaming=False, data_files=None):
        """
        Load ViMD_Dataset from Hugging Face
        
        Args:
            split: Dataset split to load
            streaming: Whether to use streaming mode
            data_files: Specific parquet files to download
        """
        if data_files:
            print(f"Loading ViMD_Dataset from specific files: {data_files}")
            dataset = load_dataset(
                "nguyendv02/ViMD_Dataset",
                data_files=data_files,
                split="train",
                streaming=streaming
            )
        else:
            print(f"Loading ViMD_Dataset (split: {split})...")
            dataset = load_dataset(
                "nguyendv02/ViMD_Dataset",
                split=split,
                streaming=streaming
            )
        dataset = dataset.cast_column("audio", Audio(decode=False))
        return dataset
    
    def prepare_dataset(self, batch):
        """
        Prepare audio data from bytes without using torchcodec
        
        Args:
            batch: Single sample from dataset
            
        Returns:
            Batch with prepared input_features and labels
        """
        # Read audio from bytes using soundfile
        audio_bytes = batch["audio"]["bytes"]
        with io.BytesIO(audio_bytes) as f:
            array, sr = sf.read(f, dtype="float32")
        
        # Convert to torch tensor
        array = torch.from_numpy(array)
        
        # Convert stereo to mono if needed
        if array.ndim > 1:
            array = array.mean(dim=1)
        
        # Resample to target sampling rate if needed
        if sr != TARGET_SR:
            array = torchaudio.functional.resample(array, sr, TARGET_SR)
        
        # Extract features using Whisper processor
        batch["input_features"] = self.processor.feature_extractor(
            array, 
            sampling_rate=TARGET_SR
        ).input_features[0]
        
        # Prepare labels (transcription)
        # Check different possible field names
        text = batch.get("text") or batch.get("transcription") or batch.get("sentence", "")
        batch["labels"] = self.processor.tokenizer(
            text, 
            max_length=448, 
            truncation=True
        ).input_ids
        
        return batch
    
    def transcribe(self, input_features):
        """
        Transcribe audio using Whisper
        
        Args:
            input_features: Preprocessed audio features
            
        Returns:
            Transcription text
        """
        # Ensure input_features is a tensor on the correct device
        if not isinstance(input_features, torch.Tensor):
            input_features = torch.tensor(input_features)
        
        if input_features.ndim == 2:
            input_features = input_features.unsqueeze(0)  # Add batch dimension
        
        input_features = input_features.to(self.device)
        
        # Generate transcription
        with torch.no_grad():
            predicted_ids = self.model.generate(
                input_features,
                language="vi",
                task="transcribe"
            )
        
        # Decode transcription
        transcription = self.processor.batch_decode(
            predicted_ids,
            skip_special_tokens=True
        )[0]
        
        return transcription
    
    def evaluate_dataset(self, dataset=None, split="test", max_samples=None, data_files=None):
        """
        Evaluate Whisper on the dataset
        
        Args:
            dataset: Pre-loaded dataset (optional)
            split: Split to evaluate if dataset not provided
            max_samples: Maximum number of samples to evaluate
            data_files: Specific data files to load
        """
        if dataset is None:
            dataset = self.load_dataset(split=split, data_files=data_files)
        
        predictions = []
        references = []
        results = []
        
        # Limit samples if specified
        if max_samples:
            if hasattr(dataset, 'select'):
                dataset = dataset.select(range(min(max_samples, len(dataset))))
        
        print(f"\nEvaluating on {len(dataset) if hasattr(dataset, '__len__') else 'streaming'} samples...")
        
        for idx, sample in enumerate(tqdm(dataset)):
            # Stop if max_samples reached (for streaming datasets)
            if max_samples and idx >= max_samples:
                break
            
            try:
                # Prepare the sample (extract features from audio bytes)
                prepared = self.prepare_dataset(sample)
                
                # Get input features
                input_features = prepared["input_features"]
                
                # Get reference transcription
                reference = sample.get("text") or sample.get("transcription") or sample.get("sentence", "")
                
                # Transcribe
                prediction = self.transcribe(input_features)
                
                predictions.append(prediction)
                references.append(reference)
                
                # Store individual result
                results.append({
                    "index": idx,
                    "prediction": prediction,
                    "reference": reference
                })
                
            except Exception as e:
                print(f"\nError processing sample {idx}: {str(e)}")
                import traceback
                traceback.print_exc()
                continue
        
        # Calculate metrics
        print("\nCalculating metrics...")
        wer = self.wer_metric.compute(predictions=predictions, references=references)
        cer = self.cer_metric.compute(predictions=predictions, references=references)
        
        # Calculate additional statistics
        avg_pred_length = np.mean([len(p) for p in predictions])
        avg_ref_length = np.mean([len(r) for r in references])
        
        metrics = {
            "model_name": self.model_name,
            "dataset": "ViMD_Dataset",
            "split": split if split else "custom",
            "num_samples": len(predictions),
            "wer": wer * 100,  # Convert to percentage
            "cer": cer * 100,  # Convert to percentage
            "avg_prediction_length": avg_pred_length,
            "avg_reference_length": avg_ref_length,
            "timestamp": datetime.now().isoformat()
        }
        
        return metrics, results
    
    def print_results(self, metrics, sample_results=None, num_examples=5):
        """
        Print evaluation results
        """
        print("\n" + "="*80)
        print("EVALUATION RESULTS")
        print("="*80)
        print(f"Model: {metrics['model_name']}")
        print(f"Dataset: {metrics['dataset']} ({metrics['split']} split)")
        print(f"Number of samples: {metrics['num_samples']}")
        print(f"\nWord Error Rate (WER): {metrics['wer']:.2f}%")
        print(f"Character Error Rate (CER): {metrics['cer']:.2f}%")
        print(f"\nAverage prediction length: {metrics['avg_prediction_length']:.1f} characters")
        print(f"Average reference length: {metrics['avg_reference_length']:.1f} characters")
        
        if sample_results and num_examples > 0:
            print("\n" + "="*80)
            print(f"SAMPLE PREDICTIONS (first {num_examples})")
            print("="*80)
            
            for i, result in enumerate(sample_results[:num_examples]):
                print(f"\n--- Sample {result['index'] + 1} ---")
                print(f"Reference:  {result['reference']}")
                print(f"Prediction: {result['prediction']}")
    
    def save_results(self, metrics, results, output_file="phowhisper_medium_evaluation_results.json"):
        """
        Save evaluation results to JSON file
        """
        output_data = {
            "metrics": metrics,
            "detailed_results": results
        }
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(output_data, f, ensure_ascii=False, indent=2)
        
        print(f"\nResults saved to: {output_file}")


def main():
    """
    Main evaluation function
    """
    # Configuration
    MODEL_NAME = "vinai/PhoWhisper-medium"
    MAX_SAMPLES = 100  # Set to None for all samples
    OUTPUT_FILE = "PhoWhisperMed_vimd_evaluation.json"
    
    # Initialize evaluator
    evaluator = WhisperEvaluator(model_name=MODEL_NAME)
    
    # Load only test files or anything you want
    metrics, results = evaluator.evaluate_dataset(
        data_files="data/valid-*.parquet",
        max_samples=MAX_SAMPLES
    )
    
    # Print results
    evaluator.print_results(metrics, results, num_examples=5)
    
    # Save results
    evaluator.save_results(metrics, results, output_file=OUTPUT_FILE)


if __name__ == "__main__":
    main()

Looking for parquet files in: D:\SPL\Code\PhanBietVungMien\DAT301m\data\test_edited
Folder exists: True
Found 3 parquet files


Device set to use cuda:0


Processing ViMD: 0 samples [00:00, ? samples/s]

Starting processing at 08:19:07
[08:49:27] Loaded 10 samples (1820.0s elapsed)
[09:16:21] Loaded 20 samples (3433.6s elapsed)
[09:43:09] Loaded 30 samples (5042.1s elapsed)
[09:48:42] Processing batch 1 (32 samples)...


RuntimeError: Could not load libtorchcodec. Likely causes:
          1. FFmpeg is not properly installed in your environment. We support
             versions 4, 5, 6, 7, and 8. On Windows, ensure you've installed
             the "full-shared" version which ships DLLs.
          2. The PyTorch version (2.9.1+cu128) is not compatible with
             this version of TorchCodec. Refer to the version compatibility
             table:
             https://github.com/pytorch/torchcodec?tab=readme-ov-file#installing-torchcodec.
          3. Another runtime dependency; see exceptions below.
        The following exceptions were raised as we tried to load libtorchcodec:
        
[start of libtorchcodec loading traceback]
FFmpeg version 8: Could not load this library: C:\Users\ADMIN\AppData\Local\Programs\Python\Python311\Lib\site-packages\torchcodec\libtorchcodec_core8.dll
FFmpeg version 7: Could not load this library: C:\Users\ADMIN\AppData\Local\Programs\Python\Python311\Lib\site-packages\torchcodec\libtorchcodec_core7.dll
FFmpeg version 6: Could not load this library: C:\Users\ADMIN\AppData\Local\Programs\Python\Python311\Lib\site-packages\torchcodec\libtorchcodec_core6.dll
FFmpeg version 5: Could not load this library: C:\Users\ADMIN\AppData\Local\Programs\Python\Python311\Lib\site-packages\torchcodec\libtorchcodec_core5.dll
FFmpeg version 4: Could not load this library: C:\Users\ADMIN\AppData\Local\Programs\Python\Python311\Lib\site-packages\torchcodec\libtorchcodec_core4.dll
[end of libtorchcodec loading traceback].