In [26]:
from datasets import load_dataset
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperFeatureExtractor
import torch
from evaluate import load
import numpy as np

# Load the Korean FLEURS dataset
dataset = load_dataset("google/fleurs", "ko_kr")


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
        num_rows: 2307
    })
    validation: Dataset({
        features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
        num_rows: 226
    })
    test: Dataset({
        features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
        num_rows: 382
    })
})

In [29]:
# combine the training and validation sets
from datasets import concatenate_datasets

dataset = concatenate_datasets([dataset["train"], dataset["validation"], dataset["test"]])
dataset

Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 2915
})

In [6]:
dataset['path'][0]

'/home/sabina/.cache/huggingface/datasets/downloads/extracted/55112befab2532b530aa8c033dbfc7a9eeac91525105b351d78d86ef342718dd/10003190170698054799.wav'

In [7]:
# calculate the total duration of the dataset
import numpy as np
import librosa
from tqdm import tqdm

def calculate_total_duration(dataset):
    total_duration = 0
    for sample in tqdm(dataset, desc="Calculating duration"):
        audio = sample['audio']
        
        # If the audio is already loaded as a numpy array
        if isinstance(audio, np.ndarray):
            duration = len(audio) / 16000  # Assuming 16kHz sample rate
        
        # If the audio is a dict with 'array' and 'sampling_rate' keys
        elif isinstance(audio, dict) and 'array' in audio and 'sampling_rate' in audio:
            duration = len(audio['array']) / audio['sampling_rate']
        
        # If the audio is a file path
        elif isinstance(audio, str):
            duration = librosa.get_duration(filename=audio)
        
        else:
            raise ValueError(f"Unexpected audio format: {type(audio)}")
        
        total_duration += duration
    
    return total_duration

# Calculate total duration
total_duration = calculate_total_duration(dataset)

Calculating duration: 100%|██████████| 2915/2915 [00:06<00:00, 439.80it/s]


In [9]:
total_duration/3600

10.039933333333332

In [10]:
total_duration / 2915

12.399231560891936

In [27]:

# Load the Whisper model and processor
model_name = "openai/whisper-small"  # You can change this to other variants
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [33]:
# Function to compute WER for a batch
def compute_metrics(pred_ids, labels):
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = labels
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# Function to pad input features
def pad_input_features(input_features, target_length=3000):
    current_length = input_features.shape[-1]
    if current_length < target_length:
        padding_length = target_length - current_length
        padding = torch.zeros((input_features.shape[0], input_features.shape[1], padding_length))
        return torch.cat((input_features, padding), dim=-1)
    return input_features


#  Evaluation loop
model.eval()
all_metrics = []

batch_size = 32
for i in range(0, len(dataset), batch_size):
    batch = dataset.select(range(i, min(i + batch_size, len(dataset))))
    
    # Prepare inputs
    audio_samples = [sample['audio']['array'] for sample in batch]
    sampling_rate = batch[0]['audio']['sampling_rate']  # Assuming all samples have the same sampling rate
    
    # Extract features
    inputs = feature_extractor(audio_samples, sampling_rate=sampling_rate, return_tensors="pt")
    
    # Pad input features
    inputs.input_features = pad_input_features(inputs.input_features)
    
    with torch.no_grad():
        generated_ids = model.generate(inputs=inputs.input_features)
    
    metrics = compute_metrics(generated_ids, [sample['transcription'] for sample in batch])
    all_metrics.append(metrics["wer"])
    
    print(f"Batch {i//batch_size + 1} WER: {metrics['wer']:.4f}")

# Compute average WER
average_wer = np.mean(all_metrics)
print(f"Average WER: {average_wer:.4f}")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


KeyboardInterrupt: 