In [None]:
!pip install transformers==4.46.0 peft==0.6.2 datasets[audio] accelerate evaluate jiwer tensorboard torchaudio sentence-transformers matplotlib seaborn
!pip install git+https://github.com/huggingface/transformers.git

Collecting transformers==4.46.0
  Downloading transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.6.2
  Downloading peft-0.6.2-py3-none-any.whl.metadata (23 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting datasets[audio]
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers==4.46.0)
  Downloading tokenizers-0.20.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets[audio])
  

In [None]:
##Import libraries
import torch
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainerCallback
from peft import LoraConfig, get_peft_model
from datasets import Dataset
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import os
import glob
import pandas as pd
import tarfile
import torchaudio
import time
from jiwer import wer, cer
from sentence_transformers import SentenceTransformer, util
from google.colab import drive



#Mount google drive
drive.mount('/content/drive')

drive_path='/content/drive/MyDrive/librispeech_raw'
train_tar=os.path.join(drive_path, 'train-clean-100.tar.gz')
test_tar=os.path.join(drive_path, 'test-clean.tar.gz')
extract_path='/content/librispeech_subset'

#Create directory
os.makedirs(extract_path, exist_ok=True)

# Function to extract subset
def extract_subset(tar_path, extract_path, max_files=20000, max_speakers=20, prefix='train'):
    with tarfile.open(tar_path, 'r:gz') as tar:
        members = [m for m in tar.getmembers() if m.name.endswith('.flac') or m.name.endswith('.txt')]
        selected = []
        speaker_dirs = set()
        for m in members:
            if len(selected) >= max_files or len(speaker_dirs) >= max_speakers:
                break
            if m.name.endswith('.flac') or m.name.endswith('.txt'):
                speaker_dir = '/'.join(m.name.split('/')[:3])
                if speaker_dir not in speaker_dirs and len(speaker_dirs) < max_speakers:
                    speaker_dirs.add(speaker_dir)
                    selected.append(m)
                elif speaker_dir in speaker_dirs:
                    selected.append(m)
        tar.extractall(path=extract_path, members=selected)
    print(f"Extracted {len(selected)} files for {prefix} to {extract_path}")

#Extract subset
extract_subset(train_tar, extract_path, max_files=20000, max_speakers=20, prefix='train')
extract_subset(test_tar, extract_path, max_files=1000, max_speakers=5, prefix='test')

#Load dataset
train_flacs=glob.glob(f"{extract_path}/LibriSpeech/train-clean-100/**/*.flac", recursive=True)
test_flacs=glob.glob(f"{extract_path}/LibriSpeech/test-clean/**/*.flac", recursive=True)
train_txts=glob.glob(f"{extract_path}/LibriSpeech/train-clean-100/**/*.trans.txt", recursive=True)
test_txts=glob.glob(f"{extract_path}/LibriSpeech/test-clean/**/*.trans.txt", recursive=True)

print(f"Train FLAC files: {len(train_flacs)}")
print(f"Test FLAC files: {len(test_flacs)}")
print(f"Train TXT files: {len(train_txts)}")
print(f"Test TXT files: {len(test_txts)}")

def load_transcripts(flac_files, prefix='train'):
    data = []
    unpaired_flacs = []
    for flac in flac_files:
        parts = flac.split('/')
        speaker, book = parts[-3], parts[-2]
        utterance_id = os.path.basename(flac).replace('.flac', '')
        txt_file = os.path.join(os.path.dirname(flac), f"{speaker}-{book}.trans.txt")
        if os.path.exists(txt_file):
            with open(txt_file, 'r') as f:
                for line in f:
                    parts = line.strip().split(' ', 1)
                    if len(parts) == 2 and parts[0] == utterance_id:
                        data.append({'audio': flac, 'text': parts[1]})
                        break
                else:
                    unpaired_flacs.append(flac)
        else:
            unpaired_flacs.append(flac)
    print(f"{prefix.capitalize()} - Paired: {len(data)}, Unpaired FLACs: {len(unpaired_flacs)}")
    if unpaired_flacs:
        print(f"Sample unpaired FLACs: {unpaired_flacs[:2]}")
    return data

train_data=load_transcripts(train_flacs, prefix='train')
test_data=load_transcripts(test_flacs, prefix='test')

if not train_data:
    raise ValueError("No training data loaded. Check dataset path and structure.")
if not test_data:
    raise ValueError("No test data loaded. Check dataset path and structure.")

train_dataset=Dataset.from_pandas(pd.DataFrame(train_data))
test_dataset=Dataset.from_pandas(pd.DataFrame(test_data))

#Load processor
processor=WhisperProcessor.from_pretrained("openai/whisper-tiny", language="English", task="transcribe")

#Preprocess audio
def preprocess_audio(batch):
    waveform, orig_sr=torchaudio.load(batch['audio'])
    if waveform.shape[0] > 1:
        waveform=waveform.mean(dim=0, keepdim=True)
    resampler=torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=16000)
    batch['audio']={'array': resampler(waveform).squeeze().numpy(), 'sampling_rate': 16000}
    return batch

train_dataset=train_dataset.map(preprocess_audio)
test_dataset=test_dataset.map(preprocess_audio)

#Prepare data
def prepare_dataset(batch):
    audio=batch["audio"]
    batch["input_features"]=processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"]=processor.tokenizer(batch["text"]).input_ids
    return batch

train_dataset_with_features=train_dataset.map(prepare_dataset, remove_columns=["audio", "text"])
test_dataset_with_features=test_dataset.map(prepare_dataset, remove_columns=["audio", "text"])

#Load model
model=WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.to("cuda")
model.generation_config.language="english"
model.generation_config.task="transcribe"
model.generation_config.forced_decoder_ids = None

sentence_model=SentenceTransformer('all-MiniLM-L6-v2')

#Define collator
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features=[{"input_features": feature["input_features"]} for feature in features]
        batch=self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        label_features=[{"input_ids": feature["labels"]} for feature in features]
        labels_batch=self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels=labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if (labels[:, 0]==self.decoder_start_token_id).all().cpu().item():
            labels=labels[:, 1:]
        batch["labels"]=labels
        return batch

data_collator=DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

wer_metric=evaluate.load("wer")
cer_metric=evaluate.load("cer")

def compute_metrics(pred):
    try:
        pred_ids=pred.predictions
        label_ids=pred.label_ids
        print(f"pred_ids shape: {pred_ids.shape}, type: {type(pred_ids)}")
        print(f"label_ids shape: {label_ids.shape}, type: {type(label_ids)}")
        label_ids[label_ids == -100]=processor.tokenizer.pad_token_id
        pred_str=processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        label_str=processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
        pred_str=[s.lower() for s in pred_str if s.strip()]
        label_str=[s.lower() for s in label_str if s.strip()]
        print(f"Sample pred_str: {pred_str[:2]}")
        print(f"Sample label_str: {label_str[:2]}")
        if not pred_str or not label_str:
            print("Empty predictions or labels detected.")
            return {"eval_wer": -1, "eval_cer": -1, "eval_semascore": -1}

        # WER
        wer_score = 100 * wer_metric.compute(predictions=pred_str, references=label_str)
        print(f"Computed WER: {wer_score}")

        # CER
        cer_score = 100 * cer_metric.compute(predictions=pred_str, references=label_str)
        print(f"Computed CER: {cer_score}")

        # SeMaScore
        pred_embeddings = sentence_model.encode(pred_str, convert_to_tensor=True)
        label_embeddings = sentence_model.encode(label_str, convert_to_tensor=True)
        semascore = util.cos_sim(pred_embeddings, label_embeddings).diag().mean().item()
        print(f"Computed SeMaScore: {semascore}")

        return {"eval_wer": wer_score, "eval_cer": cer_score, "eval_semascore": semascore}
    except Exception as e:
        print(f"Error in compute_metrics: {str(e)}")
        return {"eval_wer": -1, "eval_cer": -1, "eval_semascore": -1}

#Function: RTF and Latency
def compute_rtf_latency(model, dataset, test_data, num_samples=10):
    model.eval()
    total_processing_time=0
    total_audio_duration=0
    latencies = []
    with torch.no_grad():
        for i, sample in enumerate(dataset.select(range(min(num_samples, len(dataset))))):
            input_features=torch.tensor(sample["input_features"]).unsqueeze(0).to("cuda")
            start_time=time.time()
            generated_ids=model.generate(input_features)
            end_time=time.time()
            processing_time=end_time - start_time
            total_processing_time+=processing_time
            waveform, _ =torchaudio.load(test_data[i]['audio'])
            audio_duration=waveform.shape[1] / 16000
            total_audio_duration+=audio_duration
            latencies.append(processing_time)
    rtf=total_processing_time / total_audio_duration if total_audio_duration > 0 else -1
    avg_latency=np.mean(latencies) if latencies else -1
    return rtf, avg_latency

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, patience=3, threshold=0.01):
        self.patience=patience
        self.threshold=threshold
        self.best_wer=float('inf')
        self.steps_without_improvement=0

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        wer=metrics.get('eval_wer', float('inf'))
        if wer < self.best_wer - self.threshold:
            self.best_wer=wer
            self.steps_without_improvement=0
        else:
            self.steps_without_improvement+=1
        if self.steps_without_improvement >= self.patience:
            control.should_training_stop=True
        return control

#Baseline eval
baseline_model=WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny").to("cuda")
baseline_trainer=Seq2SeqTrainer(
    model=baseline_model,
    args=Seq2SeqTrainingArguments(
        output_dir="/content/whisper-tiny-baseline",
        per_device_eval_batch_size=2,
        fp16=True,
        predict_with_generate=True,
        generation_max_length=225,
        report_to=["tensorboard"],
    ),
    eval_dataset=test_dataset_with_features,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor,
)
baseline_results=baseline_trainer.evaluate()
baseline_wer=baseline_results.get('eval_wer', -1)
baseline_cer=baseline_results.get('eval_cer', -1)
baseline_semascore=baseline_results.get('eval_semascore', -1)
print(f"Baseline WER: {baseline_wer:.3f}%")
print(f"Baseline CER: {baseline_cer:.3f}%")
print(f"Baseline SeMaScore: {baseline_semascore:.3f}")

# Baseline OOD eval
def add_noise(batch):
    waveform=torch.tensor(batch["audio"]["array"])
    noise=0.01 * torch.randn_like(waveform)
    noisy_waveform=waveform + noise
    batch["audio"]["array"]=noisy_waveform.numpy()
    return batch

ood_test_dataset=test_dataset.map(add_noise)
ood_test_dataset_with_features=ood_test_dataset.map(prepare_dataset, remove_columns=["audio", "text"])
ood_results=baseline_trainer.evaluate(ood_test_dataset_with_features)
ood_wer=ood_results.get('eval_wer', -1)
ood_cer=ood_results.get('eval_cer', -1)
ood_semascore=ood_results.get('eval_semascore', -1)
print(f"Baseline OOD WER: {ood_wer:.3f}%")
print(f"Baseline OOD CER: {ood_cer:.3f}%")
print(f"Baseline OOD SeMaScore: {ood_semascore:.3f}")

# Baseline RTF & Latency
baseline_rtf, baseline_latency=compute_rtf_latency(baseline_model, test_dataset_with_features, test_data)
print(f"Baseline RTF: {baseline_rtf:.3f}")
print(f"Baseline Latency: {baseline_latency:.3f} seconds")

# Baseline Per-Speaker Metrics
def compute_per_speaker_metrics(model, dataset, test_data):
    model.eval()
    transcriptions=[]
    references=[]
    with torch.no_grad():
        for sample in dataset:
            input_features=torch.tensor(sample["input_features"]).unsqueeze(0).to("cuda")
            generated_ids=model.generate(input_features)
            transcription=processor.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
            transcriptions.append(transcription.lower())
            label=processor.tokenizer.batch_decode([sample['labels']], skip_special_tokens=True)[0].lower()
            references.append(label)
    speaker_trans={}
    speaker_labels={}
    for i, (trans, ref) in enumerate(zip(transcriptions, references)):
        flac_path=test_data[i]['audio']
        speaker=flac_path.split('/')[-3]
        if speaker not in speaker_trans:
            speaker_trans[speaker]=[]
            speaker_labels[speaker]=[]
        speaker_trans[speaker].append(trans)
        speaker_labels[speaker].append(ref)
    speaker_metrics = {}
    for speaker in speaker_trans:
        wer_score = 100 * wer_metric.compute(predictions=speaker_trans[speaker], references=speaker_labels[speaker])
        cer_score = 100 * cer_metric.compute(predictions=speaker_trans[speaker], references=speaker_labels[speaker])
        pred_embeddings = sentence_model.encode(speaker_trans[speaker], convert_to_tensor=True)
        label_embeddings = sentence_model.encode(speaker_labels[speaker], convert_to_tensor=True)
        semascore = util.cos_sim(pred_embeddings, label_embeddings).diag().mean().item()
        speaker_metrics[speaker] = (wer_score, cer_score, semascore, len(speaker_trans[speaker]))
    return speaker_metrics

baseline_speaker_metrics = compute_per_speaker_metrics(baseline_model, test_dataset_with_features, test_data)
print("Baseline Per-Speaker Metrics:")
for speaker, (wer_score, cer_score, semascore, count) in baseline_speaker_metrics.items():
    print(f"Speaker {speaker}: WER = {wer_score:.3f}%, CER = {cer_score:.3f}%, SeMaScore = {semascore:.3f} (Samples: {count})")

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1
)
model = get_peft_model(model, lora_config)

class CustomSeq2SeqTrainer(Seq2SeqTrainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        valid_inputs = {k: v for k, v in inputs.items() if k in ['input_features', 'labels']}
        outputs = model(**valid_inputs)
        loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        return (loss, outputs) if return_outputs else loss

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/whisper-tiny-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=50,
    num_train_epochs=1,
    max_steps=400,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=2,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="eval_wer",
    greater_is_better=False,
    remove_unused_columns=False,
)

trainer = CustomSeq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_with_features,
    eval_dataset=test_dataset_with_features,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor,
    callbacks=[EarlyStoppingCallback(patience=3, threshold=0.01)],
)

trainer.train()

#Save
trainer.save_model("/content/whisper-tiny-finetuned-final")
processor.save_pretrained("/content/whisper-tiny-finetuned-final")

#Finetuned eval
finetuned_results = trainer.evaluate()
finetuned_wer = finetuned_results.get('eval_wer', -1)
finetuned_cer = finetuned_results.get('eval_cer', -1)
finetuned_semascore = finetuned_results.get('eval_semascore', -1)
print(f"Fine-Tuned WER: {finetuned_wer:.3f}%")
print(f"Fine-Tuned CER: {finetuned_cer:.3f}%")
print(f"Fine-Tuned SeMaScore: {finetuned_semascore:.3f}")

#Finetuned OOD eval
finetuned_ood_results = trainer.evaluate(ood_test_dataset_with_features)
finetuned_ood_wer = finetuned_ood_results.get('eval_wer', -1)
finetuned_ood_cer = finetuned_ood_results.get('eval_cer', -1)
finetuned_ood_semascore = finetuned_ood_results.get('eval_semascore', -1)
print(f"Fine-Tuned OOD WER: {finetuned_ood_wer:.3f}%")
print(f"Fine-Tuned OOD CER: {finetuned_ood_cer:.3f}%")
print(f"Fine-Tuned OOD SeMaScore: {finetuned_ood_semascore:.3f}")

#Finetuned RTF & Latency
finetuned_rtf, finetuned_latency = compute_rtf_latency(model, test_dataset_with_features, test_data)
print(f"Fine-Tuned RTF: {finetuned_rtf:.3f}")
print(f"Fine-Tuned Latency: {finetuned_latency:.3f} seconds")

#Finetuned metrics
finetuned_speaker_metrics=compute_per_speaker_metrics(model, test_dataset_with_features, test_data)
print("Fine-Tuned Per-Speaker Metrics:")
for speaker, (wer_score, cer_score, semascore, count) in finetuned_speaker_metrics.items():
    print(f"Speaker {speaker}: WER = {wer_score:.3f}%, CER = {cer_score:.3f}%, SeMaScore = {semascore:.3f} (Samples: {count})")

print("\nComparison of Baseline and Fine-Tuned Metrics:")
print(f"{'Metric':<30} {'Baseline':<15} {'Fine-Tuned':<15}")
print(f"{'-'*60}")
print(f"{'WER (%)':<30} {baseline_wer:<15.3f} {finetuned_wer:<15.3f}")
print(f"{'CER (%)':<30} {baseline_cer:<15.3f} {finetuned_cer:<15.3f}")
print(f"{'SeMaScore':<30} {baseline_semascore:<15.3f} {finetuned_semascore:<15.3f}")
print(f"{'OOD WER (%)':<30} {ood_wer:<15.3f} {finetuned_ood_wer:<15.3f}")
print(f"{'OOD CER (%)':<30} {ood_cer:<15.3f} {finetuned_ood_cer:<15.3f}")
print(f"{'OOD SeMaScore':<30} {ood_semascore:<15.3f} {finetuned_ood_semascore:<15.3f}")
print(f"{'RTF':<30} {baseline_rtf:<15.3f} {finetuned_rtf:<15.3f}")
print(f"{'Latency (s)':<30} {baseline_latency:<15.3f} {finetuned_latency:<15.3f}")
baseline_var_wer = np.var([wer for wer, _, _, _ in baseline_speaker_metrics.values()])
finetuned_var_wer = np.var([wer for wer, _, _, _ in finetuned_speaker_metrics.values()])
print(f"{'Per-Speaker WER Variance (%)':<30} {baseline_var_wer:<15.3f} {finetuned_var_wer:<15.3f}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Extracted 2205 files for train to /content/librispeech_subset
Extracted 271 files for test to /content/librispeech_subset
Train FLAC files: 2156
Test FLAC files: 262
Train TXT files: 49
Test TXT files: 9
Train - Paired: 2155, Unpaired FLACs: 1
Sample unpaired FLACs: ['/content/librispeech_subset/LibriSpeech/train-clean-100/7447/91186/7447-91186-0003.flac']
Test - Paired: 261, Unpaired FLACs: 1
Sample unpaired FLACs: ['/content/librispeech_subset/LibriSpeech/test-clean/7729/102255/7729-102255-0045.flac']


Map:   0%|          | 0/2155 [00:00<?, ? examples/s]

Map:   0%|          | 0/261 [00:00<?, ? examples/s]

Map:   0%|          | 0/2155 [00:00<?, ? examples/s]

Map:   0%|          | 0/261 [00:00<?, ? examples/s]

pred_ids shape: (261, 448), type: <class 'numpy.ndarray'>
label_ids shape: (261, 448), type: <class 'numpy.ndarray'>
Sample pred_str: [' i had scarcely known what i had been saying or doing up to this time, but as he spoke i looked at my hand.', ' i know he had it this very evening.']
Sample label_str: ['i had scarcely known what i had been saying or doing up to this time but as he spoke i looked at my hand', 'i know he had it this very evening']
Computed WER: 19.097662620039863
Computed CER: 5.676676264911559
Computed SeMaScore: 0.879367470741272
Baseline WER: 19.098%
Baseline CER: 5.677%
Baseline SeMaScore: 0.879


Map:   0%|          | 0/261 [00:00<?, ? examples/s]

Map:   0%|          | 0/261 [00:00<?, ? examples/s]

pred_ids shape: (261, 448), type: <class 'numpy.ndarray'>
label_ids shape: (261, 448), type: <class 'numpy.ndarray'>
Sample pred_str: [' i had scarcely known what i had been saying or doing up to this time, but as he spoke i looked at my hand.', ' i know he had it this very evening.']
Sample label_str: ['i had scarcely known what i had been saying or doing up to this time but as he spoke i looked at my hand', 'i know he had it this very evening']
Computed WER: 21.978619315093315
Computed CER: 7.90826820238585
Computed SeMaScore: 0.8516378998756409
Baseline OOD WER: 21.979%
Baseline OOD CER: 7.908%
Baseline OOD SeMaScore: 0.852
Baseline RTF: 0.028
Baseline Latency: 0.154 seconds


No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Baseline Per-Speaker Metrics:
Speaker 6930: WER = 19.195%, CER = 5.952%, SeMaScore = 0.867 (Samples: 78)
Speaker 260: WER = 21.362%, CER = 7.153%, SeMaScore = 0.882 (Samples: 82)
Speaker 5639: WER = 19.714%, CER = 5.462%, SeMaScore = 0.885 (Samples: 42)
Speaker 1320: WER = 16.453%, CER = 4.409%, SeMaScore = 0.888 (Samples: 59)


Step,Training Loss,Validation Loss,Wer,Cer,Semascore
100,1.318,1.42128,18.952709,5.68696,0.879041
200,1.189,1.385862,18.753397,5.632113,0.879731
300,1.1644,1.364108,18.499728,5.58755,0.879773
400,1.1857,1.35709,18.409132,5.566982,0.880185


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.


pred_ids shape: (261, 448), type: <class 'numpy.ndarray'>
label_ids shape: (261, 448), type: <class 'numpy.ndarray'>
Sample pred_str: [' i had scarcely known what i had been saying or doing up to this time, but as he spoke i looked at my hand.', ' i know he had it this very evening.']
Sample label_str: ['i had scarcely known what i had been saying or doing up to this time but as he spoke i looked at my hand', 'i know he had it this very evening']
Computed WER: 18.95270882406233
Computed CER: 5.686960098724804
Computed SeMaScore: 0.8790414333343506
pred_ids shape: (261, 448), type: <class 'numpy.ndarray'>
label_ids shape: (261, 448), type: <class 'numpy.ndarray'>
Sample pred_str: [' i had scarcely known what i had been saying or doing up to this time, but as he spoke i looked at my hand.', ' i know he had it this very evening.']
Sample label_str: ['i had scarcely known what i had been saying or doing up to this time but as he spoke i looked at my hand', 'i know he had it this very eveni

pred_ids shape: (261, 448), type: <class 'numpy.ndarray'>
label_ids shape: (261, 448), type: <class 'numpy.ndarray'>
Sample pred_str: [' i had scarcely known what i had been saying or doing up to this time, but as he spoke i looked at my hand.', ' i know he had it this very evening.']
Sample label_str: ['i had scarcely known what i had been saying or doing up to this time but as he spoke i looked at my hand', 'i know he had it this very evening']
Computed WER: 18.409132089146585
Computed CER: 5.5669820375702725
Computed SeMaScore: 0.8801850080490112
Fine-Tuned WER: 18.409%
Fine-Tuned CER: 5.567%
Fine-Tuned SeMaScore: 0.880
pred_ids shape: (261, 448), type: <class 'numpy.ndarray'>
label_ids shape: (261, 448), type: <class 'numpy.ndarray'>
Sample pred_str: [' i had scarcely known what i had been saying or doing up to this time, but as he spoke i looked at my hand.', ' i know he had it this very evening.']
Sample label_str: ['i had scarcely known what i had been saying or doing up to this