In [1]:
## 2024-09-15 update using code recommended by ChatGPT o1-mini when provided with the original code 
## and Gradio documentation

In [2]:
# Import necessary libraries
import os
import pandas as pd
from datasets import Dataset, DatasetDict, Audio
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    pipeline,
)
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
import gradio as gr
import matplotlib.pyplot as plt
import io
from PIL import Image  # Import PIL for image handling
import time
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# ================================
# 1. Device Configuration
# ================================

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
# ================================
# 2. Load and Prepare Data
# ================================

# Load training and testing data
data_train = pd.read_csv('train.tsv', delimiter='\t')
data_test = pd.read_csv('test.tsv', delimiter='\t')

cv = DatasetDict()
cv['train'] = Dataset.from_pandas(data_train)
cv['test'] = Dataset.from_pandas(data_test)

# Sanity check
print(cv)

# Set the sampling rate to 16kHz
cv = cv.cast_column("path", Audio(sampling_rate=16000))

DatasetDict({
    train: Dataset({
        features: ['path', 'script'],
        num_rows: 52
    })
    test: Dataset({
        features: ['path', 'script'],
        num_rows: 12
    })
})


In [5]:
# ================================
# 3. Load Feature Extractor, Tokenizer, Processor
# ================================

feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-small')
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-small', language='Russian', task='transcribe')
processor = WhisperProcessor.from_pretrained('openai/whisper-small', language='Russian', task='transcribe')

# Sanity check for tokenizer
input_str = cv['train'][0]['script']
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[src/libmpg123/id3.c:process_comment():584] error: No comment text / valid description?
[src/libmpg123/id3.c:process_comment():584] error: No comment text / valid description?


Input:                 Слышим, Круг, разрешён следует прямо OSMIK. Курс OSMIK, Свердловск-три-восемь-восемь, спасибо. 
Decoded w/ special:    <|startoftranscript|><|ru|><|transcribe|><|notimestamps|>Слышим, Круг, разрешён следует прямо OSMIK. Курс OSMIK, Свердловск-три-восемь-восемь, спасибо. <|endoftext|>
Decoded w/out special: Слышим, Круг, разрешён следует прямо OSMIK. Курс OSMIK, Свердловск-три-восемь-восемь, спасибо. 
Are equal:             True


In [6]:
# ================================
# 4. Prepare the Data
# ================================

def prep_dataset(batch):
    # Load and resample audio data to 16kHz
    audio = batch['path']

    # Compute log-mel spectrograms
    batch['input_features'] = feature_extractor(audio['array'], sampling_rate=16000).input_features[0]
   
    # Encode target text to label ids
    batch['labels'] = tokenizer(batch['script']).input_ids

    return batch

# Apply the preprocessing to the datasets
cv = cv.map(prep_dataset, remove_columns=cv['train'].column_names, num_proc=4)

Map (num_proc=4):   0%|          | 0/52 [00:00<?, ? examples/s][src/libmpg123/id3.c:process_comment():584] error: No comment text / valid description?
[src/libmpg123/id3.c:process_comment():584] error: No comment text / valid description?
[src/libmpg123/id3.c:process_comment():584] error: No comment text / valid description?
[src/libmpg123/id3.c:process_comment():584] error: No comment text / valid description?
[src/libmpg123/id3.c:process_comment():584] error: No comment text / valid description?
[src/libmpg123/id3.c:process_comment():584] error: No comment text / valid description?
[src/libmpg123/id3.c:process_comment():584] error: No comment text / valid description?
[src/libmpg123/id3.c:process_comment():584] error: No comment text / valid description?
Map (num_proc=4):   2%|▏         | 1/52 [00:00<00:17,  2.85 examples/s][src/libmpg123/id3.c:process_comment():584] error: No comment text / valid description?
[src/libmpg123/id3.c:process_comment():584] error: No comment text / valid

In [7]:
# ================================
# 5. Define Data Collator
# ================================

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels since they have to be of different lengths and need different padding methods
        # First treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # Pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # If BOS token is appended in previous tokenization step, cut BOS token here as it's appended later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

# Initialize data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [8]:
# ================================
# 6. Define Evaluation Metrics
# ================================

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # Decode the predictions and labels
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [9]:
# ================================
# 7. Load and Fine-Tune the Whisper Model
# ================================

# Path to save or load the fine-tuned model
fine_tuned_model_path = "./wspr-smll-ru-04"

# Metrics storage
fine_tuning_metrics = {}

# Function to save metrics to a JSON file
def save_metrics(metrics, path):
    with open(os.path.join(path, 'metrics.json'), 'w') as f:
        json.dump(metrics, f)

# Function to load metrics from a JSON file
def load_metrics(path):
    with open(os.path.join(path, 'metrics.json'), 'r') as f:
        return json.load(f)

# Check if the fine-tuned model already exists
if not os.path.exists(fine_tuned_model_path):
    # Load the pretrained Whisper model
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
    model.config.forced_decoder_ids = None
    model.config.suppress_tokens = []
    model.to(device)

    # Enable gradient checkpointing to save memory
    model.gradient_checkpointing_enable()

    # Define training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=fine_tuned_model_path,  # Directory to save the model
        per_device_train_batch_size=8,     # Reduced batch size to save memory
        gradient_accumulation_steps=2,     # To simulate larger batch size
        learning_rate=1e-5,
        warmup_steps=50,
        max_steps=500,
        gradient_checkpointing=True,
        fp16=True,                         # Use half-precision
        evaluation_strategy="steps",
        per_device_eval_batch_size=4,
        predict_with_generate=True,
        generation_max_length=225,
        save_steps=25,
        eval_steps=25,
        logging_steps=25,
        report_to=["tensorboard"],
        load_best_model_at_end=True,
        metric_for_best_model="wer",
        greater_is_better=False,
        push_to_hub=False,
    )

    # Initialize the trainer
    trainer = Seq2SeqTrainer(
        args=training_args,
        model=model,
        train_dataset=cv["train"],
        eval_dataset=cv["test"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=processor.feature_extractor,
    )

    # Save the processor
    processor.save_pretrained(fine_tuned_model_path)

    # Record the start time
    start_time = time.time()

    # Launch training
    trainer.train()

    # Record the end time
    end_time = time.time()    

    # Save the fine-tuned model
    trainer.save_model(fine_tuned_model_path)
    
    # Extract fine-tuning metrics
    fine_tuning_metrics['fine_tuned_model_path'] = fine_tuned_model_path
    fine_tuning_metrics['max_steps'] = training_args.max_steps
    fine_tuning_metrics['batch_size'] = training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps
    fine_tuning_metrics['total_training_time'] = end_time - start_time  # in seconds

    # Extract training loss from log_history
    loss_history = []
    eval_loss_history = []
    eval_wer_history = []
    for log in trainer.state.log_history:
        if 'step' in log and 'loss' in log:
            loss_history.append({'step': log['step'], 'loss': log['loss']})
        if 'step' in log and 'eval_loss' in log:
            eval_loss_history.append({'step': log['step'], 'eval_loss': log['eval_loss']})
        if 'step' in log and 'eval_wer' in log:
            eval_wer_history.append({'step': log['step'], 'eval_wer': log['eval_wer']})

    # Store loss history for plotting
    fine_tuning_metrics['train_loss_history'] = loss_history
    fine_tuning_metrics['eval_loss_history'] = eval_loss_history
    fine_tuning_metrics['eval_wer_history'] = eval_wer_history

    # Save metrics to a JSON file
    save_metrics(fine_tuning_metrics, fine_tuned_model_path)
else:
    print(f"Fine-tuned model already exists at {fine_tuned_model_path}")
    metrics_file = os.path.join(fine_tuned_model_path, 'metrics.json')
    if os.path.exists(metrics_file):
        fine_tuning_metrics = load_metrics(fine_tuned_model_path)
    else:
        # Set placeholders if metrics are not available
        fine_tuning_metrics['fine_tuned_model_path'] = fine_tuned_model_path
        fine_tuning_metrics['max_steps'] = "N/A (Metrics not available)"
        fine_tuning_metrics['batch_size'] = "N/A (Metrics not available)"
        fine_tuning_metrics['total_training_time'] = "N/A (Metrics not available)"
        fine_tuning_metrics['train_loss_history'] = []
        fine_tuning_metrics['eval_loss_history'] = []
        fine_tuning_metrics['eval_wer_history'] = []


max_steps is given, it will override any value given in num_train_epochs
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
  5%|▌         | 25/500 [01:14<23:12,  2.93s/it]

{'loss': 1.5004, 'grad_norm': 8.717329978942871, 'learning_rate': 4.600000000000001e-06, 'epoch': 7.14}


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
                                                
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 1.2244247198104858, 'eval_wer': 102.79720279720279, 'eval_runtime': 11.0648, 'eval_samples_per_second': 1.085, 'eval_steps_per_second': 0.271, 'epoch': 7.14}


 10%|█         | 50/500 [02:43<23:00,  3.07s/it]

{'loss': 0.4019, 'grad_norm': 2.3732190132141113, 'learning_rate': 9.600000000000001e-06, 'epoch': 14.29}


                                                
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.8245256543159485, 'eval_wer': 100.0, 'eval_runtime': 12.7529, 'eval_samples_per_second': 0.941, 'eval_steps_per_second': 0.235, 'epoch': 14.29}


 15%|█▌        | 75/500 [04:14<21:54,  3.09s/it]

{'loss': 0.0649, 'grad_norm': 1.8990089893341064, 'learning_rate': 9.48888888888889e-06, 'epoch': 21.43}


                                                
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.8177196979522705, 'eval_wer': 92.65734265734265, 'eval_runtime': 12.5577, 'eval_samples_per_second': 0.956, 'eval_steps_per_second': 0.239, 'epoch': 21.43}


 20%|██        | 100/500 [05:45<20:40,  3.10s/it]

{'loss': 0.0145, 'grad_norm': 0.34284093976020813, 'learning_rate': 8.933333333333333e-06, 'epoch': 28.57}


                                                 
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.8171883225440979, 'eval_wer': 77.27272727272727, 'eval_runtime': 11.7395, 'eval_samples_per_second': 1.022, 'eval_steps_per_second': 0.256, 'epoch': 28.57}


 25%|██▌       | 125/500 [07:15<19:49,  3.17s/it]

{'loss': 0.0018, 'grad_norm': 0.03412243351340294, 'learning_rate': 8.377777777777779e-06, 'epoch': 35.71}


                                                 
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.8253594040870667, 'eval_wer': 64.33566433566433, 'eval_runtime': 12.6022, 'eval_samples_per_second': 0.952, 'eval_steps_per_second': 0.238, 'epoch': 35.71}


 30%|███       | 150/500 [08:46<18:16,  3.13s/it]

{'loss': 0.0008, 'grad_norm': 0.015993455424904823, 'learning_rate': 7.822222222222224e-06, 'epoch': 42.86}


                                                 
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.8315367102622986, 'eval_wer': 65.38461538461539, 'eval_runtime': 12.4924, 'eval_samples_per_second': 0.961, 'eval_steps_per_second': 0.24, 'epoch': 42.86}


 35%|███▌      | 175/500 [10:17<15:23,  2.84s/it]

{'loss': 0.0006, 'grad_norm': 0.011768111027777195, 'learning_rate': 7.266666666666668e-06, 'epoch': 50.0}


                                                 
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.83879154920578, 'eval_wer': 81.46853146853147, 'eval_runtime': 12.7162, 'eval_samples_per_second': 0.944, 'eval_steps_per_second': 0.236, 'epoch': 50.0}


 39%|███▉      | 197/500 [11:39<15:35,  3.09s/it]

In [None]:

# ================================
# 8. Load Both Base and Fine-Tuned Models for Inference
# ================================

# Load the base Whisper model
base_tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language='Russian', task='transcribe')
base_pipe = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-small",
    tokenizer=base_tokenizer,
    device=0 if torch.cuda.is_available() else -1,  # Use GPU if available
)

# Load the fine-tuned Whisper model
fine_tuned_tokenizer = WhisperTokenizer.from_pretrained(fine_tuned_model_path, language='Russian', task='transcribe')
fine_tuned_pipe = pipeline(
    task="automatic-speech-recognition",
    model=os.path.join(fine_tuned_model_path, "checkpoint-250"),  # Adjust checkpoint path as needed
    tokenizer=fine_tuned_tokenizer,
    device=0 if torch.cuda.is_available() else -1,  # Use GPU if available
)


In [11]:
# ================================
# 9. Define Inference Function
# ================================

def transcribe_both(audio):
    """
    Transcribe the given audio file using both the base and fine-tuned Whisper models.

    Args:
        audio (str): Path to the audio file.

    Returns:
        list: List containing transcriptions from both models.
    """
    if audio is None:
        return ["No audio provided.", "No audio provided."]

    try:
        # Transcribe with base model
        base_result = base_pipe(audio)
        base_text = base_result.get("text", "Error in base model transcription.")

        # Transcribe with fine-tuned model
        fine_tuned_result = fine_tuned_pipe(audio)
        fine_tuned_text = fine_tuned_result.get("text", "Error in fine-tuned model transcription.")

        return [base_text, fine_tuned_text]
    except Exception as e:
        error_message = f"Error during transcription: {str(e)}"
        return [error_message, error_message]

In [12]:
# ================================
# 10. Generate Training Loss Plot
# ================================

def generate_loss_plot(train_loss_history, eval_loss_history):
    """
    Generates a matplotlib plot of training and evaluation loss over steps.

    Args:
        train_loss_history (list): List of dictionaries with 'step' and 'loss'.
        eval_loss_history (list): List of dictionaries with 'step' and 'eval_loss'.

    Returns:
        PIL.Image or None: Image data in PIL format or None if no data.
    """
    if not train_loss_history and not eval_loss_history:
        return None  # No data to plot

    plt.figure(figsize=(10, 6))

    if train_loss_history:
        steps = [entry['step'] for entry in train_loss_history]
        losses = [entry['loss'] for entry in train_loss_history]
        plt.plot(steps, losses, marker='o', linestyle='-', color='b', label='Training Loss')

    if eval_loss_history:
        eval_steps = [entry['step'] for entry in eval_loss_history]
        eval_losses = [entry['eval_loss'] for entry in eval_loss_history]
        plt.plot(eval_steps, eval_losses, marker='x', linestyle='--', color='r', label='Evaluation Loss')

    plt.title('Training and Evaluation Loss over Steps')
    plt.xlabel('Step')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    # Save plot to a bytes buffer
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    plt.close()
    buf.seek(0)

    # Open image with PIL and return
    image = Image.open(buf)
    return image

def generate_wer_plot(eval_wer_history):
    """
    Generates a matplotlib plot of evaluation Word Error Rate (WER) over steps.

    Args:
        eval_wer_history (list): List of dictionaries with 'step' and 'eval_wer'.

    Returns:
        PIL.Image or None: Image data in PIL format or None if no data.
    """
    if not eval_wer_history:
        return None  # No data to plot

    steps = [entry['step'] for entry in eval_wer_history]
    wers = [entry['eval_wer'] for entry in eval_wer_history]

    plt.figure(figsize=(10, 6))
    plt.plot(steps, wers, marker='s', linestyle='-', color='g', label='Evaluation WER (%)')
    plt.title('Evaluation Word Error Rate (WER) over Steps')
    plt.xlabel('Step')
    plt.ylabel('WER (%)')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()

    # Save plot to a bytes buffer
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    plt.close()
    buf.seek(0)

    # Open image with PIL and return
    image = Image.open(buf)
    return image

# Generate the plots
loss_plot = generate_loss_plot(
    fine_tuning_metrics.get('train_loss_history', []),
    fine_tuning_metrics.get('eval_loss_history', [])
)

wer_plot = generate_wer_plot(
    fine_tuning_metrics.get('eval_wer_history', [])
)



In [13]:
# ================================
# 11. Create Gradio Interface
# ================================

with gr.Blocks() as demo:
    gr.Markdown("# Whisper ASR Comparison")
    gr.Markdown(
        """
        Upload an audio file to see transcriptions from both the **Base Whisper Model** and the **Fine-Tuned Whisper Model**.
        """
    )
    
    with gr.Row():
        audio_input = gr.Audio(label="Upload Audio", type="filepath")
    
    with gr.Row():
        transcribe_button = gr.Button("Transcribe")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Base Whisper Model Transcription")
            base_output = gr.Textbox(lines=10, interactive=False)
        with gr.Column():
            gr.Markdown("### Fine-Tuned Whisper Model Transcription")
            fine_tuned_output = gr.Textbox(lines=10, interactive=False)
    
    transcribe_button.click(
        fn=transcribe_both,
        inputs=audio_input,
        outputs=[base_output, fine_tuned_output],
    )
    
    gr.Markdown("---")
    
    gr.Markdown("## Fine-Tuning Metrics")
    
    with gr.Row():
        with gr.Column(scale=1):
            # Display Metrics without labels
            gr.Markdown("**Fine-Tuned Model Path:**")
            model_path_display = gr.Textbox(value=fine_tuning_metrics.get('fine_tuned_model_path', 'N/A'), lines=1, interactive=False)
            
            gr.Markdown("**Max Steps:**")
            max_steps_display = gr.Textbox(value=fine_tuning_metrics.get('max_steps', 'N/A'), lines=1, interactive=False)
            
            gr.Markdown("**Batch Size:**")
            batch_size_display = gr.Textbox(value=fine_tuning_metrics.get('batch_size', 'N/A'), lines=1, interactive=False)
            
            gr.Markdown("**Total Training Time (seconds):**")
            training_time_display = gr.Textbox(value=round(fine_tuning_metrics.get('total_training_time', 0), 2), lines=1, interactive=False)
            
            gr.Markdown("**Latest Evaluation Loss:**")
            latest_eval_loss = "N/A"
            if fine_tuning_metrics.get('eval_loss_history'):
                latest_eval_loss = fine_tuning_metrics['eval_loss_history'][-1]['eval_loss']
            gr.Textbox(value=latest_eval_loss, lines=1, interactive=False)
            
            gr.Markdown("**Latest Evaluation WER:**")
            latest_eval_wer = "N/A"
            if fine_tuning_metrics.get('eval_wer_history'):
                latest_eval_wer = fine_tuning_metrics['eval_wer_history'][-1]['eval_wer']
            gr.Textbox(value=latest_eval_wer, lines=1, interactive=False)
        with gr.Column(scale=1):
            # Display Plots
            gr.Markdown("### Training and Evaluation Loss Plot")
            if loss_plot:
                loss_plot_display = gr.Image(
                    label=None,
                    value=loss_plot,
                    interactive=False
                )
            else:
                loss_plot_display = gr.Markdown("No loss data available.")
            
            gr.Markdown("### Evaluation Word Error Rate (WER) Plot")
            if wer_plot:
                wer_plot_display = gr.Image(
                    label=None,
                    value=wer_plot,
                    interactive=False
                )
            else:
                wer_plot_display = gr.Markdown("No WER data available.")
    
    gr.Markdown(
        """
        ---
        Built with [Gradio](https://gradio.app) and [Hugging Face Transformers](https://huggingface.co/transformers/)
        """
    )



In [None]:
# Launch the Gradio interface
demo.launch(share=False)