In [1]:
import json
import pandas as pd
import torch
import librosa
from datasets import load_dataset, Audio, Features, Value

from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    pipeline
)


# Point this to your folder containing model.safetensors, config.json, etc.
model_path = "whisper-finetuned/checkpoint-1611"



In [2]:
processor = WhisperProcessor.from_pretrained(model_path)
model = WhisperForConditionalGeneration.from_pretrained(model_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)



WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 1280)
      (layers): ModuleList(
        (0-31): 32 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (fc2): Linear(in_features=5120, out_features=1280, bia

In [19]:
forced_decoder_ids = processor.get_decoder_prompt_ids(language="az", task="transcribe")

In [21]:
whisper_asr = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device=0 if device == "cuda" else -1,
    chunk_length_s=30,
    generate_kwargs={'forced_decoder_ids': forced_decoder_ids}
)

Device set to use cuda:0


In [22]:
features = Features({
    "audio": Value("string"),            # Force IDs to string
    "audio_filepath": Value("string"),    # Force audio_path to string (or use Audio(...) if needed)
    "speaker": Value("string"),
    "transcript": Value("string")# Force description to string
    # Add other fields as needed...
})


dataset = load_dataset(
    "json",
    data_files="../Data/dataset/combined_dataset.json",
    features = features, 
    split = 'train'
)

In [23]:
dataset = dataset.cast_column("audio_filepath", Audio(sampling_rate=16000))

In [24]:
dataset[1]

{'audio': '1150481435_413589_413590_25232',
 'audio_filepath': {'path': '../Data/audio/bcd/voices/1150481435_413589_413590_25232.ogg',
  'array': array([1.63709046e-11, 1.45519152e-11, 1.81898940e-11, ...,
         1.24341566e-02, 1.14010815e-02, 1.16392896e-02]),
  'sampling_rate': 16000},
 'speaker': '1150481435',
 'transcript': '1233482219165815'}

In [25]:
def transcribe_batch(batch):
    # batch["audio_path"] is a list of dicts, each with "array" and "sampling_rate"
    # We'll collect them into a single list of audio inputs
    audio_inputs = []
    for audio_info in batch["audio_filepath"]:
        audio_inputs.append({
            "array": audio_info["array"],
            "sampling_rate": audio_info["sampling_rate"]
        })
    
    # Pass the list to the pipeline in one go
    results = whisper_asr(audio_inputs, batch_size=16)  
    # results is a list of dicts, each with "text"

    # Return them in a list that matches the input length
    transcriptions = [r["text"] for r in results]
    return {"transcription": transcriptions}

# Apply the batched function
dataset = dataset.map(transcribe_batch, batched=True, batch_size=16)

Map:   0%|          | 0/11478 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [27]:
dataset.select_columns(['audi, 'transcript', 'transcription'])

Dataset({
    features: ['audio', 'audio_filepath', 'speaker', 'transcript', 'transcription'],
    num_rows: 11478
})