In [13]:
import torchaudio
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch

# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Path to the fine-tuned model checkpoint
checkpoint_path = "/opt/jupyter/whisper-tiny-az/checkpoint-4000"  # Your checkpoint directory
processor_checkpoint_path ="/opt/jupyter/whisper-tiny-az" 

In [14]:
processor = WhisperProcessor.from_pretrained(processor_checkpoint_path)

In [15]:
model = WhisperForConditionalGeneration.from_pretrained(checkpoint_path).to(device)

In [16]:
def transcribe_audio(audio_path, target_sampling_rate=16000):
    # Validate audio file
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio file not found: {audio_path}")
    
    # Load audio
    audio, sr = torchaudio.load(audio_path)
    
    # Resample audio to 16000 Hz if necessary
    if sr != target_sampling_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sampling_rate)
        audio = resampler(audio)
    
    # Ensure audio is mono (Whisper expects single-channel audio)
    if audio.shape[0] > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)
    
    # Process audio for Whisper model
    input_features = processor(audio[0].numpy(), sampling_rate=target_sampling_rate, return_tensors='pt').input_features.to(device)
    
    # Generate transcription
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    return transcription

# Example usage
file = "/home/user/.cache/huggingface/datasets/downloads/extracted/683c6f874760e64ca0a4107d56d97ef171e5dd8fdbcaf9ac15b5f3df0e085d89/tr_train_0/common_voice_tr_28856093.mp3"
try:
    transcription = transcribe_audio(file)
    print('Transcription:', transcription)
except Exception as e:
    print(f"Error during transcription: {str(e)}")

`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A custom logits processor of type <class 'transformers.ge

Transcription: Sadece birkaç gün.


In [39]:
#transcription with attention mask
audio_path = "/home/user/.cache/huggingface/datasets/downloads/extracted/683c6f874760e64ca0a4107d56d97ef171e5dd8fdbcaf9ac15b5f3df0e085d89/tr_train_0/common_voice_tr_28856093.mp3"
target_sampling_rate = 16000
# Load audio
audio, sr = torchaudio.load(audio_path)

# Resample audio to 16000 Hz if necessary
if sr != target_sampling_rate:
    resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sampling_rate)
    audio = resampler(audio)
    # Ensure audio is mono (Whisper expects single-channel audio)
if audio.shape[0] > 1:
    audio = torch.mean(audio, dim=0, keepdim=True)

# Process audio with attention mask
processed_audio = processor(
    audio[0].numpy(),
    sampling_rate=16000,
    return_tensors='pt',
    return_attention_mask=True  # Include attention mask
)
input_features = processed_audio.input_features.to(device)
attention_mask = processed_audio.attention_mask.to(device)  # Get attention mask
    
# Generate transcription with attention mask
with torch.no_grad():  # Disable gradient computation for inference
    predicted_ids = model.generate(
        input_features,
        attention_mask=attention_mask,  # Pass attention mask to model
        max_length=225,
        num_beams=4,  # Beam search for better results
        return_dict_in_generate=False  # Ensure simple tensor output
    )

# Decode transcription
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
transcription

'Sadece birkaç gün.'