## import the required packages

In [51]:
import os
import pandas as pd
import torch

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import Dataset, DatasetDict, Audio

## set any required constants

In [52]:
# MODEL='thesven/whisper-tiny-bn-thesven'
MODEL='thesven/whisper-tiny-bn-thesven'
LANGUAGE='bengali'
TASK='transcribe'

## Load in the test data

In [53]:
# Step 1: Get the file paths from the directory
directory_path = './bengaliai-speech/test_mp3s/'
file_paths = [os.path.join(directory_path, file_name) for file_name in os.listdir(directory_path) if file_name.endswith('.mp3')]

# Step 2: Create a Hugging Face dataset with the file paths
dataset = Dataset.from_dict({'audio': file_paths})
# cast the column so that it contains the audio file path and the array of audio data
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# Print the dataset
print(dataset[0])

{'audio': {'path': './bengaliai-speech/test_mp3s/0f3dac00655e.mp3', 'array': array([ 1.77635684e-15, -1.42108547e-14,  2.48689958e-14, ...,
       -8.83544737e-04, -2.18719291e-03, -2.56951153e-03]), 'sampling_rate': 16000}}


## Create a processor and load the model

In [54]:
processor = WhisperProcessor.from_pretrained(MODEL)
model = WhisperForConditionalGeneration.from_pretrained(MODEL)
forced_decoder_ids = processor.get_decoder_prompt_ids(language=LANGUAGE, task=TASK)

## Prepare the test data for prediction

In [55]:
import pandas as pd

# List to store data
data = []

for audio in dataset:
    audio = audio['audio']
    input_features = processor(audio['array'], sampling_rate=audio['sampling_rate'], return_tensors="pt")
    data.append([audio['path'], input_features])

# Convert data to a pandas DataFrame
df_test_samples = pd.DataFrame(data, columns=['Audio_Path', 'Input_Features'])

df_test_samples.head()

Unnamed: 0,Audio_Path,Input_Features
0,./bengaliai-speech/test_mp3s/0f3dac00655e.mp3,[input_features]
1,./bengaliai-speech/test_mp3s/a9395e01ad21.mp3,[input_features]
2,./bengaliai-speech/test_mp3s/bf36ea8b718d.mp3,[input_features]


## Perform the transcription

In [56]:
def get_transcription(row):
    # Extract input features from the row
    input_features_tensor = row['Input_Features']['input_features']
    print(input_features_tensor)
    
    # Generate token ids
    predicted_ids = model.generate(input_features_tensor, forced_decoder_ids=forced_decoder_ids)
    
    # Decode token ids to text
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    
    return transcription[0]  # Assuming batch_decode returns a list, get the first item

# Apply the function to each row and save the result in a new column 'Transcription'
df_test_samples['Transcription'] = df_test_samples.apply(get_transcription, axis=1)

df_test_samples.head()

tensor([[[-0.8013, -0.8013, -0.8013,  ..., -0.8013, -0.8013, -0.8013],
         [-0.8013, -0.8013, -0.8013,  ..., -0.8013, -0.8013, -0.8013],
         [-0.8013, -0.8013, -0.8013,  ..., -0.8013, -0.8013, -0.8013],
         ...,
         [-0.8013, -0.8013, -0.8013,  ..., -0.8013, -0.8013, -0.8013],
         [-0.8013, -0.8013, -0.8013,  ..., -0.8013, -0.8013, -0.8013],
         [-0.8013, -0.8013, -0.8013,  ..., -0.8013, -0.8013, -0.8013]]])




tensor([[[-0.6312, -0.6312, -0.6312,  ..., -0.6312, -0.6312, -0.6312],
         [-0.6312, -0.6312, -0.6312,  ..., -0.6312, -0.6312, -0.6312],
         [-0.6312, -0.6312, -0.6312,  ..., -0.6312, -0.6312, -0.6312],
         ...,
         [-0.6312, -0.6312, -0.6312,  ..., -0.6312, -0.6312, -0.6312],
         [-0.6312, -0.6312, -0.6312,  ..., -0.6312, -0.6312, -0.6312],
         [-0.6312, -0.6312, -0.6312,  ..., -0.6312, -0.6312, -0.6312]]])
tensor([[[-0.5800, -0.5800, -0.5800,  ..., -0.5800, -0.5800, -0.5800],
         [-0.5800, -0.5800, -0.5800,  ..., -0.5800, -0.5800, -0.5800],
         [-0.5800, -0.5800, -0.5800,  ..., -0.5800, -0.5800, -0.5800],
         ...,
         [-0.5800, -0.5800, -0.5800,  ..., -0.5800, -0.5800, -0.5800],
         [-0.5800, -0.5800, -0.5800,  ..., -0.5800, -0.5800, -0.5800],
         [-0.5800, -0.5800, -0.5800,  ..., -0.5800, -0.5800, -0.5800]]])


Unnamed: 0,Audio_Path,Input_Features,Transcription
0,./bengaliai-speech/test_mp3s/0f3dac00655e.mp3,[input_features],এক্তু বয়ের শুলে একটি বিদেশি।
1,./bengaliai-speech/test_mp3s/a9395e01ad21.mp3,[input_features],কি কারণে তুমি এতাবত কাল পর্রণ্ত এ দারল দৈবো দু...
2,./bengaliai-speech/test_mp3s/bf36ea8b718d.mp3,[input_features],এ কারণে সর্কান্নে ধরিত হারে পরিবহন্জনীত খোতি অ...


In [57]:
print(df_test_samples['Transcription'][0])
print(df_test_samples['Transcription'][1])
print(df_test_samples['Transcription'][2])

এক্তু বয়ের শুলে একটি বিদেশি।
কি কারণে তুমি এতাবত কাল পর্রণ্ত এ দারল দৈবো দুর্বি পাকে প্রতিত ছিলে করা।
এ কারণে সর্কান্নে ধরিত হারে পরিবহন্জনীত খোতি অনুমাদন করে।
