# Evaluate Hugging Face's "bangla-speech-processing/BanglaASR" model</br> on the test set of the  OpenSLR Bangla dataset



### Step 1: Set Up Your Environment

In [3]:
!pip install --quiet transformers torchaudio jiwer


In [4]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '{path to your google credentials json file}'


In [5]:
PATH_TO_TSV_FILE = "{path to your tsv OpenSLR file}"
BUCKET_NAME = "{your bucket name}"
DESTINATION_FOLDER = "{your folder in the bucket containing the .wav files}"


In [6]:
from transformers import AutoProcessor, AutoModelForCTC
import torch, torchaudio
from google.cloud import storage
import librosa
import numpy as np
import pandas as pd
import random
import io
from jiwer import wer
import os


In [1]:
from transformers import file_utils
print(file_utils.default_cache_path)


  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]

/Users/tanushrinayak/.cache/huggingface/hub





In [8]:
# Delete the  cache folder
import shutil
shutil.rmtree(file_utils.default_cache_path)


In [10]:
# Connect to the Google Cloud Storage bucket
storage_client = storage.Client()
bucket = storage_client.get_bucket(BUCKET_NAME)


### Step 2: Load the Pre-Trained Model

In [12]:
from transformers import WhisperTokenizer
from transformers import WhisperProcessor
from transformers import WhisperFeatureExtractor
from transformers import WhisperForConditionalGeneration


In [13]:
# GPU or CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "bangla-speech-processing/BanglaASR"

# Load the model and tokenizer
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
tokenizer = WhisperTokenizer.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)


preprocessor_config.json: 100%|██████████| 339/339 [00:00<00:00, 1.01MB/s]
tokenizer_config.json: 100%|██████████| 805/805 [00:00<00:00, 1.63MB/s]
vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 2.55MB/s]
merges.txt: 100%|██████████| 494k/494k [00:00<00:00, 11.6MB/s]
normalizer.json: 100%|██████████| 52.7k/52.7k [00:00<00:00, 22.8MB/s]
added_tokens.json: 100%|██████████| 2.08k/2.08k [00:00<00:00, 5.94MB/s]
special_tokens_map.json: 100%|██████████| 2.08k/2.08k [00:00<00:00, 7.18MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
model.safetensors: 100%|██████████| 967M/967M [00:32<00:00, 29.5MB/s] 
generation_config.json: 100%|██████████| 3.50k/3.50k [00:00<00:00, 9.27MB/s]


### Step 3: Prepare a Test Data to pre-evaluate the model

- #### Step 1: Fetch Audio File Paths from GCS

In [None]:
def list_blobs_with_prefix(prefix, delimiter=None):
    """Lists all the blobs in the bucket with a given prefix."""

    blobs = bucket.list_blobs(prefix=prefix, delimiter=delimiter)

    list_blobs = []
    # Just print the first 10 blobs returned for demonstration purposes.
    for blob in blobs:
        list_blobs.append(blob.name)

    # Print 1st 5 elements
    print(list_blobs[:5])

    return list_blobs


# Replace with your bucket name
bucket_name = BUCKET_NAME
audio_files_prefix = DESTINATION_FOLDER

# List all audio files
audio_file_paths = list_blobs_with_prefix(audio_files_prefix)

# Number of audio files
print(
    f"Found {len(audio_file_paths)} audio files in {bucket_name}/{audio_files_prefix}."
)


- #### Step 2: Fetch Transcriptions from the TSV File

In [16]:
def download_blob_to_dataframe(source_blob_name):
    """Downloads a blob from the bucket and load it as a pandas dataframe."""
    blob = bucket.blob(source_blob_name)
    data = blob.download_as_text()

    df = pd.read_csv(io.StringIO(data), sep="\t", header=None)
    return df

# Replace with your bucket name and TSV file path in the bucket
tsv_file_path = PATH_TO_TSV_FILE
transcriptions_df = download_blob_to_dataframe(tsv_file_path)

# Print the first 5 rows
transcriptions_df.head()


Unnamed: 0,0,1,2
0,000020a912,16cfb,বাংলাদেশে দায়িত্ব নেবে
1,000039928e,976b1,এ ধরণের কার্ড নিয়ে
2,00005debc7,f83df,হতে উপার্জিত অর্থ
3,00009e687c,9813c,হাসির বিষয় হয়েই আছে
4,00012843bc,7ec1c,সার্ক দেশগুলোতে


- #### Step 3: Split Full Dataset

In [18]:
from sklearn.model_selection import train_test_split

# First column of the TSV file has the filenames without extensions
ground_truths = dict(zip(transcriptions_df[0], transcriptions_df[2]))

# all_data has:  tuples (audio_file_path, transcription)
# If the ground truth is not available, don't include the audio file
# in all_data
all_data = [
    (path, ground_truths.get(path.split("/")[-1].replace(".wav", ""), ""))
    for path in audio_file_paths
    if ground_truths.get(path.split("/")[-1].replace(".wav", ""), "") != ""
]

# Split the data into training + validation and test sets
trainval_data, test_data = train_test_split(all_data, test_size=0.1)  # 10% for testing

# Further split the training + validation into separate training and validation sets
train_data, val_data = train_test_split(
    trainval_data, test_size=0.1
)  # 10% of 90% for validation

# Print the number of samples in each set
print(f"Number of training samples: {len(train_data)}")
print(f"Number of validation samples: {len(val_data)}")
print(f"Number of test samples: {len(test_data)}")

# Print a sample from each set
print(f"Sample from training set: {train_data[0]}")
print(f"Sample from validation set: {val_data[0]}")
print(f"Sample from test set: {test_data[0]}")


Number of training samples: 84271
Number of validation samples: 9364
Number of test samples: 10404
Sample from training set: ('flac_to_wav_bn/1317792315.wav', 'অস্ট্রিয়াকে তিনটি')
Sample from validation set: ('flac_to_wav_bn/33e2890916.wav', 'সকাল ৭:৩০টায় গ্রন্থমেলার মূলমঞ্চে')
Sample from test set: ('flac_to_wav_bn/61ef997f05.wav', 'তাই পার্বত্যাঞ্চলে')


- #### Step 4: Sampling a Subset of Data

In [19]:
# Randomly sample a subset from the test set if it's very large
subset_size = 100

test_data_subset = random.sample(test_data, subset_size)


### Step 4: Transcription with the Model


In [20]:
# Streams a blob from the specified GCS bucket
def stream_blob(source_blob_name):
    # Print the file name
    #print(f"Transcribing {source_blob_name}...")
    blob = bucket.blob(source_blob_name)

    # Load the audio file into memory
    audio_data = blob.download_as_bytes()

    return audio_data


In [26]:
# Transcribe the audio file using the specified model for Bengali
def transcribe_bn(audio_file_path):

    # Load the audio file into memory from the GCS bucket
    audio_data = stream_blob(audio_file_path)

    # Use torchaudio to load the audio data
    speech_array, sampling_rate = torchaudio.load(io.BytesIO(audio_data), format="wav")

    #speech_array, sampling_rate = torchaudio.load(audio_file_path, format="wav")
    speech_array = speech_array[0].numpy()
    speech_array = librosa.resample(
        np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000
    )
    input_features = feature_extractor(
        speech_array, sampling_rate=16000, return_tensors="pt"
    ).input_features

    # batch = processor.feature_extractor.pad(input_features, return_tensors="pt")
    predicted_ids = model.generate(inputs=input_features.to(device))[0]

    transcription = processor.decode(predicted_ids, skip_special_tokens=True)

    #print(transcription)

    return transcription


In [21]:
# List all audio files from test_data
audio_file_paths = [item[0] for item in test_data_subset]


In [None]:
# Use the existing trained model to transcribe the test_data audio files
transcriptions = {}

for audio_file_path in audio_file_paths:
    transcription = transcribe_bn(audio_file_path)
    print(f'Transcription for file {audio_file_path} is : {transcription}')
    transcriptions[audio_file_path] = transcription


### Step 5: Calculate WER

In [28]:
# Initialize variables for WER calculation
total_wer = 0
num_files = 0

# Iterate over the transcriptions dictionary
for audio_path, transcription in transcriptions.items():
    # Extract the filename from the audio path
    filename = audio_path.split('/')[-1]
    print(f'Filename: {filename}')
    print(f'Transcription: {transcription}')

    # Get the corresponding ground truth transcription
    ground_truth = ground_truths.get(filename.replace('.wav', ''), None)
    print(f'Ground truth: {ground_truth}')
    # If ground_truth is None, skip the file
    if ground_truth is None:
        continue
    else:
        # Calculate the WER
        wer_value = wer(ground_truth, transcription)
        print(f'WER: {wer_value}')
        # Add to the total WER
        total_wer += wer_value
        # Increment the number of files
        num_files += 1


# Calculate the average WER
average_wer = total_wer / num_files if num_files > 0 else float('inf')

print(f"Average WER: {average_wer}")


Filename: 2d44a3a97e.wav
Transcription: স্কুল শিক্ষার্থী ঐশীয়।
Ground truth: স্কুলশিক্ষার্থী ঐশী
WER: 1.5
Filename: 5a356e1c2c.wav
Transcription: মদুন
Ground truth: মদন
WER: 1.0
Filename: 4452afe6c1.wav
Transcription: গাড়িতে আগন দিয়েছে।
Ground truth: গাড়ীতে আগুন দিয়েছে
WER: 1.0
Filename: 12ac7110b4.wav
Transcription: তার সঙ্গের ওই লোকটা।
Ground truth: তার সঙ্গের ঐ লোকটা
WER: 0.5
Filename: 98a3ae5f53.wav
Transcription: সেই নিল লিপ্ত।
Ground truth: সেই নির্লিপ্ত
WER: 1.0
Filename: 320cd7f47b.wav
Transcription: আমার মরণ।
Ground truth: আমার মরণ
WER: 0.5
Filename: a4c8265353.wav
Transcription: নির্বাচনে আইনসিং খোলারক্ষায়।
Ground truth: নির্বাচনে আইন-শৃঙ্খলা রক্ষায়
WER: 0.6666666666666666
Filename: 11e47bb287.wav
Transcription: পরবর্তী সময়ে সুলতান গঞ্জকে।
Ground truth: পরবর্তী সময়ে সুলতানগঞ্জকে
WER: 0.6666666666666666
Filename: 4efc757f3f.wav
Transcription: কবি ফয়েছে।
Ground truth: কবি ফয়েজ
WER: 0.5
Filename: 46b7ebdc1c.wav
Transcription: বড়ো হওয়া।
Ground truth: বড় হওয়া
WER: 1

# The transcribed text produced by the model matches with the actual audio file better than the ground truth transcription. This is because the model is trained on a large amount of data and is able to generalize better than the ground truth transcription.</br>

## Ignore the WER because of the reason above.