# Train Hugging Face's "bangla-speech-processing/BanglaASR" model</br> on the  OpenSLR Bangla dataset



### Step 1: Set Up Your Environment

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)


Tue Dec  5 10:29:02 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    44W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install --quiet datasets>=2.6.1 librosa evaluate>=0.30 jiwer gradio


In [3]:
!pip install --quiet git+https://github.com/huggingface/transformers


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [4]:
!pip install --quiet  --upgrade accelerate


In [5]:
!pip install --quiet tensorboardX


In [6]:
!pip install --quiet torch==2.1.0 torchvision==0.16.0+cu118 torchaudio==2.1.0 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html


In [7]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '<your Google Cloud service account key- JSON file>'


In [8]:
PATH_TO_TSV_FILE = "tsv/utt_spk_text_processed.tsv"
BUCKET_NAME = "voice_translation"
DESTINATION_FOLDER = "flac_to_wav_bn/"


In [9]:
from transformers import AutoProcessor, AutoModelForCTC
import torch, torchaudio
from google.cloud import storage
import librosa
import numpy as np
import pandas as pd
import random
import io
from jiwer import wer
import os


In [10]:
from transformers import file_utils
print(file_utils.default_cache_path)


/root/.cache/huggingface/hub


In [11]:
# List all the folders in the cache
os.listdir(file_utils.default_cache_path)


['.locks', 'version.txt', 'models--bangla-speech-processing--BanglaASR']

In [12]:
# Delete the  cache folder
#import shutil
#shutil.rmtree(file_utils.default_cache_path)


In [13]:
# Check if the cache folder is deleted
#os.listdir(file_utils.default_cache_path)


In [14]:
# Connect to the Google Cloud Storage bucket
storage_client = storage.Client()
bucket = storage_client.get_bucket(BUCKET_NAME)


### Step 2: Load the Pre-Trained Model

In [15]:
from transformers import WhisperTokenizer
from transformers import WhisperProcessor
from transformers import WhisperFeatureExtractor
from transformers import WhisperForConditionalGeneration


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "bangla-speech-processing/BanglaASR"


feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
tokenizer = WhisperTokenizer.from_pretrained(model_name)
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)


### Step 3: Preprocess the data to train the model

- #### Step 1: Fetch Audio File Paths from GCS

In [17]:
def list_blobs_with_prefix(prefix, delimiter=None):
    """Lists all the blobs in the bucket with a given prefix."""

    blobs = bucket.list_blobs(prefix=prefix, delimiter=delimiter)

    list_blobs = []
    # Just print the first 10 blobs returned for demonstration purposes.
    for blob in blobs:
        list_blobs.append(blob.name)

    # Print 1st 5 elements
    print(list_blobs[:2])

    return list_blobs


# Replace with your bucket name
bucket_name = BUCKET_NAME
audio_files_prefix = DESTINATION_FOLDER

# List all audio files
audio_file_paths = list_blobs_with_prefix(audio_files_prefix)

# Number of audio files
print(
    f"Found {len(audio_file_paths)} audio files in {bucket_name}/{audio_files_prefix}."
)


['flac_to_wav_bn/', 'flac_to_wav_bn/000020a912.wav']
Found 168911 audio files in voice_translation/flac_to_wav_bn/.


In [18]:
# Check if GPU is available
torch.cuda.is_available()


True

- #### Step 2: Fetch Transcriptions from the TSV File

In [19]:
def download_blob_to_dataframe(source_blob_name):
    """Downloads a blob from the bucket and load it as a pandas dataframe."""
    blob = bucket.blob(source_blob_name)
    data = blob.download_as_text()

    df = pd.read_csv(io.StringIO(data), sep="\t", header=None)
    return df

# Replace with your bucket name and TSV file path in the bucket
tsv_file_path = PATH_TO_TSV_FILE
transcriptions_df = download_blob_to_dataframe(tsv_file_path)

# Print the first 5 rows
transcriptions_df.head()


Unnamed: 0,0,1,2
0,000020a912,16cfb,বাংলাদেশে দায়িত্ব নেবে
1,000039928e,976b1,এ ধরণের কার্ড নিয়ে
2,00005debc7,f83df,হতে উপার্জিত অর্থ
3,00009e687c,9813c,হাসির বিষয় হয়েই আছে
4,00012843bc,7ec1c,সার্ক দেশগুলোতে


In [20]:
# Print the shape of the dataframe
transcriptions_df.shape


(127565, 3)

- Drop duplicates if any

In [21]:
transcriptions_df_unique = transcriptions_df.drop_duplicates(subset=[2])
transcriptions_df_unique.shape


(83120, 3)

- Since the dataset is huge, we use 30% of the dataset for our ASR model.

In [22]:
transcriptions_df_unique = transcriptions_df_unique.sample(frac=0.3, random_state=42)
transcriptions_df_unique.shape


(24936, 3)

- #### Step 3: Split Full Dataset

In [23]:
from sklearn.model_selection import train_test_split

# First column of the TSV file has the filenames without extensions
ground_truths = dict(zip(transcriptions_df_unique[0], transcriptions_df_unique[2]))

# all_data has:  tuples (audio_file_path, transcription)
# If the ground truth is not available, don't include the audio file
# in all_data
all_data = [
    (path, ground_truths.get(path.split("/")[-1].replace(".wav", ""), ""))
    for path in audio_file_paths
    if ground_truths.get(path.split("/")[-1].replace(".wav", ""), "") != ""
]

# Split the data into training + validation and test sets
trainval_data, test_data = train_test_split(all_data, test_size=0.1)  # 10% for testing

# Further split the training + validation into separate training and validation sets
train_data, val_data = train_test_split(
    trainval_data, test_size=0.1
)  # 10% of 90% for validation

# Print the number of samples in each set
print(f"Number of training samples: {len(train_data)}")
print(f"Number of validation samples: {len(val_data)}")
print(f"Number of test samples: {len(test_data)}")

# Print a sample from each set
print(f"Sample from training set: {train_data[0]}")
print(f"Sample from validation set: {val_data[0]}")
print(f"Sample from test set: {test_data[0]}")


Number of training samples: 17361
Number of validation samples: 1930
Number of test samples: 2144
Sample from training set: ('flac_to_wav_bn/052abf9f08.wav', 'কারখানাগুলোতে তারা অগ্নি নিরাপত্তাজনিত')
Sample from validation set: ('flac_to_wav_bn/9ceba5425e.wav', 'অম্বিকাপুর, ভারতের')
Sample from test set: ('flac_to_wav_bn/a11c1c68fc.wav', 'ঈদের আগে')


In [25]:
# Store the train_data and val_data in a dataframe
train_df = pd.DataFrame(train_data, columns=["path", "sentence"])
val_df = pd.DataFrame(val_data, columns=["path", "sentence"])
test_df = pd.DataFrame(test_data, columns=["path", "sentence"])


- Add the folder name of your bucket to the path where the audio files are stored.

In [26]:
train_df.head()


Unnamed: 0,path,sentence
0,flac_to_wav_bn/052abf9f08.wav,কারখানাগুলোতে তারা অগ্নি নিরাপত্তাজনিত
1,flac_to_wav_bn/9f5ade8fc8.wav,জানা গেছে মারসেল ফ্রিজে
2,flac_to_wav_bn/4ab7b6dafd.wav,নির্বাচনের প্রস্তুতি
3,flac_to_wav_bn/2a8ca24cb2.wav,অবিচ্ছিন্ন চতুর্থ উইকেট জুটিতে
4,flac_to_wav_bn/45d640fdb1.wav,অশোকনগর সড়ক পথে


In [28]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Step 4: Transcription with the Model


- Get the audio file from the GC Storage 

In [29]:
# Streams a blob from the specified GCS bucket
def stream_blob(source_blob_name):
    blob = bucket.blob(source_blob_name)
    # Print the blob name
    #print(f"Blob name: {blob.name}")
    # Load the audio file into memory
    audio_data = blob.download_as_bytes()

    return audio_data


- Preprocess the audio file for the speech recognition task.
</br>
1. `Loading Audio Data`: loads the audio file from a given path (in our case from Google Cloud Storage)
2. `Processing with torchaudio`: takes from the first input channel and convert into numpy array
3. `Resampling`: resamples the audio: Use sampling_rate of 16kHz
4. `Feature Extraction`: extracts features from the audio data, the log-Mel spectrograms, suitable for a speech recognition model.
5. `Tokenization`: tokenizes the target sentence (transcription) and converts it into input IDs as per the tokenizer's vocabulary.
6. `Label length`: calculates the length of the tokenized labels.




In [30]:
def prepare_dataset(batch):
    # Load the audio file into memory from the GCS bucket
    audio_data = stream_blob(batch["path"])


    speech_array, sampling_rate = torchaudio.load(io.BytesIO(audio_data),
                                                  format="wav")
    array = speech_array[0].numpy()
    batch["array"] = librosa.resample(
        np.asarray(array), orig_sr=sampling_rate, target_sr=16000
    )
    # compute the length of the audio file 'audio_data' returned by the function stream_blob
    batch["input_length"] = len(audio_data)

    #     compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(
        batch["array"], sampling_rate=16000
    ).input_features[0]
    #     batch["sampling_rate"] = sampling_rate
    batch["sampling_rate"] = 16000
    #     # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    # compute labels length **with** special tokens! -> total label length
    batch["labels_length"] = len(batch["labels"])

    #print(batch)

    return batch


In [32]:
# delete the train_dataset if it already exists
if not os.path.exists("train_dataset"):
    print("train_dataset deleted successfully")
else:
    print("train_dataset is not deleted,,,,Deleting it")
    !rm -r train_dataset

if not os.path.exists("train_dataset_final"):
    print("train_dataset_final deleted successfully")
else:
    print("train_dataset_final is not deleted,,,,Deleting it")
    !rm -r train_dataset_final


train_dataset deleted successfully
train_dataset_final deleted successfully


In [None]:
from datasets import Dataset


- Convert a pandas DataFrame (train_df) into a Dataset object from the datasets library by Hugging Face; suitable for training and evaluation.

In [33]:
# Batch processing of the training dataset
train_dataset = Dataset.from_pandas(train_df)


In [None]:
# Preprocess the training dataset
train_dataset_final = train_dataset.map(prepare_dataset, num_proc=16)


- Convert a pandas DataFrame (val_df) into a Dataset object from the datasets library by Hugging Face; suitable for training and evaluation.

In [35]:
# Validation dataset
val_dataset = Dataset.from_pandas(val_df)


In [None]:
val_data_final = val_dataset.map(prepare_dataset, num_proc=4)


- Play one of the training audio files and listen to it. (to ensure it works)

In [38]:
import IPython.display as ipd
import numpy as np
import random


rand_int = random.randint(0, len(train_dataset_final))
print(train_dataset_final[rand_int]["sentence"])
ipd.Audio(data=np.asarray(train_dataset_final[rand_int]["array"]), autoplay=True, rate=16000)


যেদিন হইতে ডাইনামাইট সৃষ্টি হইয়াছে


- Create a custom `data collator`` for handling speech-to-text data in a sequence-to-sequence learning setup. 
</br>
This class is particularly useful when working with models that require input data to be batched and padded correctly.

In [39]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union


@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(
        self, features: List[Dict[str, Union[List[int], torch.Tensor]]]
    ) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [
            {"input_features": feature["input_features"]} for feature in features
        ]
        # This ensures that all audio sequences in the batch have the same length, which is necessary for processing
        # them as a batch in a neural network.
        batch = self.processor.feature_extractor.pad(
            input_features, return_tensors="pt"
        )

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly.
        # It ensures that padding tokens do not contribute to the loss.
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


- Call the custom data collator to collate the training data.

In [40]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)


- Evaluate 

- Load the `Word Error Rate (WER)` metric using the `evaluate` library from Hugging Face. 
- `WER` calculates the rate of errors in the predicted transcription compared to the actual (ground truth) transcription.

In [None]:
import evaluate

metric = evaluate.load("wer")


- Define `compute_metrics` function for calculating the Word Error Rate (WER) using Hugging Face's Transformers library. 

1. Extract Predictions and Labels
2. Handle Padding
3. Decode Predictions and Labels
4. Calculate WER and return the dictionary

In [42]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


In [43]:
print(model)


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

- Configuration of Seq2SeqTrainingArguments from the Hugging Face transformers library; for training a sequence-to-sequence model.

In [70]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-bn-openslr",  # change to a repo name of your choice
    per_device_train_batch_size=26, #16
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=100,
    max_steps=150,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=26,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    lr_scheduler_type="cosine",  # or try 'cosine', 'cosine_with_restarts'
    max_grad_norm=1.0,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    #push_to_hub=False,
    push_to_hub=True,
)


- Instantiate a Seq2SeqTrainer object from the Hugging Face transformers library. </br>
This trainer will be used for training and evaluating a sequence-to-sequence model, while regularly computing the WER to gauge model performance.

In [71]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=train_dataset_final,
    eval_dataset=val_data_final,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


In [72]:
processor.save_pretrained(training_args.output_dir)


In [73]:
import torch
torch.cuda.empty_cache()


In [None]:
torch.cuda.memory_summary(device=None, abbreviated=False)


In [None]:
#trainer.train()


- Implement a custom stopping condition based on the Word Error Rate (WER), as Seq2SeqTrainer does not directly support early stopping. 

In [59]:
def custom_train(trainer, target_wer=0.4):
    step = 0
    lowest_wer = float('inf')
    while step < training_args.max_steps:
        # Perform a training step
        trainer.train(resume_from_checkpoint=True, trial=None, ignore_keys_for_eval=[])
        step += training_args.eval_steps
        # Evaluate the model
        metrics = trainer.evaluate()
        # Check and print metrics
        print('Evaluation metrics:', metrics)
        current_wer = metrics.get('wer', None)
        # Check if current_wer is valid before comparison
        if current_wer is not None:
            if current_wer < target_wer:
                print(f"Stopping training: WER reached {current_wer*100}% which is below the target of {target_wer*100}%")
                break
            if current_wer < lowest_wer:
                lowest_wer = current_wer
                # Save the best model
                trainer.save_model()
        else:
            print("Current WER is None. Check your compute_metrics function.")


- Called the `custom_train` function to train and evaluate the model.

In [60]:
custom_train(trainer)


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


Step,Training Loss,Validation Loss


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


Evaluation metrics: {'eval_loss': 0.40609219670295715, 'eval_wer': 40.668928744546776, 'eval_runtime': 608.0214, 'eval_samples_per_second': 3.174, 'eval_steps_per_second': 0.123, 'epoch': 0.3}
Current WER is None. Check your compute_metrics function.


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


Step,Training Loss,Validation Loss


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


Evaluation metrics: {'eval_loss': 0.40609219670295715, 'eval_wer': 40.668928744546776, 'eval_runtime': 608.0516, 'eval_samples_per_second': 3.174, 'eval_steps_per_second': 0.123, 'epoch': 0.3}
Current WER is None. Check your compute_metrics function.


- Save the model to huggingface hub

In [None]:
trainer.save_model()
trainer.push_to_hub()
tokenizer.push_to_hub("tanushrin/whisper-small-bn-openslr")
