In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch

from datasets import Dataset, DatasetDict, Audio, load_from_disk, concatenate_datasets
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
TRAINING_CSV_PATH="./bengaliai-speech/train.csv"
TRAINING_AUDIO_LOCATION="./bengaliai-speech/train_mp3s/"
BASE_MODEL="openai/whisper-tiny"
LANGUAGE="bengali"


# Import and pre-process the data for training and validation

In [None]:
raw_data = pd.read_csv(f"{TRAINING_CSV_PATH}")
print(f"Number of Samples in Set: {len(raw_data)}")
raw_data.head()

**split the training and validation data into two sets**

In [None]:
def create_split(df, col, value):
    split = df[df[col] == value]
    split = split[['id', 'sentence']]
    split['audio'] = split['id'].apply(lambda x: f"{TRAINING_AUDIO_LOCATION}{x}.mp3")
    return split

train_df = create_split(raw_data, 'split', 'train')
validation_df = create_split(raw_data, 'split', 'valid')

print(f"{len(train_df)} entries in the training set")
print(f"{len(validation_df)} entries in the validation set")
train_df.head()

**Create a hugging face Dataset and DatasetDict**

In [None]:
train_df = train_df.head(10000)
validation_df = validation_df.head(1000)

train_ds = Dataset.from_dict({"audio": train_df['audio'], "sentence": train_df['sentence']}).cast_column("audio", Audio(sampling_rate=16000))
validation_ds = Dataset.from_dict({"audio": validation_df['audio'], "sentence": validation_df['sentence']}).cast_column("audio", Audio(sampling_rate=16000))

# datasets = DatasetDict({'train': train_ds, 'validation': validation_ds})

**Create the whisper tokenizer and extractor**
*check to validate that tokenizer is working*

In [None]:

feature_extractor = WhisperFeatureExtractor.from_pretrained(BASE_MODEL)

In [None]:

tokenizer = WhisperTokenizer.from_pretrained(BASE_MODEL, language=LANGUAGE, task="transcribe")

In [None]:
input_str = raw_data.iloc[0]['sentence']
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input String: {input_str}")
print(f"Decoded w/ Special: {decoded_with_special}")
print(f"Decoded w/o Special: {decoded_str}")
print(f"Are Equal: {input_str == decoded_str}")

**Prepare the dataset**

In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

# shard the training datasets into chunks for preperation
num_shards = 10
for shard_idx in range(num_shards):
    shard = train_ds.shard(num_shards=num_shards, index=shard_idx, contiguous=True)
    shard.save_to_disk(f"./training-shards/bengali-ai-train-set-{shard_idx}")

# load the shards and prepare them
prepared_training_datasets = []
for shard_idx in range(num_shards):
    shard = load_from_disk(f"./training-shards/bengali-ai-train-set-{shard_idx}")
    shard = shard.map(prepare_dataset, remove_columns=shard.column_names, writer_batch_size=1000, keep_in_memory=False)
    prepared_training_datasets.append(shard)
    
for shard_idx in range(num_shards):
    shard = validation_ds.shard(num_shards=num_shards, index=shard_idx, contiguous=True)
    shard.save_to_disk(f"./valid-shards/bengali-ai-valid-set-{shard_idx}")

# load the shards and prepare them
prepared_validation_datasets = []
for shard_idx in range(num_shards):
    shard = load_from_disk(f"./valid-shards/bengali-ai-valid-set-{shard_idx}")
    shard = shard.map(prepare_dataset, remove_columns=shard.column_names, writer_batch_size=1000, keep_in_memory=False)
    prepared_validation_datasets.append(shard)

# concatenate the prepared shards into a single dataset
datasets = DatasetDict({"valid": concatenate_datasets(prepared_validation_datasets), "train": concatenate_datasets(prepared_training_datasets)})

print(datasets)

# print("Preparing dataset")
# datasets = datasets.map(prepare_dataset, remove_columns=datasets.column_names["train"], writer_batch_size=1000, keep_in_memory=False)
datasets.push_to_hub("bengali-ai-train-set-tiny")