# Importing libraries

In [2]:
try:
    import datasets
except:
    !pip install datasets
try:
    import torch
except:
    !pip install torch
    import torch

try:
    import transformers
except:
    !pip install transformers==4.25
    import transformers
try:
    import accelerate
except:
    !pip install accelerate==0.20.1
    import accelerate

# Setting up config

In [28]:
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor

language = "English"
task = "transcribe"
model_name_or_path = "openai/whisper-small"
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language=language, task=task)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [29]:
def prepare_dataset(batch):
    
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

# Creation of dataset (from IMDA)
We are using the national speech corpus dataset available from IMDA, please refer [here](https://www.imda.gov.sg/how-we-can-help/national-speech-corpus) for how to get the dataset.

In [30]:
from datasets import load_from_disk, concatenate_datasets, DatasetDict, Audio
import os
import glob

def create_audio_dataset(text_file_path, audio_folder_path):
    id_to_text = {}

    with open(text_file_path, 'r') as f:
        lines = f.readlines()  # Read all lines into a list

    # Remove the Unicode Character 'ZERO WIDTH NO-BREAK SPACE' if it exists
    lines = [line.replace('\ufeff', '') for line in lines]

    # Iterate over the lines two by two (step by 2)
    for i in range(0, len(lines), 2):
        id_line = lines[i].strip()
        text_line = lines[i + 1].strip() if (i + 1) < len(lines) else ""

        # Split the ID line by tab to separate the ID and text
        parts = id_line.split('\t')
        if len(parts) == 2:
            id_part, _ = parts
            # Strip any whitespace from the ID and use the lowercase text part
            id_to_text[id_part.strip()] = text_line
        else:
            # Handle error if the line does not contain exactly two parts
            print(f"Error in line format: {id_line}")

    ids = []
    texts = []
    audio_paths = []

    # Iterate over the id_to_text dictionary and populate the lists
    for id, text in id_to_text.items():
        # The ID in the file name is prefixed with four zeroes in your file system
        audio_file_name = f"{id}.WAV"
        audio_file_path = os.path.join(audio_folder_path, audio_file_name)

        if os.path.isfile(audio_file_path):  # Check if the file exists
            ids.append(id)
            texts.append(text)
            audio_paths.append(audio_file_path)
        else:
            print(f"Audio file not found for ID: {id}")

    # Create a Dataset
    dataset = Dataset.from_dict({
        "id": ids,
        "text": texts,
        "audio": audio_paths
    })

    # Cast the 'audio' column to Audio feature type
    dataset = dataset.cast_column("audio", Audio())

    return dataset

# Create dataset

In [18]:
text_file_path = '###' # TO CHANGE
audio_folder_path = '###' # TO CHANGE

audio_dataset = create_audio_dataset(text_file_path, audio_folder_path)
audio_dataset.save_to_disk('dataset13_0') # TO CHANGE

ValueError: No audio data found. Expecting filename, url, or data.

# Split into train-test
Splitting into train-test split for easier evaluation when fine-tuning model.

In [31]:
# Function to process and concatenate datasets
def process_and_concatenate_datasets(dataset_path, feature_extractor, tokenizer):
    # Lists to store train and test sets for concatenation
    train_datasets = []
    test_datasets = []

    # Retrieve all the dataset files from the path
    dataset_files = glob.glob(f'{dataset_path}/*')
    dataset_files.sort()

    for file in dataset_files:
        # Load and process each dataset
        audio_dataset = load_from_disk(file)
        audio_dataset = audio_dataset.remove_columns(["id"])
        audio_dataset = audio_dataset.rename_column('text', 'sentence')
        audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=16000))
        audio_dataset = audio_dataset.map(prepare_dataset, remove_columns=['audio', 'sentence'])

        # Split the dataset
        split_dataset = audio_dataset.train_test_split(test_size=0.2, shuffle=True)
        train_datasets.append(split_dataset['train'])
        test_datasets.append(split_dataset['test'])

    # Concatenate all datasets in the lists
    train_concatenated = concatenate_datasets(train_datasets)
    test_concatenated = concatenate_datasets(test_datasets)

    # Combine into one DatasetDict"
    concatenated_dataset = DatasetDict({
        'train': train_concatenated,
        'test': test_concatenated
    })

    return concatenated_dataset

In [38]:
concatenated_dataset = process_and_concatenate_datasets('databruv', feature_extractor, tokenizer)

Map:   0%|          | 0/431 [00:00<?, ? examples/s]

Map:   0%|          | 0/442 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

Map:   0%|          | 0/421 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/442 [00:00<?, ? examples/s]

Map:   0%|          | 0/403 [00:00<?, ? examples/s]

Map:   0%|          | 0/419 [00:00<?, ? examples/s]

Map:   0%|          | 0/419 [00:00<?, ? examples/s]

Map:   0%|          | 0/419 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


# Saving dataset

In [40]:
concatenated_dataset
concatenated_dataset.save_to_disk('full_dataset_2')

Saving the dataset (0/7 shards):   0%|          | 0/3641 [00:00<?, ? examples/s]

Saving the dataset (0/2 shards):   0%|          | 0/916 [00:00<?, ? examples/s]