In [58]:
#explore the dataset & look at the structure of the data

import datasets
from datasets import load_dataset, load_metric

# We are using the timit dataset : 
# English ASR dataset contains 5hours of training data
timit = load_dataset("timit_asr", data_dir="/home/ix502iv/Documents/Datasets/timit_large")

print(timit)

Using custom data configuration default-43b510b3628aa686
Found cached dataset timit_asr (/home/ix502iv/.cache/huggingface/datasets/timit_asr/default-43b510b3628aa686/0.0.0/43f9448dd5db58e95ee48a277f466481b151f112ea53e27f8173784da9254fb2)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
        num_rows: 4620
    })
    test: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
        num_rows: 1680
    })
})


In [59]:
# drop most of the params of the file
timit = timit.remove_columns(['phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'])

In [60]:
# randomly display samples of the dataset
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset) #number of elemnts to display should not be more than
                                         # the ones available in the dataset
    picks = [] #initialize an empty list

    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick) #append pick to end of picks

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(timit["train"].remove_columns(['audio','file'])) 

# below is a read speech corpus

Unnamed: 0,text
0,Quince seed gum is the main ingredient in wave-setting lotions.
1,Brush fires are common in the dry underbrush of Nevada.
2,The rose corsage smelled sweet.
3,You're so preoccupied that you've let your faith grow dim.
4,Cut a small corner off each edge.
5,Cliff's display was misplaced on the screen.
6,All too often our language is unduly harsh.
7,They remained lifelong friends and companions.
8,Bagpipes and bongos are musical instruments.
9,She had your dark suit in greasy wash water all year.


In [61]:
# since we are not working with a language model, we eliminate special characters
# we also normalize to lower case

import re #Regular expression -> specifies a set of string that matches it
chars_to_ignore_regex = '[\,?\-\:\;\!\"]'

def rm_special_chars(batch):
    batch['text'] = re.sub(chars_to_ignore_regex, '', batch['text']).lower()
    return batch

timit = timit.map(rm_special_chars)

show_random_elements(timit["train"].remove_columns(['audio','file']))

  0%|          | 0/4620 [00:00<?, ?ex/s]

  0%|          | 0/1680 [00:00<?, ?ex/s]

Unnamed: 0,text
0,don't ask me to carry an oily rag like that.
1,in either case they do not appreciate the private detective's zeal.
2,don't ask me to carry an oily rag like that.
3,her hum became a gurgle of surprise.
4,the local drugstore was charged with illegally dispensing tranquilizers.
5,well then who brought it
6,the courier was a dwarf.
7,these exclusive documents must be locked up at all times.
8,samples he walked in upon her invitation.
9,a monstrous shadow fell across the illuminated wall distorted and indefinable.


In [62]:
# have a mapping function that concatenates all transcriptions into one long one
# transform the string into a set of chars -> all done in batch

def extract_all_chars(batch):
    all_text = " ".join(batch["text"]) #join all items into a string, use space as a separator
    vocab = list(set(all_text)) #create a list of objects, use set() to display in a random order
    return {"vocab": [vocab], "all_text": [all_text]}

vocabs = timit.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=timit.column_names["train"])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [63]:
# create a union of all distinct letters in the train/test dataset
# convert the list into a dictionary

vocab_list = list(set(vocabs['train']['vocab'][0]) | set(vocabs["test"]["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'h': 0,
 'o': 1,
 'c': 2,
 'd': 3,
 'j': 4,
 'b': 5,
 'a': 6,
 'z': 7,
 's': 8,
 'q': 9,
 '.': 10,
 'm': 11,
 'u': 12,
 't': 13,
 ' ': 14,
 'r': 15,
 'w': 16,
 'v': 17,
 'g': 18,
 'n': 19,
 'k': 20,
 'i': 21,
 'e': 22,
 'p': 23,
 "'": 24,
 'f': 25,
 'l': 26,
 'y': 27,
 'x': 28}

In [64]:
# give " " a more visible character
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [65]:
# add an unkown token to deal with characters not encoutered in our specific dataset
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
print(len(vocab_dict))

31


In [None]:
# save the vocab in a json file
import json
with open('vocab1.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)