### Let's load dataset

In [214]:
# from datasets import load_dataset, load_metric, Audio, Dataset
from tqdm.auto import tqdm

import os
import numpy as np
import pandas as pd
import torchaudio

os.environ["WANDB_DISABLED"] = "true"

In [215]:
timit_path = './data'

In [216]:
data_path = './data/data/'

In [217]:
df_train = pd.read_csv(timit_path + '/train_data.csv')
df_test = pd.read_csv(timit_path +'/test_data.csv')
df = pd.concat([df_train, df_test])
df = df[df['is_converted_audio'] == False]

In [218]:
df

Unnamed: 0,index,test_or_train,dialect_region,speaker_id,filename,path_from_data_dir,path_from_data_dir_windows,is_converted_audio,is_audio,is_word_file,is_phonetic_file,is_sentence_file
1,2.0,TRAIN,DR4,MMDM0,SI1311.PHN,TRAIN/DR4/MMDM0/SI1311.PHN,TRAIN\\DR4\\MMDM0\\SI1311.PHN,False,False,False,True,False
2,3.0,TRAIN,DR4,MMDM0,SI1311.WRD,TRAIN/DR4/MMDM0/SI1311.WRD,TRAIN\\DR4\\MMDM0\\SI1311.WRD,False,False,True,False,False
3,4.0,TRAIN,DR4,MMDM0,SX321.PHN,TRAIN/DR4/MMDM0/SX321.PHN,TRAIN\\DR4\\MMDM0\\SX321.PHN,False,False,False,True,False
4,5.0,TRAIN,DR4,MMDM0,SX321.WRD,TRAIN/DR4/MMDM0/SX321.WRD,TRAIN\\DR4\\MMDM0\\SX321.WRD,False,False,True,False,False
5,6.0,TRAIN,DR4,MMDM0,SI681.TXT,TRAIN/DR4/MMDM0/SI681.TXT,TRAIN\\DR4\\MMDM0\\SI681.TXT,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
8394,8395.0,TEST,DR8,MPAM0,SX19.WAV,TEST/DR8/MPAM0/SX19.WAV,TEST\\DR8\\MPAM0\\SX19.WAV,False,True,False,False,False
8395,8396.0,TEST,DR8,MPAM0,SX109.TXT,TEST/DR8/MPAM0/SX109.TXT,TEST\\DR8\\MPAM0\\SX109.TXT,False,False,False,False,True
8397,8398.0,TEST,DR8,MPAM0,SX289.WRD,TEST/DR8/MPAM0/SX289.WRD,TEST\\DR8\\MPAM0\\SX289.WRD,False,False,True,False,False
8398,8399.0,TEST,DR8,MPAM0,SX109.WAV,TEST/DR8/MPAM0/SX109.WAV,TEST\\DR8\\MPAM0\\SX109.WAV,False,True,False,False,False


### Split Dataset

In [219]:
data = {}

for idx, row in tqdm(df.iterrows()):
    path = row['path_from_data_dir']
    entry_id = path.split('.')[0]

    if entry_id not in data:
        data[entry_id] = {}

    if row['is_audio'] is True:
        data[entry_id]['audio_file'] = os.path.join(data_path, path)
    elif row['is_word_file'] is True:
        data[entry_id]['word_file'] = os.path.join(data_path, path)
    elif row['is_phonetic_file'] is True:
        data[entry_id]['phonetic_file'] = os.path.join(data_path, path)

25200it [00:01, 24727.25it/s]


In [220]:
import random

keys = [key for key in data.keys() if len(data[key]) == 3]
random.Random(101).shuffle(keys)

num_train = int(len(keys) * 0.8)
num_valid = int(len(keys) * 0.1)
num_test = len(keys) - num_train - num_valid

train_keys = keys[:num_train]
valid_keys = keys[num_train:num_train + num_valid]
test_keys = keys[-num_test:]

In [221]:
train = { key:data[key] for key in train_keys }
valid = { key:data[key] for key in valid_keys }
test  = { key:data[key] for key in test_keys }

In [222]:
import librosa

def get_durations(dict_data):
    total_durations = 0

    for entry in dict_data.values():
        audio_data, _ = librosa.load(entry['audio_file'], sr=16_000)
        duration = len(audio_data) / 16_000
        total_durations += duration

    return int(total_durations)

In [223]:
# print(f"Duration of Train: {get_durations(train) // 60} mns")
# print(f"Duration of Valid: {get_durations(valid) // 60} mns")
# print(f"Duration of Test : {get_durations(test) // 60} mns")

In [224]:
import json

with open("./data/custom/custom_train.json", "w") as f:
    json.dump(train, f)
with open("./data/custom/custom_valid.json", "w") as f:
    json.dump(valid, f)
with open("./data/custom/custom_test.json", "w") as f:
    json.dump(test, f)

### Preprocessing


In [225]:
def convert_to_feature_dict(data_dict):
    # convert each feature into an array instead
    audio_files = []
    word_files = []
    phonetic_files = []
    for key, value in data_dict.items():
        audio_files.append(value['audio_file'])
        word_files.append(value['word_file'])
        phonetic_files.append(value['phonetic_file'])
    
    return {
        'audio_file': audio_files,
        'word_file': word_files,
        'phonetic_file': phonetic_files
    }

In [226]:
train = convert_to_feature_dict(train)
valid = convert_to_feature_dict(valid)
test  = convert_to_feature_dict(test)

In [227]:
import pandas as pd

# Chuyển đổi dict thành DataFrame
train_dataset = pd.DataFrame(train)
valid_dataset    = pd.DataFrame(valid)
test_dataset  = pd.DataFrame(test)

# Kiểm tra dữ liệu
print(train_dataset.head())

                               audio_file  \
0      ./data/data/TEST/DR7/FISB0/SA1.WAV   
1  ./data/data/TRAIN/DR4/MAEB0/SI1411.WAV   
2    ./data/data/TRAIN/DR2/FCAJ0/SX39.WAV   
3    ./data/data/TRAIN/DR3/MILB0/SX93.WAV   
4   ./data/data/TEST/DR2/MMDM2/SI2082.WAV   

                                word_file  \
0      ./data/data/TEST/DR7/FISB0/SA1.WRD   
1  ./data/data/TRAIN/DR4/MAEB0/SI1411.WRD   
2    ./data/data/TRAIN/DR2/FCAJ0/SX39.WRD   
3    ./data/data/TRAIN/DR3/MILB0/SX93.WRD   
4   ./data/data/TEST/DR2/MMDM2/SI2082.WRD   

                            phonetic_file  
0      ./data/data/TEST/DR7/FISB0/SA1.PHN  
1  ./data/data/TRAIN/DR4/MAEB0/SI1411.PHN  
2    ./data/data/TRAIN/DR2/FCAJ0/SX39.PHN  
3    ./data/data/TRAIN/DR3/MILB0/SX93.PHN  
4   ./data/data/TEST/DR2/MMDM2/SI2082.PHN  


In [228]:
print(train_dataset)

                                  audio_file  \
0         ./data/data/TEST/DR7/FISB0/SA1.WAV   
1     ./data/data/TRAIN/DR4/MAEB0/SI1411.WAV   
2       ./data/data/TRAIN/DR2/FCAJ0/SX39.WAV   
3       ./data/data/TRAIN/DR3/MILB0/SX93.WAV   
4      ./data/data/TEST/DR2/MMDM2/SI2082.WAV   
...                                      ...   
2683    ./data/data/TEST/DR7/FCAU0/SX317.WAV   
2684    ./data/data/TEST/DR7/MTWH0/SX290.WAV   
2685   ./data/data/TRAIN/DR4/MRFL0/SX436.WAV   
2686     ./data/data/TRAIN/DR3/FSKC0/SA2.WAV   
2687    ./data/data/TEST/DR7/MKJL0/SX110.WAV   

                                   word_file  \
0         ./data/data/TEST/DR7/FISB0/SA1.WRD   
1     ./data/data/TRAIN/DR4/MAEB0/SI1411.WRD   
2       ./data/data/TRAIN/DR2/FCAJ0/SX39.WRD   
3       ./data/data/TRAIN/DR3/MILB0/SX93.WRD   
4      ./data/data/TEST/DR2/MMDM2/SI2082.WRD   
...                                      ...   
2683    ./data/data/TEST/DR7/FCAU0/SX317.WRD   
2684    ./data/data/TEST/DR7/MTWH0/SX29

In [229]:
def read_text_file(filepath):
    with open(filepath) as f:
        tokens = [line.split()[-1] for line in f]
        return " ".join(tokens)
    
def prepare_text_data(item):
    item['text'] = read_text_file(item['word_file'])
    item['phonetic'] = read_text_file(item['phonetic_file'])
    return item

In [230]:
# Áp dụng hàm xử lý `prepare_text_data` lên từng dòng
train_dataset = train_dataset.apply(prepare_text_data, axis=1)
valid_dataset = valid_dataset.apply(prepare_text_data, axis=1)
test_dataset  = test_dataset.apply(prepare_text_data, axis=1)

# Loại bỏ cột không cần thiết
train_dataset = train_dataset.drop(columns=["word_file", "phonetic_file"])
valid_dataset = valid_dataset.drop(columns=["word_file", "phonetic_file"])
test_dataset  = test_dataset.drop(columns=["word_file", "phonetic_file"])

# Kiểm tra kết quả
print(train_dataset.head())

                               audio_file  \
0      ./data/data/TEST/DR7/FISB0/SA1.WAV   
1  ./data/data/TRAIN/DR4/MAEB0/SI1411.WAV   
2    ./data/data/TRAIN/DR2/FCAJ0/SX39.WAV   
3    ./data/data/TRAIN/DR3/MILB0/SX93.WAV   
4   ./data/data/TEST/DR2/MMDM2/SI2082.WAV   

                                                text  \
0  she had your dark suit in greasy wash water al...   
1  resolved that the anti slavery sentiment is be...   
2      barb's gold bracelet was a graduation present   
3                   cut a small corner off each edge   
4                      you're boiling milk ain't you   

                                            phonetic  
0  h# sh iy hv ae dcl d y er dcl d aa r kcl k s u...  
1  h# r ix z aa l v dh eh tcl dh iy ae nx ix s l ...  
2  h# b aa r bcl b z gcl g ow l dcl b r ey s epi ...  
3  h# k ah dx ax s epi m ao l kcl k ao r nx er ao...  
4  h# y axr bcl b oy l ix ng m ih l kcl k q ey n ...  


In [231]:
# Lấy danh sách phonetics từ train_dataset
train_phonetics = [phone for phonetic in train_dataset["phonetic"] for phone in phonetic.split()]

# Đếm số lượng phonemes khác nhau
print("num of train phones:\t", len(set(train_phonetics)))


num of train phones:	 61


In [232]:
# TimitBet 61 phoneme mapping to 39 phonemes
# by Lee, K.-F., & Hon, H.-W. (1989). Speaker-independent phone recognition using hidden Markov models. IEEE Transactions on Acoustics, Speech, and Signal Processing, 37(11), 1641–1648. doi:10.1109/29.46546 
phon61_map39 = {
    'iy':'iy',  'ih':'ih',   'eh':'eh',  'ae':'ae',    'ix':'ih',  'ax':'ah',   'ah':'ah',  'uw':'uw',
    'ux':'uw',  'uh':'uh',   'ao':'aa',  'aa':'aa',    'ey':'ey',  'ay':'ay',   'oy':'oy',  'aw':'aw',
    'ow':'ow',  'l':'l',     'el':'l',  'r':'r',      'y':'y',    'w':'w',     'er':'er',  'axr':'er',
    'm':'m',    'em':'m',     'n':'n',    'nx':'n',     'en':'n',  'ng':'ng',   'eng':'ng', 'ch':'ch',
    'jh':'jh',  'dh':'dh',   'b':'b',    'd':'d',      'dx':'dx',  'g':'g',     'p':'p',    't':'t',
    'k':'k',    'z':'z',     'zh':'sh',  'v':'v',      'f':'f',    'th':'th',   's':'s',    'sh':'sh',
    'hh':'hh',  'hv':'hh',   'pcl':'h#', 'tcl':'h#', 'kcl':'h#', 'qcl':'h#','bcl':'h#','dcl':'h#',
    'gcl':'h#','h#':'h#',  '#h':'h#',  'pau':'h#', 'epi': 'h#','nx':'n',   'ax-h':'ah','q':'h#' 
}

def convert_phon61_to_phon39(sentence):
    tokens = [phon61_map39[x] for x in sentence.split()]
    return " ".join(tokens)

def normalize_phones(item):
    item['phonetic'] = convert_phon61_to_phon39(item['phonetic'])
    return item

In [233]:
def normalize_phones(phonetic_text):
    return convert_phon61_to_phon39(phonetic_text)  # Chỉ nhận vào chuỗi và trả về chuỗi

# Áp dụng cho từng dòng trong DataFrame
train_dataset["phonetic"] = train_dataset["phonetic"].apply(normalize_phones)
valid_dataset["phonetic"] = valid_dataset["phonetic"].apply(normalize_phones)
test_dataset["phonetic"] = test_dataset["phonetic"].apply(normalize_phones)


In [234]:
train_phonetics = [phone for phonetic in train_dataset["phonetic"] for phone in phonetic.split()]
valid_phonetics = [phone for phonetic in valid_dataset["phonetic"] for phone in phonetic.split()]
test_phonetics = [phone for phonetic in test_dataset["phonetic"] for phone in phonetic.split()]

print("num of train phones:\t", len(set(train_phonetics)))
print("num of valid phones:\t", len(set(valid_phonetics)))
print("num of test phones:\t", len(set(test_phonetics)))


num of train phones:	 39
num of valid phones:	 39
num of test phones:	 39


In [235]:
phone_vocabs = set(train_phonetics)
phone_vocabs.remove('h#')
phone_vocabs = sorted(phone_vocabs)

def count_frequency(phonetics):
    phone_counts = {phone: 0 for phone in phone_vocabs}
    for phone in phonetics:
        if phone in phone_vocabs:
            phone_counts[phone] += 1
    # eliminate h# for visualization purposes
    return [phone_counts[phone] for phone in phone_vocabs] 

In [236]:
train_phone_counts = count_frequency(train_phonetics)
valid_phone_counts = count_frequency(valid_phonetics)
test_phone_counts  = count_frequency(test_phonetics)

In [237]:
train_phone_ratio = [count / sum(train_phone_counts) for count in train_phone_counts]
valid_phone_ratio = [count / sum(valid_phone_counts) for count in valid_phone_counts]
test_phone_ratio  = [count / sum(test_phone_counts) for count in test_phone_counts]


In [240]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(25,10))

# plt.bar(phone_vocabs, train_phone_ratio)
# plt.bar(phone_vocabs, valid_phone_ratio, bottom=train_phone_ratio)
# plt.bar(phone_vocabs, test_phone_ratio, bottom=[(x+y) for x,y in zip(train_phone_ratio, valid_phone_ratio)])

# plt.show()

In [246]:
import soundfile as sf
import pandas as pd

# Hàm để tải tệp âm thanh và thay đổi sampling rate
def load_audio(file_path, sampling_rate=16_000):
    # Đọc tệp âm thanh với `soundfile`
    audio, sr = sf.read(file_path)
    
    # Nếu sampling rate không khớp, thay đổi nó (sử dụng resampling)
    if sr != sampling_rate:
        import librosa
        audio = librosa.resample(audio, sr, sampling_rate)
    
    return audio

# Áp dụng hàm lên cột `audio_file`
train_dataset['audio'] = train_dataset['audio_file'].apply(lambda x: load_audio(x, 16000))
valid_dataset['audio'] = valid_dataset['audio_file'].apply(lambda x: load_audio(x, 16000))
test_dataset['audio'] = test_dataset['audio_file'].apply(lambda x: load_audio(x, 16000))

# Xóa cột `audio_file` nếu không còn cần thiết
train_dataset = train_dataset.drop(columns=["audio_file"])
valid_dataset = valid_dataset.drop(columns=["audio_file"])
test_dataset = test_dataset.drop(columns=["audio_file"])

# Kiểm tra kết quả
print(train_dataset.head())


                                                text  \
0  she had your dark suit in greasy wash water al...   
1  resolved that the anti slavery sentiment is be...   
2      barb's gold bracelet was a graduation present   
3                   cut a small corner off each edge   
4                      you're boiling milk ain't you   

                                            phonetic  \
0  h# sh iy hh ae h# d y er h# d aa r h# k s uw h...   
1  h# r ih z aa l v dh eh h# dh iy ae n ih s l ey...   
2  h# b aa r h# b z h# g ow l h# b r ey s h# l ih...   
3  h# k ah dx ah s h# m aa l h# k aa r n er aa f ...   
4  h# y er h# b oy l ih ng m ih l h# k h# ey n h#...   

                                               audio  
0  [-6.103515625e-05, 6.103515625e-05, 0.00012207...  
1  [-9.1552734375e-05, 3.0517578125e-05, 0.000152...  
2  [0.0, -6.103515625e-05, -0.0001220703125, -3.0...  
3  [-9.1552734375e-05, 0.0, -6.103515625e-05, 9.1...  
4  [3.0517578125e-05, 6.103515625e-05, 0.0, 0.0, ..

In [248]:
train_dataset_first_row = train_dataset.iloc[[0]]
print(train_dataset_first_row)


                                                text  \
0  she had your dark suit in greasy wash water al...   

                                            phonetic  \
0  h# sh iy hh ae h# d y er h# d aa r h# k s uw h...   

                                               audio  
0  [-6.103515625e-05, 6.103515625e-05, 0.00012207...  


In [250]:
# import IPython.display as ipd
# import numpy as np
# import random

# rand_int = random.randint(0, len(train_dataset)-1)

# print("Text:", train_dataset[rand_int]["text"])
# print("Phonetics:", train_dataset[rand_int]["phonetic"])
# print("Input array shape:", train_dataset[rand_int]["audio"]["array"].shape)
# print("Sampling rate:", train_dataset[rand_int]["audio"]["sampling_rate"])
# ipd.Audio(data=train_dataset[rand_int]["audio"]["array"], autoplay=False, rate=16000)

In [251]:
vocab_train = list(set(train_phonetics)) + [' ']
vocab_valid = list(set(valid_phonetics)) + [' ']
vocab_test  = list(set(test_phonetics)) + [' ']

In [252]:
vocab_list = list(set(vocab_train + vocab_valid + vocab_test))
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}

print(vocab_dict)

{' ': 0, 'aa': 1, 'ae': 2, 'ah': 3, 'aw': 4, 'ay': 5, 'b': 6, 'ch': 7, 'd': 8, 'dh': 9, 'dx': 10, 'eh': 11, 'er': 12, 'ey': 13, 'f': 14, 'g': 15, 'h#': 16, 'hh': 17, 'ih': 18, 'iy': 19, 'jh': 20, 'k': 21, 'l': 22, 'm': 23, 'n': 24, 'ng': 25, 'ow': 26, 'oy': 27, 'p': 28, 'r': 29, 's': 30, 'sh': 31, 't': 32, 'th': 33, 'uh': 34, 'uw': 35, 'v': 36, 'w': 37, 'y': 38, 'z': 39}


In [254]:
# make the space more intuitive to understand
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

42

In [255]:
# save vocab.json
import json
with open('./data/custom/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [256]:
symbols = {"a": "ə", "ey": "eɪ", "aa": "ɑ", "ae": "æ", "ah": "ə", "ao": "ɔ",
           "aw": "aʊ", "ay": "aɪ", "ch": "ʧ", "dh": "ð", "eh": "ɛ", "er": "ər",
           "hh": "h", "ih": "ɪ", "jh": "ʤ", "ng": "ŋ",  "ow": "oʊ", "oy": "ɔɪ",
           "sh": "ʃ", "th": "θ", "uh": "ʊ", "uw": "u", "zh": "ʒ", "iy": "i", "y": "j"}

In [257]:
from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from tokenizers.processors import TemplateProcessing

In [259]:
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./data/custom/", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", )  # './' load vocab.json in the current directory
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)  
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [260]:
# This does not prepare the input for the Transformer model.
# This will resample the data and convert the sentence into indices
# Batch here is just for one entry (row)
def prepare_dataset(batch):
    audio = batch["audio"]
    
    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["phonetic"]).input_ids
    return batch

In [261]:
# train_dataset = train_dataset.map(prepare_dataset, remove_columns=train_dataset.column_names)
# valid_dataset = valid_dataset.map(prepare_dataset, remove_columns=valid_dataset.column_names)
# test_dataset = test_dataset.map(prepare_dataset, remove_columns=test_dataset.column_names)
train_dataset = train_dataset.map(prepare_dataset)
valid_dataset = valid_dataset.map(prepare_dataset)
test_dataset = test_dataset.map(prepare_dataset)

TypeError: string indices must be integers, not 'str'