## Import and Explore Dataset

In [None]:
import os

current_directory = os.getcwd()
print(f"The current working directory is: {current_directory}")

os.chdir("..")
current_directory = os.getcwd()
print(f"The current working directory is: {current_directory}")


The current working directory is: /home/thun/Documents/python_pj/accent_vn/notebooks
The current working directory is: /home/thun/Documents/python_pj/accent_vn


In [2]:
from datasets import load_from_disk

ds = load_from_disk("my_dataset")

print(ds)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['region', 'province_code', 'province_name', 'filename', 'text', 'speakerID', 'gender', 'audio'],
        num_rows: 15023
    })
    test: Dataset({
        features: ['region', 'province_code', 'province_name', 'filename', 'text', 'speakerID', 'gender', 'audio'],
        num_rows: 2026
    })
    valid: Dataset({
        features: ['region', 'province_code', 'province_name', 'filename', 'text', 'speakerID', 'gender', 'audio'],
        num_rows: 1900
    })
})


In [3]:
train_ds = ds["train"]
example = train_ds[0]

for k, v in example.items():
    print(f"{k}: {type(v)}")

print("Audio features: ", example["audio"].keys())


region: <class 'str'>
province_code: <class 'int'>
province_name: <class 'str'>
filename: <class 'str'>
text: <class 'str'>
speakerID: <class 'str'>
gender: <class 'int'>
audio: <class 'dict'>
Audio features:  dict_keys(['path', 'array', 'sampling_rate'])


In [4]:
example

{'region': 'North',
 'province_code': 11,
 'province_name': 'CaoBang',
 'filename': '11_0001.wav',
 'text': 'Nghiên cứu học tập,các ứng dụng các khoa học công nghệ và những dụng tiên tiến để đưa vào trong áp dụng trong công việc của mình. Từ đó giúp chuyển đổi những cái khó khăn trong công việc, đưa ra những thuận lợi và những cái nhanh gọn hơn giải quyết các thủ tục hành chính cho được thuận tiện hơn.',
 'speakerID': 'spk_11_0001',
 'gender': 1,
 'audio': {'path': '11_0001.wav',
  'array': array([ 0.01013184,  0.00985718,  0.00961304, ..., -0.0005188 ,
         -0.0005188 , -0.00033569], shape=(1069160,)),
  'sampling_rate': 44100}}

In [5]:
import torch #torch==2.3.0
print(torch.__version__)
print(torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())

2.3.0+cu121
12.1
CUDA available: True


In [6]:
from datasets import Audio
from collections import Counter

# trecast audio no decoding
ds_no_audio = ds.cast_column("audio", Audio(decode=False))

# region
regions = [r.strip().lower() for r in ds_no_audio["train"]["region"]]
print(Counter(regions))


Counter({'north': 5913, 'central': 4705, 'south': 4405})


In [17]:
# province_name
province_names = [r.strip().lower() for r in ds_no_audio["train"]["province_name"]]

counter = Counter(province_names)

print("Unique provinces: ", len(counter))
for name, count in counter.most_common():
    print(f"{name}: {count}")

Unique provinces:  63
caobang: 287
nghean: 280
quangbinh: 280
danang: 271
thaibinh: 265
binhthuan: 262
hagiang: 260
quangngai: 260
binhphuoc: 260
bariavungtau: 255
gialai: 252
thainguyen: 251
quangninh: 249
binhduong: 249
phutho: 248
hanoi: 248
hochiminh: 246
ninhthuan: 246
haugiang: 245
daklak: 244
daknong: 244
longan: 243
phuyen: 242
kontum: 242
haiphong: 241
laocai: 241
sonla: 241
binhdinh: 241
langson: 240
namdinh: 240
thuathienhue: 240
lamdong: 239
hatinh: 238
tayninh: 237
thanhhoa: 236
camau: 236
laichau: 235
quangtri: 233
yenbai: 232
haiduong: 231
bentre: 231
quangnam: 230
backan: 229
tiengiang: 228
vinhlong: 226
dienbien: 225
khanhhoa: 225
travinh: 225
tuyenquang: 224
baclieu: 224
hungyen: 221
bacninh: 221
hoabinh: 220
angiang: 220
kiengiang: 220
ninhbinh: 218
dongnai: 218
dongthap: 217
bacgiang: 217
soctrang: 216
vinhphuc: 215
hanam: 214
cantho: 209


In [15]:
ds_no_audio["train"]

Dataset({
    features: ['region', 'province_code', 'province_name', 'filename', 'text', 'speakerID', 'gender', 'audio'],
    num_rows: 15023
})

In [16]:
ds_no_audio["train"].features

{'region': Value(dtype='string', id=None),
 'province_code': Value(dtype='int64', id=None),
 'province_name': Value(dtype='string', id=None),
 'filename': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'speakerID': Value(dtype='string', id=None),
 'gender': Value(dtype='int64', id=None),
 'audio': Audio(sampling_rate=None, mono=True, decode=False, id=None)}

## Assign Labels

In [7]:
# # collect unique province names from train split
# all_provinces = sorted(set([p.strip().lower() for p in ds["train"]["province_name"] if p is not None]))
# province2label = {name: idx for idx, name in enumerate(all_provinces)}
# print("Mapping:", province2label)
province2label = {
    "hanoi": 0,
    "hanam": 0,
    "namdinh": 0,
    "ninhbinh": 0,
    "nghean": 1,
    "thanhhoa": 1,
    "hatinh": 1,
    "quangbinh": 2,
    "quangtri": 2,
    "thuathienhue": 2,
    "danang": 3,
    "quangnam": 3,
    "quangngai": 3,
    "hochiminh": 4,
    "binhduong": 4,
    "bariavungtau": 4,
    "dongnai": 4,
    "cantho": 5,
    "angiang": 5,
    "kiengiang": 5,
    "camau": 5
}


In [8]:
def add_label_batch(batch):
    labels = []
    for name in batch["province_name"]:
        if name is None:
            labels.append(-1)
        else:
            labels.append(province2label.get(name.strip().lower(), -1))
    batch["accent_label"] = [int(x) for x in labels]
    return batch

def process_dataset(ds):
    ds = ds.map(
        add_label_batch,
        batched=True,
        batch_size=128, # smaller chunk, too big (1000) causes Arrow to crash ;-;
        num_proc=1,
        remove_columns=["region", "province_name", "province_code", "speakerID", "filename"]
    )
    ds = ds.filter(lambda x: x["accent_label"] != -1, num_proc=1)
    return ds

In [9]:
ds_proccessed = process_dataset(ds)
print(ds_proccessed)
print(ds_proccessed["train"].features)

ds_proccessed.save_to_disk("processed_dataset")

Filter: 100%|██████████| 15023/15023 [04:49<00:00, 51.83 examples/s]
Filter: 100%|██████████| 2026/2026 [00:34<00:00, 58.43 examples/s]
Filter: 100%|██████████| 1900/1900 [00:32<00:00, 58.39 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'gender', 'audio', 'accent_label'],
        num_rows: 5041
    })
    test: Dataset({
        features: ['text', 'gender', 'audio', 'accent_label'],
        num_rows: 674
    })
    valid: Dataset({
        features: ['text', 'gender', 'audio', 'accent_label'],
        num_rows: 661
    })
})
{'text': Value(dtype='string', id=None), 'gender': Value(dtype='int64', id=None), 'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'accent_label': Value(dtype='int64', id=None)}


Saving the dataset (35/35 shards): 100%|██████████| 5041/5041 [02:57<00:00, 28.42 examples/s]
Saving the dataset (5/5 shards): 100%|██████████| 674/674 [00:21<00:00, 31.73 examples/s]
Saving the dataset (5/5 shards): 100%|██████████| 661/661 [00:29<00:00, 22.24 examples/s]


In [None]:
# resample from 44k1 to 16k
ds_proccessed16k = ds_proccessed.cast_column("audio", Audio(sampling_rate=16000))
ds_proccessed16k.save_to_disk("processed_dataset16k")


Saving the dataset (35/35 shards): 100%|██████████| 5041/5041 [03:35<00:00, 23.43 examples/s]
Saving the dataset (5/5 shards): 100%|██████████| 674/674 [00:29<00:00, 22.71 examples/s]
Saving the dataset (5/5 shards): 100%|██████████| 661/661 [00:32<00:00, 20.09 examples/s]


In [None]:
import pickle
processed_ds_file = "data/processed_ds/vimd_ds.pkl"
processed_ds16k_file = "data/processed_ds/vimd_ds16kHz.pkl"

with open(processed_ds_file, "wb") as file:
    pickle.dump(ds_proccessed, file)
    
with open(processed_ds16k_file, "wb") as file:
    pickle.dump(ds_proccessed16k, file)

: 