## Import and Explore Dataset

In [1]:
import os

current_directory = os.getcwd()
print(f"The current working directory is: {current_directory}")

os.chdir("..")
current_directory = os.getcwd()
print(f"The current working directory is: {current_directory}")


The current working directory is: /home/thun/Documents/python_pj/accent_vn/notebooks
The current working directory is: /home/thun/Documents/python_pj/accent_vn


In [2]:
import pickle
ds_file = "data/vimd_dataset.pkl"


def load_dataset(filename):
    with open(filename, 'rb') as file:
        ds = pickle.load(file)
    return ds

ds = load_dataset(ds_file)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torchcodec
print(torchcodec.__version__)

0.7.0


In [4]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())

2.8.0+cu128
12.8
CUDA available: True


In [6]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['region', 'province_code', 'province_name', 'filename', 'text', 'speakerID', 'gender', 'audio'],
        num_rows: 15023
    })
    test: Dataset({
        features: ['region', 'province_code', 'province_name', 'filename', 'text', 'speakerID', 'gender', 'audio'],
        num_rows: 2026
    })
    valid: Dataset({
        features: ['region', 'province_code', 'province_name', 'filename', 'text', 'speakerID', 'gender', 'audio'],
        num_rows: 1900
    })
})


In [5]:
from datasets import Audio
from collections import Counter

# trecast audio no decoding
ds_no_audio = ds.cast_column("audio", Audio(decode=False))

# region
regions = [r.strip().lower() for r in ds_no_audio["train"]["region"]]
print(Counter(regions))


Counter({'north': 5913, 'central': 4705, 'south': 4405})


In [8]:
# province_name
province_names = [r.strip().lower() for r in ds_no_audio["train"]["province_name"]]

counter = Counter(province_names)

print("Unique provinces: ", len(counter))
for name, count in counter.most_common():
    print(f"{name}: {count}")

Unique provinces:  63
caobang: 287
nghean: 280
quangbinh: 280
danang: 271
thaibinh: 265
binhthuan: 262
hagiang: 260
quangngai: 260
binhphuoc: 260
bariavungtau: 255
gialai: 252
thainguyen: 251
quangninh: 249
binhduong: 249
phutho: 248
hanoi: 248
hochiminh: 246
ninhthuan: 246
haugiang: 245
daklak: 244
daknong: 244
longan: 243
phuyen: 242
kontum: 242
haiphong: 241
laocai: 241
sonla: 241
binhdinh: 241
langson: 240
namdinh: 240
thuathienhue: 240
lamdong: 239
hatinh: 238
tayninh: 237
thanhhoa: 236
camau: 236
laichau: 235
quangtri: 233
yenbai: 232
haiduong: 231
bentre: 231
quangnam: 230
backan: 229
tiengiang: 228
vinhlong: 226
dienbien: 225
khanhhoa: 225
travinh: 225
tuyenquang: 224
baclieu: 224
hungyen: 221
bacninh: 221
hoabinh: 220
angiang: 220
kiengiang: 220
ninhbinh: 218
dongnai: 218
dongthap: 217
bacgiang: 217
soctrang: 216
vinhphuc: 215
hanam: 214
cantho: 209


In [15]:
ds_no_audio["train"]

Dataset({
    features: ['region', 'province_code', 'province_name', 'filename', 'text', 'speakerID', 'gender', 'audio'],
    num_rows: 15023
})

In [16]:
ds_no_audio["train"].features

{'region': Value(dtype='string', id=None),
 'province_code': Value(dtype='int64', id=None),
 'province_name': Value(dtype='string', id=None),
 'filename': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'speakerID': Value(dtype='string', id=None),
 'gender': Value(dtype='int64', id=None),
 'audio': Audio(sampling_rate=None, mono=True, decode=False, id=None)}

## Assign Labels

In [6]:
# # collect unique province names from train split
# all_provinces = sorted(set([p.strip().lower() for p in ds["train"]["province_name"] if p is not None]))
# province2label = {name: idx for idx, name in enumerate(all_provinces)}
# print("Mapping:", province2label)
province2label = {
    "hanoi": 0,
    "hanam": 0,
    "namdinh": 0,
    "ninhbinh": 0,
    "nghean": 1,
    "thanhhoa": 1,
    "hatinh": 1,
    "quangbinh": 2,
    "quangtri": 2,
    "thuathienhue": 2,
    "danang": 3,
    "quangnam": 3,
    "quangngai": 3,
    "hochiminh": 4,
    "binhduong": 4,
    "bariavungtau": 4,
    "dongnai": 4,
    "cantho": 5,
    "angiang": 5,
    "kiengiang": 5,
    "camau": 5
}


In [7]:
def add_label_batch(batch):
    labels = []
    for name in batch["province_name"]:
        if name is None:
            labels.append(-1)
        else:
            labels.append(province2label.get(name.strip().lower(), -1))
    batch["accent_label"] = [int(x) for x in labels]
    return batch

def process_dataset(ds):
    ds = ds.map(
        add_label_batch,
        batched=True,
        batch_size=128, # smaller chunk, too big (1000) causes Arrow to crash ;-;
        num_proc=1,
        remove_columns=["region", "province_name", "province_code", "speakerID"]
    )
    ds = ds.filter(lambda x: x["accent_label"] != -1, num_proc=1)
    return ds

In [8]:
ds_proccessed = process_dataset(ds)
print(ds_proccessed)
print(ds_proccessed["train"].features)


DatasetDict({
    train: Dataset({
        features: ['filename', 'text', 'gender', 'audio', 'accent_label'],
        num_rows: 5041
    })
    test: Dataset({
        features: ['filename', 'text', 'gender', 'audio', 'accent_label'],
        num_rows: 674
    })
    valid: Dataset({
        features: ['filename', 'text', 'gender', 'audio', 'accent_label'],
        num_rows: 661
    })
})
{'filename': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'gender': Value(dtype='int64', id=None), 'audio': Audio(sampling_rate=None, mono=True, decode=True, id=None), 'accent_label': Value(dtype='int64', id=None)}


In [9]:
processed_ds_file = "data/processed_dataset.pkl"

with open(processed_ds_file, "wb") as file:
    pickle.dump(ds_proccessed, file)