In [1]:
from datasets import Dataset, DatasetDict, Audio, concatenate_datasets
from IPython.display import Audio as DisplayAudio
import random

In [20]:
naijavoices_train_manifest = "/mnt/md0/synvoices/data/naijavoices/train_manifest.jsonl"
naijavoices_dev_manifest = "/mnt/md0/synvoices/data/naijavoices/dev_manifest.jsonl"
naijavoices_test_manifest = "/mnt/md0/synvoices/data/naijavoices/test_manifest.jsonl"
synthetic_train_manifest = "/mnt/md0/synvoices/data/hausa_asr_filtered_augmented/manifest_450h.jsonl"

In [21]:
# convert to a Hugging Face dataset
naijavoices_train = Dataset.from_json(naijavoices_train_manifest, split="train")
dev = Dataset.from_json(naijavoices_dev_manifest, split="validation")
test = Dataset.from_json(naijavoices_test_manifest, split="test")
synthetic_train = Dataset.from_json(synthetic_train_manifest, split="train")

In [22]:
# rename 'audio_filepath' to 'audio'
naijavoices_train = naijavoices_train.rename_column('audio_filepath', 'audio')
synthetic_train = synthetic_train.rename_column('audio_filepath', 'audio')
dev = dev.rename_column('audio_filepath', 'audio')
test = test.rename_column('audio_filepath', 'audio')

In [24]:
test[0]

{'speaker_id': 'SUYB7',
 'text': 'ni da khadijah mun hada kuɗi don siyan motar lemu na tsawon wannan lokacin',
 'age_bracket': '18-29',
 'gender': 'male',
 'audio': '/mnt/md0/synvoices/data/naijavoices/audio-unconverted/20240117060723-54-2285-486267-ni-da-khadijah-mun-hada-ku-i-d.wav',
 'duration': 2.82}

In [5]:
# save the test set for convenience
DatasetDict({"test": test}).save_to_disk("/mnt/md0/synvoices/data/hausa_naijavoices_test")

Saving the dataset (0/1 shards):   0%|          | 0/4524 [00:00<?, ? examples/s]

In [6]:
# remove unnecessary columns
cols = ['speaker_id', 'age_bracket', 'gender']
naijavoices_train = naijavoices_train.remove_columns(cols)
dev = dev.remove_columns(cols)
test = test.remove_columns(cols)

In [7]:
len(naijavoices_train), len(synthetic_train)

(663333, 383182)

In [8]:
# concatenate training sets
train = concatenate_datasets([naijavoices_train, synthetic_train])

# or use only naijavoices
# train = naijavoices_train

# or use only synthetic
# train = synthetic_train

# shuffle the training set
train = train.shuffle(seed=42)

In [9]:
train[0]

{'text': 'shin ka san hanyoyin da za a inganta lafiyar kwakwalwa',
 'audio': '/mnt/md0/synvoices/data/hausa_asr_filtered_augmented/clips/9e5b08921671df62e22f84122d24abbb.wav',
 'duration': 4.405375}

In [10]:
# cast to audio
train = train.cast_column('audio', Audio(sampling_rate=16000))
dev = dev.cast_column('audio', Audio(sampling_rate=16000))
test = test.cast_column('audio', Audio(sampling_rate=16000))

In [11]:
train[0]

{'text': 'shin ka san hanyoyin da za a inganta lafiyar kwakwalwa',
 'audio': {'path': '/mnt/md0/synvoices/data/hausa_asr_filtered_augmented/clips/9e5b08921671df62e22f84122d24abbb.wav',
  'array': array([0.00308228, 0.00305176, 0.00244141, ..., 0.0038147 , 0.00369263,
         0.00375366]),
  'sampling_rate': 16000},
 'duration': 4.405375}

In [12]:
# listen to a random sample from the training set
sample = random.choice(train)
print(sample['text'])
DisplayAudio(sample['audio']['array'], rate=sample['audio']['sampling_rate'])

piramid din sun tsayu a tsaye tsawon shekaru masu yawa


In [13]:
dataset_dict = DatasetDict({
    'train': train,
    'validation': dev,
    'test': test
})

In [14]:
train

Dataset({
    features: ['text', 'audio', 'duration'],
    num_rows: 1046515
})

In [15]:
dev

Dataset({
    features: ['text', 'audio', 'duration'],
    num_rows: 4538
})

In [16]:
# save to disk
# dataset_dict.save_to_disk("/mnt/md0/synvoices/data/hausa_579_450h", num_proc=12)

In [17]:
# from huggingface_hub import notebook_login
# notebook_login()

In [18]:
# upload to the hub
# dataset_dict.push_to_hub("CLEAR-Global/hausa_579_450h")