In [1]:
from datasets import Dataset, load_dataset, concatenate_datasets, Audio, DatasetDict
import random
import json
import pandas as pd
import re
import os
from IPython.display import Audio as DisplayAudio

# Common Voice

In [2]:
cv_19_train = load_dataset("fsicoli/common_voice_19_0", "luo", split="train+validation")
cv_19_test = load_dataset("fsicoli/common_voice_19_0", "luo", split="test")

In [3]:
cv_19_train

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    num_rows: 7559
})

In [4]:
cv_19_test

Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    num_rows: 2996
})

In [5]:
# rename the column "sentence" to "text"
cv_19_train = cv_19_train.rename_column("sentence", "text")
cv_19_test = cv_19_test.rename_column("sentence", "text")

In [6]:
# calculate the duration of the audio
def calculate_duration(batch):
    batch['duration'] = len(batch['audio']['array']) / batch['audio']['sampling_rate']
    return batch

In [7]:
cv_19_train = cv_19_train.map(calculate_duration, num_proc=4)

In [8]:
cv_19_test = cv_19_test.map(calculate_duration, num_proc=4)

In [9]:
cv_19_train[0]

{'client_id': '1b10420335a732ab849262c82a8a250848e8e9748dc9fd8816a9732f4749862f967cad4123286fc9273d19eccbde31f65d94c779309992eed172641d68b71607',
 'path': '/home/aymen/.cache/huggingface/datasets/downloads/extracted/79e80adb59e2ace9adc859487aae30d38730f7d97ddb0989b8ff760ca3c5ccc1/luo_train_0/common_voice_luo_40202958.mp3',
 'audio': {'path': None,
  'array': array([0., 0., 0., ..., 0., 0., 0.]),
  'sampling_rate': 48000},
 'text': 'Kaka osewachi no.',
 'up_votes': 2,
 'down_votes': 0,
 'age': '',
 'gender': '',
 'accent': '',
 'locale': 'luo',
 'segment': '',
 'variant': '',
 'duration': 3.348}

In [10]:
# listen to a random sample
sample = random.choice(cv_19_train)
print(sample['text'])
print(sample['duration'])
DisplayAudio(sample['audio']['array'], rate=sample['audio']['sampling_rate'])

ber ka ji bedo gi lit diere kuyo kata midhiero moro to ok ber
5.4


In [11]:
# convert to a NeMo manifest

def write_manifest_file(dataset, manifest_file):
    with open(manifest_file, 'w') as fout:
        for sample in dataset:
            manifest_line = {
                'audio_filepath': sample['path'],
                'duration': sample['duration'],
                'text': sample['text'],
            }
            fout.write(json.dumps(manifest_line) + '\n')

write_manifest_file(cv_19_train, 'cv_19_train_manifest.jsonl')
write_manifest_file(cv_19_test, 'cv_19_test_manifest.jsonl')

In [14]:
# calculate the total duration
print("Total duration of the CV19 train set: ", sum(cv_19_train['duration']) / 3600, "hours")

Total duration of the CV19 train set:  10.263009999999882 hours


# FLEURS

In [15]:
fleurs_train = load_dataset("google/fleurs", "luo_ke", split="train")
fleurs_dev = load_dataset("google/fleurs", "luo_ke", split="validation")
fleurs_test = load_dataset("google/fleurs", "luo_ke", split="test")

In [16]:
# fix the audio paths
def fix_audio_paths(batch):
    batch['path'] = os.path.join(os.path.dirname(batch['path']), batch['audio']['path'])
    return batch

fleurs_train = fleurs_train.map(fix_audio_paths, num_proc=4)
fleurs_dev = fleurs_dev.map(fix_audio_paths, num_proc=4)
fleurs_test = fleurs_test.map(fix_audio_paths, num_proc=4)

In [17]:
fleurs_train[0]

{'id': 802,
 'num_samples': 397440,
 'path': '/home/aymen/.cache/huggingface/datasets/downloads/extracted/249b89f2d9e7efe509d68a605ade1aa835a53207124cf2044caff7a42ea3fd8e/train/10004177096163272519.wav',
 'audio': {'path': None,
  'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         -9.15527344e-05,  3.05175781e-05, -3.05175781e-05]),
  'sampling_rate': 16000},
 'transcription': "madh math mag kong'o ma ok okalo tong' kong'o tiyo mopogore e dend ng'ato ka ng'ato kendo ka ing'eyo kar romb math minyalo madho to mano ber moloyo",
 'raw_transcription': "Madh math mag kong'o ma ok okalo tong'. Kong'o tiyo mopogore e dend ng'ato ka ng'ato, kendo ka ing'eyo kar romb math minyalo madho to mano ber moloyo.",
 'gender': 0,
 'lang_id': 55,
 'language': 'Luo',
 'lang_group_id': 3}

In [18]:
# calculate the duration of the audio
def calculate_duration_fleurs(batch):
    batch['duration'] = batch['num_samples'] / batch['audio']['sampling_rate']
    return batch

fleurs_train = fleurs_train.map(calculate_duration_fleurs, num_proc=4)
fleurs_dev = fleurs_dev.map(calculate_duration_fleurs, num_proc=4)
fleurs_test = fleurs_test.map(calculate_duration_fleurs, num_proc=4)

In [19]:
# rename the column "transcription" to "text"
fleurs_train = fleurs_train.rename_column("transcription", "text")
fleurs_dev = fleurs_dev.rename_column("transcription", "text")
fleurs_test = fleurs_test.rename_column("transcription", "text")

In [20]:
# calculate the total duration of the training set
print("Total duration of the Fleurs train set: ", sum(fleurs_train['duration']) / 3600, "hours")

Total duration of the Fleurs train set:  10.122416666666659 hours


# Prepare Real Data

In [17]:
# remove all columns except "path", "audio", "duration", and "text"
cv_19_train = cv_19_train.remove_columns([col for col in cv_19_train.column_names if col not in ['path', 'audio', 'duration', 'text']])
fleurs_train = fleurs_train.remove_columns([col for col in fleurs_train.column_names if col not in ['path', 'audio', 'duration', 'text']])
fleurs_dev = fleurs_dev.remove_columns([col for col in fleurs_dev.column_names if col not in ['path', 'audio', 'duration', 'text']])
# keep "gender" for the test sets
cv_19_test = cv_19_test.remove_columns([col for col in cv_19_test.column_names if col not in ['path', 'audio', 'duration', 'text', 'gender']])
fleurs_test = fleurs_test.remove_columns([col for col in fleurs_test.column_names if col not in ['path', 'audio', 'duration', 'text', 'gender']])

In [18]:
# resample the audio to 16kHz
cv_19_train = cv_19_train.cast_column('audio', Audio(sampling_rate=16000))
cv_19_test = cv_19_test.cast_column('audio', Audio(sampling_rate=16000))
fleurs_train = fleurs_train.cast_column('audio', Audio(sampling_rate=16000))
fleurs_dev = fleurs_dev.cast_column('audio', Audio(sampling_rate=16000))
fleurs_test = fleurs_test.cast_column('audio', Audio(sampling_rate=16000))

In [19]:
# concatenate the datasets
train_dataset = concatenate_datasets([cv_19_train, fleurs_train])

In [20]:
# shuffle the training set
train_dataset = train_dataset.shuffle(seed=42)

In [21]:
train_dataset

Dataset({
    features: ['path', 'audio', 'text', 'duration'],
    num_rows: 9943
})

In [22]:
# remove special characters
chars_to_ignore = '!,.:;?…”‘[]\\"' + '̇'
chars_to_remove_regex = f"[{re.escape(chars_to_ignore)}]"

def remove_special_characters(batch):
    batch['text'] = re.sub(chars_to_remove_regex, "", batch['text'])
    return batch

train_dataset = train_dataset.map(remove_special_characters, num_proc=4)
fleurs_dev = fleurs_dev.map(remove_special_characters, num_proc=4)
fleurs_test = fleurs_test.map(remove_special_characters, num_proc=4)
cv_19_test = cv_19_test.map(remove_special_characters, num_proc=4)

In [23]:
# replace whitespace characters with a single space
whitespace_chars = "-_/"

def replace_whitespace(text, whitespace_chars):
    whitespace_chars_regex = f"[{re.escape(whitespace_chars)}]"
    text = re.sub(whitespace_chars_regex, " ", text)
    # remove multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    return {'text': text}

train_dataset = train_dataset.map(replace_whitespace, fn_kwargs={'whitespace_chars': whitespace_chars}, input_columns=['text'])
fleurs_dev = fleurs_dev.map(replace_whitespace, fn_kwargs={'whitespace_chars': whitespace_chars}, input_columns=['text'])
fleurs_test = fleurs_test.map(replace_whitespace, fn_kwargs={'whitespace_chars': whitespace_chars}, input_columns=['text'])
cv_19_test = cv_19_test.map(replace_whitespace, fn_kwargs={'whitespace_chars': whitespace_chars}, input_columns=['text'])

In [24]:
# lower case, remove leading and trailing spaces
def lower_case_and_strip(batch):
    batch['text'] = batch['text'].lower().strip()
    return batch

train_dataset = train_dataset.map(lower_case_and_strip, num_proc=4)
fleurs_dev = fleurs_dev.map(lower_case_and_strip, num_proc=4)
fleurs_test = fleurs_test.map(lower_case_and_strip, num_proc=4)
cv_19_test = cv_19_test.map(lower_case_and_strip, num_proc=4)

In [25]:
# drop samples longer than 30 seconds from the train and dev sets
def drop_long_samples(batch):
    if batch['duration'] > 30:
        return False
    return True

train_dataset = train_dataset.filter(drop_long_samples, num_proc=4)
fleurs_dev = fleurs_dev.filter(drop_long_samples, num_proc=4)

In [26]:
# write the train and test manifest files
write_manifest_file(train_dataset, 'luo_train_manifest.jsonl')
write_manifest_file(fleurs_dev, 'luo_dev_manifest.jsonl')
write_manifest_file(fleurs_test, 'luo_test_manifest.jsonl')
write_manifest_file(cv_19_test, 'cv_19_test_manifest_preprocessed.jsonl')

In [27]:
train_dataset[0]

{'path': '/home/aymen/.cache/huggingface/datasets/downloads/extracted/79e80adb59e2ace9adc859487aae30d38730f7d97ddb0989b8ff760ca3c5ccc1/luo_train_0/common_voice_luo_40242801.mp3',
 'audio': {'path': None,
  'array': array([ 3.18323146e-12, -2.72848411e-12, -1.36424205e-12, ...,
          1.07734888e-06, -8.43780526e-07,  6.15047611e-07]),
  'sampling_rate': 16000},
 'text': 'oyudo ng’ato kanyo',
 'duration': 2.736}

In [28]:
# listen to a random sample
sample = random.choice(train_dataset)
print(sample['text'])
print(sample['duration'])
DisplayAudio(sample['audio']['array'], rate=sample['audio']['sampling_rate'])

sama wang'ado yien waketho kar dak mar le
3.816


In [29]:
# calculate the total duration of the train set
total_duration_real = sum(train_dataset['duration'])
total_duration_real

69957.39599999822

In [31]:
# save the common voice 19 test set for later evaluation
# DatasetDict({"test": cv_19_test}).save_to_disk("/mnt/md0/synvoices/data/luo_cv_19_test")

In [32]:
# save the FLEURS test set for later evaluation
# DatasetDict({"test": fleurs_test}).save_to_disk("/mnt/md0/synvoices/data/luo_fleurs_test")

In [33]:
# remove the "gender" column from the test sets
cv_19_test = cv_19_test.remove_columns("gender")
fleurs_test = fleurs_test.remove_columns("gender")

# Prepare Synthetic Data

In [30]:
synth_path = "/mnt/md0/synvoices/data/luo_asr_filtered_augmented/manifest.jsonl"

with open(synth_path, "r") as f:
    lines = f.readlines()
    data = [json.loads(line) for line in lines]

len(data)

657877

In [31]:
df = pd.DataFrame(data)
df.describe(include="all")

Unnamed: 0,audio_filepath,text,pred_text,original_text,duration,ratio
count,657877,657877,657877,657877,657877.0,657877.0
unique,657877,657841,657853,657877,,
top,/mnt/md0/synvoices/data/luo_asr_filtered_augme...,in japuonjre maber,kaka chieng' rieny kawuono,waparo ni wanego rech mathoth moloyo mana kaka...,,
freq,1,2,2,1,,
mean,,,,,4.245807,0.998449
std,,,,,0.948613,0.041809
min,,,,,1.323375,0.5625
25%,,,,,3.574,0.976744
50%,,,,,4.160687,1.0
75%,,,,,4.822,1.018868


In [32]:
# strip leading and trailing whitespace from the text column
df["text"] = df["text"].str.strip()

# drop duplicates
df = df.drop_duplicates(subset=["text"])
df.describe(include="all")

Unnamed: 0,audio_filepath,text,pred_text,original_text,duration,ratio
count,657841,657841,657841,657841,657841.0,657841.0
unique,657841,657841,657819,657841,,
top,/mnt/md0/synvoices/data/luo_asr_filtered_augme...,waparo ni wanego rech mathoth moloyo mana kaka...,tweche mang'eny nyalo nonore gi remo,waparo ni wanego rech mathoth moloyo mana kaka...,,
freq,1,1,2,1,,
mean,,,,,4.245846,0.998448
std,,,,,0.948601,0.041808
min,,,,,1.323375,0.5625
25%,,,,,3.574,0.976744
50%,,,,,4.160687,1.0
75%,,,,,4.822,1.018868


In [33]:
# remove samples longer than 20 seconds
df = df[df["duration"] <= 20]
df["duration"].describe()

count    657824.000000
mean          4.245258
std           0.940951
min           1.323375
25%           3.574000
50%           4.160687
75%           4.822000
max          19.542000
Name: duration, dtype: float64

In [34]:
# total duration
total_duration = df['duration'].sum()
total_duration / 3600

np.float64(775.7311908506944)

In [35]:
cols = ['audio_filepath', 'text', 'duration']

# save to new manifest
new_manifest_path = os.path.join(
    os.path.dirname(synth_path),
    f"manifest_{int(total_duration / 3600)}h.jsonl"
)
df[cols].to_json(new_manifest_path, orient="records", lines=True)

In [36]:
def sample_dataset(df, duration, path, manifest_name, random_state=1):
    """
    Sample a dataset to a specific duration.
    """
    total_duration = float(df['duration'].sum())
    ratio = (duration * 3600) / total_duration
    n = int(len(df) * ratio)
    sample = df.sample(n=n, random_state=random_state)
    sample = sample.sample(frac=1, random_state=random_state).reset_index(drop=True)
    sample_duration = sample['duration'].sum()
    print(f"Sample duration: {sample_duration / 3600:.2f} hours")
    
    # save to new manifest
    sample_manifest_path = os.path.join(
        os.path.dirname(path),
        f"{manifest_name}_{int(duration)}h.jsonl"
    )
    sample.to_json(sample_manifest_path, orient="records", lines=True)
    print(f"Manifest saved to {sample_manifest_path}")

In [43]:
sample_dataset(df, total_duration_real / 3600, synth_path, "manifest", random_state=4)

Sample duration: 19.43 hours
Manifest saved to /mnt/md0/synvoices/data/luo_asr_filtered_augmented/manifest_19h.jsonl


In [37]:
sample_dataset(df, (total_duration_real / 3600) * 2, synth_path, "manifest")

Sample duration: 38.87 hours
Manifest saved to /mnt/md0/synvoices/data/luo_asr_filtered_augmented/manifest_38h.jsonl


In [38]:
sample_dataset(df, (total_duration_real / 3600) * 4, synth_path, "manifest", random_state=5)

Sample duration: 77.72 hours
Manifest saved to /mnt/md0/synvoices/data/luo_asr_filtered_augmented/manifest_77h.jsonl


In [44]:
synthetic_manifest_path = "/mnt/md0/synvoices/data/luo_asr_filtered_augmented/manifest_19h.jsonl"

# convert to a Hugging Face dataset
synthetic_train = Dataset.from_json(synthetic_manifest_path, split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [45]:
# cast to audio
synthetic_train = synthetic_train.cast_column('audio_filepath', Audio(sampling_rate=16000))

In [46]:
# rename 'audio_filepath' to 'audio'
synthetic_train = synthetic_train.rename_column("audio_filepath", "audio")

In [47]:
# remove unnecessary columns
cols = ['audio', 'text', 'duration']
synthetic_train = synthetic_train.remove_columns([col for col in synthetic_train.column_names if col not in cols])

In [48]:
synthetic_train[0]

{'audio': {'path': '/mnt/md0/synvoices/data/luo_asr_filtered_augmented/clips/500d049d8dc23f9ca04009fdda9a5dda.wav',
  'array': array([0.0005188 , 0.00048828, 0.00036621, ..., 0.00045776, 0.00045776,
         0.00048828]),
  'sampling_rate': 16000},
 'text': 'kue ok en mana bedo mos to en bedo gi chuny mokuwe',
 'duration': 5.7286875}

# Save Final Dataset

In [49]:
len(train_dataset), len(synthetic_train)

(9858, 16478)

In [50]:
# concatenate the real and synthetic datasets
train_dataset = concatenate_datasets([train_dataset, synthetic_train])

# shuffle the training set
train_dataset = train_dataset.shuffle(seed=42)

In [51]:
# remove unnecessary columns
cols = ['audio', 'text', 'duration']
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col not in cols])
fleurs_dev = fleurs_dev.remove_columns([col for col in fleurs_dev.column_names if col not in cols])
fleurs_test = fleurs_test.remove_columns([col for col in fleurs_test.column_names if col not in cols])

In [52]:
# listen to a random sample from the training set
sample = random.choice(train_dataset)
print(sample['text'])
print(sample['duration'])
DisplayAudio(sample['audio']['array'], rate=sample['audio']['sampling_rate'])

jopiny mar gweng'e dwaro weche mag siasa
3.3926875


In [53]:
# create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': fleurs_dev,
    'test': fleurs_test
})

# save the dataset
save_path = "/mnt/md0/synvoices/data/luo_19_19h"
dataset_dict.save_to_disk(save_path, num_proc=4)

Saving the dataset (0/14 shards):   0%|          | 0/26336 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/101 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/256 [00:00<?, ? examples/s]

In [54]:
# load the dataset
# dataset_dict = DatasetDict.load_from_disk(save_path)

In [55]:
# upload to the hub
dataset_dict.push_to_hub(f"CLEAR-Global/{save_path.split('/')[-1]}")

Uploading the dataset shards:   0%|          | 0/14 [00:00<?, ?it/s]

Map:   0%|          | 0/1882 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1882 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Map:   0%|          | 0/1881 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/CLEAR-Global/luo_19_19h/commit/7209e84fdcf9d67146c904ccb97d7bf501b127e1', commit_message='Upload dataset', commit_description='', oid='7209e84fdcf9d67146c904ccb97d7bf501b127e1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CLEAR-Global/luo_19_19h', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CLEAR-Global/luo_19_19h'), pr_revision=None, pr_num=None)