In [1]:
from datasets import Dataset, load_dataset, concatenate_datasets, Audio, DatasetDict
import random
import json
import pandas as pd
import re
import os
from IPython.display import Audio as DisplayAudio

# Utils

In [2]:
# convert to a NeMo manifest

def write_manifest_file(dataset, manifest_file):
    with open(manifest_file, 'w') as fout:
        for sample in dataset:
            # Get the audio file path
            path = sample['audio']['path']
            if path is None:
                path = sample['path']
            manifest_line = {
                'audio_filepath': path,
                'duration': sample['duration'],
                'text': sample['text'],
            }
            fout.write(json.dumps(manifest_line) + '\n')

In [3]:
# listen to a random sample
def listen_to_random_sample(dataset):
    random_index = random.randint(0, len(dataset) - 1)
    sample = dataset[random_index]
    data = sample['audio']['array']
    text = sample['text']
    duration = sample['duration']
    rate = sample['audio']['sampling_rate']
    
    print(f"Text: {text}")
    print(f"Duration: {duration} seconds")
    
    return DisplayAudio(data, rate=rate, autoplay=True)

In [4]:
def sample_dataset(df, duration, path, manifest_name, random_state=1):
    """
    Sample a dataset to a specific duration.
    """
    total_duration = float(df['duration'].sum())
    ratio = (duration * 3600) / total_duration
    n = int(len(df) * ratio)
    sample = df.sample(n=n, random_state=random_state)
    sample = sample.sample(frac=1, random_state=random_state).reset_index(drop=True)
    sample_duration = sample['duration'].sum()
    print(f"Sample duration: {sample_duration / 3600:.2f} hours")
    
    # save to new manifest
    sample_manifest_path = os.path.join(
        os.path.dirname(path),
        f"{manifest_name}_{int(duration)}h.jsonl"
    )
    sample.to_json(sample_manifest_path, orient="records", lines=True)
    print(f"Manifest saved to {sample_manifest_path}")

# FLEURS

In [5]:
fleurs_train = load_dataset("google/fleurs", "ny_mw", split="train")
fleurs_dev = load_dataset("google/fleurs", "ny_mw", split="validation")
fleurs_test = load_dataset("google/fleurs", "ny_mw", split="test")

In [6]:
# fix the audio paths
def fix_audio_paths(batch):
    batch['path'] = os.path.join(os.path.dirname(batch['path']), batch['audio']['path'])
    return batch

fleurs_train = fleurs_train.map(fix_audio_paths, num_proc=4)
fleurs_dev = fleurs_dev.map(fix_audio_paths, num_proc=4)
fleurs_test = fleurs_test.map(fix_audio_paths, num_proc=4)

In [7]:
fleurs_train[0]

{'id': 1299,
 'num_samples': 222720,
 'path': '/home/aymen/.cache/huggingface/datasets/downloads/extracted/c6472e2668127b67f34f39c6344ba4b7daced48d5bbf426b8a576d85e82b887b/train/10021477015352456347.wav',
 'audio': {'path': None,
  'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         3.96728516e-04, 9.15527344e-05, 3.35693359e-04]),
  'sampling_rate': 16000},
 'transcription': "akazi amaphikira panja masitolo anali otsegula moyang'ana mumsewu miyala inkagwiritsidwa ntchito pomanga manyumba",
 'raw_transcription': "Akazi amaphikira panja; masitolo anali otsegula moyang'ana mumsewu. Miyala inkagwiritsidwa ntchito pomanga manyumba.",
 'gender': 0,
 'lang_id': 69,
 'language': 'Nyanja',
 'lang_group_id': 3}

In [8]:
# calculate the duration of the audio
def calculate_duration_fleurs(batch):
    batch['duration'] = batch['num_samples'] / batch['audio']['sampling_rate']
    return batch

fleurs_train = fleurs_train.map(calculate_duration_fleurs, num_proc=4)
fleurs_dev = fleurs_dev.map(calculate_duration_fleurs, num_proc=4)
fleurs_test = fleurs_test.map(calculate_duration_fleurs, num_proc=4)

In [9]:
# rename the column "transcription" to "text"
fleurs_train = fleurs_train.rename_column("transcription", "text")
fleurs_dev = fleurs_dev.rename_column("transcription", "text")
fleurs_test = fleurs_test.rename_column("transcription", "text")

In [10]:
# save to NeMo manifest files
write_manifest_file(fleurs_train, "chichewa_fleurs_train.jsonl")
write_manifest_file(fleurs_dev, "chichewa_fleurs_dev.jsonl")
write_manifest_file(fleurs_test, "chichewa_fleurs_test.jsonl")

In [11]:
# calculate the total duration of the training set
print("Total duration of the Fleurs train set: ", sum(fleurs_train['duration']) / 3600, "hours")

Total duration of the Fleurs train set:  10.872649999999982 hours


# Zambezi Voice

In [12]:
# download the Nyanja subset of the Zambezi Voice dataset
# !git clone https://github.com/unza-speech-lab/zambezi-voice-nyanja /mnt/md0/synvoices/data/zambezi-voice-nyanja

In [13]:
train_df = pd.read_csv('/mnt/md0/synvoices/data/zambezi-voice-nyanja/nya/train.tsv', sep='\t', index_col=0)
dev_df = pd.read_csv('/mnt/md0/synvoices/data/zambezi-voice-nyanja/nya/dev.tsv', sep='\t', index_col=0)
test_df = pd.read_csv('/mnt/md0/synvoices/data/zambezi-voice-nyanja/nya/test.tsv', sep='\t', index_col=0)

In [14]:
# merge the train and dev sets
train_df = pd.concat([train_df, dev_df], axis=0)

In [15]:
# drop the "BitsPerSample" and "sampleRate" columns
cols = ['BitsPerSample', 'sampleRate']
train_df = train_df.drop(columns=cols)
test_df = test_df.drop(columns=cols)

In [16]:
# rename "sentence" to "text", and "audio_id" to "audio_filepath"
renamer = {
    "sentence": "text",
    "audio_id": "audio_filepath"
}
train_df = train_df.rename(columns=renamer)
test_df = test_df.rename(columns=renamer)

In [17]:
# convert the audio file paths to absolute paths
def convert_to_absolute_path(filepath):
    return os.path.join('/mnt/md0/synvoices/data/zambezi-voice-nyanja/nya/audio', filepath)

train_df['audio_filepath'] = train_df['audio_filepath'].apply(convert_to_absolute_path)
test_df['audio_filepath'] = test_df['audio_filepath'].apply(convert_to_absolute_path)

In [18]:
# convert duration to seconds
train_df['duration'] = train_df.pop('durationMsec') / 1000
test_df['duration'] = test_df.pop('durationMsec') / 1000

In [19]:
train_df.describe(include='all')

Unnamed: 0,audio_filepath,text,duration
count,8739,8739,8739.0
unique,8739,8724,
top,/mnt/md0/synvoices/data/zambezi-voice-nyanja/n...,khamu la anthu lomwe amuna awiri avala zipewa ...,
freq,1,2,
mean,,,9.902439
std,,,3.959854
min,,,0.544
25%,,,7.305
50%,,,9.143
75%,,,11.6045


In [20]:
test_df.describe(include='all')

Unnamed: 0,audio_filepath,text,duration
count,428,428,428.0
unique,428,428,
top,/mnt/md0/synvoices/data/zambezi-voice-nyanja/n...,mwamuna wina wovala zakuda akuwomba m manja mu...,
freq,1,1,
mean,,,11.01171
std,,,4.617315
min,,,0.657
25%,,,8.144
50%,,,9.937
75%,,,13.204


In [21]:
# save to NeMo manifest files
train_df.to_json('/mnt/md0/synvoices/data/zambezi-voice-nyanja/nya/train.jsonl', orient='records', lines=True)
test_df.to_json('/mnt/md0/synvoices/data/zambezi-voice-nyanja/nya/test.jsonl', orient='records', lines=True)

In [22]:
# convert to HF dataset
def convert_to_hf_dataset(df):
    dataset = Dataset.from_pandas(df)
    dataset = dataset.cast_column("audio_filepath", Audio())
    return dataset

zambezi_train = convert_to_hf_dataset(train_df)
zambezi_test = convert_to_hf_dataset(test_df)

zambezi_train = zambezi_train.rename_column("audio_filepath", "audio")
zambezi_test = zambezi_test.rename_column("audio_filepath", "audio")

In [23]:
zambezi_train[0]

{'audio': {'path': '/mnt/md0/synvoices/data/zambezi-voice-nyanja/nya/audio/221102-102320_nya_510_elicit_0.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00338745,
          0.00234985,  0.00933838]),
  'sampling_rate': 16000},
 'text': 'wosewera mpira wina akungoyang ana mosowa chochita wina amalumpha ndikuwombera mpirawo kupita kudengu ',
 'duration': 10.595,
 '__index_level_0__': 0}

In [24]:
listen_to_random_sample(zambezi_train)

Text: bambo wina waima kutsogolo kwa msika wa nyama uku akulankhula pa foni 
Duration: 6.08 seconds


In [25]:
# calculate the total duration of the training set
print("Total duration of the Zambezi Voice train set: ", sum(zambezi_train['duration']) / 3600, "hours")

Total duration of the Zambezi Voice train set:  24.03817138888891 hours


# Prepare Real Data

In [26]:
# remove all columns except "path", "audio", "duration", and "text"
COLS = ['path', 'audio', 'duration', 'text']
fleurs_train = fleurs_train.remove_columns([col for col in fleurs_train.column_names if col not in COLS])
fleurs_dev = fleurs_dev.remove_columns([col for col in fleurs_dev.column_names if col not in COLS])
zambezi_train = zambezi_train.remove_columns([col for col in zambezi_train.column_names if col not in COLS])
zambezi_test = zambezi_test.remove_columns([col for col in zambezi_test.column_names if col not in COLS])
# keep "gender" in the fleurs test set
fleurs_test = fleurs_test.remove_columns([col for col in fleurs_test.column_names if col not in COLS + ["gender"]])

In [27]:
# resample the audio to 16kHz
fleurs_train = fleurs_train.cast_column('audio', Audio(sampling_rate=16000))
fleurs_dev = fleurs_dev.cast_column('audio', Audio(sampling_rate=16000))
fleurs_test = fleurs_test.cast_column('audio', Audio(sampling_rate=16000))
zambezi_train = zambezi_train.cast_column('audio', Audio(sampling_rate=16000))
zambezi_test = zambezi_test.cast_column('audio', Audio(sampling_rate=16000))

In [28]:
len(fleurs_train), len(zambezi_train)

(2694, 8739)

In [29]:
# concatenate the datasets
train_dataset = concatenate_datasets([fleurs_train, zambezi_train])

In [30]:
# shuffle the training set
train_dataset = train_dataset.shuffle(seed=42)

In [31]:
train_dataset

Dataset({
    features: ['path', 'audio', 'text', 'duration'],
    num_rows: 11433
})

In [32]:
# remove special characters
chars_to_ignore = '!,.:;[]‘’”'
chars_to_remove_regex = f"[{re.escape(chars_to_ignore)}]"

def remove_special_characters(batch):
    batch['text'] = re.sub(chars_to_remove_regex, "", batch['text'])
    return batch

train_dataset = train_dataset.map(remove_special_characters, num_proc=4)
fleurs_dev = fleurs_dev.map(remove_special_characters, num_proc=4)
fleurs_test = fleurs_test.map(remove_special_characters, num_proc=4)
zambezi_test = zambezi_test.map(remove_special_characters, num_proc=4)

Map (num_proc=4):   0%|          | 0/428 [00:00<?, ? examples/s]

In [33]:
# replace whitespace characters with a single space
whitespace_chars = "-/"

def replace_whitespace(text, whitespace_chars):
    whitespace_chars_regex = f"[{re.escape(whitespace_chars)}]"
    text = re.sub(whitespace_chars_regex, " ", text)
    # remove multiple spaces
    text = re.sub(r"\s+", " ", text).strip()
    return {'text': text}

train_dataset = train_dataset.map(replace_whitespace, fn_kwargs={'whitespace_chars': whitespace_chars}, input_columns=['text'])
fleurs_dev = fleurs_dev.map(replace_whitespace, fn_kwargs={'whitespace_chars': whitespace_chars}, input_columns=['text'])
fleurs_test = fleurs_test.map(replace_whitespace, fn_kwargs={'whitespace_chars': whitespace_chars}, input_columns=['text'])
zambezi_test = zambezi_test.map(replace_whitespace, fn_kwargs={'whitespace_chars': whitespace_chars}, input_columns=['text'])

Map:   0%|          | 0/428 [00:00<?, ? examples/s]

In [34]:
# lower case, remove leading and trailing spaces
def lower_case_and_strip(batch):
    batch['text'] = batch['text'].lower().strip()
    return batch

train_dataset = train_dataset.map(lower_case_and_strip, num_proc=4)
fleurs_dev = fleurs_dev.map(lower_case_and_strip, num_proc=4)
fleurs_test = fleurs_test.map(lower_case_and_strip, num_proc=4)
zambezi_test = zambezi_test.map(lower_case_and_strip, num_proc=4)

Map (num_proc=4):   0%|          | 0/428 [00:00<?, ? examples/s]

In [35]:
# drop samples longer than 30 seconds from the train and dev sets
def drop_long_samples(batch):
    if batch['duration'] >= 30:
        return False
    return True

train_dataset = train_dataset.filter(drop_long_samples, num_proc=4)
fleurs_dev = fleurs_dev.filter(drop_long_samples, num_proc=4)

In [36]:
# calculate character rate
def calculate_character_rate(batch):
    batch['character_rate'] = len(batch['text']) / batch['duration']
    return batch

train_dataset = train_dataset.map(calculate_character_rate, num_proc=4)
fleurs_dev = fleurs_dev.map(calculate_character_rate, num_proc=4)

In [37]:
MAX_CR = 18

# drop samples with character rate greater than MAX_CR
def drop_high_character_rate(batch):
    if batch['character_rate'] >= MAX_CR:
        return False
    return True

train_dataset = train_dataset.filter(drop_high_character_rate, num_proc=4)
fleurs_dev = fleurs_dev.filter(drop_high_character_rate, num_proc=4)

In [38]:
# write the train and test manifest files
write_manifest_file(train_dataset, 'chichewa_train_manifest.jsonl')
write_manifest_file(fleurs_dev, 'chichewa_dev_manifest.jsonl')
write_manifest_file(fleurs_test, 'chichewa_test_manifest.jsonl')
write_manifest_file(zambezi_test, 'chichewa_zambezi_test_manifest.jsonl')

In [39]:
# write all the transcripts to txt files
def dump_transcripts(dataset, filename):
    texts = dataset['text']
    with open(filename, 'w') as f:
        for text in texts:
            f.write(f"{text}\n")
    print(f"Transcripts saved to {filename}")

dump_transcripts(train_dataset, 'chichewa_train_real_34h.txt')
dump_transcripts(fleurs_dev, 'chichewa_fleurs_dev.txt')
dump_transcripts(fleurs_test, 'chichewa_fleurs_test.txt')
dump_transcripts(zambezi_test, 'chichewa_zambezi_test.txt')

Transcripts saved to chichewa_train_real_34h.txt
Transcripts saved to chichewa_fleurs_dev.txt
Transcripts saved to chichewa_fleurs_test.txt
Transcripts saved to chichewa_zambezi_test.txt


In [40]:
# listen to a random sample
listen_to_random_sample(train_dataset)

Text: galu oyera ndi kuda amuika mu madzi mu ziwe labuluu losewereramo
Duration: 6.897 seconds


In [41]:
# calculate the total duration of the train set
total_duration_real = sum(train_dataset['duration'])
total_duration_real

123139.12700000037

In [42]:
# save the Zambezi Voice test set for later evaluation
# DatasetDict({"test": zambezi_test}).save_to_disk("/mnt/md0/synvoices/data/chichewa_zambezi_test")

In [43]:
# upload the Zambezi Voice test set to Hugging Face
# DatasetDict({"test": zambezi_test}).push_to_hub("CLEAR-Global/chichewa-zambezi-voice-test")

In [44]:
# save the FLEURS test set for later evaluation
# DatasetDict({"test": fleurs_test}).save_to_disk("/mnt/md0/synvoices/data/chichewa_fleurs_test")

In [45]:
# remove the "gender" column from the FLEURS test set
fleurs_test = fleurs_test.remove_columns("gender")

# Prepare Synthetic Data

In [46]:
synth_path = "/mnt/md0/synvoices/data/chichewa_asr_augmented/manifest.jsonl"

with open(synth_path, "r") as f:
    lines = f.readlines()
    data = [json.loads(line) for line in lines]

len(data)

374612

In [47]:
df = pd.DataFrame(data)
df.describe(include="all")

Unnamed: 0,audio_filepath,text,original_text,duration
count,374612,374612,374612,374612.0
unique,374612,374545,374612,
top,/mnt/md0/synvoices/data/chichewa_asr_augmented...,aliyense avulala mu ngozi ya galimoto,tidzafunika madzi ambiri pa nthawi ya masewera.,
freq,1,2,1,
mean,,,,5.292552
std,,,,1.170319
min,,,,2.102
25%,,,,4.587375
50%,,,,5.206
75%,,,,5.899375


In [48]:
# strip leading and trailing whitespace from the text column
df["text"] = df["text"].str.strip()

# drop duplicates
df = df.drop_duplicates(subset=["text"])
df.describe(include="all")

Unnamed: 0,audio_filepath,text,original_text,duration
count,374545,374545,374545,374545.0
unique,374545,374545,374545,
top,/mnt/md0/synvoices/data/chichewa_asr_augmented...,tidzafunika madzi ambiri pa nthawi ya masewera,tidzafunika madzi ambiri pa nthawi ya masewera.,
freq,1,1,1,
mean,,,,5.292626
std,,,,1.170347
min,,,,2.102
25%,,,,4.587375
50%,,,,5.206
75%,,,,5.899375


In [49]:
# remove samples longer than 20 seconds
df = df[df["duration"] <= 20]
df["duration"].describe()

count    374501.000000
mean          5.286822
std           0.986455
min           2.102000
25%           4.587375
50%           5.206000
75%           5.899375
max          17.462000
Name: duration, dtype: float64

In [50]:
# total duration
total_duration = df['duration'].sum()
total_duration / 3600

np.float64(549.9777953125001)

In [51]:
cols = ['audio_filepath', 'text', 'duration']

# save to new manifest
new_manifest_path = os.path.join(
    os.path.dirname(synth_path),
    f"manifest_{int(total_duration / 3600)}h.jsonl"
)
df[cols].to_json(new_manifest_path, orient="records", lines=True)

In [52]:
sample_dataset(df, total_duration_real / 3600, synth_path, "manifest", random_state=7)

Sample duration: 34.21 hours
Manifest saved to /mnt/md0/synvoices/data/chichewa_asr_augmented/manifest_34h.jsonl


In [53]:
sample_dataset(df, (total_duration_real / 3600) * 2, synth_path, "manifest", random_state=16)

Sample duration: 68.40 hours
Manifest saved to /mnt/md0/synvoices/data/chichewa_asr_augmented/manifest_68h.jsonl


In [54]:
sample_dataset(df, (total_duration_real / 3600) * 3, synth_path, "manifest", random_state=15)

Sample duration: 102.62 hours
Manifest saved to /mnt/md0/synvoices/data/chichewa_asr_augmented/manifest_102h.jsonl


In [55]:
sample_dataset(df, (total_duration_real / 3600) * 4, synth_path, "manifest", random_state=8)

Sample duration: 136.80 hours
Manifest saved to /mnt/md0/synvoices/data/chichewa_asr_augmented/manifest_136h.jsonl


In [56]:
sample_dataset(df, (total_duration_real / 3600) * 9, synth_path, "manifest", random_state=13)

Sample duration: 307.85 hours
Manifest saved to /mnt/md0/synvoices/data/chichewa_asr_augmented/manifest_307h.jsonl


In [93]:
synthetic_manifest_path = "/mnt/md0/synvoices/data/chichewa_asr_augmented/manifest_307h.jsonl"

# convert to a Hugging Face dataset
synthetic_train = Dataset.from_json(synthetic_manifest_path, split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [94]:
# cast to audio
synthetic_train = synthetic_train.cast_column('audio_filepath', Audio(sampling_rate=16000))

In [95]:
# rename 'audio_filepath' to 'audio'
synthetic_train = synthetic_train.rename_column("audio_filepath", "audio")

In [96]:
# remove unnecessary columns
cols = ['audio', 'text', 'duration']
synthetic_train = synthetic_train.remove_columns([col for col in synthetic_train.column_names if col not in cols])

In [97]:
synthetic_train[0]

{'audio': {'path': '/mnt/md0/synvoices/data/chichewa_asr_augmented/clips/39755ab7575cedc18c576e3d3018264d.wav',
  'array': array([-1.00708008e-03,  1.12915039e-03, -9.15527344e-04, ...,
         -3.05175781e-05,  0.00000000e+00, -3.05175781e-05]),
  'sampling_rate': 16000},
 'text': 'chikalata chimenecho chinalembedwa ndi mfumu yamakedzana',
 'duration': 6.4326875}

In [98]:
# write the transcripts to a txt file
dump_transcripts(synthetic_train, 'chichewa_train_synth_307h.txt')

Transcripts saved to chichewa_train_synth_307h.txt


# Save Final Dataset

In [99]:
len(train_dataset), len(synthetic_train)

(11352, 209625)

In [73]:
# concatenate the real and synthetic datasets, or skip this cell if you want to use only the real dataset
train_dataset = concatenate_datasets([train_dataset, synthetic_train])

# shuffle the training set
train_dataset = train_dataset.shuffle(seed=42)

In [74]:
# remove unnecessary columns
cols = ['audio', 'text', 'duration']
train_dataset = train_dataset.remove_columns([col for col in train_dataset.column_names if col not in cols])
fleurs_dev = fleurs_dev.remove_columns([col for col in fleurs_dev.column_names if col not in cols])
fleurs_test = fleurs_test.remove_columns([col for col in fleurs_test.column_names if col not in cols])

In [75]:
# listen to a random sample from the training set
sample = random.choice(train_dataset)
print(sample['text'])
print(sample['duration'])
DisplayAudio(sample['audio']['array'], rate=sample['audio']['sampling_rate'])

tili ndi mayeso a kompyuta sabata yamawa
5.323375


In [76]:
# create a DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': fleurs_dev,
    'test': fleurs_test
})

# save the dataset
save_path = "/mnt/md0/synvoices/data/chichewa_34_102h"
dataset_dict.save_to_disk(save_path, num_proc=4)

Saving the dataset (0/32 shards):   0%|          | 0/81227 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/304 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/761 [00:00<?, ? examples/s]

In [77]:
# load the dataset
# dataset_dict = DatasetDict.load_from_disk(save_path)

In [78]:
# upload to the hub
dataset_dict.push_to_hub(f"CLEAR-Global/{save_path.split('/')[-1]}")

Uploading the dataset shards:   0%|          | 0/32 [00:00<?, ?it/s]

Map:   0%|          | 0/2539 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2539 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2539 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2539 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2539 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2539 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2539 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2539 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2539 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2539 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2539 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Map:   0%|          | 0/2538 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/26 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/304 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/761 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/CLEAR-Global/chichewa_34_102h/commit/2e679a45bfe98a5cb8b195b0c9105b29136cfff3', commit_message='Upload dataset', commit_description='', oid='2e679a45bfe98a5cb8b195b0c9105b29136cfff3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CLEAR-Global/chichewa_34_102h', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CLEAR-Global/chichewa_34_102h'), pr_revision=None, pr_num=None)