# Imports

In [1]:
import os
import pandas as pd
from collections import Counter
from sklearn.model_selection import GroupShuffleSplit

# Prepare NaijaVoices dataset

In [2]:
path = "/mnt/md0/synvoices/data/naijavoices/metadata.csv"

df = pd.read_csv(path, sep="~")
df.describe(include="all")

Unnamed: 0,speaker_id,audio,text,language,age_bracket,gender,phase
count,1917686,1917686,1917686,1917686,1917686,1917686,1917686
unique,5454,1887787,644654,3,3,2,2
top,E4J10,20240115210237-157-1281-272473-a-cikin-rayuwar...,Ban sani ba.,hausa,18-29,female,phase 1
freq,8483,48,80,708599,1263398,1097357,1294450


In [3]:
# check for missing values
df.isnull().sum()

speaker_id     0
audio          0
text           0
language       0
age_bracket    0
gender         0
phase          0
dtype: int64

In [4]:
# get Hausa subset
df = df[df['language'] == 'hausa']
df.describe(include="all")

Unnamed: 0,speaker_id,audio,text,language,age_bracket,gender,phase
count,708599,708599,708599,708599,708599,708599,708599
unique,1879,694375,218041,1,3,2,2
top,E4J10,20240115210237-157-1281-272473-a-cikin-rayuwar...,Ban sani ba.,hausa,18-29,female,phase 1
freq,8483,48,80,708599,439895,359529,452974


In [5]:
# drop unnecessary columns
drop_cols = ['phase', 'language']
df.drop(columns=drop_cols, inplace=True)
df.describe(include="all")

Unnamed: 0,speaker_id,audio,text,age_bracket,gender
count,708599,708599,708599,708599,708599
unique,1879,694375,218041,3,2
top,E4J10,20240115210237-157-1281-272473-a-cikin-rayuwar...,Ban sani ba.,18-29,female
freq,8483,48,80,439895,359529


In [6]:
# get the audio file paths
df['audio_filepath'] = df['audio'].apply(lambda x: os.path.join(os.path.dirname(path), "audio-unconverted", x))
# drop the `audio` column
df.drop(columns=['audio'], inplace=True)
df.head()

Unnamed: 0,speaker_id,text,age_bracket,gender,audio_filepath
551655,3XVX7,Baban mu yana koya mana hudan doya,30-over,male,/mnt/md0/synvoices/data/naijavoices/audio-unco...
551656,3XVX7,Ina tsananain son doya,30-over,male,/mnt/md0/synvoices/data/naijavoices/audio-unco...
551657,3XVX7,Ana noman masara sosai a arewacin Najeriya,30-over,male,/mnt/md0/synvoices/data/naijavoices/audio-unco...
551658,3XVX7,"Noma tushen arziki ne, misali kasar Sin tafi k...",30-over,male,/mnt/md0/synvoices/data/naijavoices/audio-unco...
551659,3XVX7,Kayan amfanin gona irin su masara suna taimako...,30-over,male,/mnt/md0/synvoices/data/naijavoices/audio-unco...


In [7]:
# save hausa subset to csv
df.to_csv("hausa_subset.csv", index=False)

In [8]:
# get the duration of the audio files
durations = pd.read_csv("durations.csv")
durations.describe(include='all')

Unnamed: 0,audio_filepath,duration
count,708599,708599.0
unique,694375,
top,/mnt/md0/synvoices/data/naijavoices/audio-unco...,
freq,48,
mean,,3.13947
std,,2.099536
min,,0.06
25%,,2.16
50%,,2.82
75%,,3.78


In [9]:
# merge the two dataframes on `audio_filepath`
df = df.merge(durations, on='audio_filepath', how='left')
df.describe(include='all')

Unnamed: 0,speaker_id,text,age_bracket,gender,audio_filepath,duration
count,806989,806989,806989,806989,806989,806989.0
unique,1879,218041,3,2,694375,
top,BHQAS,"A cikin rayuwar titi, titunan birni suna cunku...",18-29,female,/mnt/md0/synvoices/data/naijavoices/audio-unco...,
freq,37152,2321,502937,433931,2304,
mean,,,,,,3.171088
std,,,,,,2.03666
min,,,,,,0.06
25%,,,,,,2.16
50%,,,,,,2.88
75%,,,,,,3.84


In [10]:
# check for missing values
df.isnull().sum()

speaker_id        0
text              0
age_bracket       0
gender            0
audio_filepath    0
duration          0
dtype: int64

In [11]:
# drop duplicates
df.drop_duplicates(subset=['audio_filepath'], inplace=True)
df.describe(include='all')

Unnamed: 0,speaker_id,text,age_bracket,gender,audio_filepath,duration
count,694375,694375,694375,694375,694375,694375.0
unique,1879,218041,3,2,694375,
top,EX3HO,Ban sani ba.,18-29,female,/mnt/md0/synvoices/data/naijavoices/audio-unco...,
freq,8400,80,429250,351495,1,
mean,,,,,,3.138125
std,,,,,,2.109653
min,,,,,,0.06
25%,,,,,,2.16
50%,,,,,,2.82
75%,,,,,,3.78


In [12]:
# remove long audio files
df = df[df['duration'] <= 15]
df['duration'].sum() / 3600

np.float64(604.7786333333333)

In [13]:
# lowercase the text
df['text'] = df['text'].str.lower()

# get character frequency
all_text = "".join(df['text'].tolist())

# get the frequency of each character
char_freq = Counter(all_text)
# sort the characters by frequency
char_freq

Counter({'a': 6282572,
         ' ': 4758994,
         'n': 2282807,
         'i': 2156317,
         'k': 1130173,
         'u': 1105938,
         's': 1006139,
         'r': 955514,
         'y': 887596,
         'm': 800640,
         'd': 775487,
         'e': 709391,
         't': 707356,
         'b': 603050,
         'o': 546908,
         'w': 537547,
         'h': 431544,
         'g': 414722,
         'l': 378615,
         '.': 376669,
         'f': 298775,
         'c': 288003,
         'j': 259659,
         'z': 259485,
         'ɗ': 116812,
         'ƙ': 101459,
         "'": 87830,
         '?': 83164,
         ',': 64927,
         'ɓ': 21503,
         '-': 19653,
         '!': 19437,
         'p': 15828,
         'ƴ': 11601,
         '"': 6831,
         '’': 3357,
         'v': 2856,
         'q': 1795,
         '2': 1050,
         'x': 1039,
         '\u200b': 976,
         '0': 887,
         '‘': 735,
         '1': 675,
         'ʼ': 614,
         'ã': 566,
         ')': 

In [14]:
# remove punctuation and special characters
CHARS_TO_REMOVE = ['!', '"', '(', ')', ',', '.', ':', ';', '?', '[', ']', '“', '”', '☕', '️', '\u200b']

for char in CHARS_TO_REMOVE:
    df['text'] = df['text'].str.replace(char, '', regex=False)

df['text'] = df['text'].str.replace('\xa0', ' ', regex=False)

# strip leading and trailing whitespace from the text column
df['text'] = df['text'].str.strip()

In [15]:
# remove rows with anomalous character rates
df['character_rate'] = df['text'].str.len() / df['duration']
df['character_rate'].describe()

count    694347.000000
mean         13.083885
std           3.467767
min           0.900901
25%          11.212121
50%          12.916667
75%          14.728682
max         883.333333
Name: character_rate, dtype: float64

In [16]:
MAX = 30
df = df[df['character_rate'] <= MAX]
df.drop(columns=['character_rate'], inplace=True)

In [17]:
# save to NeMo jsonl format
manifest_path = os.path.join(os.path.dirname(path), "manifest.jsonl")

df.to_json(manifest_path, orient='records', lines=True)
print(f"Manifest saved to {manifest_path}")

Manifest saved to /mnt/md0/synvoices/data/naijavoices/manifest.jsonl


# Split the dataset into train, validation, and test sets

In [18]:
df.describe(include='all')

Unnamed: 0,speaker_id,text,age_bracket,gender,audio_filepath,duration
count,694051,694051,694051,694051,694051,694051.0
unique,1879,217080,3,2,694051,
top,PJ07Y,ban sani ba,18-29,female,/mnt/md0/synvoices/data/naijavoices/audio-unco...,
freq,8400,83,429041,351343,1,
mean,,,,,,3.136315
std,,,,,,1.391822
min,,,,,,0.42
25%,,,,,,2.16
50%,,,,,,2.82
75%,,,,,,3.78


In [19]:
def balanced_simple_split(df, train_size=0.8, dev_size=0.1, test_size=0.1, random_state=None):
    """
    Balanced split ensuring:
    - No speaker/text overlaps between any splits
    - Maintains relative size between dev/test
    """
    # Validate inputs
    assert abs(train_size + dev_size + test_size - 1.0) < 1e-6
    assert all(col in df.columns for col in ['text', 'speaker_id', 'audio_filepath'])

    # First split: train vs temp (dev+test)
    gss = GroupShuffleSplit(n_splits=1, train_size=train_size, random_state=random_state)
    train_idx, temp_idx = next(gss.split(df, groups=df['speaker_id']))
    train = df.iloc[train_idx]
    temp = df.iloc[temp_idx]

    # Second split: dev vs test (with speaker isolation)
    split_ratio = dev_size / (dev_size + test_size)
    gss2 = GroupShuffleSplit(n_splits=1, train_size=split_ratio, random_state=random_state)
    dev_idx, test_idx = next(gss2.split(temp, groups=temp['speaker_id']))
    dev = temp.iloc[dev_idx]
    test = temp.iloc[test_idx]

    # Remove overlapping texts between dev and test from BOTH sets
    common_texts = set(dev['text']).intersection(test['text'])
    dev = dev[~dev['text'].isin(common_texts)]
    test = test[~test['text'].isin(common_texts)]

    # Remove train texts overlapping with cleaned dev/test
    dev_test_texts = set(dev['text']).union(test['text'])
    train = train[~train['text'].isin(dev_test_texts)]

    return (
        train.reset_index(drop=True),
        dev.reset_index(drop=True),
        test.reset_index(drop=True)
    )

In [20]:
# Split with balanced dev/test
train_df, dev_df, test_df = balanced_simple_split(
    df,
    train_size=0.98,
    dev_size=0.01,
    test_size=0.01,
    random_state=42
)

print(f"Split sizes: {len(train_df):,} | {len(dev_df):,} | {len(test_df):,}")

Split sizes: 663,333 | 4,538 | 4,524


In [21]:
len(train_df) + len(dev_df) + len(test_df), len(df)

(672395, 694051)

In [22]:
# get unique speakers for each split
train_speakers = train_df['speaker_id'].unique()
dev_speakers = dev_df['speaker_id'].unique()
test_speakers = test_df['speaker_id'].unique()
print(f"Train speakers: {len(train_speakers)}")
print(f"Dev speakers: {len(dev_speakers)}")
print(f"Test speakers: {len(test_speakers)}")

# check for overlapping speakers
overlap_train_dev = set(train_speakers) & set(dev_speakers)
overlap_train_test = set(train_speakers) & set(test_speakers)
overlap_dev_test = set(dev_speakers) & set(test_speakers)
print(f"Overlap train-dev: {len(overlap_train_dev)}")
print(f"Overlap train-test: {len(overlap_train_test)}")
print(f"Overlap dev-test: {len(overlap_dev_test)}")

Train speakers: 1809
Dev speakers: 19
Test speakers: 19
Overlap train-dev: 0
Overlap train-test: 0
Overlap dev-test: 0


In [23]:
# get unique texts for each split
train_texts = train_df['text'].unique()
dev_texts = dev_df['text'].unique()
test_texts = test_df['text'].unique()
print(f"Train texts: {len(train_texts)}")
print(f"Dev texts: {len(dev_texts)}")
print(f"Test texts: {len(test_texts)}")

# check for overlapping texts
overlap_train_dev = set(train_texts) & set(dev_texts)
overlap_train_test = set(train_texts) & set(test_texts)
overlap_dev_test = set(dev_texts) & set(test_texts)
print(f"Overlap train-dev: {len(overlap_train_dev)}")
print(f"Overlap train-test: {len(overlap_train_test)}")
print(f"Overlap dev-test: {len(overlap_dev_test)}")

Train texts: 208103
Dev texts: 4513
Test texts: 4464
Overlap train-dev: 0
Overlap train-test: 0
Overlap dev-test: 0


In [24]:
# get the total duration of each split
train_duration = train_df['duration'].sum()
dev_duration = dev_df['duration'].sum()
test_duration = test_df['duration'].sum()
print(f"Train duration: {train_duration / 3600:.2f} hours")
print(f"Dev duration: {dev_duration / 3600:.2f} hours")
print(f"Test duration: {test_duration / 3600:.2f} hours")

Train duration: 579.08 hours
Dev duration: 3.61 hours
Test duration: 3.41 hours


In [25]:
# save the splits to NeMo jsonl format
train_manifest_path = os.path.join(os.path.dirname(path), "train_manifest.jsonl")
dev_manifest_path = os.path.join(os.path.dirname(path), "dev_manifest.jsonl")
test_manifest_path = os.path.join(os.path.dirname(path), "test_manifest.jsonl")
train_df.to_json(train_manifest_path, orient='records', lines=True)
dev_df.to_json(dev_manifest_path, orient='records', lines=True)
test_df.to_json(test_manifest_path, orient='records', lines=True)
print(f"Train manifest saved to {train_manifest_path}")
print(f"Dev manifest saved to {dev_manifest_path}")
print(f"Test manifest saved to {test_manifest_path}")

Train manifest saved to /mnt/md0/synvoices/data/naijavoices/train_manifest.jsonl
Dev manifest saved to /mnt/md0/synvoices/data/naijavoices/dev_manifest.jsonl
Test manifest saved to /mnt/md0/synvoices/data/naijavoices/test_manifest.jsonl


In [26]:
def sample_dataset(df, duration, manifest_name, random_state=1):
    """
    Sample a dataset to a specific duration.
    """
    total_duration = float(df['duration'].sum())
    ratio = (duration * 3600) / total_duration
    n = int(len(df) * ratio)
    sample = df.sample(n=n, random_state=random_state)
    sample = sample.sample(frac=1, random_state=random_state).reset_index(drop=True)
    sample_duration = sample['duration'].sum()
    print(f"Sample duration: {sample_duration / 3600:.2f} hours")
    
    # save to new manifest
    sample_manifest_path = os.path.join(
        os.path.dirname(path),
        f"{manifest_name}_{int(duration)}h.jsonl"
    )
    sample.to_json(sample_manifest_path, orient="records", lines=True)
    print(f"Manifest saved to {sample_manifest_path}")

In [27]:
sample_dataset(train_df, 50, "train_manifest", random_state=3)

Sample duration: 49.99 hours


Manifest saved to /mnt/md0/synvoices/data/naijavoices/train_manifest_50h.jsonl


In [28]:
sample_dataset(train_df, 100, "train_manifest", random_state=19)

Sample duration: 100.00 hours
Manifest saved to /mnt/md0/synvoices/data/naijavoices/train_manifest_100h.jsonl


In [29]:
sample_dataset(train_df, 250, "train_manifest", random_state=20)


Sample duration: 249.99 hours
Manifest saved to /mnt/md0/synvoices/data/naijavoices/train_manifest_250h.jsonl


In [30]:
sample_dataset(train_df, 500, "train_manifest", random_state=13)

Sample duration: 500.00 hours
Manifest saved to /mnt/md0/synvoices/data/naijavoices/train_manifest_500h.jsonl
