# Imports

In [1]:
import json
import pandas as pd
import os

# Prepare synthetic dataset

In [2]:
path = "/mnt/md0/synvoices/data/hausa_asr_filtered_augmented/manifest.jsonl"

with open(path, "r") as f:
    lines = f.readlines()
    data = [json.loads(line) for line in lines]

len(data)

490475

In [3]:
data[0]

{'audio_filepath': '/mnt/md0/synvoices/data/hausa_asr_filtered_augmented/clips/c0a35934830037866c2681950ad7197f.wav',
 'text': 'yawancin mutane suna koyi daga manyan su',
 'pred_text': 'yawancin mutane suna koyi daga manyanso',
 'original_text': 'Yawancin mutane suna koyi daga manyan su.',
 'duration': 3.6586875,
 'ratio': 0.975}

In [4]:
df = pd.DataFrame(data)
df.describe(include="all")

Unnamed: 0,audio_filepath,text,pred_text,original_text,duration,ratio
count,490475,490475,490475,490475,490475.0,490475.0
unique,490475,489383,489041,490475,,
top,/mnt/md0/synvoices/data/hausa_asr_filtered_aug...,me ya sa aka rubuta takardar 'yancin kai,me ya sa ilimi mai kyau yake da muhimmanci,Yaya za mu iya tabbatar da samar da kaya mai t...,,
freq,1,5,5,1,,
mean,,,,,4.215951,1.001487
std,,,,,0.750225,0.024981
min,,,,,1.578688,0.85
25%,,,,,3.712,1.0
50%,,,,,4.170687,1.0
75%,,,,,4.682688,1.019231


In [5]:
# strip leading and trailing whitespace from the text column
df["text"] = df["text"].str.strip()

# drop duplicates
df = df.drop_duplicates(subset=["text"])
df.describe(include="all")

Unnamed: 0,audio_filepath,text,pred_text,original_text,duration,ratio
count,489382,489382,489382,489382,489382.0,489382.0
unique,489382,489382,488123,489382,,
top,/mnt/md0/synvoices/data/hausa_asr_filtered_aug...,yaya za mu iya tabbatar da samar da kaya mai t...,me ya sa ilimi mai kyau yake da muhimmanci,Yaya za mu iya tabbatar da samar da kaya mai t...,,
freq,1,1,5,1,,
mean,,,,,4.216348,1.001489
std,,,,,0.750284,0.02498
min,,,,,1.578688,0.85
25%,,,,,3.712,1.0
50%,,,,,4.170687,1.0
75%,,,,,4.682688,1.019231


In [6]:
# total duration in hours
df['duration'].sum() / 3600

np.float64(573.168066440972)

In [7]:
# get samples that are questions
questions = df[df['original_text'].str.endswith('?')]
questions.describe(include="all")

Unnamed: 0,audio_filepath,text,pred_text,original_text,duration,ratio
count,203643,203643,203643,203643,203643.0,203643.0
unique,203643,203643,203038,203643,,
top,/mnt/md0/synvoices/data/hausa_asr_filtered_aug...,yaya za mu iya tabbatar da samar da kaya mai t...,me ya sa ilimi yake da muhimmanci ga matasa,Yaya za mu iya tabbatar da samar da kaya mai t...,,
freq,1,1,5,1,,
mean,,,,,4.176179,0.999971
std,,,,,0.73643,0.025995
min,,,,,1.578688,0.85
25%,,,,,3.658688,0.982759
50%,,,,,4.128,1.0
75%,,,,,4.64,1.019231


In [8]:
questions.head()

Unnamed: 0,audio_filepath,text,pred_text,original_text,duration,ratio
3,/mnt/md0/synvoices/data/hausa_asr_filtered_aug...,me yasa talauci ke ci gaba da damun al'umma,meyasa tan nauci ke cigaba da damun al'umma,Me yasa talauci ke ci gaba da damun al'umma?,4.362687,1.0
5,/mnt/md0/synvoices/data/hausa_asr_filtered_aug...,shin hadin kai zai iya taimakawa wajen kare mu...,shin hadin kai zai iya taimakawa wajen kare mu...,Shin hadin kai zai iya taimakawa wajen kare mu...,4.170687,1.019608
10,/mnt/md0/synvoices/data/hausa_asr_filtered_aug...,me yasa ake kawo kyaututtuka a bukukuwa,meye sa ake kawo caututtuka abu kuwa,Me yasa ake kawo kyaututtuka a bukukuwa?,3.434687,0.923077
12,/mnt/md0/synvoices/data/hausa_asr_filtered_aug...,shin akwai hanyoyi na musamman don mayar da un...,shin akwai hanyoyi na musamman to mayar da ung...,Shin akwai hanyoyi na musamman don mayar da un...,4.874688,0.983607
13,/mnt/md0/synvoices/data/hausa_asr_filtered_aug...,shin fasaha tana taimaka maka wajen karatu,shin fasaha tana taimaka maka wajen karatum,Shin fasaha tana taimaka maka wajen karatu?,3.754688,1.02381


In [9]:
# ratio of questions to total samples
questions_ratio = len(questions) / len(df)
questions_ratio

0.416122783428896

In [10]:
questions['duration'].sum() / 3600

np.float64(236.23598380208333)

In [11]:
# sample fewer questions
target_pct = 0.2543
n = len(df) - len(questions)
new_length = int(n / (1 - target_pct) * target_pct)
questions = questions.sample(n=new_length, random_state=1)
questions.describe(include="all")

Unnamed: 0,audio_filepath,text,pred_text,original_text,duration,ratio
count,97443,97443,97443,97443,97443.0,97443.0
unique,97443,97443,97279,97443,,
top,/mnt/md0/synvoices/data/hausa_asr_filtered_aug...,ta yaya za mu tabbatar da daidaiton kowa a cik...,me ya sa ilimi mai kyau yake da muhimmanci,Ta yaya za mu tabbatar da daidaiton kowa a cik...,,
freq,1,1,4,1,,
mean,,,,,4.17748,0.999907
std,,,,,0.735115,0.026071
min,,,,,1.664,0.85
25%,,,,,3.658688,0.982759
50%,,,,,4.128,1.0
75%,,,,,4.64,1.019231


In [12]:
new_df = pd.concat([df[~df['original_text'].str.endswith('?')], questions])
new_df = new_df.sample(frac=1, random_state=1).reset_index(drop=True)
new_df.describe(include="all")

Unnamed: 0,audio_filepath,text,pred_text,original_text,duration,ratio
count,383182,383182,383182,383182,383182.0,383182.0
unique,383182,383182,382366,383182,,
top,/mnt/md0/synvoices/data/hausa_asr_filtered_aug...,goron giwa yana da dogon wuya sosai,me ya sa ilimi mai kyau yake da muhimmanci,Goron giwa yana da dogon wuya sosai.,,
freq,1,1,4,1,,
mean,,,,,4.227813,1.001893
std,,,,,0.753351,0.024697
min,,,,,1.664,0.85
25%,,,,,3.712,1.0
50%,,,,,4.170687,1.0
75%,,,,,4.682688,1.019231


In [13]:
# new ratio of questions to total samples
len(new_df[new_df['original_text'].str.endswith('?')]) / len(new_df)

0.25429952346404583

In [14]:
# new duration in seconds
total_duration = float(new_df['duration'].sum())
total_duration / 3600

450.0060280729167

In [15]:
cols = ['audio_filepath', 'text', 'duration']
new_df = new_df[cols]

# save to new manifest
new_manifest_path = os.path.join(
    os.path.dirname(path),
    f"manifest_{int(total_duration / 3600)}h.jsonl"
)
new_df.to_json(new_manifest_path, orient="records", lines=True)

In [16]:
def sample_dataset(df, duration, manifest_name, random_state=1):
    """
    Sample a dataset to a specific duration.
    """
    total_duration = float(df['duration'].sum())
    ratio = (duration * 3600) / total_duration
    n = int(len(df) * ratio)
    sample = df.sample(n=n, random_state=random_state)
    sample = sample.sample(frac=1, random_state=random_state).reset_index(drop=True)
    sample_duration = sample['duration'].sum()
    print(f"Sample duration: {sample_duration / 3600:.2f} hours")
    
    # save to new manifest
    sample_manifest_path = os.path.join(
        os.path.dirname(path),
        f"{manifest_name}_{int(duration)}h.jsonl"
    )
    sample.to_json(sample_manifest_path, orient="records", lines=True)
    print(f"Manifest saved to {sample_manifest_path}")

In [17]:
sample_dataset(new_df, 400, "manifest", random_state=4)

Sample duration: 400.00 hours
Manifest saved to /mnt/md0/synvoices/data/hausa_asr_filtered_augmented/manifest_400h.jsonl


In [18]:
sample_dataset(new_df, 250, "manifest", random_state=3)

Sample duration: 250.00 hours
Manifest saved to /mnt/md0/synvoices/data/hausa_asr_filtered_augmented/manifest_250h.jsonl
