In [None]:
import os

import pandas as pd
from phonemizer import phonemize
from tqdm.auto import tqdm

tqdm.pandas()

In [None]:
espeak_path = '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.dylib'
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = espeak_path

In [None]:
# REQUIREMENTS
# download source files in ../audio_data
# install espeak and set the variable espeak_path

datasets = {
    '28spk': 'Valentini-Botinhao/train/28spk/',
    '56spk': 'Valentini-Botinhao/train/56spk/',
    'test': 'Valentini-Botinhao/test/'
}

is_test = False
source_path = "../_audio_data/"
output_dir = "target/"
os.makedirs(output_dir, exist_ok=True)
output_file_name = output_dir + "valentini_metadata.tsv"

limit_per_dataset = 50000

In [None]:
def get_source_metadata(full_path):
    log_path = os.path.join(full_path, "log.txt")
    return pd.read_csv(log_path, sep=' ', header=None, names=['filename', 'environment', 'volume'])


def handle_source_metadata(meta, set_name, set_path, full_path):
    meta['set'] = set_name

    print("Processing 'clean_file_path' for set: {}".format(set_name))
    meta['clean_file_path'] = meta['filename'].apply(lambda x: os.path.join(set_path, 'clean', f'{x}.wav'))

    print("Processing 'noisy_file_path' for set: {}".format(set_name))
    meta['noisy_file_path'] = meta['filename'].apply(lambda x: os.path.join(set_path, 'noisy', f'{x}.wav'))

    print("Processing 'txt_file_path' for set: {}".format(set_name))
    meta['txt_file_path'] = meta['filename'].apply(lambda x: os.path.join(full_path, 'txt', f'{x}.txt'))

    print("Processing 'sentence' for set: {}".format(set_name))
    meta['sentence'] = meta['txt_file_path'].progress_apply(read_text_file)

    meta.drop('txt_file_path', axis=1, inplace=True)

    print("Processing 'phoneme_count' for set: {}".format(set_name))
    meta['phoneme_count'] = meta['sentence'].progress_apply(count_phonemes)

    return meta


def read_text_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read().strip()
    except FileNotFoundError:
        return None


def count_phonemes(text):
    phoneme_sequence = phonemize(text, language='en', backend="espeak").replace("ʲ", "")
    words = phoneme_sequence.split()
    return sum(len(word) for word in words)

In [None]:
dataframes = []
for name, sub_path in datasets.items():
    path = os.path.abspath(source_path + sub_path)
    df = get_source_metadata(path).head(limit_per_dataset)
    df = handle_source_metadata(df, name, sub_path, path)
    dataframes.append(df)

result = pd.concat(dataframes, ignore_index=False)
result = result.sample(frac=1, random_state=42)
result = result.reset_index(drop=True)

result.to_csv(output_file_name, sep='\t', index=False)