In [1]:
import os

import pandas as pd
from phonemizer import phonemize
from tqdm.auto import tqdm

tqdm.pandas()

In [2]:
espeak_path = '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.dylib'
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = espeak_path

In [6]:
# REQUIREMENTS
# download source files in ../audio_data
# install espeak and set the variable espeak_path

datasets = {
    '28spk': 'train/28spk/',
    '58spk': 'train/56spk/',
    'test': 'test/'
}

is_test = False
source_path = "../_audio_data/Valentini-Botinhao/"
output_dir = "target/"
os.makedirs(output_dir, exist_ok=True)
output_file_name = output_dir + "valentini_metadata.tsv"

In [7]:
def get_source_metadata(set_path):
    log_path = os.path.join(set_path, "log.txt")
    return pd.read_csv(log_path, sep=' ', header=None, names=['filename', 'environment', 'volume'])


def handle_source_metadata(meta, set_name, set_path):
    meta['set'] = set_name

    print("Processing 'clean_file_path' for set: {}".format(set_name))
    meta['clean_file_path'] = meta['filename'].apply(lambda x: os.path.join(set_path, 'clean', f'{x}.wav'))

    print("Processing 'noisy_file_path' for set: {}".format(set_name))
    meta['noisy_file_path'] = meta['filename'].apply(lambda x: os.path.join(set_path, 'noisy', f'{x}.wav'))

    print("Processing 'txt_file_path' for set: {}".format(set_name))
    meta['txt_file_path'] = meta['filename'].apply(lambda x: os.path.join(set_path, 'txt', f'{x}.txt'))

    print("Processing 'sentence' for set: {}".format(set_name))
    meta['sentence'] = meta['txt_file_path'].progress_apply(read_text_file)

    meta.drop('txt_file_path', axis=1, inplace=True)

    # print("Processing 'phoneme_count' for set: {}".format(set_name))
    # meta['phoneme_count'] = meta['sentence'].progress_apply(count_phonemes)

    return meta


def read_text_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read().strip()
    except FileNotFoundError:
        return None


def count_phonemes(text):
    print(text)
    phoneme_sequence = phonemize(text, language='en', backend="espeak").replace("ʲ", "")
    print(phoneme_sequence)
    words = phoneme_sequence.split()
    print(words)
    return sum(len(word) for word in words)

In [8]:
dataframes = []
for name, sub_path in datasets.items():
    path = os.path.abspath(source_path + sub_path)
    df = get_source_metadata(path)
    df = handle_source_metadata(df, name, path)
    dataframes.append(df)

result = pd.concat(dataframes, ignore_index=False)
result = result.sample(frac=1, random_state=42)
result = result.reset_index(drop=True)

result.to_csv(output_file_name, sep='\t', index=False)

Processing 'clean_file_path' for set: 28spk
Processing 'noisy_file_path' for set: 28spk
Processing 'txt_file_path' for set: 28spk
Processing 'sentence' for set: 28spk


  0%|          | 0/11572 [00:00<?, ?it/s]

Processing 'clean_file_path' for set: 58spk
Processing 'noisy_file_path' for set: 58spk
Processing 'txt_file_path' for set: 58spk
Processing 'sentence' for set: 58spk


  0%|          | 0/23075 [00:00<?, ?it/s]

Processing 'clean_file_path' for set: test
Processing 'noisy_file_path' for set: test
Processing 'txt_file_path' for set: test
Processing 'sentence' for set: test


  0%|          | 0/824 [00:00<?, ?it/s]