In [None]:
import os

import pandas as pd
from phonemizer import phonemize
from tqdm.auto import tqdm

tqdm.pandas()

In [None]:
# REQUIREMENTS
# download source files in ../audio_data
# install espeak and set the variable espeak_path

languages = ["en", "de", "ru"]

source_path = "../_audio_data/"
output_dir = "target/"
os.makedirs(output_dir, exist_ok=True)
output_file_name = output_dir + "cv_metadata.tsv"

min_up_votes = 2
max_down_votes = 0
limit_per_language = 2000

espeak_path = '/opt/homebrew/Cellar/espeak/1.48.04_1/lib/libespeak.dylib'

In [None]:
os.environ['PHONEMIZER_ESPEAK_LIBRARY'] = espeak_path

In [None]:
def get_source_metadata(lang):
    source_metadata_path = os.path.abspath(source_path + "cv-{}/validated.tsv".format(lang))
    return pd.read_csv(source_metadata_path, delimiter='\t')


def handle_source_metadata(lang, meta):
    print("Processing lang: {}".format(lang))

    rows = meta[(meta['down_votes'] <= max_down_votes) & (meta['up_votes'] >= min_up_votes)]
    rows = rows.sort_values(by='up_votes', ascending=False).head(limit_per_language)
    rows = rows[["path", "sentence"]]
    rows['lang'] = lang

    print("Processing 'path' for lang: {}".format(lang))
    rows['path'] = rows.progress_apply(lambda row: prepare_path(lang, row['path']), axis=1)

    print("Processing 'phoneme_count' for lang: {}".format(lang))
    rows['phoneme_count'] = rows.progress_apply(lambda row: count_phonemes(lang, row['sentence']), axis=1)

    return rows


def prepare_path(lang, file_name):
    if not file_name.endswith(".mp3"):
        file_name += ".mp3"
    return "cv-{}".format(lang) + "/clips/" + file_name


def count_phonemes(lang, text):
    phoneme_sequence = phonemize(text, language=lang, backend="espeak").replace("Ê²", "")
    words = phoneme_sequence.split()
    return sum(len(word) for word in words)

In [None]:
dataframes = []
for language in languages:
    df = get_source_metadata(language)
    df = handle_source_metadata(language, df)
    dataframes.append(df)

result = pd.concat(dataframes, ignore_index=False)
result = result.sample(frac=1, random_state=42)
result = result.reset_index(drop=True)

result.to_csv(output_file_name, sep='\t', index=False)