In [None]:
import os
import re

import pandas as pd
import pyphen
from tqdm.auto import tqdm

tqdm.pandas()

In [None]:
# REQUIREMENTS
# download source files in ../audio_data

languages = ["en", "de", "ru"]

source_path = "../_audio_data/"
output_dir = "target/"
os.makedirs(output_dir, exist_ok=True)
output_file_name = output_dir + "cv_metadata.tsv"

min_up_votes = 2
max_down_votes = 0
limit_per_language = 5000

In [None]:
def get_source_metadata(lang):
    source_metadata_path = os.path.abspath(source_path + "cv-{}/validated.tsv".format(lang))
    return pd.read_csv(source_metadata_path, delimiter='\t')


def handle_source_metadata(lang, meta):
    print("Processing lang: {}".format(lang))

    meta = meta[(meta['down_votes'] <= max_down_votes) & (meta['up_votes'] >= min_up_votes)]
    meta = meta.sort_values(by='up_votes', ascending=False).head(limit_per_language)
    meta = meta[["path", "sentence"]]
    meta['lang'] = lang

    print("Processing 'path' for lang: {}".format(lang))
    meta['path'] = meta.progress_apply(lambda row: prepare_path(lang, row['path']), axis=1)

    print("Processing 'syllable_count' for lang: {}".format(lang))
    dic = pyphen.Pyphen(lang=lang)
    meta['syllable_count'] = meta['sentence'].progress_apply(lambda text: count_syllables(text, dic))

    return meta


def prepare_path(lang, file_name):
    if not file_name.endswith(".mp3"):
        file_name += ".mp3"
    return "cv-{}".format(lang) + "/clips/" + file_name


def count_syllables(line, dic):
    handled_line = dic.inserted(line)
    cleaned_line = re.sub(r'[^\w\s-]', '', handled_line, flags=re.UNICODE)
    cleaned_line = re.sub(r'[-_]', ' ', cleaned_line)
    divided_line = cleaned_line.split(' ')
    divided_line = [line for line in divided_line if line]
    return len(divided_line)

In [None]:
dataframes = []
for language in languages:
    df = get_source_metadata(language)
    df = handle_source_metadata(language, df)
    dataframes.append(df)

result = pd.concat(dataframes, ignore_index=False)
result = result.sample(frac=1, random_state=42)
result = result.reset_index(drop=True)

result.to_csv(output_file_name, sep='\t', index=False)