In [None]:
from tqdm import tqdm
import pandas as pd
import torchaudio
import librosa
import shutil
import json
import os

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=8, progress_bar=True)

In [None]:
audio_dir = "/data/audio_data/prep_submission_audio/12/"

metadata_path = "../data/metadata/raw-jsonl/testset-short-sentence-type-12.jsonl"
with open(metadata_path, "r", encoding="utf-8") as f:
    content = f.readlines()
    lines = [json.loads(line.strip()) for line in content]
    data = pd.DataFrame(lines)
    
data["audio_path"] = data.id.apply(lambda x: os.path.join(audio_dir, f'{x}.wav'))
print(data.shape)
data.head(2)

In [None]:
def check_audio(path, min_duration=1.0):    
    try:
        wav, sr = librosa.load(path, sr=16000)   

        if wav.shape[0] / sr < min_duration:
            return False 
    except:
        return False
    
    return True
    
is_success = data.parallel_apply(lambda x: check_audio(x["audio_path"]), axis=1)
print(is_success[is_success==False].shape)
print(is_success[is_success==True].shape)

In [None]:
data = data[is_success==True]

In [None]:
data.utterance_score.hist(bins=100)

In [None]:
data[["id", "text"]].to_csv("../data/metadata/csv/testset-short-sentence-type-12.csv", sep="|", index=None, header=None)

In [None]:
extracted_data = data[is_success == True]
path = "../data/metadata/jsonl/testset-short-sentence-type-12.jsonl"
with open(path, "w", encoding="utf-8") as f:
    for index in tqdm(extracted_data.index):
        sample = extracted_data.loc[index].to_dict()
        json_obj = json.dumps(sample)

        f.write(f'{json_obj}\n')