In [None]:
from tqdm import tqdm
import pandas as pd
import torchaudio
import librosa
import shutil
import json
import os

from pandarallel import pandarallel
pandarallel.initialize(nb_workers=8, progress_bar=False)

In [None]:
def load_data(path):
    with open(path, "r", encoding="utf-8") as f:
        content = f.readlines()
        lines = [json.loads(line.strip()) for line in content]
    data = pd.DataFrame(lines)

    return data

In [None]:
# audio_dir = "/data/codes/apa/train/data/wav/9"

# in_jsonl_path = "../data/metadata/filtered-jsonl/train-data-type-9.jsonl"
# out_jsonl_path = "../data/metadata/jsonl/train-data-type-9.jsonl"
# out_csv_path = "../data/metadata/csv/train-data-type-9.csv"

audio_dir = "/data/audio_data/prep_submission_audio/12"

in_jsonl_path = "../data/metadata/filtered-jsonl/train-data-type-12.jsonl"
out_jsonl_path = "../data/metadata/jsonl/train-data-type-12.jsonl"
out_csv_path = "../data/metadata/csv/train-data-type-12.csv"

data = load_data(in_jsonl_path)
print(data.shape)
data.head(2)

In [None]:
data["audio_path"] = data.id.apply(lambda x: os.path.join(audio_dir, f'{x}.wav'))

In [None]:
def check_audio(path, min_duration=1.0):    
    try:
        wav, sr = librosa.load(path, sr=16000)   

        if wav.shape[0] / sr < min_duration:
            return False 
    except:
        return False
    
    return True
    
is_success = data.parallel_apply(lambda x: check_audio(x["audio_path"]), axis=1)
print(data[is_success].shape)
print(data[~is_success].shape)
data = data[is_success]

In [None]:
data.utterance_score.hist(bins=100)

In [None]:
data[["id", "text"]].to_csv(out_csv_path, sep="|", index=None, header=None)

In [None]:
extracted_data = data

with open(out_jsonl_path, "w", encoding="utf-8") as f:
    for index in tqdm(extracted_data.index):
        sample = extracted_data.loc[index].to_dict()
        json_obj = json.dumps(sample)

        f.write(f'{json_obj}\n')