In [None]:
from pandarallel import pandarallel
from glob import glob
import soundfile as sf
import pandas as pd
import librosa
import random
import json
import re
import os

pandarallel.initialize(nb_workers=10, progress_bar=True)

In [None]:
audio_dir = "/data/audio_data/prep_submission_audio/10"
json_dir = "/data/audio_data/pronunciation_scoring_result/marking_data/10"
path = "/data/audio_data/pronunciation_scoring_result/merged_info/info_question_type-10_01082022_18092023.csv"
metadata = pd.read_csv(path)
metadata.head()

In [None]:
import torchaudio

def check_audio_is_exist(audio_id):
    abs_path = os.path.join(audio_dir, f'{audio_id}.wav')
    if os.path.exists(abs_path):
        try:
            wav, sr = torchaudio.load(abs_path)
        except:
            return False

        if sr != 16000:
            return False
        
        return True
    return False

is_valid =  metadata.id.parallel_apply(check_audio_is_exist)
print(metadata.shape)
metadata = metadata[is_valid == True]
metadata.reset_index(inplace=True)
print(metadata.shape)

In [None]:
filtered_data = []
for name, group in metadata.groupby("question_id"):
    if group.shape[0] > 500:
        filtered_data.append(group.sample(500))
    else:
        filtered_data.append(group)

filtered_data = pd.concat(filtered_data)
filtered_data.shape

In [None]:
filtered_data.total_time.sum() / 3600

In [None]:
def normalize(text):
    # text = re.sub('[\!@#$%^&*\(\)\\\.\,\?\;\:\+\-\_\/\|~`]', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.upper().strip()
    return text

In [None]:
filtered_data.reset_index(inplace=True)
filtered_data = filtered_data[['id', 'is_deleted', 'user_id', 'question_id', 'question_type', 'question_content', 'url', 'score', 'fidelity_class', 'created_at', 'total_time', 'word_count']]

In [None]:
def check(text):
    text = re.sub("\s+", " ", text)
    for word in text.split():
        if not (word.isalpha() or "'" in word):
            return False

    return True

In [None]:
from tqdm import tqdm

data = []
for index in tqdm(filtered_data.index):
    row = filtered_data.iloc[index].to_dict()
    
    id = row["id"]

    text = row["question_content"]
    if check(text) == False:
        continue
    text = normalize(text)


    total_time = row["total_time"]
    audio_path = f'{audio_dir}/{id}.wav'

    sample = {
        "id": id,
        "text": text,
        "total_time": total_time,
        "audio_path": audio_path
    }

    data.append(sample)

In [None]:
path = "/data/codes/apa/kaldi/stt/data/stt-data/jsonl/info_question_type-10_01082022_18092023.jsonl"
with open(path, "w", encoding="utf-8") as f:
    for line in data:
        json_obj = json.dumps(line)
        f.write(f'{json_obj}\n')