In [None]:
from pandarallel import pandarallel
from glob import glob
import soundfile as sf
import pandas as pd
import librosa
import random
import json
import re
import os

pandarallel.initialize(nb_workers=10, progress_bar=True)

In [None]:
audio_dir = "/data/audio_data/prep_submission_audio/9"
json_dir = "/data/audio_data/pronunciation_scoring_result/marking_data/9"
metadata_path="/data/audio_data/pronunciation_scoring_result/merged_info/info_question_type-9_19092023_21122023.csv"
metadata = pd.read_csv(metadata_path)
metadata.head(2)

In [None]:
def check_audio_is_exist(audio_id):
    abs_path = os.path.join(audio_dir, f'{audio_id}.wav')
    if os.path.exists(abs_path):
        return True
    return False

metadata["is_exist"] =  metadata.id.parallel_apply(check_audio_is_exist)
print(metadata.shape)
metadata = metadata[metadata["is_exist"] == True]
metadata.reset_index(inplace=True)
print(metadata.shape)

In [None]:
def normalize(text):
    text = re.sub('[\!@#$%^&*\(\)\\\.\'\"\,\?\;\:\+\-\_\/\|~`]', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.upper().strip()
    return text

In [None]:
def preprocess_data(json_path):
    try:
        with open(json_path, "r") as f:
            raw_sample = json.load(f)
        
        sentences = []
        sample_id = os.path.basename(json_path).split(".")[0]
        for index, utterance in enumerate(raw_sample["utterances"]):
            if utterance["result"] is None:
                continue

            utt_text = utterance["text"]
            utt_start_time = utterance["start_time"]
            utt_end_time = utterance["end_time"]
            utt_id = utterance["utterance_id"]

            sentence = {
                "text": utt_text,
                "start_time": utt_start_time,
                "end_time": utt_end_time,
                "utt_id": utt_id,
                "sid": sample_id
            }

            sentences.append(sentence)
        return sentences
    except:
        return []

tmp = metadata.id.parallel_apply(lambda x: preprocess_data(os.path.join(json_dir, f'{x}.json')))
# tmp = metadata.id.apply(lambda x: preprocess_data(os.path.join(json_dir, f'{x}.json')))


In [None]:
tmp_metadata = tmp.explode()

tmp_metadata = pd.DataFrame(tmp_metadata.values, columns=['sent'])
tmp_metadata.dropna(inplace=True)
tmp_metadata["start_time"] = tmp_metadata.sent.apply(lambda x: x["start_time"])
tmp_metadata["end_time"] = tmp_metadata.sent.apply(lambda x: x["end_time"]) 
tmp_metadata["duration"] = tmp_metadata["end_time"] - tmp_metadata["start_time"]
tmp_metadata = tmp_metadata[tmp_metadata.duration > 2.0]
tmp_metadata.head()

In [None]:
tmp_metadata["sent"][0]

In [None]:
tmp_metadata.duration.sum() / 3600

In [None]:
import torchaudio
import numpy as np

In [None]:
in_dir = '/data/audio_data/prep_submission_audio/9'
out_dir = '/data/codes/apa/kaldi/stt/data/stt-data/wav'
def get_audio(sent):
    utt_id = sent["utt_id"]
    sid = sent["sid"]

    in_path = f'{in_dir}/{sid}.wav'
    out_path = f'{out_dir}/{sid}-{utt_id}.wav'

    # waveform, sr = librosa.load(in_path, sr=16000)
    waveform, sr = torchaudio.load(in_path)
    if sr != 16000:
        return False
        
    start_time = int(sent["start_time"] * sr)
    end_time = int(sent["end_time"] * sr)
    if end_time - waveform.shape[1] > 100:
        return False

    assert "prep_submission_audio" not in out_path
    sf.write(out_path, waveform[0].numpy()[start_time: end_time], samplerate=sr)

    return True

is_success = tmp_metadata.sent.parallel_apply(get_audio)

In [None]:
jsonl_data = tmp_metadata[is_success]["sent"].tolist()

path = "/data/codes/apa/kaldi/stt/data/stt-data/jsonl/info_question_type-9_19092023_21122023.jsonl"
with open(path, "w", encoding="utf-8") as f:
    for line in jsonl_data:
        json_obj = json.dumps(line)
        f.write(f'{json_obj}\n')