In [None]:
%cd /data/codes/prep_gopt/egs/librispeech/s5/
from pandarallel import pandarallel
from asr import Whisper_STT
from glob import glob
import soundfile as sf
import pandas as pd
import librosa
import random
import json
import re
import os

pandarallel.initialize(nb_workers=5, progress_bar=True)

In [None]:
audio_dir = "/data/audio_data/prep_submission_audio/10"
metadata_path="/data/audio_data/pronunciation_scoring_result/info_question_type-10_01082022_18092023.csv"
metadata = pd.read_csv(metadata_path)
metadata.head(2)

In [None]:
def check_audio_is_exist(audio_id):
    abs_path = os.path.join(audio_dir, f'{audio_id}.wav')
    if os.path.exists(abs_path):
        return True
    return False

metadata["is_exist"] =  metadata.id.parallel_apply(check_audio_is_exist)
print(metadata.shape)
metadata = metadata[metadata["is_exist"] == True]
metadata.reset_index(inplace=True)
print(metadata.shape)

In [None]:
def normalize(text):
    text = re.sub('[\!@#$%^&*\(\)\\\.\'\"\,\?\;\:\+\-\_\/\|~`]', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.upper().strip()
    return text

def load_lexicon(path="resources/lexicon.txt"):
    with open(path, "r", encoding="utf-8") as f:
        content = f.readlines()
        lines = [line.strip() for line in content]
    lexicon = {}
    for line in lines:
        tmp = line.split()
        word, arpabet = tmp[0], " ".join(tmp[1:])

        if word not in lexicon:
            lexicon[word] = [arpabet, ]
        else:
            lexicon[word].append(arpabet)

    for key in lexicon.keys():
        lexicon[key] = set(lexicon[key])
    
    return lexicon

lexicon = load_lexicon(path="/data/codes/prep_gopt/egs/librispeech/s5/data/lexicon.txt")

In [None]:
def is_valid_word(word):
    text = normalize(word["text"])

    if text not in lexicon:
        return False
    else:
        if word["trans_arpabet"] not in lexicon[text]:
            return False      
    return True

In [None]:
def parse_sentence(sentence):
    words, text = [], []
    start_time, end_time = sentence[0]["start_time"], sentence[-1]["end_time"]
    for word in sentence:
        if not is_valid_word(word):
            return None

        text.append(word["text"])
        words.append(
            {
                'text': word["text"],
                'arpabet': word["trans_arpabet"],
                'start_time': word["start_time"],
                'end_time': word["end_time"],
                'score': word["nativeness_score"]
            }
        )

    sentence = {
            "start_time": start_time,
            "end_time": end_time,
            "text": " ".join(text),
            "words": words,
        }
    
    return sentence

In [None]:
def preprocess_data(json_path):
    try:
        with open(json_path, "r") as f:
            raw_sample = json.load(f)
        
        sentences = []

        assert len(raw_sample["utterance"]) == 1
        sample_id = os.path.basename(json_path).split(".")[0]
        for index, utterance in enumerate(raw_sample["utterance"]):
            parsed_sent = parse_sentence(utterance["words"].copy())
            if parsed_sent is None:
                continue

            parsed_sent["utt_id"] = f'{sample_id}'
            parsed_sent["id"] = f'{sample_id}'
            sentences.append(parsed_sent)

        return sentences
    
    except:
        return []

json_dir = "/data/audio_data/pronunciation_scoring_result/marking_data/10"
tmp = metadata.id.parallel_apply(lambda x: preprocess_data(os.path.join(json_dir, f'{x}.json')))
# tmp = metadata.head(10).id.apply(lambda x: preprocess_data(os.path.join(json_dir, f'{x}.json')))


In [None]:
count = tmp.apply(lambda x: len(x))
tmp[count!=0].shape

In [None]:
count = tmp.apply(lambda x: len(x))
tmp[count!=0]

In [None]:
total_time = 0
for index in tmp.index:
    for sent in tmp.iloc[index]:
        total_time += (sent["end_time"] - sent["start_time"])

total_time / 3600

In [None]:
tmp_metadata = tmp.explode()

df = pd.DataFrame(tmp_metadata.values, columns=['sent'])
df.dropna(inplace=True)
df["sent"] = df["sent"].apply(lambda x: json.dumps(x, ensure_ascii=False))

In [None]:
df.to_csv("/data/codes/prep_gopt/egs/librispeech/s5/data/stt/raw/metadata_type_10.csv")

In [None]:
# from tqdm import tqdm 

# question_id = tmp_metadata.question_id.value_counts()
# filtered_datas = []
# for index in tqdm(question_id.index):
#     if tmp_metadata[tmp_metadata.question_id == index].shape[0] > 1000:
#         tmp = tmp_metadata[tmp_metadata.question_id == index][0:1000]
#     else:
#         tmp = tmp_metadata[tmp_metadata.question_id == index]
#     filtered_datas.append(tmp.sample(frac=1)[0:1000])

# filtered_metadata = pd.concat(filtered_datas)
# filtered_metadata.shape

In [None]:
# import shutil

# in_dir = '/data/audio_data/prep_submission_audio/10'
# out_dir = '/data/codes/prep_gopt/egs/librispeech/s5/data/prep/wav'
# def get_audio(sent):
#     sent = json.loads(sent)
#     utt_id = sent["utt_id"]

#     in_path = f'{in_dir}/{sent["id"]}.wav'
#     out_path = f'{out_dir}/{utt_id}.wav'

#     shutil.copyfile(in_path, out_path)

# df.sent.parallel_apply(get_audio)