In [None]:
%cd /data/codes/prep_ps_pykaldi

import pandas as pd
import os
from glob import glob
import json
from pandarallel import pandarallel
import random
import re

pandarallel.initialize(nb_workers=10, progress_bar=True)

In [None]:
audio_dir = "/data/audio_data/prep_submission_audio/9"
metadata_path="/data/audio_data/pronunciation_scoring_result/merged_info/info_question_type-9_01082022_18092023.csv"
metadata = pd.read_csv(metadata_path)
metadata.head(2)

In [None]:
def check_audio_is_exist(audio_id):
    abs_path = os.path.join(audio_dir, f'{audio_id}.wav')
    if os.path.exists(abs_path):
        return True
    return False

metadata["is_exist"] =  metadata.id.parallel_apply(check_audio_is_exist)
metadata = metadata[metadata["is_exist"]]

In [None]:
lexicon_path = "/data/codes/prep_ps_pykaldi/resources/lexicon.txt"
vocab = pd.read_csv(lexicon_path, sep="\t", names=["word", "arpa"])
lexicon = {}
for name, group in vocab.groupby("word"):
    lexicon[name] = group["arpa"].tolist()

In [None]:
def is_valid_phoneme(phoneme):
    if phoneme["phoneme_error_arpabet"] != "normal":
        trans = phoneme["phoneme_error_arpabet"].split(" - ")[-1]
        if len(trans.split(" ")) >= 2:
            return False
    return True

def norm_text(text):
    text = re.sub(r"[\,\.\!\?\:\;]", " ", text)
    text = re.sub("\s+", " ", text).strip()
    text = text.upper()

    return text
            
def parse_metadata_data(json_path):
    try: 
        with open(json_path, "r") as f:
            content = json.load(f)
        id = os.path.basename(json_path).split(".")[0]

        utterances = []
        for raw_utterance in content["utterances"]:
            utterance = {
                "id": id,
                "text": norm_text(raw_utterance["text"]),
                "utt_id": raw_utterance["utterance_id"],
                "start_time": raw_utterance["start_time"],
                "end_time": raw_utterance["end_time"],
                "arpas": [],
                "trans": [],
                "phone_scores": [],
                "word_scores": [],
                "word_ids": [],
                "utterance_scores": raw_utterance["result"]["nativeness_score"],
            }
            ignore = False
            for word_id, word in enumerate(raw_utterance["result"]["words"]):
                if norm_text(word["word"]) not in lexicon:
                    ignore = True
                    break
                if word["trans_arpabet"] not in lexicon[norm_text(word["word"])]:
                    ignore = True
                    break

                for phoneme in word["phonemes"]:
                    arpa = phoneme["trans_arpabet"]
                    score = phoneme["nativeness_score"]
                    tran = phoneme["trans_arpabet"]

                    if score <= 0:
                        score = 0

                    utterance["arpas"].append(arpa)
                    utterance["phone_scores"].append(score)
                    utterance["word_ids"].append(word_id)
                    utterance["trans"].append(tran)

                word_score = word["nativeness_score"]
                utterance["word_scores"].append(word_score)

            if ignore == False:
                utterances.append(utterance)


        return utterances

    except:
        return []

json_dir = "/data/audio_data/pronunciation_scoring_result/marking_data/9"
extracted_data = metadata.id.parallel_apply(lambda x: parse_metadata_data(os.path.join(json_dir, f'{x}.json')))
extracted_data.head()

In [None]:
data = extracted_data.explode().reset_index()["id"]
data = pd.DataFrame({"data": data})
data.dropna(inplace=True)
data["text"] = data["data"].apply(lambda x: x["text"])
data.drop_duplicates("text", inplace=True)
data = data.reset_index()[["data"]]
data.head()

In [None]:
scores = data["data"].apply(lambda x: x["word_scores"]).to_list()
scores = [score for sample in scores for score in sample]

pd.DataFrame(scores, columns=["score"]).score.hist(bins=100)

In [None]:
scores = data["data"].apply(lambda x: x["phone_scores"]).to_list()
scores = [score for sample in scores for score in sample]

pd.DataFrame(scores, columns=["score"]).score.hist(bins=100)

In [None]:
import numpy as np

def check_valid(score, threshold=0.35, score_threshold=80):
    score = np.array(score)

    is_low_score = score < score_threshold

    rate = score[is_low_score].shape[0] / (score[is_low_score].shape[0] + score[~is_low_score].shape[0])
    if rate > threshold:
        return True
    
    if random.randint(0, 100) > 90:
        return True
    
    return False

is_valid = data["data"].apply(lambda x: check_valid(x["phone_scores"]))

In [None]:
scores = data[is_valid]["data"].apply(lambda x: x["phone_scores"]).to_list()
scores = [score for sample in scores for score in sample]

pd.DataFrame(scores, columns=["score"]).score.hist(bins=100)

In [None]:
from tqdm import tqdm
 
extracted_data = data[is_valid]
path = "prep_data/jsonl/info_question_type-9_01082022_18092023.jsonl"
with open(path, "w", encoding="utf-8") as f:
    for index in tqdm(extracted_data.index):
        sample = extracted_data.loc[index, "data"]
        json_obj = json.dumps(sample)

        f.write(f'{json_obj}\n')

In [None]:
extracted_data["data"][5].keys()