In [12]:
import pandas as pd
import os
from glob import glob
import json
from pandarallel import pandarallel
import re

pandarallel.initialize(nb_workers=10, progress_bar=True)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [13]:
audio_dir = "/data/audio_data/prep_submission_audio"
metadata_path="/data/audio_data/pronunciation_scoring_result/info_question_type-10_01082023_18092023.csv"
metadata = pd.read_csv(metadata_path)
metadata.dropna(inplace=True)
metadata.head()

Unnamed: 0,id,is_deleted,user_id,question_id,question_type,question_content,url,score,fidelity_class,created_at,total_time,word_count
0,5580000,0,52077.0,66902,10,statistics,https://storage.googleapis.com/materials-eleme...,90.0,RELEVANT,2023-09-18 21:17:11,2.63,1.0
1,5580001,0,88226.0,26144,10,Seat,https://storage.googleapis.com/materials-eleme...,53.0,RELEVANT,2023-09-18 21:17:11,2.45,1.0
2,5580002,0,63452.0,107048,10,Keep your body in shape,https://storage.googleapis.com/materials-eleme...,67.46,RELEVANT,2023-09-18 21:17:12,3.05,5.0
3,5580003,0,199169.0,10273,10,Foggy,https://storage.googleapis.com/materials-eleme...,80.0,RELEVANT,2023-09-18 21:17:12,1.86,1.0
4,5580004,0,237799.0,23683,10,School,https://storage.googleapis.com/materials-eleme...,69.0,RELEVANT,2023-09-18 21:17:13,1.8,1.0


In [14]:
def check_audio_is_exist(audio_id):
    abs_path = os.path.join(audio_dir, f'{audio_id}.wav')
    if os.path.exists(abs_path):
        return True
    return False

metadata["is_exist"] =  metadata.id.parallel_apply(check_audio_is_exist)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=200781), Label(value='0 / 200781')…

In [15]:
metadata = metadata[metadata["is_exist"] == True]
metadata.shape

(2007666, 13)

In [16]:
def parse_elsa_data(json_path):
    try:
        raw_sample = json.load(open(json_path, "r", encoding="utf-8"))
        raw_sample = json.loads(raw_sample)

        if "api_version" not in raw_sample:
            return None
        
        assert len(raw_sample["utterance"]) == 1
        
        utterance = raw_sample["utterance"][0]    
        words, phonemes = [], []

        for word in utterance["words"]:
            words.append([word["text"], word["trans_arpabet"], word["nativeness_score"]])
            _phonemes = []
            for phoneme in word["phonemes"]:
                _phonemes.append([phoneme["text"], phoneme["trans_arpabet"], phoneme["nativeness_score"]])
            phonemes.append(_phonemes)
                
        metadata = {
            "words": words,
            "phonemes": phonemes,
            "utterance": utterance["nativeness_score"]
        }

        return json.dumps(metadata, ensure_ascii=False)
    except:
        return None

In [17]:
json_dir = "/data/audio_data/pronunciation_scoring_result/marking_data/10"
metadata["score"] = metadata.id.parallel_apply(lambda x: parse_elsa_data(os.path.join(json_dir, f'{x}.json')))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=200767), Label(value='0 / 200767')…

In [18]:
print(metadata.shape)
metadata = metadata[metadata["score"].notna()]
print(metadata.shape)

(2007666, 13)
(2007307, 13)


In [19]:
lexicon_path = "resources/lexicon.txt"
lexicon = {}
with open(lexicon_path, "r", encoding="utf-8") as f:
    lines = f.readlines()
    lines = [line.strip().split() for line in lines]
    lines = [[line[0], " ".join(line[1:])] for line in lines]
    
    for word, phoneme in lines:
        lexicon[word] = phoneme
        
def filter_data(text, words):
    words = json.loads(words)["words"]
    for word, phoneme in zip(text.upper().split(), words):
        if word not in lexicon:
            return False
        if lexicon[word] != phoneme[1]:
            return False
        if word.upper() != phoneme[0].upper():
            return False
    
    return True
metadata["is_selected"] = metadata.parallel_apply(lambda x: filter_data(x["question_content"], x["score"]), axis=1)
metadata = metadata[metadata["is_selected"]==True].reset_index()
metadata = metadata[metadata["word_count"]<2]

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=200731), Label(value='0 / 200731')…

In [20]:
def preprocess_text(text):
    text = re.sub("[\,\.\;\:\!\?]", " ", text)
    text = text.upper()
    
    return text
metadata["question_content"] = metadata.question_content.apply(preprocess_text)

In [21]:
metadata = metadata[metadata.fidelity_class == "RELEVANT"]

In [22]:
metadata[["user_id", "id", "question_content", "score"]][0:100000].to_csv("/data/codes/prep_gopt/data/processed_data/metadata.csv")