In [21]:
import pandas as pd
import os
from glob import glob
import json
from pandarallel import pandarallel
from tqdm import tqdm
import torchaudio
import random
import re

pandarallel.initialize(nb_workers=8, progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [22]:
type2path = {
    12: {
        "json_dir": "/data/metadata/apa-en/marking-data/10",
        "audio_dir": "/data/audio/prep-submission-audio/apa-type-10",
        "metadata_path": "/data/metadata/stt-en/raw/vad-filtered-info_question_type-10_01082022_18092023.csv"
    },
}


In [23]:
_type_ = 12

data_dir = "/data/codes/sb-apa/data/scoring"
out_raw_json_path = f'{data_dir}/train-data-type-10.jsonl'

In [24]:
path_dict = type2path[_type_]

hparams = {
    "json_dir": path_dict["json_dir"],
    "audio_dir": path_dict["audio_dir"],
    "metadata_path": path_dict["metadata_path"],
    "out_jsonl_path": out_raw_json_path
}

metadata = pd.read_csv(hparams["metadata_path"])
metadata.head(2)

Unnamed: 0,id,is_deleted,user_id,question_id,question_type,question_content,url,score,fidelity_class,created_at,total_time,word_count
0,5580000,0,52077.0,66902,10,statistics,https://storage.googleapis.com/materials-eleme...,90.0,RELEVANT,2023-09-18 21:17:11,2.63,1.0
1,5580001,0,88226.0,26144,10,Seat,https://storage.googleapis.com/materials-eleme...,53.0,RELEVANT,2023-09-18 21:17:11,2.45,1.0


In [25]:
def is_valid_audio(audio_id):
    abs_path = os.path.join(hparams["audio_dir"], f'{audio_id}.wav')
    if not os.path.exists(abs_path):
        return False
    try:
        wav, sr = torchaudio.load(abs_path)
        if sr != 16000:
            return False
    except:
        return False
    
    return True

is_exist =  metadata.id.parallel_apply(is_valid_audio)
print(metadata.shape)
metadata = metadata[is_exist]
print(metadata.shape)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=136984), Label(value='0 / 136984')…

(1095866, 12)
(1095866, 12)


In [26]:
def filter_data(data):
    print(f'### shape before filtering: {data.shape}')
    data = data[data.total_time > 1.0]
    data = data[data.total_time < 6.0]
    data = data[data.word_count < 16]
    # data = data[0:20000]
    print(f'### shape after filtering: {data.shape}')
    return data

metadata = filter_data(metadata)

### shape before filtering: (1095866, 12)


### shape after filtering: (1055250, 12)


In [27]:
def norm_text(text):
    text = re.sub(r"[\,\.\!\?\:\;]", " ", text)
    text = re.sub("\s+", " ", text).strip()
    text = text.upper()

    return text

def is_valid_phoneme(phoneme):
    if phoneme["phoneme_error_arpabet"] != "normal":
        trans = phoneme["phoneme_error_arpabet"].split(" - ")[-1]
        labels = phoneme["phoneme_error_arpabet"].split(" - ")[0]
        if len(labels.split(" ")) >= 2:
            return False
        
        if len(trans.split(" ")) >= 2:
            return False
                
    return True

def is_valid_word(word):
    if len(word["phonemes"]) != len(word["trans_arpabet"].split()):
        return False

    return True
            
def parse_json_file(json_path):
    decision2color = {
        "correct": 2,
        "warning":1,
        "error":0
    }

    try: 
        with open(json_path, "r") as f:
            content = json.load(f)
        id = os.path.basename(json_path).split(".")[0]

        utterances = []
        for raw_utterance in content["utterance"]:
            id = id
            utt_raw_text = raw_utterance["sentence"]
            utt_score = raw_utterance["nativeness_score"]

            audio_path = os.path.join(hparams["audio_dir"], f'{id}.wav')

            start_time = None
            end_time = None
            utt_uid = None
            intonation_score = 0
            fluency_score = 0
            
            utt_text = []
            utt_arpas = []
            utt_trans = [] 
            utt_phone_scores = []
            utt_decisions = []
            utt_word_scores = []
            utt_word_ids = []
            utt_rel_pos = []
            
            ignore = False
            for word_id, word in enumerate(raw_utterance["words"]):
                word["trans_arpabet"] = word["trans_arpabet"].replace("AH0", "AX")
                
                wrd_score = word["nativeness_score"]
                wrd_text = norm_text(word["text"])
                wrd_arpa = word["trans_arpabet"].split()

                if is_valid_word(word) == False:
                    ignore = True
                    break

                for index, phoneme in enumerate(word["phonemes"]):
                    if is_valid_phoneme(phoneme) == False:
                        ignore = True
                        break
                    
                    if index == 0:
                        rel_pos = 1
                    elif index == (len(word["phonemes"]) - 1):
                        rel_pos = 3
                    else:
                        rel_pos = 2

                    arpa = phoneme["trans_arpabet"]
                    decision = decision2color[phoneme["decision"]]
                    score = phoneme["nativeness_score"] if phoneme["nativeness_score"] >= 0 else 0
                    tran = phoneme["trans_arpabet"]
                    
                    if tran == "SCHWA":
                        tran = "AX"

                    utt_phone_scores.append(score)
                    utt_word_ids.append(word_id)
                    utt_trans.append(tran)
                    utt_decisions.append(decision)
                    utt_rel_pos.append(rel_pos)

                utt_text.append(wrd_text)                
                utt_word_scores.append(wrd_score)
                utt_arpas.extend(wrd_arpa)
            

            utterance = {
                "id": id,
                "raw_text": utt_raw_text,
                "text": " ".join(utt_text),
                "utt_id": utt_uid,
                "start_time": start_time,
                "end_time": end_time,
                "arpas": utt_arpas,
                "trans": utt_trans,
                "phone_scores": utt_phone_scores,
                "word_scores": utt_word_scores,
                "decisions": utt_decisions,
                "word_ids": utt_word_ids,
                "rel_pos": utt_rel_pos,
                "utterance_score": utt_score,
                "intonation_score": intonation_score,
                "fluency_score": fluency_score,
                "audio_path": audio_path
            }
            
            if ignore == False:
                utterances.append(utterance)
        
        return utterances
    except:
        return []

extracted_data = metadata.id.parallel_apply(
    lambda x: parse_json_file(os.path.join(hparams["json_dir"], f'{x}.json')))
extracted_data.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=131907), Label(value='0 / 131907')…

0    [{'id': '5580000', 'raw_text': 'statistics', '...
1    [{'id': '5580001', 'raw_text': 'Seat', 'text':...
2                                                   []
3    [{'id': '5580004', 'raw_text': 'School', 'text...
5    [{'id': '5580008', 'raw_text': 'precision', 't...
Name: id, dtype: object

In [28]:
def save_jsonl_data_col_level(data, path):
    with open(path, "w", encoding="utf-8") as f:
        samples = (sample for sample in data.to_dict().values())
        for sample in tqdm(samples):
            json_obj = json.dumps(sample)

            f.write(f'{json_obj}\n')
    print(f'saved data to: ', path)

data = extracted_data.explode().dropna()
save_jsonl_data_col_level(data=data, path=hparams["out_jsonl_path"])

889735it [00:04, 209991.57it/s]


saved data to:  /data/codes/sb-apa/data/scoring/train-data-type-10.jsonl


#### prepare scoring data

In [29]:
import json
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel
import librosa

pandarallel.initialize(nb_workers=8, progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [30]:
def load_jsonl_data(path):
    with open(path, "r", encoding="utf-8") as f:
        content = f.readlines()
        lines = [json.loads(line.strip()) for line in content]
    data = pd.DataFrame(lines)

    return data

In [31]:
path = "/data/codes/sb-apa/data/scoring/train-data-type-10.jsonl"

metadata = load_jsonl_data(path)
metadata.head(1)

Unnamed: 0,id,raw_text,text,utt_id,start_time,end_time,arpas,trans,phone_scores,word_scores,decisions,word_ids,rel_pos,utterance_score,intonation_score,fluency_score,audio_path
0,5580000,statistics,STATISTICS,,,,"[S, T, AX, T, IH1, S, T, IH0, K, S]","[S, T, AX, T, IH, S, T, IH, K, S]","[100, 100, 14.000000000000002, 94, 98, 96, 98,...",[90],"[2, 2, 0, 2, 2, 2, 2, 2, 2, 2]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[1, 2, 2, 2, 2, 2, 2, 2, 2, 3]",90,0,0,/data/audio/prep-submission-audio/apa-type-10/...


In [32]:
def filter_data_with_text(data, text_label="text", n_sample_per_question_id=268):
    print(f'### shape before filtering: {data.shape}')
    filtered_data = []
    for name, group in data.groupby(text_label):
        if group.shape[0] >= n_sample_per_question_id:
            samples = group.sample(n_sample_per_question_id)
            filtered_data.append(samples)
        else:
            filtered_data.append(group)
    filtered_data = pd.concat(filtered_data)
    print(f'### shape after filtering: {filtered_data.shape}')
    return filtered_data

metadata = filter_data_with_text(
    data=metadata, text_label="text",
    n_sample_per_question_id=512
)

### shape before filtering: (889735, 17)
### shape after filtering: (166801, 17)


In [33]:
import re

def convert_to_phone_pure(arpas):
    phone_pures = []
    for phone in arpas:
        phone_pures.append(re.sub("\d", "", phone))

    phone_pures = " ".join(phone_pures)
    return phone_pures.lower()

In [34]:
def get_duration(path):
    wav, sr = librosa.load(path, sr=16000)
    
    return wav.shape[0]/sr

metadata["duration"]= metadata.audio_path.parallel_apply(get_duration)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=20851), Label(value='0 / 20851')))…

In [35]:
metadata["word_scores"] = metadata.apply(lambda row: [row["word_scores"][index] for index in row["word_ids"]], axis=1)

In [46]:
from sklearn.model_selection import train_test_split

train_metadata, val_metadata = train_test_split(metadata, test_size=0.1, random_state=42)
test_metadata, val_metadata = train_test_split(val_metadata, test_size=0.5, random_state=42)

In [47]:
print(f'train duration: {train_metadata.duration.sum()/3600}')
print(f'test duration: {test_metadata.duration.sum()/3600}')
print(f'val duration: {val_metadata.duration.sum()/3600}')

train duration: 83.9903020486111
test duration: 4.672772170138889
val duration: 4.68741939236111


In [48]:
test_metadata.shape[0] // 16

521

In [45]:
def convert_df_to_dict(metadata, max_length=32):
    data = {}
    total_sample = metadata.shape[0] // 16
    print(total_sample)
    metadata = metadata.sample(total_sample * 16)
    for index in tqdm(metadata.index):
        utterance_id = metadata["id"][index]
        wav = metadata["audio_path"][index]
        utt_score = str(metadata["utterance_score"][index] / 50)
        
        text = metadata["text"][index].lower()
        spk_id = ""
        phn = convert_to_phone_pure(metadata["arpas"][index])
        phn_canonical = convert_to_phone_pure(metadata["arpas"][index])

        scores = [str(ele/50) for ele in metadata["phone_scores"][index]]
        wrd_score = [str(ele/50) for ele in metadata["word_scores"][index]]
        wrd_ids = [str(ele + 1) for ele in metadata["word_ids"][index]]
        rel_pos = [str(ele) for ele in metadata["rel_pos"][index]]

        if len(scores) > max_length:
            continue
        
        duration = 0.0
        phn_ali = ""
        phn_ali_start = ""
        phn_ali_duration = ""

        if phn is None:
            continue

        scores = " ".join(scores)
        wrd_score = " ".join(wrd_score)
        wrd_ids= " ".join(wrd_ids)
        rel_pos = " ".join(rel_pos)

        sample = {
            "utterance_id": utterance_id,
            "wav": wav,
            "text": text,
            "spk_id": spk_id,
            "phn": phn,
            "phn_canonical": phn_canonical,
            "phn_score": scores,
            "wrd_score": wrd_score,
            "utt_score": utt_score,
            "wrd_id": wrd_ids,
            "rel_pos": rel_pos,
            "duration": duration,
            "phn_ali": phn_ali,
            "phn_ali_start": phn_ali_start,
            "phn_ali_duration": phn_ali_duration
        }
        
        if utterance_id in data:
            continue
        
        data[utterance_id] = sample
    
    return data

test_metadata = convert_df_to_dict(test_metadata, max_length=48)
train_metadata = convert_df_to_dict(train_metadata, max_length=48)
val_metadata = convert_df_to_dict(val_metadata, max_length=48)

521


100%|██████████| 521/521 [00:00<00:00, 37037.21it/s]


9382


100%|██████████| 9382/9382 [00:00<00:00, 37657.50it/s]


521


100%|██████████| 521/521 [00:00<00:00, 37531.47it/s]


In [39]:
def save_jsonl_data_row_level(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json_obj = json.dumps(data, indent=4, ensure_ascii=False)

        f.write(f'{json_obj}\n')

    print(f'###saved jsonl data to: {path}')

save_jsonl_data_row_level(data=train_metadata, path=f'{data_dir}/train.json')
save_jsonl_data_row_level(data=test_metadata, path=f'{data_dir}/test.json')
save_jsonl_data_row_level(data=val_metadata, path=f'{data_dir}/val.json')

###saved jsonl data to: /data/codes/sb-apa/data/scoring/train.json
###saved jsonl data to: /data/codes/sb-apa/data/scoring/test.json
###saved jsonl data to: /data/codes/sb-apa/data/scoring/val.json


In [42]:
len(val_metadata)

521