In [17]:
import json
import pandas as pd
from tqdm import tqdm
import librosa
import random
import re
import os

In [18]:
lexicon_path = "/data/codes/apa/train/resources/lexicon.txt"
lexicon = pd.read_csv(lexicon_path, names=["word", "arpa"], sep="\t")
lexicon.dropna(inplace=True)

filtered_lexicon = []
for name, group in lexicon.groupby("word"):
    if group.shape[0] > 1:
        continue
    filtered_lexicon.append(group)

lexicon = pd.concat(filtered_lexicon)
vocab = lexicon["word"].tolist()
lexicon = lexicon.set_index("word")["arpa"].to_dict()

In [19]:
path = "/data/codes/apa/train/resources/same_pron_arpa_dict.json"

with open(path, "r", encoding="utf-8") as f:
    same_pron_arpa_dict = json.load(f)

In [20]:
path = "/data/codes/apa/train/prep_data/jsonl/info_qt_10_trainset.jsonl"

with open(path, "r", encoding="utf-8") as f:
    lines = [json.loads(line) for line in f.readlines()]

In [21]:
data = []
for line in tqdm(lines):
    if len(line["raw"].split()) > 1:
        continue

    is_valid = True
    for score in line["phone_scores"]:
        if score < 90:
            is_valid = False
            break
    
    wav, sr = librosa.load(line["audio_path"], sr=16000)
    if wav.shape[0] / sr < 1.5:
        continue
    
    if len(line["arpas"]) > 6:
        continue

    if is_valid == False:
        continue
    data.append(line)

random.shuffle(data)
data = data[0:10000]

100%|██████████| 153491/153491 [00:19<00:00, 7809.83it/s] 


In [22]:
def is_valid(arpa_1, arpa_2):
    arpa_1 = [re.sub("\d", "", arpa) for arpa in arpa_1]
    arpa_2 = set([re.sub("\d", "", arpa) for arpa in arpa_2])

    for arpa in arpa_1:
        if arpa in arpa_2:
            return False

        if arpa not in same_pron_arpa_dict:
            continue
        tmp_arpa = same_pron_arpa_dict[arpa]

        for arpa in arpa_2:
            if arpa in tmp_arpa:
                return False
    
    return True

def gen_aug_data(sample, text, arpa):
    augment_data = {}

    augment_data["id"] = f'{sample["id"]}-aug'
    augment_data["raw"] = sample["raw"]
    augment_data["text"] = text
    augment_data["utt_id"] = sample["utt_id"]
    augment_data["start_time"] = sample["start_time"]
    augment_data["end_time"] = sample["end_time"]
    augment_data["arpas"] = arpa
    augment_data["trans"] = arpa #sample["arpas"]
    augment_data["phone_scores"] = [0,] * len(sample["phone_scores"])
    augment_data["word_scores"] = [0,]
    augment_data["utterance_scores"] = 0
    augment_data["decisions"] = sample["decisions"]
    augment_data["word_ids"] = sample["word_ids"]
    augment_data["audio_path"] = sample["audio_path"]
        # os.path.join(
        #     os.path.dirname(sample["audio_path"]),
        #     f'{augment_data["id"]}.wav')
    
    return augment_data

In [23]:
max_length = 3
augmented_datas = []

for sample in tqdm(data):
    origin_arpa = []
    for arpa in sample["arpas"]:
        if arpa == "AH0":
            origin_arpa.append("AX")
        else:
            origin_arpa.append(arpa)

    sample["arpas"] = origin_arpa

    augment_arpa = []

    num_word = random.randint(1, max_length)

    count = 0
    text, arpas = [], []
    while count < num_word:
        random_word = random.choice(vocab)
        random_arpa = lexicon[random_word]

        augment_arpa = random_arpa.strip().split()
        if len(augment_arpa) != len(origin_arpa):
            continue

        if is_valid(arpa_1=origin_arpa, arpa_2=augment_arpa) == True:
            count += 1

            text.append(random_word)
            arpas.append(" ".join(augment_arpa))

    text = " ".join(text)
    arpas = " ".join(arpas).split()

    augment_data = gen_aug_data(
        sample=sample,
        text=text,
        arpa=arpas
    )

    augmented_datas.append(augment_data)

  2%|▏         | 190/9894 [00:00<00:05, 1898.58it/s]

100%|██████████| 9894/9894 [00:06<00:00, 1456.39it/s]


In [24]:
import shutil

for sample in tqdm(augmented_datas):
    in_path = sample["audio_path"]
    out_path = f'/data/codes/apa/train/prep_data/wav/{sample["id"]}.wav'

    shutil.copy(src=in_path, dst=out_path)


100%|██████████| 9894/9894 [00:01<00:00, 6275.03it/s]


In [25]:
path = "/data/codes/apa/train/prep_data/raw_jsonl/info_qt_10_trainset-aug.jsonl"
with open(path, "w", encoding="utf-8") as f:
    for sample in augmented_datas:
        json_obj = json.dumps(sample, ensure_ascii=False)
        f.write(f'{json_obj}\n')

In [26]:
json_obj

'{"id": "3324420-aug", "raw": "however", "text": "MUCKLE CUNY MEANLY", "utt_id": null, "start_time": null, "end_time": null, "arpas": ["M", "AH1", "K", "AX", "L", "K", "Y", "UW1", "N", "IY0", "M", "IY1", "N", "L", "IY0"], "trans": ["M", "AH1", "K", "AX", "L", "K", "Y", "UW1", "N", "IY0", "M", "IY1", "N", "L", "IY0"], "phone_scores": [0, 0, 0, 0, 0], "word_scores": [0], "utterance_scores": 0, "decisions": [2, 2, 2, 2, 2], "word_ids": [0, 0, 0, 0, 0], "audio_path": "/data/audio_data/prep_submission_audio/10/3324420.wav"}'