In [29]:
import pandas as pd
import soundfile as sf
from jiwer import wer
import librosa
import json
import os
import re
from pandarallel import pandarallel
from glob import glob

pandarallel.initialize(nb_workers=8, progress_bar=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [30]:
def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        lines = [json.loads(line.strip()) for line in lines]

    return lines

In [31]:
lexicon_path = "/data/codes/apa/kaldi/g2p/resources/lexicon"

lexicon = pd.read_csv(lexicon_path, names=["word", "arpa"], sep="\t")
vocab = set(lexicon["word"].tolist())
len(vocab)

313973

In [32]:
path = "/data/codes/apa/kaldi/stt/data/stt-data/infer/info_question_type-9_19092023_21122023.jsonl"

metadata = load_jsonl(path)

In [33]:
def normalize(text):
    text = re.sub("[!?,.\"-]", " ", text)
    text = text.upper().strip()

    text = re.sub('\s+', ' ', text)

    # for word in text.split():
    #     if not (word.isalpha() or "'" in word):
    #         print(word)

    return text

def wer_normalize(text):
    text = normalize(text)

    text = text.replace(" UM ", " ")
    text = text.replace(" UH ", " ")
    text = text.replace(" UHM ", " ")
    text = text.replace(" AH ", " ")

    if text.startswith("UM "):
        text = text.replace("UM ", " ")
    if text.startswith("UH "):
        text = text.replace("UH ", " ")
    if text.startswith("UHM "):
        text = text.replace("UHM ", " ")
    if text.startswith("AH "):
        text = text.replace("AH ", " ")

    return text

def calculate_wer(reference, hypothesis):
    # try:
    word_error_rate = wer(
        reference,
        hypothesis)
    # except:
    #     word_error_rate = 1
    return word_error_rate

In [34]:
metadata = pd.DataFrame(metadata)
metadata.head(1)

Unnamed: 0,sid,utt_id,elsa,prep,audio_path,start_time,end_time
0,6392867,2,Absolutely to are among my favorite closing p...,A slowly t shirt are among my favorite closing...,/data/codes/apa/kaldi/stt/data/stt-data/wav/63...,18.75,29.93


In [35]:
metadata["wer_prep"] = metadata.prep.parallel_apply(lambda x: wer_normalize(x))
metadata["wer_elsa"] = metadata.elsa.parallel_apply(lambda x: wer_normalize(x))

metadata.head(3)[["wer_prep", "wer_elsa"]]

Unnamed: 0,wer_prep,wer_elsa
0,A SLOWLY T SHIRT ARE AMONG MY FAVORITE CLOSING...,ABSOLUTELY TO ARE AMONG MY FAVORITE CLOSING PI...
1,NO BECAUSE MY FAMILY DON'T AFFORD WITH,NO BECAUSE MY FAMILY DON'T AFFORD WITH
2,THE FEES OF LEARNING,THE THE PHASE OF LEARNING


In [36]:
metadata["wer"] = metadata.parallel_apply(lambda x: calculate_wer(reference=x["wer_elsa"], hypothesis=x["wer_prep"]), axis=1)

In [37]:
metadata["prep"] = metadata.prep.parallel_apply(lambda x: normalize(x))
metadata["elsa"] = metadata.elsa.parallel_apply(lambda x: normalize(x))

In [38]:
metadata["duration"] = metadata.parallel_apply(lambda x: x["end_time"] - x["start_time"], axis=1)

In [39]:
print(metadata.duration.sum() / 3600)

715.9134638888889


In [40]:
def check_vocab(text):
    for word in text.split():
        if word not in vocab:
            return False
        
    return True

is_valid = metadata.elsa.parallel_apply(check_vocab)
print(metadata[is_valid==False].duration.sum() / 3600)

32.003302777777776


In [41]:
metadata = metadata[is_valid==True]

In [42]:
metadata[metadata.wer<0.1].duration.sum() /  3600

304.25403611111113

In [43]:
filtered_data = metadata[metadata.wer<0.1]

In [44]:
import librosa
import soundfile as sf
from tqdm import tqdm
from pandarallel import pandarallel
import os

pandarallel.initialize(nb_workers=16, progress_bar=True)

out_dir = "/data/codes/apa/kaldi/stt/data/stt-data/audio"
wav_files = os.listdir(out_dir)
wav_files = set(wav_files)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [45]:
def copy_audio(row):
    in_dir = "/data/audio_data/prep_submission_audio/9"

    row = row.to_dict()

    in_path = f'{in_dir}/{row["sid"]}.wav'
    out_path = f'{out_dir}/{row["sid"]}{row["utt_id"]}.wav'
    
    filename = os.path.basename(out_path)
    
    assert filename not in wav_files
    assert "prep_submission_audio" not in out_path

    wav, sr = librosa.load(in_path, sr=16000)

    start_time = int(sr * row["start_time"])
    end_time = int(sr * row["end_time"])

    assert end_time < wav.shape[0]
    sf.write(out_path, wav[start_time:end_time], samplerate=sr)

filtered_data.parallel_apply(lambda x: copy_audio(x), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=7331), Label(value='0 / 7331'))), …

1         None
3         None
5         None
6         None
16        None
          ... 
274306    None
274310    None
274312    None
274316    None
274317    None
Length: 117290, dtype: object

In [46]:
filtered_data.reset_index(inplace=True)

In [47]:
import librosa
import soundfile as sf
from tqdm import tqdm

out_dir = "/data/codes/apa/kaldi/stt/data/stt-data/audio"

filtered_datas = []
for index in tqdm(filtered_data[['sid', 'utt_id', 'elsa', 'prep', 'audio_path', 'start_time', 'end_time', 'wer', 'duration']].index):
    row = filtered_data.iloc[index].to_dict()

    out_path = f'{out_dir}/{row["sid"]}{row["utt_id"]}.wav'
    
    sample = {
        "sid": row["sid"],
        "utt_id": row["utt_id"],
        "elsa": row["elsa"],
        "prep": row["prep"],
        "start_time": row["start_time"],
        "end_time": row["end_time"],
        "audio_path": out_path,
    }

    filtered_datas.append(sample)

100%|██████████| 117290/117290 [00:04<00:00, 23544.19it/s]


In [48]:
path = "/data/codes/apa/kaldi/stt/data/stt-data/final/info_question_type-9_19092023_21122023.jsonl"
with open(path, "w", encoding="utf-8") as f:
    for line in filtered_datas:
        json_obj = json.dumps(line)
        f.write(f'{json_obj}\n')