In [1]:
import pandas as pd
import soundfile as sf
from jiwer import wer
import librosa
import json
import os
import re
from pandarallel import pandarallel
from glob import glob

pandarallel.initialize(nb_workers=8, progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        lines = [json.loads(line.strip()) for line in lines]

    return lines

In [3]:
path = "/data/codes/apa/kaldi/stt/data/stt-data/infer/info_question_type-10_01082022_18092023.jsonl"
metadata = load_jsonl(path)

In [4]:
def normalize(text):
    text = re.sub("[!?,.\"-]", " ", text)
    text = text.upper().strip()

    text = re.sub('\s+', ' ', text)

    return text

def calculate_wer(reference, hypothesis):
    # try:
    word_error_rate = wer(
        reference,
        hypothesis)
    # except:
    #     word_error_rate = 1
    return word_error_rate

In [5]:
metadata = pd.DataFrame(metadata)
metadata["prep"] = metadata.prep.parallel_apply(lambda x: normalize(x))
metadata["elsa"] = metadata.elsa.parallel_apply(lambda x: normalize(x))
metadata.head()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25806), Label(value='0 / 25806')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25806), Label(value='0 / 25806')))…

Unnamed: 0,sid,start_time,end_time,audio_path,elsa,prep,utt_id
0,1177329,0,7.594625,/data/audio_data/prep_submission_audio/10/1177...,EMMA TOOK ME TO MANY FAMOUS TOURIST ATTRACTION...,AND IT TO ME TOO MANY FAMOUS TOURIST ATTRACTIO...,
1,4580473,0,11.776,/data/audio_data/prep_submission_audio/10/4580...,EMMA TOOK ME TO MANY FAMOUS TOURIST ATTRACTION...,AMMER TOOK ME TO MANY FAMOUS TOURIST ATTRACTIO...,
2,1422856,0,6.656,/data/audio_data/prep_submission_audio/10/1422...,EMMA TOOK ME TO MANY FAMOUS TOURIST ATTRACTION...,AND MY TOOK ME TO MANY FAMOUS TOURIST ATTRACTI...,
3,2150677,0,6.912,/data/audio_data/prep_submission_audio/10/2150...,EMMA TOOK ME TO MANY FAMOUS TOURIST ATTRACTION...,AND TOOK ME TO ANY FAMOUS TOURIST ATTRACTION OF,
4,1465808,0,8.106625,/data/audio_data/prep_submission_audio/10/1465...,EMMA TOOK ME TO MANY FAMOUS TOURIST ATTRACTION...,I TOOK ME TO MANY FAMOUS TOURIST ATTRACTIONS AT,


In [6]:
metadata["wer"] = metadata.parallel_apply(lambda x: calculate_wer(reference=x["elsa"], hypothesis=x["prep"]), axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25806), Label(value='0 / 25806')))…

In [7]:
metadata["duration"] = metadata.parallel_apply(lambda x: x["end_time"] - x["start_time"], axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=25806), Label(value='0 / 25806')))…

In [8]:
metadata.duration.sum() / 3600

146.65267564236115

In [12]:
metadata[metadata.wer==0].duration.sum() /  3600

59.144882326388895

In [13]:
filtered_data = metadata[metadata.wer==0]

In [14]:
filtered_data.reset_index(inplace=True)

In [16]:
import librosa
import soundfile as sf
from tqdm import tqdm

filtered_datas = []
for index in tqdm(filtered_data[['sid', 'utt_id', 'elsa', 'prep', 'audio_path', 'start_time', 'end_time', 'duration']].index):
    row = filtered_data.iloc[index].to_dict()
    
    sample = {
        "sid": row["sid"],
        "utt_id": row["utt_id"],
        "elsa": row["elsa"],
        "prep": row["prep"],
        "start_time": row["start_time"],
        "end_time": row["end_time"],
        "audio_path": row["audio_path"],
    }

    filtered_datas.append(sample)

100%|██████████| 86522/86522 [00:03<00:00, 24294.09it/s]


In [17]:
path = "/data/codes/apa/kaldi/stt/data/stt-data/final/info_question_type-10_01082022_18092023.jsonl"
with open(path, "w", encoding="utf-8") as f:
    for line in filtered_datas:
        json_obj = json.dumps(line)
        f.write(f'{json_obj}\n')