In [17]:
import pandas as pd
import soundfile as sf
from jiwer import wer
import librosa
import json
import os
import re
from pandarallel import pandarallel
from glob import glob

pandarallel.initialize(nb_workers=4, progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [19]:
def load_jsonl(path):
    # path = "/data/codes/apa/kaldi/stt/logs/4133769.jsonl"

    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        lines = [json.loads(line.strip()) for line in lines]
    
    return lines

jsonl_dir = "/data/codes/apa/kaldi/stt/logs"
files = glob(f'{jsonl_dir}/*jsonl')
metadata = []
for file in files:
    metadata += load_jsonl(file)

In [13]:
def normalize(text):
    text = re.sub('[\!@#$%^&*\(\)\\\.\"\,\?\;\:\+\-\_\/\|~`]', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.upper().strip()
    return text

def calculate_wer(reference, hypothesis):
    try:
        word_error_rate = wer(
            reference,
            hypothesis)
    except:
        word_error_rate = 1
    return word_error_rate


In [14]:
metadata = pd.DataFrame(metadata)
metadata["prep"] = metadata.prep.apply(lambda x: normalize(x))
metadata["elsa"] = metadata.elsa.apply(lambda x: normalize(x))
metadata.head()

Unnamed: 0,sid,utt_id,elsa,prep,audio_path
0,5581470,6,UH ALSO SOME OF THE MIXTURE THAT WE NEED TO PU...,ALSO SOME OF THE MIXTURE THAT WE NEED TO PUT ON,/data/codes/apa/kaldi/stt/data/stt-data/wav/55...
1,5581471,0,COOKING IN THE PAST IS KIND OF HARD BECAUSE SO...,COOKING IN THE PAST IS KIND OF HARD BECAUSE SO...,/data/codes/apa/kaldi/stt/data/stt-data/wav/55...
2,5581471,1,DIDN'T HAVE ENOUGH UH INGREDIENTS AND UM THE T...,DIDN'T HAVE ENOUGH INGREDIENTS AND THE TOOLS F...,/data/codes/apa/kaldi/stt/data/stt-data/wav/55...
3,5581471,3,AND ANOTHER TOOL FOR COOKING AND ALSO INGREDIE...,AND ANOTHER TUNE FOR COOKING AND ALSO IN RED I...,/data/codes/apa/kaldi/stt/data/stt-data/wav/55...
4,5581471,4,THEY ONLY HAVE LIKE TRADITIONAL NATURAL MIXTUR...,THEY ONLY HAVE LIKE TRADITIONAL NATURAL MIXTUR...,/data/codes/apa/kaldi/stt/data/stt-data/wav/55...


In [15]:
metadata["wer"] = metadata.apply(lambda x: calculate_wer(reference=x["elsa"], hypothesis=x["prep"]), axis=1)


In [16]:
metadata.wer.mean()

0.1758958911771171