In [1]:
import pandas as pd
import soundfile as sf
from jiwer import wer
import librosa
import json
import os
import re
from pandarallel import pandarallel
from glob import glob
import torchaudio

pandarallel.initialize(nb_workers=8, progress_bar=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
def load_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        lines = [json.loads(line.strip()) for line in lines]

    return lines

In [3]:
log_dir = "/data/codes/apa/kaldi/stt/logs-type-9-v2"
out_path = "/data/codes/apa/kaldi/stt/data/stt-data/infer/info_question_type-9_19092023_21122023.jsonl"

In [4]:
files = glob(f'{log_dir}/*jsonl')
metadata = []
for file in files:
    metadata += load_jsonl(file)


In [5]:
df = pd.DataFrame(metadata)
df.head()

Unnamed: 0,sid,utt_id,elsa,prep,audio_path
0,6392867,2,Absolutely to are among my favorite closing p...,A slowly t shirt are among my favorite closing...,/data/codes/apa/kaldi/stt/data/stt-data/wav/63...
1,6392902,0,"No, because my family don't afford with.","No, because my family don't afford with.",/data/codes/apa/kaldi/stt/data/stt-data/wav/63...
2,6392902,1,The the phase of learning.,The fees of learning.,/data/codes/apa/kaldi/stt/data/stt-data/wav/63...
3,6392902,3,", I don't have a chance to learn to play a mus...",I don't have a chance to learn to play a music...,/data/codes/apa/kaldi/stt/data/stt-data/wav/63...
4,6392911,0,"I think it can be, uh, environ.",I think it can be a violent.,/data/codes/apa/kaldi/stt/data/stt-data/wav/63...


In [15]:
df["sid"] = df["id"]
df["start_time"] = 0
df["utt_id"] = None

In [13]:
def get_duration(path):
    wav, sr = torchaudio.load(path)
    
    assert sr == 16000
    
    return wav.shape[-1] / sr

df["end_time"] = df.audio_path.parallel_apply(get_duration)

In [16]:
df.head()

Unnamed: 0,id,total_time,elsa,prep,audio_path,sid,start_time,end_time,utt_id
0,1177329,7.57,EMMA TOOK ME TO MANY FAMOUS TOURIST ATTRACTION...,And it to me too many famous tourist attractio...,/data/audio_data/prep_submission_audio/10/1177...,1177329,0,7.594625,
1,4580473,11.76,EMMA TOOK ME TO MANY FAMOUS TOURIST ATTRACTION...,Ammer took me to many famous tourist attractio...,/data/audio_data/prep_submission_audio/10/4580...,4580473,0,11.776,
2,1422856,6.64,EMMA TOOK ME TO MANY FAMOUS TOURIST ATTRACTION...,And my took me to many famous tourist attracti...,/data/audio_data/prep_submission_audio/10/1422...,1422856,0,6.656,
3,2150677,6.89,EMMA TOOK ME TO MANY FAMOUS TOURIST ATTRACTION...,And took me to any famous tourist attraction of.,/data/audio_data/prep_submission_audio/10/2150...,2150677,0,6.912,
4,1465808,8.09,EMMA TOOK ME TO MANY FAMOUS TOURIST ATTRACTION...,I took me to many famous tourist attractions at.,/data/audio_data/prep_submission_audio/10/1465...,1465808,0,8.106625,


In [19]:
with open(out_path, "w", encoding="utf-8") as f:
    for index in df.index:
        row = df.iloc[index].to_dict()
        
        sid = row["sid"]
        start_time = row["start_time"]
        end_time = row["end_time"]
        audio_path = row["audio_path"]
        elsa = row["elsa"]
        prep = row["prep"]
        utt_id = row["utt_id"]
        
        sample = {
            "sid":sid,
            "start_time": start_time,
            "end_time": end_time,
            "audio_path": audio_path,
            "elsa": elsa,
            "prep":prep,
            "utt_id": utt_id
        }
        
        json_obj = json.dumps(sample, ensure_ascii=False)
        f.write(f'{json_obj}\n')

In [8]:
# with open(out_path, "w", encoding="utf-8") as f:
#     for line in metadata:
#         json_obj = json.dumps(line)
#         f.write(f'{json_obj}\n')