In [1]:
from matplotlib import pyplot as plt
import soundfile as sf
from tqdm import tqdm
from glob import glob
import pandas as pd
import torchaudio
import librosa
import torch
import json
import os

from jiwer import wer

In [2]:
def load_jsonl_data(metadata_path, load_json_obj=False):
    if load_json_obj:
        metadata = list(map(
            json.loads, 
            tqdm(
                open(metadata_path).readlines(), 
                desc="Load data"
                )
            )
        )
    else:
        metadata = open(metadata_path).readlines()
    
    return metadata

In [3]:
threshold = 0.05

In [None]:
data_dir = "/data/asr-research/data/s4_stt_metadata_nemo_ctc"
metadata = []
for filepath in glob(f'{data_dir}/*.jsonl'):
    metadata += load_jsonl_data(filepath, load_json_obj=True)
    
metadata = pd.DataFrame(metadata)
metadata = metadata.drop_duplicates()

In [6]:
metadata["wer"] = metadata.apply(lambda row: wer(row["text"], row["pred_text"]), axis=1)
metadata[metadata.wer <= threshold].duration.sum() / 3600

In [8]:
tmp = metadata[metadata.wer <= threshold]
filtered_df = []
for text, group in tmp.groupby("text"):
    if group.shape[0] > 32:
        group = group.sample(32, random_state=42)

    filtered_df.append(group)

filtered_df = pd.concat(filtered_df)
filtered_df.head(1)

In [None]:
metadata.duration.sum() / 3600, filtered_df.duration.sum() / 3600

In [11]:
saved_df = filtered_df.copy()
# saved_df = saved_df.rename(
#     columns={
#         "start_time": "offset",
#         "text": "text"
#     }
# )
saved_df.duration.hist(bins=100)

In [None]:
manifest_filepath = f"/data/asr-research/data/metadata/f88_segments_wer_{threshold}_v1.jsonl"
with open(manifest_filepath, "w") as f:
    for index in tqdm(saved_df.index):
        row = saved_df.loc[index].to_dict()
        json_obj = json.dumps(row, ensure_ascii=False)
        f.write(json_obj + "\n")

In [None]:
metadata[metadata.wer < threshold].duration.sum() / 3600

In [None]:
metadata[metadata.wer < threshold].duration.hist(bins=100)