In [1]:
from tqdm import tqdm
import pandas as pd
import json
import re
import os

In [2]:
def normalize(text):
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip().lower()
    
    return text

In [3]:
lexicon_path = "/data/asr/kaldi/data/lexicon"
lexicon_df = pd.read_csv(lexicon_path, sep="\t", names=["word", "phonemes"])
vocab = set(lexicon_df.word.to_list())

In [20]:
data_dir = "/data/asr/kaldi/data/test"
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    
# metadata_path = "/data/asr/metadata/test_tele.jsonl"
metadata_path = "/data/asr/metadata/pred-train_gigaspeech2_filtered.jsonl"

metadata = list(map(json.loads, tqdm(open(metadata_path).readlines())))
metadata_df = pd.DataFrame(metadata)
metadata_df["text"] = metadata_df.text.apply(normalize)
metadata_df["id"] = metadata_df.audio_filepath.apply(lambda x: os.path.basename(x).split(".wav")[0])
metadata_df = metadata_df[1000000: 1500000]
metadata_df.head(1)

100%|██████████| 1875452/1875452 [00:04<00:00, 375982.92it/s]


Unnamed: 0,audio_filepath,text,id,duration,pred,wer
1000000,/data/asr/asr_data/train/126/419/126-419-71.wav,công việc khiến cho ông minh đi công tác nhiều...,126-419-71,2.640062,công việc khiến cho ông minh đi công tác nhiều...,0.0


In [5]:
# metadata_df["text"].to_csv("/data/asr/icefall/egs/librispeech/ASR/manifests/lm_corpus.txt", index=None)

In [21]:
metadata_df.duration.sum()/ 3600

488.6643307812499

In [22]:
def check_oov(text):
    words = text.split()
    for word in words:
        if word not in vocab:
            return False
        
    return True

metadata_df = metadata_df[metadata_df.text.apply(check_oov)]

In [23]:
from lhotse import RecordingSet, Recording, SupervisionSegment, SupervisionSet

recording_samples, supervision_samples = [], []

for index in tqdm(metadata_df.index):
    row = metadata_df.loc[index].to_dict()

    audio_filepath = row["audio_filepath"]
    id = row["id"]
    duration = row["duration"]
    text = row["text"]
    
    if row["wer"] > 0.05:
        continue
    
    recording = Recording.from_file(audio_filepath)
    supervision = SupervisionSegment(
        id=id, recording_id=id,
        start=0.0, duration=recording.duration, channel=0,
        text=text,
        language='Vietnamese', 
    )

    recording_samples.append(recording)
    supervision_samples.append(supervision)

100%|██████████| 499928/499928 [07:05<00:00, 1176.00it/s]


In [24]:
recs = RecordingSet.from_recordings(recording_samples)
sups = SupervisionSet.from_segments(supervision_samples)

In [25]:
sups.to_file('/data/asr/icefall/egs/librispeech/ASR/manifests/gasr_supervisions_train_3.jsonl.gz')
recs.to_file('/data/asr/icefall/egs/librispeech/ASR/manifests/gasr_recordings_train_3.jsonl.gz')

In [11]:
# output_dir = "/data/asr/icefall/egs/librispeech/ASR/manifests"

# output_path = f'{output_dir}/recordings_train.jsonl'
# with open(output_path, "w") as f:
#     for sample in recording_samples:
#         json_obj = json.dumps(sample, ensure_ascii=False)
#         f.write(json_obj + "\n")
        
# output_path = f'{output_dir}/supervision_train.jsonl'
# with open(output_path, "w") as f:
#     for sample in supervision_samples:
#         json_obj = json.dumps(sample, ensure_ascii=False)
#         f.write(json_obj + "\n")

In [12]:
# metadata_df.text.to_csv("/data/asr/kaldi/exp/lm/llm_corpus.txt", index=None)

In [13]:
# def save(metadata, data_dir):
#     wavscp_path = f'{data_dir}/wav.scp'
#     text_path = f'{data_dir}/text'
#     spk2utt_path = f'{data_dir}/spk2utt'
#     utt2spk_path = f'{data_dir}/utt2spk'

#     def create_text_file(f, contents):
#         line = "\t".join(contents)
#         f.write(line + "\n")

#     with open(wavscp_path, "w", encoding="utf-8") as f:
#         metadata.sort_values("id").apply(lambda x: create_text_file(f, (x["id"], x["audio_filepath"])), axis=1)
#         print(f'###saved to: {wavscp_path}')

#     with open(text_path, "w", encoding="utf-8") as f:
#         metadata.sort_values("id").apply(lambda x: create_text_file(f, (x["id"], x["text"])), axis=1)
#         print(f'###saved to: {text_path}')
        
#     with open(spk2utt_path, "w", encoding="utf-8") as f:
#         metadata.sort_values("id").apply(lambda x: create_text_file(f, (x["id"], x["id"])), axis=1)
#         print(f'###saved to: {spk2utt_path}')
        
#     with open(utt2spk_path, "w", encoding="utf-8") as f:
#         metadata.sort_values("id").apply(lambda x: create_text_file(f, (x["id"], x["id"])), axis=1)
#         print(f'###saved to: {utt2spk_path}')
        
# save(metadata=metadata_df, data_dir=data_dir)