In [None]:
def init_asr_model_with_lm(asr_ckpt_path, lm_ckpt_path):
    from dataclasses import dataclass, field
    from typing import List

    from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
    from nemo.collections.asr.parts.submodules import ctc_beam_decoding
    
    @dataclass
    class BeamSearchNGramConfig:
        decoding_mode: str = "beamsearch_ngram"

        beam_width: List[int] = field(default_factory=lambda: [32])
        beam_alpha: List[float] = field(default_factory=lambda: [1.0])
        beam_beta: List[float] = field(default_factory=lambda: [0.0])

        decoding_strategy: str = "flashlight" #"pyctcdecode"
        decoding: ctc_beam_decoding.BeamCTCInferConfig = field(
            default_factory=lambda: ctc_beam_decoding.BeamCTCInferConfig(beam_size=128)
        )

    # change decoding config
    cfg = BeamSearchNGramConfig()

    cfg.decoding.beam_size = 32
    cfg.decoding.beam_alpha = 0.4
    cfg.decoding.beam_beta = 1.5
    cfg.decoding.return_best_hypothesis = True
    cfg.decoding.kenlm_path = lm_ckpt_path

    cfg.decoding.flashlight_cfg.beam_size_token = 32
    cfg.decoding.flashlight_cfg.beam_threshold = 25.0
    cfg.decoding.flashlight_cfg.lexicon_path=f'/data/asr-research/data/lexicon'
    cfg.decoding.search_type = "flashlight"

    # init asr model
    model = EncDecCTCModelBPE.load_from_checkpoint(asr_ckpt_path)
    model.change_decoding_strategy(None)

    model.cfg.decoding.strategy = cfg.decoding_strategy
    model.cfg.decoding.beam = cfg.decoding

    model.change_decoding_strategy(model.cfg.decoding)

    return model

asr_ckpt_path = "/data/asr/nemo_experiments/FastConformer-CTC-BPE/2025-07-28_15-26-01/checkpoints/FastConformer-CTC-BPE--val_wer--0.1891.ckpt"
lm_ckpt_path = "/data/asr/vi_lm_5grams.bin"
model = init_asr_model_with_lm(asr_ckpt_path, lm_ckpt_path)

In [None]:
def init_asr_model_with_lm(asr_ckpt_path, lm_ckpt_path):
    from dataclasses import dataclass, field
    from typing import List

    from nemo.collections.asr.models.ctc_bpe_models import EncDecCTCModelBPE
    from nemo.collections.asr.parts.submodules import ctc_beam_decoding
    
    @dataclass
    class BeamSearchNGramConfig:
        decoding_mode: str = "beamsearch_ngram"

        beam_width: List[int] = field(default_factory=lambda: [32])
        beam_alpha: List[float] = field(default_factory=lambda: [1.0])
        beam_beta: List[float] = field(default_factory=lambda: [0.0])

        decoding_strategy: str = "pyctcdecode"
        decoding: ctc_beam_decoding.BeamCTCInferConfig = field(
            default_factory=lambda: ctc_beam_decoding.BeamCTCInferConfig(beam_size=128)
        )

    # change decoding config
    cfg = BeamSearchNGramConfig()

    cfg.decoding.beam_size = 32
    cfg.decoding.beam_alpha = 0.4
    cfg.decoding.beam_beta = 1.5
    cfg.decoding.return_best_hypothesis = True
    cfg.decoding.kenlm_path = lm_ckpt_path

    # init asr model
    model = EncDecCTCModelBPE.load_from_checkpoint(asr_ckpt_path)
    model.change_decoding_strategy(None)

    model.cfg.decoding.strategy = cfg.decoding_strategy
    model.cfg.decoding.beam = cfg.decoding

    model.change_decoding_strategy(model.cfg.decoding)

    return model

asr_ckpt_path = "/data/asr/nemo_experiments/FastConformer-CTC-BPE/2025-08-01_04-41-00/checkpoints/FastConformer-CTC-BPE--val_wer-0.1896-epoch-0.ckpt"
lm_ckpt_path = "/data/asr/vi_lm_5grams.bin"
model = init_asr_model_with_lm(asr_ckpt_path, lm_ckpt_path)

In [None]:
from glob import glob
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

data_dir = "/data/ai-nlp-stt-service/exp/audio/log"
audio_filepaths = glob(f'{data_dir}/*.wav')
df = pd.DataFrame({"audio_filepath": audio_filepaths})
df.head(1)

In [None]:
import re

def normalize_veh_name(text):
    veh_spoken_to_norm = {
        'hon đa uây': 'honda wave',
        'hon đa guây': 'honda wave',
        'hon đa rim': 'honda dream',
        'hon đa đờ rim': 'honda dream',
        'hon đa ét hát': 'honda sh',
        'hon đa e bờ lết': 'honda air blade',
        'hon đa a bờ lết': 'honda air blade',
        'hon đa bờ lết': 'honda blade',
        'hon đa vi sừn': 'honda vision',
        'hon đa phiu trờ': 'honda future',
        'hon đa lít': 'honda lead',
        'hon đa ét hát mốt': 'honda sh mode',
        'hon đa guyn nơ': 'honda winner',
        'hon đa guyn nờ': 'honda winner',
        'hon đa uyn nơ': 'honda winner',
        'hon đa uyn nờ': 'honda winner',
        'hon đa uây rờ ét ích': 'honda wave RSX',
        'hon đa va ri ô': 'honda vario',
        'hon đa ét há một sáu mươi i': 'honda sh 125i',
        'y a ma ha rờ ba': 'yamaha R3',
        'y a ma ha si ri ớt một trăm mười': 'yamaha sirius 110',
        'y a ma ha ích sai tơ': 'yamaha exciter',
        'hon đa': 'honda',
        'gia ma ha': 'yamaha',
        'da ma ha': 'yamaha',
        'y a ma': 'yamaha'
    }
    items = veh_spoken_to_norm.items()
    items = sorted(items, key=lambda x: len(x[0]), reverse=True)
    text = f' {text} '
    for key, value in items:
        if key not in text:
            continue
        text = re.sub(rf'\s{key}\s', ' ' + value +' ', text)

    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [39]:
df["text"] = df.audio_filepath.progress_apply(lambda x: model.transcribe(x)[0].text)

Transcribing: 100%|██████████| 1/1 [00:00<00:00, 29.40it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 24.23it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 24.76it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 26.00it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 27.22it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 23.88it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 43.78it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 46.95it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 48.85it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 26.92it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 24.33it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 24.22it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 49.06it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 26.93it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 27.60it/s]
Transcribing: 100%|██████████| 1/1 [00:00<00:00, 27.15it/s]
Transcribing: 100%|██████████| 1/1 [00:0

In [None]:
# model.save_to("/data/ai-nlp-stt-service/exp/ckpts/FastConformer-CTC-BPE-0.1896-veh.nemo")

In [None]:
filepath = "/data/ai-nlp-stt-service/exp/audio/log/358bc7b4-f13c-4d93-bd81-2c51df52699a.wav"


In [None]:
from tqdm import tqdm
import pandas as pd
import json

train_manifest_path=[\
"/data/asr-research/data/metadata/f88_segments_wer_0.05_v1.jsonl",\
"/data/asr-research/data/metadata/f88_wer_0.05_v1.jsonl"\
]
val_manifest_path=["/data/asr/metadata/test_tele.jsonl"]

manifest_paths = train_manifest_path + val_manifest_path
metadata = []
for filepath in manifest_paths:
    lines = open(filepath).readlines()
    for line in tqdm(lines):
        line = json.loads(line)
        
        metadata.append(line)
        
metadata = pd.DataFrame(metadata)
metadata.head(1)

In [57]:
corpus = metadata.text.apply(lambda x: x.split()).explode()
corpus = corpus.value_counts()
corpus = corpus[corpus>= 5]

vocab = list(corpus.index)
vocab_df = pd.DataFrame({"word": vocab})
vocab_df["tokens"] = vocab_df.word.apply(model.tokenizer.text_to_tokens)
vocab_df["tokens"] = vocab_df["tokens"].apply(lambda x: " ".join(x))
vocab_df.sort_values("word").to_csv("data/lexicon", index=None, sep="\t", header=None)

In [None]:
!python scripts/eval_beamsearch_ngram_lexicon_ctc.py \
    nemo_model_file="/data/asr/nemo_experiments/FastConformer-CTC-BPE/2025-08-01_04-41-00/checkpoints/FastConformer-CTC-BPE--val_wer-0.1896-epoch-0.ckpt"\
    input_manifest="/data/asr/metadata/test_tele.jsonl" \
    kenlm_model_file="/data/asr/vi_lm_5grams.bin" \
    beam_width=[32] \
    beam_alpha=[0.4] \
    beam_beta=[1.5] \
    preds_output_folder=output \
    probs_cache_file=null \
    decoding_mode=beamsearch_ngram \
    decoding_strategy="pyctcdecode" 

In [1]:
from tqdm import tqdm
import pandas as pd
import librosa
import json
import re

tqdm.pandas()

filepath = "/data/asr/vixtts-demo/tts_data.csv"
df = pd.read_csv(filepath, index_col=0)

In [None]:
def get_duration(filepath):
    try:
        audio, sr = librosa.load(filepath, sr=None)
        assert sr == 16000
        
        return audio.shape[0] / sr
    except:
        return None

df["duration"] = df.audio_filepath.progress_apply(get_duration)

In [None]:
df = df.dropna()

In [None]:
df.duration.sum() / 3600

In [None]:
filepath = "tts_data.jsonl"
with open(filepath, "w") as f:
    for index in df.index:
        row = df.loc[index].to_dict()
        json_obj = json.dumps(row, ensure_ascii=False)
        
        f.write(json_obj + "\n")
        
print(f"### saved file to {filepath}")