In [1]:
import dataclasses
import transformers
import librosa
import torch
from transformers import (
    pipeline,
    AutoTokenizer,
    LasrFeatureExtractor,
)
import regex as re 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
audio_path = "../model/medasr/test_audio.wav"   
speech, sr = librosa.load(
    audio_path,
    sr=16000,   
    mono=True
)

print(speech.shape, sr)

(700800,) 16000


In [3]:
audio_path = "../model/medasr/test_audio.wav"   

In [4]:
import dataclasses
import pyctcdecode
import transformers
import huggingface_hub 
def _restore_text(text: str) -> str:
  return text.replace(" ", "").replace("#", " ").replace("", "").strip()


class LasrCtcBeamSearchDecoder:

  def __init__(
      self,
      tokenizer: transformers.LasrTokenizer,
      kenlm_model_path=None,
      **kwargs,
  ):
    vocab = [None for _ in range(tokenizer.vocab_size)]
    for k, v in tokenizer.vocab.items():
      if v < tokenizer.vocab_size:
        vocab[v] = k
    assert not [i for i in vocab if i is None]
    # pyctcdecode also expect the blank label to map to the empty string.
    vocab[0] = ""
    # Replace '▁' with '#' and prefix each token with a '▁'. This way, pyctcdecode
    # treats each token as a "word".
    for i in range(1, len(vocab)):
      piece = vocab[i]
      if not piece.startswith("<") and not piece.endswith(">"):
        piece = "▁" + piece.replace("▁", "#")
      vocab[i] = piece
    self._decoder = pyctcdecode.build_ctcdecoder(
        vocab, kenlm_model_path, **kwargs
    )

  def decode_beams(self, *args, **kwargs):
    beams = self._decoder.decode_beams(*args, **kwargs)

    fixed_beams = []
    for beam in beams:
        text, logit_score, lm_score, score, word_offsets = beam
        fixed_beams.append(
            (
                _restore_text(text),
                logit_score,
                lm_score,
                score,
                word_offsets,
            )
        )
    return fixed_beams

def beam_search_pipe(model: str, lm: str):
  feature_extractor = transformers.LasrFeatureExtractor.from_pretrained(model)
  feature_extractor._processor_class = "LasrProcessorWithLM"
  pipe = transformers.pipeline(
      task="automatic-speech-recognition",
      model=model,
      feature_extractor=feature_extractor,
      decoder=LasrCtcBeamSearchDecoder(
          transformers.AutoTokenizer.from_pretrained(model), lm
      ),
  )
  assert pipe.type == "ctc_with_lm"
  return pipe

model_id = "google/medasr"
lm_path = "../model/medasr/lm_6.kenlm" 
pipe_with_lm = beam_search_pipe(
    model_id, 
    lm_path
    )

result_with_lm = pipe_with_lm(
    audio_path,
    chunk_length_s=20,
    stride_length_s=2,
    decoder_kwargs=dict(beam_width=8),
)

Unigrams not provided and cannot be automatically determined from LM file (only arpa format). Decoding accuracy might be reduced.
No known unigrams provided, decoding results might be a lot worse.
Loading weights: 100%|██████████| 368/368 [00:00<00:00, 1008.16it/s, Materializing param=encoder.subsampler.dense_1.weight]              
  return F.conv1d(


In [5]:
result_with_lm

{'text': '[EXAM TYPE] CT chest PE protocol {period} [INDICATION] 54-year-old female, shortness of breath, evaluate for PE {period} [TECHNIQUE] Standard protocol {period} [FINDINGS] {colon} Pulmonary vasculature {colon} The main PA is patent {period} There are filling defects in the segmental branches of the right lower lobe {comma} compatible with acute PE {period} No saddle embolus {period} Lungs {colon} No pneumothorax {period} Small bilateral effusions {comma} right greater than left {period} {new paragraph} [IMPRESSION] {colon} Acute segmental PE, right lower lobe {period}</s>'}