In [4]:
from omegaconf import OmegaConf
from model import AsrLlmCascadeModel

args = OmegaConf.create()
args.llm_config = "Qwen/Qwen2-0.5B"
args.asr_config = "openai/whisper-large-v3-turbo"

cascade = AsrLlmCascadeModel.build_model(args)
cascade

AsrLlmCascadeModel(
  (asr): HuggingFaceWhisperModel(
    (encoder): DummyEncoder()
    (decoder): DummyDecoder()
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 1280)
          (layers): ModuleList(
            (0-31): 32 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
                (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
                (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
                (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
              )
              (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=Tru

# Тесты, что работает

In [5]:
import torch
import soundfile as sf
from datasets import load_dataset


dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

waveform = torch.tensor(sample['array']).unsqueeze(0)  # Add batch dimension
sampling_rate = sample['sampling_rate']

waveform = waveform.float()

inputs = cascade.asr.processor(waveform.squeeze(0), sampling_rate=sampling_rate, return_tensors="pt")
waveform = inputs['input_features']

sf.write('audio.wav', sample['array'], sampling_rate)

2024-12-05 18:59:43 | INFO | datasets | PyTorch version 2.5.1 available.


In [6]:
# # 1. сгенерировать токены

cascade.generate_from_asr(waveform)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


tensor([[50258, 50259, 50360, 50364,  2221,    13,  2326,   388,   391,   307,
           264, 50244,   295,   264,  2808,  5359,    11,   293,   321,   366,
          5404,   281,  2928,   702, 14943,    13,  6966,   307,  2221,    13,
          2326,   388,   391,   311,  9060,  1570,  1880,   813,   702,  1871,
            13,   634,  5112,   505,   300,   412,   341, 42729,  3196,   295,
           264,  1064,    11,   365,  5272,   293, 12904,  9256,   450, 10539,
           949,   505,    11,  1034,  4680, 10117,   490,  3936,   293,  1080,
          3542,  5160,   881, 26336,   281,   264,  1575,    13,   634,   575,
         12525, 22618,  1968,  6144, 35617, 20084,  1756,   311,   589,   307,
           534, 10281,   934,   439,    11]])

In [7]:
# 2. сгенерировать текст

cascade.generate_from_asr(waveform, text=True)

[" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all,"]

In [8]:
# 3. сгенерировать токены из аудиофайла

cascade.generate_from_asr(file='audio.wav')

tensor([[50258, 50259, 50360, 50364,  2221,    13,  2326,   388,   391,   307,
           264, 50244,   295,   264,  2808,  5359,    11,   293,   321,   366,
          5404,   281,  2928,   702, 14943,    13,  6966,   307,  2221,    13,
          2326,   388,   391,   311,  9060,  1570,  1880,   813,   702,  1871,
            13,   634,  5112,   505,   300,   412,   341, 42729,  3196,   295,
           264,  1064,    11,   365,  5272,   293, 12904,  9256,   450, 10539,
           949,   505,    11,  1034,  4680, 10117,   490,  3936,   293,  1080,
          3542,  5160,   881, 26336,   281,   264,  1575,    13,   634,   575,
         12525, 22618,  1968,  6144, 35617, 20084,  1756,   311,   589,   307,
           534, 10281,   934,   439,    11]])

In [9]:
# 4. сгенерировать из текст аудиофайла

cascade.generate_from_asr(file='audio.wav', text=True)

[" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all,"]

In [10]:
# 5. форвард пасс

in_features = cascade.asr.processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
prompt_ids = torch.tensor(cascade.asr.processor.tokenizer.prefix_tokens).unsqueeze(0)
cascade(src_tokens=in_features, tgt_tokens=prompt_ids)

(tensor([[50258, 50259, 50360, 50364,  2221,    13,  2326,   388,   391,   307,
            264, 50244,   295,   264,  2808,  5359,    11,   293,   321,   366,
           5404,   281,  2928,   702, 14943,    13,  6966,   307,  2221,    13,
           2326,   388,   391,   311,  9060,  1570,  1880,   813,   702,  1871,
             13,   634,  5112,   505,   300,   412,   341, 42729,  3196,   295,
            264,  1064,    11,   365,  5272,   293, 12904,  9256,   450, 10539,
            949,   505,    11,  1034,  4680, 10117,   490,  3936,   293,  1080,
           3542,  5160,   881, 26336,   281,   264,  1575,    13,   634,   575,
          12525, 22618,  1968,  6144, 35617, 20084,  1756,   311,   589,   307,
            534, 10281,   934,   439,    11]]),
 None,
 None,
 None)

In [12]:
# 6. Ответ LLM по аудио

gen_texts = cascade.generate(file='audio.wav', max_new_tokens=150, do_sample=True, top_k=50, top_p=0.95)
for _ in gen_texts:
    print(_)
    print("_________________________________________________")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, though it is certainly French. The fact is that it is not the work of a Frenchman but is the result of the thinking of a man from America, so that it is no more foreign to us than French to anyone, but no less strange to us. It is true that one may take his work as a whole, but, in the light of our observations above, it is impossible to imagine it as being anything but strange to us. He has a strange view of the subject of the work, and the peculiar difficulties of its presentation he has experienced himself. In his opening sentence he says that it is to some extent 'a thing of Gre