In [1]:
from omegaconf import OmegaConf
from model import AsrLlmCascadeModel

args = OmegaConf.create()
args.llm_config = "Qwen/Qwen2-0.5B"
args.asr_config = "openai/whisper-large-v3-turbo"

cascade = AsrLlmCascadeModel.build_model(args)
cascade

2024-12-30 16:40:29 | INFO | speechgpt_logger | Building ASR-LLM Cascade Model
2024-12-30 16:40:29 | INFO | speechgpt_logger | Loading models: ASR and LLM
2024-12-30 16:40:36 | INFO | speechgpt_logger | Model loaded from openai/whisper-large-v3-turbo
2024-12-30 16:40:36 | INFO | speechgpt_logger | ASR model loaded successfully
2024-12-30 16:40:36 | INFO | speechgpt_logger | Building HuggingFaceQwen2ForCausalLM model.
2024-12-30 16:40:36 | INFO | speechgpt_logger | Loading model from Qwen/Qwen2-0.5B
2024-12-30 16:40:36 | INFO | speechgpt_logger | Initializing Qwen2Decoder with 24 layers.
2024-12-30 16:40:39 | INFO | speechgpt_logger | Qwen2Decoder initialized successfully.
2024-12-30 16:40:41 | INFO | speechgpt_logger | Model initialized successfully.
2024-12-30 16:40:41 | INFO | speechgpt_logger | Loading model weights.
2024-12-30 16:40:44 | INFO | speechgpt_logger | Loaded model weights.
2024-12-30 16:40:45 | INFO | speechgpt_logger | LLM model loaded successfully


AsrLlmCascadeModel(
  (asr): HuggingFaceWhisperModel(
    (encoder): DummyEncoder()
    (decoder): DummyDecoder()
    (model): WhisperForConditionalGeneration(
      (model): WhisperModel(
        (encoder): WhisperEncoder(
          (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
          (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
          (embed_positions): Embedding(1500, 1280)
          (layers): ModuleList(
            (0-31): 32 x WhisperEncoderLayer(
              (self_attn): WhisperSdpaAttention(
                (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
                (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
                (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
                (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
              )
              (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=Tru

# Тесты

In [2]:
import torch
import soundfile as sf
from datasets import load_dataset


dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]

waveform = torch.tensor(sample['array']).unsqueeze(0)  # Add batch dimension
sampling_rate = sample['sampling_rate']

waveform = waveform.float()

inputs = cascade.asr.processor(waveform.squeeze(0), sampling_rate=sampling_rate, return_tensors="pt")
waveform = inputs['input_features']

sf.write('audio.wav', sample['array'], sampling_rate)

2024-12-30 16:40:46 | INFO | datasets | PyTorch version 2.5.1 available.


In [3]:
# 1. сгенерировать токены
cascade.generate_from_asr(waveform)

2024-12-30 16:40:58 | INFO | speechgpt_logger | Generating text from ASR
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
2024-12-30 16:41:51 | DEBUG | speechgpt_logger | Generated speech output, length of generated token

tensor([[50258, 50259, 50360, 50364,  2221,    13,  2326,   388,   391,   307,
           264, 50244,   295,   264,  2808,  5359,    11,   293,   321,   366,
          5404,   281,  2928,   702, 14943,    13,  6966,   307,  2221,    13,
          2326,   388,   391,   311,  9060,  1570,  1880,   813,   702,  1871,
            13,   634,  5112,   505,   300,   412,   341, 42729,  3196,   295,
           264,  1064,    11,   365,  5272,   293, 12904,  9256,   450, 10539,
           949,   505,    11,  1034,  4680, 10117,   490,  3936,   293,  1080,
          3542,  5160,   881, 26336,   281,   264,  1575,    13,   634,   575,
         12525, 22618,  1968,  6144, 35617, 20084,  1756,   311,   589,   307,
           534, 10281,   934,   439,    11]])

In [4]:
# 2. сгенерировать текст
cascade.generate_from_asr(waveform, text=True)

2024-12-30 16:41:52 | INFO | speechgpt_logger | Generating text from ASR
2024-12-30 16:42:42 | DEBUG | speechgpt_logger | Generated speech output, length of generated tokens:1


[" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all,"]

In [5]:
# 3. сгенерировать токены из аудиофайла
cascade.generate_from_asr(file='audio.wav')

2024-12-30 16:42:42 | INFO | speechgpt_logger | Generating text from ASR
2024-12-30 16:42:42 | INFO | speechgpt_logger | Parsed waveform from file audio.wav
2024-12-30 16:43:32 | DEBUG | speechgpt_logger | Generated speech output, length of generated tokens:1


tensor([[50258, 50259, 50360, 50364,  2221,    13,  2326,   388,   391,   307,
           264, 50244,   295,   264,  2808,  5359,    11,   293,   321,   366,
          5404,   281,  2928,   702, 14943,    13,  6966,   307,  2221,    13,
          2326,   388,   391,   311,  9060,  1570,  1880,   813,   702,  1871,
            13,   634,  5112,   505,   300,   412,   341, 42729,  3196,   295,
           264,  1064,    11,   365,  5272,   293, 12904,  9256,   450, 10539,
           949,   505,    11,  1034,  4680, 10117,   490,  3936,   293,  1080,
          3542,  5160,   881, 26336,   281,   264,  1575,    13,   634,   575,
         12525, 22618,  1968,  6144, 35617, 20084,  1756,   311,   589,   307,
           534, 10281,   934,   439,    11]])

In [6]:
# 4. сгенерировать из текст аудиофайла

cascade.generate_from_asr(file='audio.wav', text=True)

2024-12-30 16:43:32 | INFO | speechgpt_logger | Generating text from ASR
2024-12-30 16:43:32 | INFO | speechgpt_logger | Parsed waveform from file audio.wav
2024-12-30 16:44:22 | DEBUG | speechgpt_logger | Generated speech output, length of generated tokens:1


[" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all,"]

In [7]:
# 5. форвард пасс
in_features = cascade.asr.processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
prompt_ids = torch.tensor(cascade.asr.processor.tokenizer.prefix_tokens).unsqueeze(0)
cascade(src_tokens=in_features, tgt_tokens=prompt_ids)

2024-12-30 16:44:22 | DEBUG | speechgpt_logger | Running forward pass
2024-12-30 16:45:12 | DEBUG | speechgpt_logger | Forward pass complete, logits generated
2024-12-30 16:45:12 | DEBUG | speechgpt_logger | ASR output generated


(tensor([[50258, 50259, 50360, 50364,  2221,    13,  2326,   388,   391,   307,
            264, 50244,   295,   264,  2808,  5359,    11,   293,   321,   366,
           5404,   281,  2928,   702, 14943,    13,  6966,   307,  2221,    13,
           2326,   388,   391,   311,  9060,  1570,  1880,   813,   702,  1871,
             13,   634,  5112,   505,   300,   412,   341, 42729,  3196,   295,
            264,  1064,    11,   365,  5272,   293, 12904,  9256,   450, 10539,
            949,   505,    11,  1034,  4680, 10117,   490,  3936,   293,  1080,
           3542,  5160,   881, 26336,   281,   264,  1575,    13,   634,   575,
          12525, 22618,  1968,  6144, 35617, 20084,  1756,   311,   589,   307,
            534, 10281,   934,   439,    11]]),
 None,
 None,
 None)

In [9]:
# 6. Ответ LLM по аудио
from speechgpt.logger import get_logger

logger = get_logger()

gen_texts = cascade.generate(logger, file='audio.wav', max_new_tokens=150, do_sample=True, top_k=50, top_p=0.95)
for _ in gen_texts:
    print(_)
    print("_________________________________________________")

2024-12-30 16:48:15 | INFO | speechgpt_logger | Parsed waveform from file audio.wav
2024-12-30 16:48:55 | DEBUG | speechgpt_logger | Generated speech output, length of generated tokens:1
2024-12-30 16:48:55 | INFO | speechgpt_logger | Generated asr_texts [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all,"]
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
2024-12-30 16:51:44 | DEBUG | speechgpt_logger | Generated text successfully


 Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, or whether the Christian's taste of it is a real one. If the former is true, he is a scholar of genius. But whether it is true that he is a real scholar, we shall have to wait another hundred years to see. On the other hand, a real Greek may have many ways of telling a simile of his own to his audience; and his audience, which is not so much a moral as a religious man, is not so eager to follow him at all. This he might do. For when the Gospel is in the hands of the ignorant he is like a man who is unable to tell a story of a horse but by using the word. No; as soon as we read any C