In [1]:
import os
import sys
import torch
import torchaudio
import soundfile as sf
from datasets import load_dataset
from IPython.display import Audio
from transformers import AutoProcessor

In [2]:
from omegaconf import OmegaConf
from model import HuggingFaceWhisperModel

args = OmegaConf.create()
args.asr_config = "openai/whisper-large-v3-turbo"

model = HuggingFaceWhisperModel.build_model(args, None)

2024-12-30 16:11:21 | INFO | speechgpt_logger | Model loaded from openai/whisper-large-v3-turbo


In [3]:
# Загружаем тестовый датасет

dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
sample = dataset[0]["audio"]
sampling_rate = sample['sampling_rate']

waveform = torch.tensor(sample['array']).unsqueeze(0)  # Add batch dimension
waveform = waveform.float()

inputs = model.processor(waveform.squeeze(0), sampling_rate=sampling_rate, return_tensors="pt")
waveform = inputs['input_features']

sf.write('audio.wav',sample['array'], sampling_rate)

README.md:   0%|          | 0.00/480 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


(…)-00000-of-00001-913508124a40cb97.parquet:   0%|          | 0.00/1.98M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/1 [00:00<?, ? examples/s]

In [4]:
# сгенерировать токены

model.generate(waveform)

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
2024-12-30 16:12:25 | DEBUG | speechgpt_logger | Generated speech output, length of generated tokens:1


tensor([[50258, 50259, 50360, 50364,  2221,    13,  2326,   388,   391,   307,
           264, 50244,   295,   264,  2808,  5359,    11,   293,   321,   366,
          5404,   281,  2928,   702, 14943,    13,  6966,   307,  2221,    13,
          2326,   388,   391,   311,  9060,  1570,  1880,   813,   702,  1871,
            13,   634,  5112,   505,   300,   412,   341, 42729,  3196,   295,
           264,  1064,    11,   365,  5272,   293, 12904,  9256,   450, 10539,
           949,   505,    11,  1034,  4680, 10117,   490,  3936,   293,  1080,
          3542,  5160,   881, 26336,   281,   264,  1575,    13,   634,   575,
         12525, 22618,  1968,  6144, 35617, 20084,  1756,   311,   589,   307,
           534, 10281,   934,   439,    11]])

In [5]:
# сгенерировать текст

model.generate(waveform, text=True)

2024-12-30 16:13:13 | DEBUG | speechgpt_logger | Generated speech output, length of generated tokens:1


[" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all,"]

In [6]:
# форвард пасс

in_features = model.processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
prompt_ids = torch.tensor(model.processor.tokenizer.prefix_tokens).unsqueeze(0)
model(src_tokens=in_features, tgt_tokens=prompt_ids)

2024-12-30 16:13:53 | DEBUG | speechgpt_logger | Forward pass complete, logits generated


(tensor([[50258, 50259, 50360, 50364,  2221,    13,  2326,   388,   391,   307,
            264, 50244,   295,   264,  2808,  5359,    11,   293,   321,   366,
           5404,   281,  2928,   702, 14943,    13,  6966,   307,  2221,    13,
           2326,   388,   391,   311,  9060,  1570,  1880,   813,   702,  1871,
             13,   634,  5112,   505,   300,   412,   341, 42729,  3196,   295,
            264,  1064,    11,   365,  5272,   293, 12904,  9256,   450, 10539,
            949,   505,    11,  1034,  4680, 10117,   490,  3936,   293,  1080,
           3542,  5160,   881, 26336,   281,   264,  1575,    13,   634,   575,
          12525, 22618,  1968,  6144, 35617, 20084,  1756,   311,   589,   307,
            534, 10281,   934,   439,    11]]),
 None,
 None,
 None)

In [7]:
# сгенерировать из текст аудиофайла

model.generate(file='audio.wav', text=True)

2024-12-30 16:13:54 | INFO | speechgpt_logger | Parsed waveform from file audio.wav
2024-12-30 16:14:34 | DEBUG | speechgpt_logger | Generated speech output, length of generated tokens:1


[" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all,"]

In [8]:
# сгенерировать токены из аудиофайла

model.generate(file='audio.wav')

2024-12-30 16:14:34 | INFO | speechgpt_logger | Parsed waveform from file audio.wav
2024-12-30 16:15:15 | DEBUG | speechgpt_logger | Generated speech output, length of generated tokens:1


tensor([[50258, 50259, 50360, 50364,  2221,    13,  2326,   388,   391,   307,
           264, 50244,   295,   264,  2808,  5359,    11,   293,   321,   366,
          5404,   281,  2928,   702, 14943,    13,  6966,   307,  2221,    13,
          2326,   388,   391,   311,  9060,  1570,  1880,   813,   702,  1871,
            13,   634,  5112,   505,   300,   412,   341, 42729,  3196,   295,
           264,  1064,    11,   365,  5272,   293, 12904,  9256,   450, 10539,
           949,   505,    11,  1034,  4680, 10117,   490,  3936,   293,  1080,
          3542,  5160,   881, 26336,   281,   264,  1575,    13,   634,   575,
         12525, 22618,  1968,  6144, 35617, 20084,  1756,   311,   589,   307,
           534, 10281,   934,   439,    11]])