### Whisper

In [5]:
import whisper

def test_whisper(model_name):
    model = whisper.load_model(model_name)

    for speech_number in range(1, 14):
        filepath = f"./data/speech_{speech_number}.wav"
        result = model.transcribe(filepath, language="ru", fp16=False)
        print(result["text"])

In [None]:
# base
test_whisper("base")

In [None]:
# small
test_whisper("small")

In [None]:
# medium
test_whisper("medium")

### Vosk

In [3]:
import wave
import sys

from vosk import Model, KaldiRecognizer, SetLogLevel

# You can set log level to -1 to disable debug messages
SetLogLevel(0)

filepath = "./data/speech_1_16bit.wav"
wf = wave.open(filepath, "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
    print("Audio file must be WAV format mono PCM.")
    sys.exit(1)

model_name = "vosk-model-small-ru-0.22"
model = Model(model_name=model_name, lang="ru")

rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)
rec.SetPartialWords(True)

while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        print(rec.Result())
    else:
        print(rec.PartialResult())

print(rec.FinalResult())


### Silero models

In [None]:
import torch
import zipfile
import torchaudio
from glob import glob

device = torch.device('cpu')  # gpu also works, but our models are fast enough for CPU
model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                       model='silero_stt',
                                       language='en', # also available 'de', 'es'
                                       device=device)
(read_batch, split_into_batches,
 read_audio, prepare_model_input) = utils  # see function signature for details

# download a single file in any format compatible with TorchAudio
torch.hub.download_url_to_file('https://opus-codec.org/static/examples/samples/speech_orig.wav',
                               dst ='speech_orig.wav', progress=True)
test_files = glob('speech_orig.wav')
batches = split_into_batches(test_files, batch_size=10)
input = prepare_model_input(read_batch(batches[0]),
                            device=device)

output = model(input)
for example in output:
    print(decoder(example.cpu()))

### Nvidia

In [None]:
import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_ru_conformer_transducer_large")

filepath = "./data/speech_1_16bit.wav"
asr_model.transcribe([filepath])[0]

### wav2vec2-large-xlsr-53-russian

In [None]:
from huggingsound import SpeechRecognitionModel

model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
audio_paths = ["./data/speech_5_16bit.wav"]

transcriptions = model.transcribe(audio_paths)

print(transcriptions[0]["transcription"])

### Sneakers

In [None]:
import torch
import zipfile
import torchaudio
from glob import glob

device = torch.device('cpu')  # gpu also works, but our models are fast enough for CPU
model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                       model='silero_stt',
                                       language='ru', # also available 'de', 'es'
                                       device=device)
(read_batch, split_into_batches,
 read_audio, prepare_model_input) = utils  # see function signature for details

# download a single file in any format compatible with TorchAudio
torch.hub.download_url_to_file('https://opus-codec.org/static/examples/samples/speech_orig.wav',
                               dst ='speech_orig.wav', progress=True)
filename = "./data/speech_1.wav"
test_files = glob(filename)
batches = split_into_batches(test_files, batch_size=10)
input = prepare_model_input(read_batch(batches[0]),
                            device=device)

output = model(input)
for example in output:
    print(decoder(example.cpu()))

### Nvidia nemo or Whisper small

In [None]:
import whisper

model = whisper.load_model("small")

for speech_number in range(1, 14):
    filepath = f"data/speech_{speech_number}_pcm16.wav"
    result = model.transcribe(filepath, language="ru", fp16=False)
    print(result["text"])

In [None]:
import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained("nvidia/stt_ru_conformer_transducer_large")

for speech_number in range(1, 14):
    filepath = f"data/speech_{speech_number}_mono.wav"
    result = asr_model.transcribe([filepath])
    print(result)