# Speech Recognition with Silero

This notebook uses Silero from the project [snakers4/silero-models](https://github.com/snakers4/silero-models/) to transcribe English and German audio using microphone.

For other deep-learning Colab notebooks, visit [tugstugi/dl-colab-notebooks](https://github.com/tugstugi/dl-colab-notebooks).

## Install snakers4/silero-models

In [None]:
#@title

import os
import sys
from os.path import exists, join, basename, splitext

# checkout the project
if not exists('silero-models'):
  !git clone -q --depth 1 https://github.com/snakers4/silero-models

  !pip install -q omegaconf torchaudio soundfile
  !pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip
%cd silero-models

# silero imports
import torch
import random
from glob import glob
from omegaconf import OmegaConf
from utils import (init_jit_model, 
                   split_into_batches,
                   read_batch,
                   prepare_model_input)

device = torch.device('cpu')   # you can use any pytorch device
models = OmegaConf.load('models.yml')

# imports for uploading/recording
import ipywidgets as widgets
from IPython.display import Audio, display, clear_output
from dl_colab_notebooks.audio import record_audio, upload_audio
import numpy as np
from scipy.io import wavfile

# wav to text method
def wav_to_text(f='test.wav'):
  batch = read_batch([f])
  input = prepare_model_input(batch, device=device)
  output = model(input)
  return decoder(output[0].cpu())

## Select Language

In [None]:
#@markdown { run: "auto" }

language = "English" #@param ["English", "German", "Spanish"]
print(language)
if language == 'German':
  model, decoder = init_jit_model(models.stt_models.de.latest.jit, device=device)
elif language == "Spanish":
  model, decoder = init_jit_model(models.stt_models.es.latest.jit, device=device)
else:
  model, decoder = init_jit_model(models.stt_models.en.latest.jit, device=device)

## Transcribe

In [None]:
#@markdown * Either record audio from microphone or upload audio from file (.mp3 or .wav) { run: "auto" }

SAMPLE_RATE = 16000
record_or_upload = "Record" #@param ["Record", "Upload (.mp3 or .wav)"]
record_seconds =   10#@param {type:"number", min:1, max:10, step:1}

def _recognize(audio):
  display(Audio(audio, rate=SAMPLE_RATE, autoplay=True))
  wavfile.write('test.wav', SAMPLE_RATE, (32767*audio).astype(np.int16))

  transcription = wav_to_text()
  print('\n\nTRANSCRIPTION:\n')
  print(transcription)


def _record_audio(b):
  clear_output()
  audio = record_audio(record_seconds, sample_rate=SAMPLE_RATE)
  _recognize(audio)
def _upload_audio(b):
  clear_output()
  audio = upload_audio(sample_rate=SAMPLE_RATE)
  _recognize(audio)

if record_or_upload == "Record":
  button = widgets.Button(description="Record Speech")
  button.on_click(_record_audio)
  display(button)
else:
  _upload_audio("")