<a href="https://colab.research.google.com/github/visiont3lab/DeepLearning/blob/main/colab/DeepSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## DeepSpeech Speech To Text

* [DeepSpeech italian model](https://github.com/MozillaItalia/DeepSpeech-Italian-Model)
* [DeepSpeech](https://github.com/mozilla/DeepSpeech)
* [Releases Italian](https://github.com/MozillaItalia/DeepSpeech-Italian-Model/releases)
* [DeepSpeech Docs](https://deepspeech.readthedocs.io/en/r0.9/)
* [Android Speech App](https://github.com/mozilla/androidspeech)

```
# English Model
!wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.pbmm
!wget https://github.com/mozilla/DeepSpeech/releases/download/v0.9.3/deepspeech-0.9.3-models.scorer
```




## DeepSpeech Italian Model

In [None]:
# Installa DeepSpeech
!pip install deepspeech==0.9.3 # deepspeech-gpu deepspeech-tflite

In [54]:
from IPython.display import clear_output
from deepspeech import Model
import numpy as np
import os
import wave
from IPython.display import Audio

In [None]:
# Setup and download utils
!mkdir -p italian/only italian/transfer italian/only_tflite italian/transfer_tflite

# Allenato con dizionario solo italiano
!curl -LO https://github.com/MozillaItalia/DeepSpeech-Italian-Model/releases/download/2020.08.07/model_tensorflow_it.tar.xz
!tar xvf transfer_model_tensorflow_it.tar.xz -C italian/only

# TFLite solo italiano
!curl -LO https://github.com/MozillaItalia/DeepSpeech-Italian-Model/releases/download/2020.08.07/model_tflite_it.tar.xz
!tar xvf model_tflite_it.tar.xz -C italian/only_tflite

# Transfer learning
!curl -LO https://github.com/MozillaItalia/DeepSpeech-Italian-Model/releases/download/2020.08.07/transfer_model_tensorflow_it.tar.xz
!tar xvf transfer_model_tensorflow_it.tar.xz -C italian/transfer

# TFLite Transfer learning 
!curl -LO https://github.com/MozillaItalia/DeepSpeech-Italian-Model/releases/download/2020.08.07/transfer_model_tflite_it.tar.xz
!tar xvf transfer_model_tflite_it.tar.xz -C italian/transfer_tflite

# estrai un sample a caso dal dataset cv_tiny
!wget -c https://github.com/MozillaItalia/DeepSpeech-Italian-Model/files/4610711/cv-it_tiny.tar.gz -O - | tar -xz common_voice_it_19574474.wav
!mv common_voice_it_19574474.wav italian/

In [120]:
path = "italian/transfer" 
#path = "italian/only"
model_file_path = f'{path}/output_graph.pbmm'
lm_file_path = f'{path}/scorer'
lm_alpha = 0.93
lm_beta = 1.18

model = Model(model_file_path)
model.enableExternalScorer(lm_file_path)

model.setScorerAlphaBeta(lm_alpha, lm_beta)
model.setBeamWidth(beam_width)

0

In [121]:
def read_wav_file(filename):
    with wave.open(filename, 'rb') as w:
        rate = w.getframerate()
        frames = w.getnframes()
        buffer = w.readframes(frames)
        print("Rate:", rate)
        print("Frames:", frames)
        print("Buffer Len:", len(buffer))

    return buffer, rate

def transcribe_batch(model,audio_file):
    buffer, rate = read_wav_file(audio_file)
    data16 = np.frombuffer(buffer, dtype=np.int16)
    return model.stt(data16)

def transcribe_streaming(model,audio_file):
    stream = model.createStream()
    buffer, rate = read_wav_file(audio_file)
    offset=0
    batch_size=8196
    text=""

    while offset < len(buffer):
      end_offset=offset+batch_size
      chunk=buffer[offset:end_offset]
      data16 = np.frombuffer(chunk, dtype=np.int16)

      stream.feedAudioContent(data16)
      text=stream.intermediateDecode()
      clear_output(wait=True)
      print(text)
      offset=end_offset
    return True

In [122]:
# Trascrivi (audio MONO, formato WAV e campionato a 16000Hz)
#!deepspeech --model italian/transfer/output_graph.pbmm --scorer italian/transfer/scorer --audio italian/common_voice_it_19574474.wav
!deepspeech --model italian/transfer/output_graph.pbmm --scorer italian/transfer/scorer --audio italian/test.wav
#transcribe_streaming(model,"italian/common_voice_it_19574474.wav")
#transcribe_streaming(model,"italian/test.wav")
#transcribe_batch(model,"italian/common_voice_it_19574474.wav")
#transcribe_batch(model,"italian/test.wav")

Loading model from file italian/transfer/output_graph.pbmm
TensorFlow: v2.3.0-6-g23ad988
DeepSpeech: v0.9.3-0-gf2e9c85
2021-05-07 22:36:58.285303: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
Loaded model in 0.0139s.
Loading scorer from files italian/transfer/scorer
Loaded scorer in 0.000352s.
Running inference.
alze il volume spegni a luce on ducati o parti alze il volume ancora ancora alze il volume a basse il volume ma fattura città mappa moto
Inference took 15.641s for 24.060s audio file.


## Record Audio from Google Colab

In [110]:
# https://gist.github.com/korakot/c21c3476c024ad6d56d5f48b0bca92be
# all imports
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
from io import BytesIO
import soundfile as sf
import librosa

RECORD = """
const sleep = time => new Promise(resolve => {
setTimeout(resolve, time)
}, )
const b2text = blob => new Promise(resolve => {
const reader = new FileReader()
reader.onloadend = e => resolve(e.srcElement.result)
reader.readAsDataURL(blob)
})
var espacio = document.querySelector("#output-area")
var record = time => new Promise(async resolve => {
stream = await navigator.mediaDevices.getUserMedia({ audio: true })
recorder = new MediaRecorder(stream)
chunks = []
recorder.ondataavailable = e => chunks.push(e.data)
recorder.start()
var numerillo = (time/1000)-1
for (var i = 0; i < numerillo; i++) {
espacio.appendChild(document.createTextNode(numerillo-i))
await sleep(1000)
espacio.removeChild(espacio.lastChild)
}
recorder.onstop = async ()=>{
blob = new Blob(chunks)
text = await b2text(blob)
resolve(text)
}
recorder.stop()
})
"""


def record(path,sec=5):
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open(path,'wb') as f:
    f.write(b)
  ad,sr = librosa.load(path)
  ad = librosa.resample(ad, sr, 16000)
  sf.write(path, ad, 16000)

path = "italian/test.wav"
record(path,25)
Audio(path)

<IPython.core.display.Javascript object>

