# Kalmyk Speech Recognition with Wav2Vec2-XLSR

This notebook uses HuggingFace finetuned [Wav2Vec2 XLSR](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) model on a Kalmyk Bible dataset.

Click on "Runtime -> Run All" and after that, you can either record your voice or upload a WAV file to transcribe.

For other deep-learning Colab notebooks, visit [tugstugi/dl-colab-notebooks](https://github.com/tugstugi/dl-colab-notebooks).

## Installation
* install HuggingFace
* download a trained model

In [None]:
#@title

import os
import sys
from os.path import exists, join, basename, splitext

# download model and install dependencies
if not exists('wav2vec2-large-xlsr-53-kalmyk'):
  !touch wav2vec2-large-xlsr-53-kalmyk
  !pip install -q git+https://github.com/huggingface/transformers.git
  !pip install -q gdown librosa
  !pip install -q https://github.com/tugstugi/dl-colab-notebooks/archive/colab_utils.zip

  from transformers import Wav2Vec2Processor
  from transformers import Wav2Vec2ForCTC
  # load the model
  model = Wav2Vec2ForCTC.from_pretrained("tugstugi/wav2vec2-large-xlsr-53-kalmyk").to("cuda")
  processor = Wav2Vec2Processor.from_pretrained("tugstugi/wav2vec2-large-xlsr-53-kalmyk")

import torch
import librosa

# imports for uploading/recording
import ipywidgets as widgets
from IPython.display import Audio, display, clear_output
from dl_colab_notebooks.audio import record_audio, upload_audio
import numpy as np
from scipy.io import wavfile

SAMPLE_RATE = 16000
# wav to text method
def wav_to_text(f='test.wav'):
  audio, _ = librosa.load('test.wav', sr=SAMPLE_RATE)
  input_dict = processor(torch.tensor(audio), return_tensors="pt", padding=True)
  logits = model(input_dict.input_values.to("cuda")).logits
  pred_ids = torch.argmax(logits, dim=-1)[0]
  return processor.decode(pred_ids)

## Transcribe

In [None]:
#@markdown * Either record audio from microphone or upload audio from file (.mp3 or .wav) { run: "auto" }

record_or_upload = "Record" #@param ["Record", "Upload (.mp3 or .wav)"]
record_seconds =   10#@param {type:"number", min:1, max:10, step:1}

def _recognize(audio):
  display(Audio(audio, rate=SAMPLE_RATE, autoplay=True))
  !rm -f test.wav
  wavfile.write('test.wav', rate=SAMPLE_RATE, data=(32767*audio).astype(np.int16))

  transcription = wav_to_text()
  print('\n\nTRANSCRIPTION:\n')
  print(transcription)


def _record_audio(b):
  clear_output()
  audio = record_audio(record_seconds, sample_rate=SAMPLE_RATE)
  _recognize(audio)
def _upload_audio(b):
  clear_output()
  audio = upload_audio(sample_rate=SAMPLE_RATE)
  _recognize(audio)

if record_or_upload == "Record":
  button = widgets.Button(description="Record Speech")
  button.on_click(_record_audio)
  display(button)
else:
  _upload_audio("")