# Cloning SOUNDOFAIOSR speech to text

In [1]:
!git clone https://github.com/TheSoundOfAIOSR/rg_speech_to_text

Cloning into 'rg_speech_to_text'...
remote: Enumerating objects: 793, done.[K
remote: Counting objects: 100% (325/325), done.[K
remote: Compressing objects: 100% (231/231), done.[K
remote: Total 793 (delta 154), reused 244 (delta 87), pack-reused 468[K
Receiving objects: 100% (793/793), 14.98 MiB | 31.62 MiB/s, done.
Resolving deltas: 100% (399/399), done.


In Google Colab we don't have sox pre-installed, so we have to install it first; next, we install torchaudio and WavAugment.

In [2]:
!apt-get install libsox-fmt-all libsox-dev sox > /dev/null
!python -m pip install torchaudio > /dev/null
!python -m pip install git+https://github.com/facebookresearch/WavAugment.git > /dev/null

  Running command git clone -q https://github.com/facebookresearch/WavAugment.git /tmp/pip-req-build-0kxy4vjb


# Installing Dependencies

In [3]:
%%capture
!pip install transformers
!pip install torchaudio
!pip install librosa
!pip install jiwer

In [4]:
import re
import json
import os
import random

import pandas as pd
import numpy as np

import torch
import torchaudio

import augment
import librosa

from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC
from transformers import get_scheduler
from transformers import AdamW

import IPython.display as ipd
from IPython.core.display import display
from tqdm.auto import tqdm

from functools import partial

from jiwer import wer

### Using Device

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("USING DEVICE: ", device)

USING DEVICE:  cuda


# Loading and preprocessing data

In [6]:
csv_path = "rg_speech_to_text/data/finetuning-dataset/finetuning-dataset.csv"
audio_path = "rg_speech_to_text/data/finetuning-dataset/audiofiles"

In [7]:
train_data = pd.read_csv(csv_path, sep="|", header=None)
train_data.head()

Unnamed: 0,0,1
0,0,I would like a sharp cello
1,1,Give me a dry acoustic guitar
2,2,Give me a metal harp
3,3,Give me a dirty organ
4,4,Give me a hollow piano


converting digits into words

In [10]:
def change_digit_to_word(x):
  x = x.replace("0", "zero ")
  x = x.replace("1", "one ")
  x = x.replace("2", "two ")
  x = x.replace("3", "three ")
  x = x.replace("4", "four ")
  x = x.replace("5", "five ")
  x = x.replace("6", "six ")
  x = x.replace("7", "seven ")
  x = x.replace("8", "eight ")
  x = x.replace("9", "nine ")
  x = x.replace("  ", " ")
  x = x.strip()
  return x

In [11]:
change_digit_to_word("Give me 909")

'Give me nine zero nine'

In [12]:
train_data.iloc[:, 1] = train_data.iloc[:, 1].map(change_digit_to_word)

In [13]:
train_data.head()

Unnamed: 0,0,1
0,0,I would like a sharp cello
1,1,Give me a dry acoustic guitar
2,2,Give me a metal harp
3,3,Give me a dirty organ
4,4,Give me a hollow piano


Removing any special characters from transcriptions.

As no language model will be used to account for these special symbols. These symbols do not have sound for speckers to speak. Also for uniformity we convert texts to uppercase

In [14]:
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(text):
    return re.sub(chars_to_ignore_regex, '', text).upper() + " "

In [15]:
train_data.iloc[:, 1] = train_data.iloc[:, 1].map(remove_special_characters)

In [16]:
train_data.head()

Unnamed: 0,0,1
0,0,I WOULD LIKE A SHARP CELLO
1,1,GIVE ME A DRY ACOUSTIC GUITAR
2,2,GIVE ME A METAL HARP
3,3,GIVE ME A DIRTY ORGAN
4,4,GIVE ME A HOLLOW PIANO


### Creating vocab list from data

- Added ["pad", "s", "/s", "unk"] to specify pad, start, end, unknown tags
- We compare it with pretrained model vocab list.
- create a new vocab list that contains union of both list
- "|" specify " " (space)

In [17]:
def extract_all_chars(texts):
  all_text = " ".join(texts)
  vocab_dataset = sorted(list(set(all_text)))
  extras = ["<pad>", "<s>", "</s>", "<unk>"]
  vocab = {v: k for k, v in enumerate(extras)}
  for k, v in enumerate(vocab_dataset):
    if v == " ":
      vocab["|"] = k+len(extras)
    else:
      vocab[v] = k+len(extras)
  return vocab

In [18]:
vocab_train = extract_all_chars(train_data.iloc[:, 1].values)
print(vocab_train)

{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, 'A': 5, 'B': 6, 'C': 7, 'D': 8, 'E': 9, 'F': 10, 'G': 11, 'H': 12, 'I': 13, 'K': 14, 'L': 15, 'M': 16, 'N': 17, 'O': 18, 'P': 19, 'Q': 20, 'R': 21, 'S': 22, 'T': 23, 'U': 24, 'V': 25, 'W': 26, 'Y': 27, 'Z': 28}


#### pretrained vocab list

In [19]:
pre_vocab = {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "E": 5, "T": 6, "A": 7, "O": 8, "N": 9, "I": 10, "H": 11, "S": 12, "R": 13, "D": 14, "L": 15, "U": 16, "M": 17, "W": 18, "C": 19, "F": 20, "G": 21, "Y": 22, "P": 23, "B": 24, "V": 25, "K": 26, "'": 27, "X": 28, "J": 29, "Q": 30, "Z": 31}

In [20]:
print(len(vocab_train))
print(len(pre_vocab))

29
32


creating sets from these vocab letters

In [21]:
set_vocab = set([k for k, v in vocab_train.items()])
set_pre_vocab = set([k for k, v in pre_vocab.items()])

elements in vocab set but not in pretrained model vocab list

In [22]:
set_vocab.difference(set_pre_vocab)

set()

these characters are not currently in our dataset

In [23]:
set_pre_vocab.difference(set_vocab)

{"'", 'J', 'X'}

creating a new vocab list by adding elements not in pretrained model vocab list

In [24]:
for i, elem in enumerate(set_vocab.difference(set_pre_vocab)):
  pre_vocab[elem] = i + len(pre_vocab)

In [25]:
pre_vocab

{"'": 27,
 '</s>': 2,
 '<pad>': 0,
 '<s>': 1,
 '<unk>': 3,
 'A': 7,
 'B': 24,
 'C': 19,
 'D': 14,
 'E': 5,
 'F': 20,
 'G': 21,
 'H': 11,
 'I': 10,
 'J': 29,
 'K': 26,
 'L': 15,
 'M': 17,
 'N': 9,
 'O': 8,
 'P': 23,
 'Q': 30,
 'R': 13,
 'S': 12,
 'T': 6,
 'U': 16,
 'V': 25,
 'W': 18,
 'X': 28,
 'Y': 22,
 'Z': 31,
 '|': 4}

In [26]:
len(pre_vocab)

32

Let's now save the vocabulary as a json file.

In [27]:
with open('vocab.json', 'w') as vocab_file:
    json.dump(pre_vocab, vocab_file)

Creating CSV file with audio path with their respective transcription.

In [28]:
audio_files = os.listdir(audio_path)

In [29]:
len(audio_files)

116

In [30]:
def create_dataset_csv(texts, audio_files):
  df_dict = {"id":[], "path":[], "sentence":[]}
  for i, af in enumerate(audio_files):
    idx = int(af.split("-")[1].split(".")[0])
    df_dict["id"].append(i)
    df_dict["path"].append(os.path.join(audio_path, af))
    df_dict["sentence"].append(texts[idx])
  return pd.DataFrame.from_dict(df_dict)

In [31]:
train_df = create_dataset_csv(train_data.iloc[:, 1].values, audio_files)
train_df.head()

Unnamed: 0,id,path,sentence
0,0,rg_speech_to_text/data/finetuning-dataset/audi...,GIVE ME A SIMPLE SQUARE BASS
1,1,rg_speech_to_text/data/finetuning-dataset/audi...,GIVE ME AN ORCHESTRAL STRING
2,2,rg_speech_to_text/data/finetuning-dataset/audi...,GIVE ME A HARD DRUM
3,3,rg_speech_to_text/data/finetuning-dataset/audi...,GIVE ME A SMOOTH OPERATOR
4,4,rg_speech_to_text/data/finetuning-dataset/audi...,A LOUD PIANO AND A FLUTE PLEASE


lets listen to few random samples from dataset and check the transcriptions matches

In [32]:
num_samples = 5

for n in range(num_samples):
  rand_int = random.randint(0, len(train_df)-1)
  speech_array, sampling_rate = torchaudio.load(train_df.iloc[rand_int, 1])
  speech = speech_array[0].numpy()
  print("Sample ",n+1)
  display(ipd.Audio(data=np.asarray(speech), autoplay=False, rate=16000))
  print(train_df.iloc[rand_int, 2])
  print(speech.shape)

Sample  1


GIVE ME A DIRTY ORGAN 
(62450,)
Sample  2


GIVE ME A DIRTY ORGAN 
(62450,)
Sample  3


GIVE ME A DRY ACOUSTIC GUITAR 
(62450,)
Sample  4


GET ME A SOFT MALE VOICE 
(62450,)
Sample  5


GIVE ME A SIMPLE SQUARE BASS 
(62450,)


# Speech Augmentations

In [71]:
class SpeechTransform:
  def __init__(self, p=0.5):
    self.p = p

  def __call__(self, y):
    if np.random.rand() < self.p:
      return self.apply(y)
    return y

  def apply(self, y: np.ndarray):
      raise NotImplementedError

In [72]:
class AddNoise(SpeechTransform):
  def __init__(self, p=0.5, sr=16000, snr=15):
    super().__init__(p)
    self.snr = snr
    self.sr = sr
  
  def apply(self, x):
    noise_generator = lambda: torch.zeros_like(x).uniform_()
    return augment.EffectChain().additive_noise(noise_generator, snr=self.snr).apply(x, src_info={'rate': self.sr})

In [73]:
class ClipEffect(SpeechTransform):
  def __init__(self, p=0.5, sr=16000, rate=0.25):
    super().__init__(p)
    self.rate = rate
    self.sr = sr
  
  def apply(self, x):
    clip_chain = augment.EffectChain().clip(self.rate)
    return clip_chain.apply(x, src_info={'rate': self.sr})

In [74]:
class PitchShift(SpeechTransform):
  def __init__(self, p=0.5, sr=16000, max_rate=200):
    super().__init__(p)
    self.sr = sr
    self.max_rate = max_rate

  def apply(self, x):
    rate = np.random.randint(-self.max_rate, self.max_rate)
    return augment.EffectChain().pitch(rate).rate(self.sr).apply(x, src_info={'rate': self.sr})

In [75]:
class ReverbEffect(SpeechTransform):
  def __init__(self, p=0.5, sr=16000, max_rate=101):
    super().__init__(p)
    self.sr = sr
    self.max_rate = max_rate
  
  def apply(self, x):
    rnd_rate = np.random.randint(0, self.max_rate)
    return augment.EffectChain().reverb(50, 50, rnd_rate).channels(1).apply(x, src_info={'rate': self.sr})

In [76]:
class TimeDropout(SpeechTransform):
  def __init__(self, p=0.5, sr=16000, max_seconds=0.5):
    super().__init__(p)
    self.sr = sr
    self.max_seconds = max_seconds
  
  def apply(self, x):
    return augment.EffectChain().time_dropout(max_seconds=self.max_seconds).apply(x, src_info={'rate': self.sr})

In [81]:
class TimeShift(SpeechTransform):
  def __init__(self, p=0.5, sr=16000, min_shift=0.7, max_shift=2):
    super().__init__(p)
    self.sr = sr
    self.max_shift = max_shift
    self.min_shift = min_shift
  
  def apply(self, x):
    x = x.numpy()
    rate = np.random.uniform(self.min_shift, self.max_shift)
    return torch.as_tensor(librosa.effects.time_stretch(x, rate))

In [82]:
class Compose:
  def __init__(self, transforms):
    self.transforms = transforms
  def __call__(self, x):
    for tr in self.transforms:
      y = tr(x)
    return y

In [83]:
aug_transforms = Compose([
                          AddNoise(p=0.6),
                          ClipEffect(p=0.6),
                          PitchShift(p=0.6),
                          ReverbEffect(p=0.6),
                          TimeDropout(p=0.6),
                          TimeShift(p=0.8)
])

lets listen to few augmented random samples from dataset 

In [84]:
num_samples = 5

for n in range(num_samples):
  rand_int = random.randint(0, len(train_df)-1)
  speech_array, sampling_rate = torchaudio.load(train_df.iloc[rand_int, 1])
  speech = speech_array[0]
  speech = aug_transforms(speech).numpy()
  print("Sample ",n+1)
  display(ipd.Audio(data=np.asarray(speech), autoplay=False, rate=16000))
  print(train_df.iloc[rand_int, 2])
  print(speech.shape)

Sample  1


GIVE ME A FUNKY GUITAR 
(53938,)
Sample  2


GIVE ME A CHORD PRESET 
(61272,)
Sample  3


GIVE ME A SOUND LIKE GENTLE BIRDS 
(35306,)
Sample  4


GIVE ME A SHARP SYNTHESIZER 
(74301,)
Sample  5


PLEASE GIVE ME A BRILLIANT ACOUSTIC GUITAR 
(62450,)


# Mounting google drive

In [46]:
from google.colab import drive
drive.mount("/content/gdrive/")

Mounted at /content/gdrive/


# Parameters used in notebooks

In [85]:
version = 2
model_save_path = "/content/gdrive/MyDrive/wav2vec_finetuning/"
model_name = f"wav2vec_osr_version_{version}"
batch_size = 8
epochs = 100
lr = 1e-4

In [86]:
os.makedirs(model_save_path, exist_ok=True)

# Creating wav2vec processor pipeline for feature extraction and tokenizer

In [87]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# Creating a Dataset class using torch dataset

params
 - df : train data pandas dataframe, with audio path and transcriptions
 - processor : wave2vec processor object for converting audio and texts to features
 - transforms : list of augmentation function

In [88]:
class SpeechDataset(torch.utils.data.Dataset):
  def __init__(self, df, processor, transforms=None):
    self.df = df
    self.transforms = transforms
    self.processor = processor
  
  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    sp_arr, sr = torchaudio.load(self.df["path"][idx])
    sp_arr = sp_arr[0]
    if self.transforms:
      sp_arr = self.transforms(sp_arr)
    X = torch.tensor(self.processor(sp_arr, sampling_rate=16000).input_values[0])
    with self.processor.as_target_processor():
      Y = torch.tensor(self.processor(self.df["sentence"][idx]).input_ids)
    return {"input_values":X, "labels":Y}

Lets check how this class works

In [89]:
speech_dataset = SpeechDataset(train_df, processor, transforms=aug_transforms)

In [90]:
sample = speech_dataset[50]
audio = sample["input_values"]
transcription = sample["labels"]
print(audio)
print(audio.shape)
print(transcription)
print(transcription.shape)

tensor([-0.0023, -0.0125, -0.0206,  ...,  0.0006,  0.0006,  0.0006],
       dtype=torch.float64)
torch.Size([49500])
tensor([ 7,  4, 24, 13, 10, 15, 15, 10,  7,  9,  6,  4, 24,  7, 12, 12,  4])
torch.Size([17])


collate function is passed to dataloader to process batches, we have to pad input audio features and labels seperately before passing to model. collate function will take care of this.

In [91]:
def collate_function(batch, processor, padding=True, max_length=None, 
                     max_length_labels=None, pad_to_multiple_of=None, pad_to_multiple_of_labels=None):
  b_X = [{"input_values": sample["input_values"]} for sample in batch]
  b_Y = [{"input_ids": sample["labels"]} for sample in batch]
  features = processor.pad(
            b_X,
            padding=padding,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt"
        ) 
  with processor.as_target_processor():
    batchY = processor.pad(
        b_Y,
        padding=padding,
        max_length=max_length_labels,
        pad_to_multiple_of=pad_to_multiple_of_labels,
        return_tensors="pt"
    )
  labels = batchY["input_ids"].masked_fill(batchY.attention_mask.ne(1), -100)
  features["labels"] = labels
  return features


In [92]:
collate_fn = partial(collate_function, processor=processor, padding=True, max_length=None, 
                     max_length_labels=None, pad_to_multiple_of=None, pad_to_multiple_of_labels=None)

In [93]:
train_dataloader = torch.utils.data.DataLoader(speech_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

check how this dataloader works

In [94]:
batch_sample = next(iter(train_dataloader))

In [95]:
print(batch_sample.keys())

dict_keys(['input_values', 'attention_mask', 'labels'])


In [96]:
print(batch_sample["input_values"].shape)
print(batch_sample["attention_mask"].shape)
print(batch_sample["labels"].shape)

torch.Size([8, 96067])
torch.Size([8, 96067])
torch.Size([8, 54])


In [97]:
len(train_dataloader)

15

# Creating wav2vec model and intialize with pretrained weights

In [98]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base-960h",
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


freezing feature extractor

In [99]:
model.freeze_feature_extractor()

# Function to calculate WER (word error rate)

In [100]:
def compute_metrics(labels, preds):
    preds = torch.argmax(preds, axis=-1)
    labels[labels == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(preds)
    label_str = processor.batch_decode(labels, group_tokens=False)
    return wer(label_str, pred_str)

# Model Training

In [101]:
optimizer = AdamW(model.parameters(), lr=lr)

In [102]:
num_training_steps = epochs*len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

for mixed precision training

In [103]:
scaler = torch.cuda.amp.GradScaler()

move model to training device and enable training 

In [104]:
model = model.to(device)
model = model.train()

In [105]:
torch.save(model, os.path.join(model_save_path, model_name+".pt"))
processor.save_pretrained(os.path.join(model_save_path, model_name+"_vocab"))

In [106]:
def train_step(train_dataloader, optimizer, lr_scheduler, processor, verbose_at=5, print_sentences=False):
    losses = []
    wers = []
    for step, data in enumerate(train_dataloader):
        optimizer.zero_grad()
        # move inputs to device
        batch = {k: v.to(device) for k, v in data.items()}
        with torch.cuda.amp.autocast():
          outputs = model(**batch)
        loss = outputs.loss
        losses.append(loss.cpu().detach().numpy())
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()
        preds = outputs.logits
        labels = data["labels"]
        metrics = compute_metrics(labels, preds)
        wers.append(metrics)
        if step%verbose_at == 0:
            print(f"WER: {metrics} \t LOSS: {loss.cpu().detach().numpy()}")
            if print_sentences:
              for l, p in zip(labels, preds):
                  predicted_str = processor.tokenizer.decode(torch.argmax(p, dim =-1))
                  label_str = processor.tokenizer.decode(l)
                  print(f"True: {label_str}; Predicted: {predicted_str}")
    return {"loss": np.mean(losses), "wer":np.mean(wers)}

In [107]:
epochs_progress_bar = tqdm(range(epochs))
curr_best_loss = 1e10
for n in range(epochs):
    res = train_step(train_dataloader, optimizer, lr_scheduler, 
                     processor, verbose_at=len(train_dataloader)//2)
    print("EPOCH: ", n+1)
    res["best_loss"] = curr_best_loss
    if curr_best_loss > res["loss"]:
      print("Best model, saving at ",model_save_path)
      torch.save(model, os.path.join(model_save_path, model_name+".pt"))
      curr_best_loss = res["loss"]
      res["best_loss"] = curr_best_loss
    print(res)
    epochs_progress_bar.update(1)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

WER: 0.8372093023255814 	 LOSS: 2.6471290588378906
WER: 0.7727272727272727 	 LOSS: 1.6004754304885864
WER: 0.75 	 LOSS: 1.3208372592926025
EPOCH:  1
Best model, saving at  /content/gdrive/MyDrive/wav2vec_finetuning/
{'loss': 1.8730332, 'wer': 0.8258393248072637, 'best_loss': 1.8730332}
WER: 0.7608695652173914 	 LOSS: 1.5976390838623047
WER: 0.8260869565217391 	 LOSS: 2.0872325897216797
WER: 0.5454545454545454 	 LOSS: 1.2456591129302979
EPOCH:  2
Best model, saving at  /content/gdrive/MyDrive/wav2vec_finetuning/
{'loss': 1.7310504, 'wer': 0.7525975558246533, 'best_loss': 1.7310504}
WER: 0.7708333333333334 	 LOSS: 1.7127583026885986
WER: 0.6888888888888889 	 LOSS: 1.3197681903839111
WER: 0.75 	 LOSS: 1.6600995063781738
EPOCH:  3
Best model, saving at  /content/gdrive/MyDrive/wav2vec_finetuning/
{'loss': 1.432037, 'wer': 0.6837826235152793, 'best_loss': 1.432037}
WER: 0.6086956521739131 	 LOSS: 1.279391884803772
WER: 0.5918367346938775 	 LOSS: 1.3991526365280151
WER: 0.42857142857142855 	

# Testing the model

#### Recording and loading audio functions

Taken from [ricardodeazambuja.com](https://ricardodeazambuja.com/deep_learning/2019/03/09/audio_and_video_google_colab/)

In [108]:
!pip install ffmpeg-python

Collecting ffmpeg-python
  Downloading https://files.pythonhosted.org/packages/d7/0c/56be52741f75bad4dc6555991fabd2e07b432d333da82c11ad701123888a/ffmpeg_python-0.2.0-py3-none-any.whl
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


In [109]:
# https://ricardodeazambuja.com/deep_learning/2019/03/09/audio_and_video_google_colab/
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
import io
import ffmpeg
import librosa

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio(sr):
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  speech, rate = librosa.load(io.BytesIO(riff),sr=16000)
  return speech, sr

##### Recording and loading audio

In [110]:
#load any audio file of your choice
speech, rate = get_audio(sr=16000)

In [111]:
model_path = os.path.join(model_save_path, model_name+".pt")
pipeline_path= os.path.join(model_save_path, model_name+"_vocab")

In [112]:
processor = Wav2Vec2Processor.from_pretrained(pipeline_path)
model = torch.load(model_path)

In [113]:
model = model.eval()

In [114]:
input_values = processor(
      speech, 
      sampling_rate=rate, 
      return_tensors="pt"
  ).input_values.to(device)
logits = model(input_values).logits

decoding transcript

In [115]:
predicted_ids = torch.argmax(logits, dim =-1)
transcriptions = tokenizer.decode(predicted_ids[0])
print(transcriptions)

GIVE ME A FUNKY SOUND
