# Cloning SOUNDOFAIOSR speech to text

In [None]:
!git clone https://github.com/TheSoundOfAIOSR/rg_speech_to_text

Cloning into 'rg_speech_to_text'...
remote: Enumerating objects: 788, done.[K
remote: Counting objects: 100% (320/320), done.[K
remote: Compressing objects: 100% (227/227), done.[K
remote: Total 788 (delta 152), reused 240 (delta 86), pack-reused 468[K
Receiving objects: 100% (788/788), 14.67 MiB | 26.96 MiB/s, done.
Resolving deltas: 100% (397/397), done.


# Installing Dependencies

In [None]:
%%capture
!pip install transformers
!pip install torchaudio
!pip install librosa
!pip install jiwer

In [None]:
import re
import json
import os
import random

import pandas as pd
import numpy as np

import torch
import torchaudio

from transformers import Wav2Vec2CTCTokenizer
from transformers import Wav2Vec2FeatureExtractor
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC
from transformers import get_scheduler
from transformers import AdamW

import IPython.display as ipd
from IPython.core.display import display
from tqdm.auto import tqdm

from functools import partial

from jiwer import wer

### Using Device

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("USING DEVICE: ", device)

USING DEVICE:  cuda


# Loading and preprocessing data

In [None]:
csv_path = "rg_speech_to_text/data/finetuning-dataset/finetuning-dataset.csv"
audio_path = "rg_speech_to_text/data/finetuning-dataset/audiofiles"

In [None]:
train_data = pd.read_csv(csv_path, sep="|", header=None)
train_data.head()

Unnamed: 0,0,1
0,0,I would like a sharp cello
1,1,Give me a dry acoustic guitar
2,2,Give me a metal harp
3,3,Give me a dirty organ
4,4,Give me a hollow piano


converting digits into words

In [None]:
def change_digit_to_word(x):
  x = x.replace("0", "zero ")
  x = x.replace("1", "one ")
  x = x.replace("2", "two ")
  x = x.replace("3", "three ")
  x = x.replace("4", "four ")
  x = x.replace("5", "five ")
  x = x.replace("6", "six ")
  x = x.replace("7", "seven ")
  x = x.replace("8", "eight ")
  x = x.replace("9", "nine ")
  return x

In [None]:
change_digit_to_word("Give me 909")

'Give me nine zero nine '

In [None]:
train_data.iloc[:, 1] = train_data.iloc[:, 1].map(change_digit_to_word)

In [None]:
train_data.head()

Unnamed: 0,0,1
0,0,I would like a sharp cello
1,1,Give me a dry acoustic guitar
2,2,Give me a metal harp
3,3,Give me a dirty organ
4,4,Give me a hollow piano


Removing any special characters from transcriptions.

As no language model will be used to account for these special symbols. These symbols do not have sound for speckers to speak. Also for uniformity we convert texts to uppercase

In [None]:
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(text):
    return re.sub(chars_to_ignore_regex, '', text).upper() + " "

In [None]:
train_data.iloc[:, 1] = train_data.iloc[:, 1].map(remove_special_characters)

In [None]:
train_data.head()

Unnamed: 0,0,1
0,0,I WOULD LIKE A SHARP CELLO
1,1,GIVE ME A DRY ACOUSTIC GUITAR
2,2,GIVE ME A METAL HARP
3,3,GIVE ME A DIRTY ORGAN
4,4,GIVE ME A HOLLOW PIANO


### Creating vocab list from data

- Added ["pad", "s", "/s", "unk"] to specify pad, start, end, unknown tags
- We compare it with pretrained model vocab list.
- create a new vocab list that contains union of both list
- "|" specify " " (space)

In [None]:
def extract_all_chars(texts):
  all_text = " ".join(texts)
  vocab_dataset = sorted(list(set(all_text)))
  extras = ["<pad>", "<s>", "</s>", "<unk>"]
  vocab = {v: k for k, v in enumerate(extras)}
  for k, v in enumerate(vocab_dataset):
    if v == " ":
      vocab["|"] = k+len(extras)
    else:
      vocab[v] = k+len(extras)
  return vocab

In [None]:
vocab_train = extract_all_chars(train_data.iloc[:, 1].values)
print(vocab_train)

{'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '|': 4, 'A': 5, 'B': 6, 'C': 7, 'D': 8, 'E': 9, 'F': 10, 'G': 11, 'H': 12, 'I': 13, 'K': 14, 'L': 15, 'M': 16, 'N': 17, 'O': 18, 'P': 19, 'Q': 20, 'R': 21, 'S': 22, 'T': 23, 'U': 24, 'V': 25, 'W': 26, 'Y': 27, 'Z': 28}


#### pretrained vocab list

In [None]:
pre_vocab = {"<pad>": 0, "<s>": 1, "</s>": 2, "<unk>": 3, "|": 4, "E": 5, "T": 6, "A": 7, "O": 8, "N": 9, "I": 10, "H": 11, "S": 12, "R": 13, "D": 14, "L": 15, "U": 16, "M": 17, "W": 18, "C": 19, "F": 20, "G": 21, "Y": 22, "P": 23, "B": 24, "V": 25, "K": 26, "'": 27, "X": 28, "J": 29, "Q": 30, "Z": 31}

In [None]:
print(len(vocab_train))
print(len(pre_vocab))

29
32


creating sets from these vocab letters

In [None]:
set_vocab = set([k for k, v in vocab_train.items()])
set_pre_vocab = set([k for k, v in pre_vocab.items()])

elements in vocab set but not in pretrained model vocab list

In [None]:
set_vocab.difference(set_pre_vocab)

set()

these characters are not currently in our dataset

In [None]:
set_pre_vocab.difference(set_vocab)

{"'", 'J', 'X'}

creating a new vocab list by adding elements not in pretrained model vocab list

In [None]:
for i, elem in enumerate(set_vocab.difference(set_pre_vocab)):
  pre_vocab[elem] = i + len(pre_vocab)

In [None]:
pre_vocab

{"'": 27,
 '</s>': 2,
 '<pad>': 0,
 '<s>': 1,
 '<unk>': 3,
 'A': 7,
 'B': 24,
 'C': 19,
 'D': 14,
 'E': 5,
 'F': 20,
 'G': 21,
 'H': 11,
 'I': 10,
 'J': 29,
 'K': 26,
 'L': 15,
 'M': 17,
 'N': 9,
 'O': 8,
 'P': 23,
 'Q': 30,
 'R': 13,
 'S': 12,
 'T': 6,
 'U': 16,
 'V': 25,
 'W': 18,
 'X': 28,
 'Y': 22,
 'Z': 31,
 '|': 4}

In [None]:
len(pre_vocab)

32

Let's now save the vocabulary as a json file.

In [None]:
with open('vocab.json', 'w') as vocab_file:
    json.dump(pre_vocab, vocab_file)

Creating CSV file with audio path with their respective transcription.

In [None]:
audio_files = os.listdir(audio_path)

In [None]:
len(audio_files)

116

In [None]:
def create_dataset_csv(texts, audio_files):
  df_dict = {"id":[], "path":[], "sentence":[]}
  for i, af in enumerate(audio_files):
    idx = int(af.split("-")[1].split(".")[0])
    df_dict["id"].append(i)
    df_dict["path"].append(os.path.join(audio_path, af))
    df_dict["sentence"].append(texts[idx])
  return pd.DataFrame.from_dict(df_dict)

In [None]:
train_df = create_dataset_csv(train_data.iloc[:, 1].values, audio_files)
train_df.head()

Unnamed: 0,id,path,sentence
0,0,rg_speech_to_text/data/finetuning-dataset/audi...,GIVE ME A SIMPLE SQUARE BASS
1,1,rg_speech_to_text/data/finetuning-dataset/audi...,GIVE ME AN ORCHESTRAL STRING
2,2,rg_speech_to_text/data/finetuning-dataset/audi...,GIVE ME A HARD DRUM
3,3,rg_speech_to_text/data/finetuning-dataset/audi...,GIVE ME A SMOOTH OPERATOR
4,4,rg_speech_to_text/data/finetuning-dataset/audi...,A LOUD PIANO AND A FLUTE PLEASE


lets listen to few random samples from dataset and check the transcriptions matches

In [None]:
num_samples = 5

for n in range(num_samples):
  rand_int = random.randint(0, len(train_df)-1)
  speech_array, sampling_rate = torchaudio.load(train_df.iloc[rand_int, 1])
  speech = speech_array[0].numpy()
  print("Sample ",n+1)
  display(ipd.Audio(data=np.asarray(speech), autoplay=False, rate=16000))
  print(train_df.iloc[rand_int, 2])
  print(speech.shape)

Sample  1


GIVE ME A CHORD PRESET 
(62450,)
Sample  2


GIVE ME A DRY ACOUSTIC GUITAR 
(62450,)
Sample  3


A LOUD PIANO AND A FLUTE PLEASE 
(62450,)
Sample  4


I WOULD LIKE A SWEET BASS 
(62450,)
Sample  5


GIVE ME A HOLLOW PIANO 
(62450,)


# Mounting google drive

In [None]:
from google.colab import drive
drive.mount("/content/gdrive/")

Mounted at /content/gdrive/


# Parameters used in notebooks

In [None]:
version = 1
model_save_path = "/content/gdrive/MyDrive/wav2vec_finetuning/"
model_name = f"wav2vec_osr_version_{version}"
batch_size = 16
epochs = 30
lr = 1e-4

In [None]:
os.makedirs(model_save_path, exist_ok=True)

# Creating wav2vec processor pipeline for feature extraction and tokenizer

In [None]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="<unk>", pad_token="<pad>", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

# Creating a Dataset class using torch dataset

params
 - df : train data pandas dataframe, with audio path and transcriptions
 - processor : wave2vec processor object for converting audio and texts to features
 - transforms : list of augmentation function

In [None]:
class SpeechDataset(torch.utils.data.Dataset):
  def __init__(self, df, processor, transforms=None):
    self.df = df
    self.transforms = transforms
    self.processor = processor
  
  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    sp_arr, sr = torchaudio.load(self.df["path"][idx])
    sp_arr = sp_arr.numpy()[0]
    if self.transforms:
      for transform in self.transforms:
        sp_arr = transform(sp_arr)
    X = torch.tensor(self.processor(sp_arr, sampling_rate=16000).input_values[0])
    with self.processor.as_target_processor():
      Y = torch.tensor(self.processor(self.df["sentence"][idx]).input_ids)
    return {"input_values":X, "labels":Y}

Lets check how this class works

In [None]:
speech_dataset = SpeechDataset(train_df, processor, transforms=None)

In [None]:
sample = speech_dataset[12]
audio = sample["input_values"]
transcription = sample["labels"]
print(audio)
print(audio.shape)
print(transcription)
print(transcription.shape)

tensor([ 3.7021e-03,  7.8298e-03, -5.0268e-05,  ..., -5.0268e-05,
        -5.0268e-05, -5.0268e-05], dtype=torch.float64)
torch.Size([62450])
tensor([21, 10, 25,  5,  4, 17,  5,  4,  7,  4, 14,  7, 13, 26,  4,  6, 10,  9,
         4, 12,  8, 16,  9, 14,  4])
torch.Size([25])


collate function is passed to dataloader to process batches, we have to pad input audio features and labels seperately before passing to model. collate function will take care of this.

In [None]:
def collate_function(batch, processor, padding=True, max_length=None, 
                     max_length_labels=None, pad_to_multiple_of=None, pad_to_multiple_of_labels=None):
  b_X = [{"input_values": sample["input_values"]} for sample in batch]
  b_Y = [{"input_ids": sample["labels"]} for sample in batch]
  features = processor.pad(
            b_X,
            padding=padding,
            max_length=max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_tensors="pt"
        ) 
  with processor.as_target_processor():
    batchY = processor.pad(
        b_Y,
        padding=padding,
        max_length=max_length_labels,
        pad_to_multiple_of=pad_to_multiple_of_labels,
        return_tensors="pt"
    )
  labels = batchY["input_ids"].masked_fill(batchY.attention_mask.ne(1), -100)
  features["labels"] = labels
  return features


In [None]:
collate_fn = partial(collate_function, processor=processor, padding=True, max_length=None, 
                     max_length_labels=None, pad_to_multiple_of=None, pad_to_multiple_of_labels=None)

In [None]:
train_dataloader = torch.utils.data.DataLoader(speech_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

check how this dataloader works

In [None]:
batch_sample = next(iter(train_dataloader))

In [None]:
print(batch_sample.keys())

dict_keys(['input_values', 'attention_mask', 'labels'])


In [None]:
print(batch_sample["input_values"].shape)
print(batch_sample["attention_mask"].shape)
print(batch_sample["labels"].shape)

torch.Size([16, 62450])
torch.Size([16, 62450])
torch.Size([16, 43])


In [None]:
len(train_dataloader)

8

# Creating wav2vec model and intialize with pretrained weights

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base-960h",
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


freezing feature extractor

In [None]:
model.freeze_feature_extractor()

# Function to calculate WER (word error rate)

In [None]:
def compute_metrics(labels, preds):
    preds = torch.argmax(preds, axis=-1)
    labels[labels == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(preds)
    label_str = processor.batch_decode(labels, group_tokens=False)
    return wer(label_str, pred_str)

# Model Training

In [None]:
optimizer = AdamW(model.parameters(), lr=lr)

In [None]:
num_training_steps = epochs*len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

for mixed precision training

In [None]:
scaler = torch.cuda.amp.GradScaler()

move model to training device and enable training 

In [None]:
model = model.to(device)
model = model.train()

In [None]:
torch.save(model, os.path.join(model_save_path, model_name+".pt"))
processor.save_pretrained(os.path.join(model_save_path, model_name+"_vocab"))

In [None]:
def train_step(train_dataloader, optimizer, lr_scheduler, processor, verbose_at=5, print_sentences=False):
    losses = []
    wers = []
    for step, data in enumerate(train_dataloader):
        optimizer.zero_grad()
        # move inputs to device
        batch = {k: v.to(device) for k, v in data.items()}
        with torch.cuda.amp.autocast():
          outputs = model(**batch)
        loss = outputs.loss
        losses.append(loss.cpu().detach().numpy())
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        lr_scheduler.step()
        scaler.update()
        preds = outputs.logits
        labels = data["labels"]
        metrics = compute_metrics(labels, preds)
        wers.append(metrics)
        if step%verbose_at == 0:
            print(f"WER: {metrics} \t LOSS: {loss.cpu().detach().numpy()}")
            if print_sentences:
              for l, p in zip(labels, preds):
                  predicted_str = processor.tokenizer.decode(torch.argmax(p, dim =-1))
                  label_str = processor.tokenizer.decode(l)
                  print(f"True: {label_str}; Predicted: {predicted_str}")
    return {"loss": np.mean(losses), "wer":np.mean(wers)}

In [None]:
epochs_progress_bar = tqdm(range(epochs))
curr_best_loss = 1e10
for n in range(epochs):
    res = train_step(train_dataloader, optimizer, lr_scheduler, 
                     processor, verbose_at=len(train_dataloader)//2)
    print("EPOCH: ", n+1)
    res["best_loss"] = curr_best_loss
    if curr_best_loss > res["loss"]:
      print("Best model, saving at ",model_save_path)
      torch.save(model, os.path.join(model_save_path, model_name+".pt"))
      curr_best_loss = res["loss"]
      res["best_loss"] = curr_best_loss
    print(res)
    epochs_progress_bar.update(1)

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))

WER: 0.3225806451612903 	 LOSS: 0.42587608098983765
WER: 0.23255813953488372 	 LOSS: 0.3128780722618103
EPOCH:  1
Best model, saving at  /content/gdrive/MyDrive/wav2vec_finetuning/
{'loss': 0.39306483, 'wer': 0.30538886624482514, 'best_loss': 0.39306483}
WER: 0.33653846153846156 	 LOSS: 0.5179641246795654
WER: 0.25 	 LOSS: 0.23975153267383575
EPOCH:  2
Best model, saving at  /content/gdrive/MyDrive/wav2vec_finetuning/
{'loss': 0.38744265, 'wer': 0.29685971370548014, 'best_loss': 0.38744265}
WER: 0.3191489361702128 	 LOSS: 0.2723190188407898
WER: 0.35555555555555557 	 LOSS: 0.46631675958633423
EPOCH:  3
Best model, saving at  /content/gdrive/MyDrive/wav2vec_finetuning/
{'loss': 0.36487395, 'wer': 0.3003870645773815, 'best_loss': 0.36487395}
WER: 0.36666666666666664 	 LOSS: 0.42233675718307495
WER: 0.18947368421052632 	 LOSS: 0.2357662171125412
EPOCH:  4
{'loss': 0.38653994, 'wer': 0.29859246371209336, 'best_loss': 0.36487395}
WER: 0.30851063829787234 	 LOSS: 0.4374377727508545
WER: 0.24

# Testing the model

#### Recording and loading audio functions

Taken from [ricardodeazambuja.com](https://ricardodeazambuja.com/deep_learning/2019/03/09/audio_and_video_google_colab/)

In [None]:
!pip install ffmpeg-python

Collecting ffmpeg-python
  Downloading https://files.pythonhosted.org/packages/d7/0c/56be52741f75bad4dc6555991fabd2e07b432d333da82c11ad701123888a/ffmpeg_python-0.2.0-py3-none-any.whl
Installing collected packages: ffmpeg-python
Successfully installed ffmpeg-python-0.2.0


In [None]:
# https://ricardodeazambuja.com/deep_learning/2019/03/09/audio_and_video_google_colab/
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
import io
import ffmpeg
import librosa

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };            
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {            
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data); 
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});
      
</script>
"""

def get_audio(sr):
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])
  
  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)
  
  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  speech, rate = librosa.load(io.BytesIO(riff),sr=16000)
  return speech, sr

##### Recording and loading audio

In [None]:
#load any audio file of your choice
speech, rate = get_audio(sr=16000)

In [None]:
model_path = os.path.join(model_save_path, model_name+".pt")
pipeline_path= os.path.join(model_save_path, model_name+"_vocab")

In [None]:
processor = Wav2Vec2Processor.from_pretrained(pipeline_path)
model = torch.load(model_path)

In [None]:
model = model.eval()

In [None]:
input_values = processor(
      speech, 
      sampling_rate=rate, 
      return_tensors="pt"
  ).input_values.to(device)
logits = model(input_values).logits

decoding transcript

In [None]:
predicted_ids = torch.argmax(logits, dim =-1)
transcriptions = tokenizer.decode(predicted_ids[0])
print(transcriptions)

I KNEW THA CHERPING SOUND
