In [1]:
## Install dependencies
!pip install openai-whisper
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2
## Install NeMo
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]


Collecting openai-whisper
  Downloading openai-whisper-20231117.tar.gz (798 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/798.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m788.5/798.6 kB[0m [31m25.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.6/798.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->openai-whisper)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torc

In [2]:
import os
import glob
import os
import random
import subprocess
import editdistance
import tarfile
import wget
import librosa
import IPython.display as ipd
import numpy as np
import json
from google.colab import drive
import soundfile as sf
import argparse
from datasets import load_dataset, Dataset
from whisper.normalizers import EnglishTextNormalizer
import torch
from joblib import Parallel, delayed
from tqdm import tqdm
import json
import pandas as pd
import nemo.collections.asr as nemo_asr
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
# Run this once
data_dir = '/content/drive/MyDrive'
os.listdir(data_dir)
if not os.path.exists(data_dir + '/svarah.tar'):
    svarah_url = 'https://indic-asr-public.objectstore.e2enetworks.net/svarah.tar'
    svarah_path = wget.download(svarah_url, data_dir)
    print(f"Dataset downloaded at: {svarah_path}")
    tar = tarfile.open(svarah_path)
    tar.extractall(path=data_dir)
else:
  print('data already downloaded')

Dataset downloaded at: /content/drive/MyDrive/svarah.tar


In [4]:
def get_data(split):
    js_data = json.loads(split)
    aud = {}
    aud['path'] = js_data['audio_filepath']
    y, sr = sf.read(aud['path'])
    aud['array'] = y
    aud['sampling_rate'] = sr
    return (aud, js_data['text'])

In [5]:
class eval_dataset(Dataset):

  def __init__(self):
      self.audios = []
      self.sents = []

  def __len__(self):
      return len(self.audios)

  def __getitem__(self, i):
      return {"raw": self.audios[i]['array'], "sampling_rate":self.audios[i]['sampling_rate'],"audio_path" :self.audios[i]['path'] , "reference":self.sents[i]}

  def fill_data(self, aud, sent):
      self.audios.append(aud)
      self.sents.append(sent)


In [6]:
manifest_path = '/content/drive/MyDrive/svarah/svarah_manifest.json'
train_portion = 0.7
with open(manifest_path, 'r') as f:
    data = f.read()
    splits = data.split('\n')[:-1]
    jsons = [json.loads(split) for split in splits]
    for js in jsons:
      js['audio_filepath'] = '/content/drive/MyDrive/svarah/'+js['audio_filepath']
    splits = [json.dumps(js) for js in jsons]
    random.seed(0)
    random.shuffle(splits)
    train_last_idx = int(len(splits)*train_portion)
    train_splits = splits[:train_last_idx]
    eval_splits =  splits[train_last_idx:]

In [7]:
da = Parallel(n_jobs=-240)(delayed(get_data)(split) for split in tqdm(eval_splits))
eval_set = eval_dataset()
for d in da:
    eval_set.fill_data(d[0], d[1])

100%|██████████| 1997/1997 [00:08<00:00, 226.06it/s]


In [8]:
from torch import nn

In [9]:
class AccentASRDecoder(nemo_asr.modules.ConvASRDecoder):
  """
  Alternative ConvASRDecoder including one additional linear layer for learning
  the accent transformation matrix. all other layers of the model should be frozen.
  """
  def __init__(self,decoder):
    super(AccentASRDecoder, self).__init__(
        decoder._feat_in,
        decoder._num_classes-1,
        vocabulary = decoder.vocabulary
        )

    self.decoder_layers = decoder.decoder_layers
    self.linear = nn.Linear(
        in_features=decoder._num_classes,
        out_features=decoder._num_classes,
        bias=False
        )
    nn.init.xavier_uniform_(self.linear.weight) # this does not change the logprobs

  def forward(self, encoder_output):
      # Adapter module forward step
      if self.is_adapter_available():
          encoder_output = encoder_output.transpose(1, 2)  # [B, T, C]
          encoder_output = self.forward_enabled_adapters(encoder_output)
          encoder_output = encoder_output.transpose(1, 2)  # [B, C, T]
      out = self.decoder_layers(encoder_output)
      out = self.linear(out.transpose(1, 2)).transpose(1, 2)

      if self.temperature != 1.0:
          return torch.nn.functional.log_softmax(
              out.transpose(1, 2) / self.temperature, dim=-1
          )
      return torch.nn.functional.log_softmax(out.transpose(1, 2), dim=-1)

In [10]:
# load pretrained
quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En")

# change to our custom decoder
quartznet.decoder = AccentASRDecoder(quartznet.decoder)

# freeze all layers but the last
quartznet.encoder.freeze()
for param in quartznet.decoder.decoder_layers.parameters():
  param.requires_grad=False

[NeMo I 2024-08-23 08:19:53 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemospeechmodels/versions/1.0.0a5/files/QuartzNet15x5Base-En.nemo to /root/.cache/torch/NeMo/NeMo_2.0.0rc2/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo
[NeMo I 2024-08-23 08:19:56 common:826] Instantiating model from pre-trained checkpoint
[NeMo I 2024-08-23 08:19:57 features:305] PADDING: 16
[NeMo I 2024-08-23 08:19:58 save_restore_connector:275] Model EncDecCTCModel was successfully restored from /root/.cache/torch/NeMo/NeMo_2.0.0rc2/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.


## Eval on quartznet

In [11]:
!nvidia-smi

Fri Aug 23 08:20:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0              26W /  70W |    195MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [12]:
hypothesis = []
ground_truth = []
whisper_norm = EnglishTextNormalizer()
model = quartznet.to("cuda").eval()
for i in tqdm(range(len(eval_set))):
    op = model.transcribe([eval_set[i]['audio_path']], verbose=False)
    hypothesis.append(op[0])
    ground_truth.append(eval_set[i]['reference'])


100%|██████████| 1997/1997 [02:27<00:00, 13.54it/s]


In [None]:

normalized_hypothesis = [whisper_norm(x) if len(whisper_norm(x)) > 0 else 'NA' for x in hypothesis]
normalized_reference = [whisper_norm(x) if len(whisper_norm(x)) > 0 else 'NA' for x in ground_truth]

ref = ' '.join(normalized_reference).split()
pred = ' '.join(normalized_hypothesis).split()
print(f'WER is: {editdistance.distance(ref,pred)/len(ref)}')

WER is: 2.780687553566327


### Training with PyTorch Lightning

NeMo models and modules can be used in any PyTorch code where torch.nn.Module is expected.

However, NeMo's models are based on [PytorchLightning's](https://github.com/PyTorchLightning/pytorch-lightning) LightningModule and we recommend you use PytorchLightning for training and fine-tuning as it makes using mixed precision and distributed training very easy. So to start, let's create Trainer instance for training on GPU for 50 epochs

In [13]:
with open('train_manifest.json','w') as f:
  f.write('\n'.join(train_splits)+'\n')

with open('eval_manifest.json','w') as f:
  f.write('\n'.join(eval_splits)+'\n')


In [14]:
# --- Config Information ---#
try:
    from omegaconf import DictConfig
    from ruamel.yaml import YAML
except ModuleNotFoundError:
    from ruamel_yaml import YAML
config_path = './configs/config.yaml'

if not os.path.exists(config_path):
    # Grab the config we'll use in this example
    BRANCH = 'main'
    !mkdir configs
    !wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml

yaml = YAML(typ='safe')
with open(config_path) as f:
    params = yaml.load(f)

params['model']['train_ds']['manifest_filepath'] = 'train_manifest.json'
params['model']['validation_ds']['manifest_filepath'] = 'eval_manifest.json'
params['model']['optim']['lr'] = 1e-6


--2024-08-23 08:23:49--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/conf/config.yaml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4280 (4.2K) [text/plain]
Saving to: ‘configs/config.yaml’


2024-08-23 08:23:49 (58.5 MB/s) - ‘configs/config.yaml’ saved [4280/4280]



In [15]:
params['model']

{'train_ds': {'manifest_filepath': 'train_manifest.json',
  'sample_rate': 16000,
  'labels': [' ',
   'a',
   'b',
   'c',
   'd',
   'e',
   'f',
   'g',
   'h',
   'i',
   'j',
   'k',
   'l',
   'm',
   'n',
   'o',
   'p',
   'q',
   'r',
   's',
   't',
   'u',
   'v',
   'w',
   'x',
   'y',
   'z',
   "'"],
  'batch_size': 32,
  'trim_silence': True,
  'max_duration': 16.7,
  'shuffle': True,
  'num_workers': 8,
  'pin_memory': True,
  'is_tarred': False,
  'tarred_audio_filepaths': None,
  'shuffle_n': 2048,
  'bucketing_strategy': 'synced_randomized',
  'bucketing_batch_size': None},
 'validation_ds': {'manifest_filepath': 'eval_manifest.json',
  'sample_rate': 16000,
  'labels': [' ',
   'a',
   'b',
   'c',
   'd',
   'e',
   'f',
   'g',
   'h',
   'i',
   'j',
   'k',
   'l',
   'm',
   'n',
   'o',
   'p',
   'q',
   'r',
   's',
   't',
   'u',
   'v',
   'w',
   'x',
   'y',
   'z',
   "'"],
  'batch_size': 32,
  'shuffle': False,
  'num_workers': 8,
  'pin_memory': Tr

In [16]:
params['model']['optim']['lr'] = 0.1

In [17]:
import pytorch_lightning as pl

In [None]:
# Use the smaller learning rate we set before
quartznet.setup_optimization(optim_config=DictConfig(params['model']['optim']))

# Point to the data we'll use for fine-tuning as the training set
quartznet.setup_training_data(train_data_config=params['model']['train_ds'])

# Point to the new validation data for fine-tuning
quartznet.setup_validation_data(val_data_config=params['model']['validation_ds'])

# And now we can create a PyTorch Lightning trainer and call `fit` again.
trainer = pl.Trainer(devices=1, max_epochs=1)
trainer.fit(quartznet)

[NeMo W 2024-08-23 08:26:02 modelPT:665] Trainer wasn't specified in model constructor. Make sure that you really wanted it.


[NeMo I 2024-08-23 08:26:02 modelPT:786] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.8, 0.5]
        eps: 1e-08
        grad_averaging: False
        lr: 0.1
        weight_decay: 0.001
    )


[NeMo W 2024-08-23 08:26:02 lr_scheduler:928] Neither `max_steps` nor `iters_per_batch` were provided to `optim.sched`, cannot compute effective `max_steps` !
    Scheduler will not be instantiated !


[NeMo I 2024-08-23 08:26:02 audio_to_text_dataset:49] Model level config does not contain `sample_rate`, please explicitly provide `sample_rate` to the dataloaders.
[NeMo I 2024-08-23 08:26:02 audio_to_text_dataset:49] Model level config does not contain `labels`, please explicitly provide `labels` to the dataloaders.
[NeMo I 2024-08-23 08:26:02 collections:196] Dataset loaded with 4517 files totalling 5.89 hours
[NeMo I 2024-08-23 08:26:02 collections:197] 142 files were filtered totalling 0.84 hours


    


[NeMo I 2024-08-23 08:26:02 audio_to_text_dataset:49] Model level config does not contain `sample_rate`, please explicitly provide `sample_rate` to the dataloaders.
[NeMo I 2024-08-23 08:26:02 audio_to_text_dataset:49] Model level config does not contain `labels`, please explicitly provide `labels` to the dataloaders.
[NeMo I 2024-08-23 08:26:02 collections:196] Dataset loaded with 1997 files totalling 2.89 hours
[NeMo I 2024-08-23 08:26:02 collections:197] 0 files were filtered totalling 0.00 hours


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2024-08-23 08:26:09 modelPT:786] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.8, 0.5]
        eps: 1e-08
        grad_averaging: False
        lr: 0.1
        weight_decay: 0.001
    )
[NeMo I 2024-08-23 08:26:09 lr_scheduler:948] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x79abd65933a0>" 
    will be used during training (effective maximum steps = 142) - 
    Parameters : 
    (warmup_steps: null
    warmup_ratio: null
    min_lr: 0.0
    last_epoch: -1
    max_steps: 142
    )


INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type                              | Params | Mode 
--------------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | eval 
1 | encoder           | ConvASREncoder                    | 18.9 M | train
2 | decoder           | AccentASRDecoder                  | 30.6 K | train
3 | loss              | CTCLoss                           | 0      | eval 
4 | spec_augmentation | SpectrogramAugmentation           | 0      | eval 
5 | wer               | WER                               | 0      | eval 
--------------------------------------------------------------------------------
841       Trainable params
18.9 M    Non-trainable params
18.9 M    Total params
75.701    Total estimated model params size (MB)
602       Modules in train mode
6         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

      self.pid = os.fork()
    


Training: |          | 0/? [00:00<?, ?it/s]

## Eval on ckpt

In [None]:
# load model
model = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En")

# change decoder
model.decoder = AccentASRDecoder(quartznet.decoder).to('cuda')

# load weights
model_ckpt='/content/lightning_logs/version_1/checkpoints/epoch=0-step=142.ckpt'
model.load_state_dict(torch.load(model_ckpt)['state_dict'], strict=True)
model = model.to('cuda')

[NeMo I 2024-08-21 21:01:29 cloud:58] Found existing object /root/.cache/torch/NeMo/NeMo_2.0.0rc2/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.
[NeMo I 2024-08-21 21:01:29 cloud:64] Re-using file from: /root/.cache/torch/NeMo/NeMo_2.0.0rc2/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo
[NeMo I 2024-08-21 21:01:29 common:826] Instantiating model from pre-trained checkpoint
[NeMo I 2024-08-21 21:01:31 features:305] PADDING: 16
[NeMo I 2024-08-21 21:01:34 save_restore_connector:275] Model EncDecCTCModel was successfully restored from /root/.cache/torch/NeMo/NeMo_2.0.0rc2/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.


In [None]:
hypothesis = []
ground_truth = []
whisper_norm = EnglishTextNormalizer()
model = quartznet.to("cuda").eval()
for i in tqdm(range(len(eval_set))):
    op = model.transcribe([eval_set[i]['audio_path']],verbose=False)
    hypothesis.append(op[0])
    ground_truth.append(eval_set[i]['reference'])

100%|██████████| 1997/1997 [02:21<00:00, 14.16it/s]


In [None]:
normalized_hypothesis = [whisper_norm(x) if len(whisper_norm(x)) > 0 else 'NA' for x in hypothesis]
normalized_reference = [whisper_norm(x) if len(whisper_norm(x)) > 0 else 'NA' for x in ground_truth]

ref = ' '.join(normalized_reference).split()
pred = ' '.join(normalized_hypothesis).split()
print(f'WER is: {editdistance.distance(ref,pred)/len(ref)}')

WER is: 0.6914579563851062


With that, we can start training with just one line!

There we go! We've put together a full training pipeline for the model and trained it for 50 epochs.

If you'd like to save this model checkpoint for loading later (e.g. for fine-tuning, or for continuing training), you can simply call `first_asr_model.save_to(<checkpoint_path>)`. Then, to restore your weights, you can rebuild the model using the config (let's say you call it `first_asr_model_continued` this time) and call `first_asr_model_continued.restore_from(<checkpoint_path>)`.

### After Training: Monitoring Progress and Changing Hyperparameters
We can  now start Tensorboard to see how training went. Recall that WER stands for Word Error Rate and so the lower it is, the better.

In [None]:
try:
  from google import colab
  COLAB_ENV = True
except (ImportError, ModuleNotFoundError):
  COLAB_ENV = False

# Load the TensorBoard notebook extension
if COLAB_ENV:
  %load_ext tensorboard
  %tensorboard --logdir lightning_logs/
else:
  print("To use tensorboard, please use this notebook in a Google Colab environment.")

We could improve this model by playing with hyperparameters. We can look at the current hyperparameters with the following:

In [None]:
print(params['model']['optim'])

Let's say we wanted to change the learning rate. To do so, we can create a `new_opt` dict and set our desired learning rate, then call `<model>.setup_optimization()` with the new optimization parameters.

In [None]:
import copy
new_opt = copy.deepcopy(params['model']['optim'])
new_opt['lr'] = 0.001
first_asr_model.setup_optimization(optim_config=DictConfig(new_opt))
# And then you can invoke trainer.fit(first_asr_model)

## Inference

Let's have a quick look at how one could run inference with NeMo's ASR model.

First, ``EncDecCTCModel`` and its subclasses contain a handy ``transcribe`` method which can be used to simply obtain audio files' transcriptions. It also has batch_size argument to improve performance.

In [None]:
audio = [os.path.join(data_dir, 'an4/wav/an4_clstk/mgah/cen2-mgah-b.wav'),
                     os.path.join(data_dir, 'an4/wav/an4_clstk/fmjd/cen7-fmjd-b.wav'),
                     os.path.join(data_dir, 'an4/wav/an4_clstk/fmjd/cen8-fmjd-b.wav'),
                     os.path.join(data_dir, 'an4/wav/an4_clstk/fkai/cen8-fkai-b.wav')]
print(first_asr_model.transcribe(audio=audio,
                                 batch_size=4))

Below is an example of a simple inference loop in pure PyTorch. It also shows how one can compute Word Error Rate (WER) metric between predictions and references.

In [None]:
# Bigger batch-size = bigger throughput
params['model']['validation_ds']['batch_size'] = 16

# Setup the test data loader and make sure the model is on GPU
first_asr_model.setup_test_data(test_data_config=params['model']['validation_ds'])
first_asr_model.cuda()
first_asr_model.eval()

# We will be computing Word Error Rate (WER) metric between our hypothesis and predictions.
# WER is computed as numerator/denominator.
# We'll gather all the test batches' numerators and denominators.
wer_nums = []
wer_denoms = []

# Loop over all test batches.
# Iterating over the model's `test_dataloader` will give us:
# (audio_signal, audio_signal_length, transcript_tokens, transcript_length)
# See the AudioToCharDataset for more details.
for test_batch in first_asr_model.test_dataloader():
        test_batch = [x.cuda() for x in test_batch]
        targets = test_batch[2]
        targets_lengths = test_batch[3]
        log_probs, encoded_len, greedy_predictions = first_asr_model(
            input_signal=test_batch[0], input_signal_length=test_batch[1]
        )
        # Notice the model has a helper object to compute WER
        first_asr_model.wer.update(predictions=greedy_predictions, predictions_lengths=None, targets=targets, targets_lengths=targets_lengths)
        _, wer_num, wer_denom = first_asr_model.wer.compute()
        first_asr_model.wer.reset()
        wer_nums.append(wer_num.detach().cpu().numpy())
        wer_denoms.append(wer_denom.detach().cpu().numpy())

        # Release tensors from GPU memory
        del test_batch, log_probs, targets, targets_lengths, encoded_len, greedy_predictions

# We need to sum all numerators and denominators first. Then divide.
print(f"WER = {sum(wer_nums)/sum(wer_denoms)}")

This WER is not particularly impressive and could be significantly improved. You could train longer (try 100 epochs) to get a better number. Check out the next section on how to improve it further.

## Model Improvements

You already have all you need to create your own ASR model in NeMo, but there are a few more tricks that you can employ if you so desire. In this section, we'll briefly cover a few possibilities for improving an ASR model.

### Data Augmentation

There exist several ASR data augmentation methods that can increase the size of our training set.

For example, we can perform augmentation on the spectrograms by zeroing out specific frequency segments ("frequency masking") or time segments ("time masking") as described by [SpecAugment](https://arxiv.org/abs/1904.08779), or zero out rectangles on the spectrogram as in [Cutout](https://arxiv.org/pdf/1708.04552.pdf). In NeMo, we can do all three of these by simply adding in a `SpectrogramAugmentation` neural module. (As of now, it does not perform the time warping from the SpecAugment paper.)

Our toy model does not do spectrogram augmentation. But the real one we got from cloud does:

In [None]:
print(quartznet._cfg['spec_augment'])

If you want to enable SpecAugment in your model, make sure your .yaml config file contains 'model/spec_augment' section which looks like the one above.

### Transfer learning

Transfer learning is an important machine learning technique that uses a model’s knowledge of one task to make it perform better on another. Fine-tuning is one of the techniques to perform transfer learning. It is an essential part of the recipe for many state-of-the-art results where a base model is first pretrained on a task with abundant training data and then fine-tuned on different tasks of interest where the training data is less abundant or even scarce.

In ASR you might want to do fine-tuning in multiple scenarios, for example, when you want to improve your model's performance on a particular domain (medical, financial, etc.) or on accented speech. You can even transfer learn from one language to another! Check out [this paper](https://arxiv.org/abs/2005.04290) for examples.

Transfer learning with NeMo is simple. Let's demonstrate how the model we got from the cloud could be fine-tuned on AN4 data. (NOTE: this is a toy example). And, while we are at it, we will change model's vocabulary, just to demonstrate how it's done.

In [None]:
# Check what kind of vocabulary/alphabet the model has right now
print(quartznet.decoder.vocabulary)

# Let's add "!" symbol there. Note that you can (and should!) change the vocabulary
# entirely when fine-tuning using a different language.
quartznet.change_vocabulary(
    new_vocabulary=[
        ' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
        'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "'", "!"
    ]
)

After this, our decoder has completely changed, but our encoder (which is where most of the weights are) remained intact. Let's fine tune-this model for 2 epochs on AN4 dataset. We will also use the smaller learning rate from ``new_opt` (see the "After Training" section)`.

In [None]:
# Use the smaller learning rate we set before
quartznet.setup_optimization(optim_config=DictConfig(new_opt))

# Point to the data we'll use for fine-tuning as the training set
quartznet.setup_training_data(train_data_config=params['model']['train_ds'])

# Point to the new validation data for fine-tuning
quartznet.setup_validation_data(val_data_config=params['model']['validation_ds'])

# And now we can create a PyTorch Lightning trainer and call `fit` again.
trainer = pl.Trainer(devices=1, accelerator='gpu', max_epochs=2)
trainer.fit(quartznet)

### Fast Training

Last but not least, we could simply speed up training our model! If you have the resources, you can speed up training by splitting the workload across multiple GPUs. Otherwise (or in addition), there's always mixed precision training, which allows you to increase your batch size.

You can use [PyTorch Lightning's Trainer object](https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html?highlight=Trainer) to handle mixed-precision and distributed training for you. Below are some examples of flags you would pass to the `Trainer` to use these features:

```python
# Mixed precision:
trainer = pl.Trainer(amp_level='O1', precision=16)

# Trainer with a distributed backend:
trainer = pl.Trainer(devices=2, num_nodes=2, accelerator='gpu', strategy='ddp')

# Of course, you can combine these flags as well.
```

Finally, have a look at [example scripts in NeMo repository](https://github.com/NVIDIA/NeMo/blob/stable/examples/asr/asr_ctc/speech_to_text_ctc.py) which can handle mixed precision and distributed training using command-line arguments.

### Deployment

Note: It is recommended to run the deployment code from the NVIDIA PyTorch container.

Let's get back to our pre-trained model and see how easy it can be exported to an ONNX file
in order to run it in an inference engine like TensorRT or ONNXRuntime.

If you are running in an environment outside of the NVIDIA PyTorch container (like Google Colab for example) then you will have to build the onnxruntime and onnxruntime-gpu. The cell below gives an example of how to build those runtimes but the example may have to be adapted depending on your environment.

In [None]:
!pip install --upgrade onnxruntime # for gpu, use onnxruntime-gpu
#!mkdir -p ort
#%cd ort
#!git clean -xfd
#!git clone --depth 1 --branch v1.8.0 https://github.com/microsoft/onnxruntime.git .
#!./build.sh --skip_tests --config Release --build_shared_lib --parallel --use_cuda --cuda_home /usr/local/cuda --cudnn_home /usr/lib/#x86_64-linux-gnu --build_wheel
#!pip uninstall -y onnxruntime
#!pip uninstall -y onnxruntime-gpu
#!pip install  --upgrade --force-reinstall ./build/Linux/Release/dist/onnxruntime*.whl
#%cd ..

Then run

In [None]:
import json
import os
import tempfile
import onnxruntime
import torch

import numpy as np
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.data.audio_to_text import AudioToCharDataset
from nemo.collections.asr.metrics.wer import WER

def to_numpy(tensor):
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

def setup_transcribe_dataloader(cfg, vocabulary):
    config = {
        'manifest_filepath': os.path.join(cfg['temp_dir'], 'manifest.json'),
        'sample_rate': 16000,
        'labels': vocabulary,
        'batch_size': min(cfg['batch_size'], len(cfg['audio'])),
        'trim_silence': True,
        'shuffle': False,
    }
    dataset = AudioToCharDataset(
        manifest_filepath=config['manifest_filepath'],
        labels=config['labels'],
        sample_rate=config['sample_rate'],
        int_values=config.get('int_values', False),
        augmentor=None,
        max_duration=config.get('max_duration', None),
        min_duration=config.get('min_duration', None),
        max_utts=config.get('max_utts', 0),
        blank_index=config.get('blank_index', -1),
        unk_index=config.get('unk_index', -1),
        normalize=config.get('normalize_transcripts', False),
        trim=config.get('trim_silence', True),
        parser=config.get('parser', 'en'),
    )
    return torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=config['batch_size'],
        collate_fn=dataset.collate_fn,
        drop_last=config.get('drop_last', False),
        shuffle=False,
        num_workers=config.get('num_workers', 0),
        pin_memory=config.get('pin_memory', False),
    )

quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En")

quartznet.export('qn.onnx')

ort_session = onnxruntime.InferenceSession('qn.onnx', providers=['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'])

with tempfile.TemporaryDirectory() as tmpdir:
    with open(os.path.join(tmpdir, 'manifest.json'), 'w') as fp:
        for audio_file in files:
            entry = {'audio_filepath': audio_file, 'duration': 100000, 'text': 'nothing'}
            fp.write(json.dumps(entry) + '\n')

    config = {'audio': files, 'batch_size': 4, 'temp_dir': tmpdir}
    temporary_datalayer = setup_transcribe_dataloader(config, quartznet.decoder.vocabulary)
    for test_batch in temporary_datalayer:
        processed_signal, processed_signal_len = quartznet.preprocessor(
            input_signal=test_batch[0].to(quartznet.device), length=test_batch[1].to(quartznet.device)
        )
        ort_inputs = {ort_session.get_inputs()[0].name: to_numpy(processed_signal),}
        ologits = ort_session.run(None, ort_inputs)
        alogits = np.asarray(ologits)
        logits = torch.from_numpy(alogits[0])
        greedy_predictions = logits.argmax(dim=-1, keepdim=False)
        wer = WER(decoding=quartznet.decoding, use_cer=False)
        hypotheses, _ = wer.decoding.ctc_decoder_predictions_tensor(greedy_predictions)
        print(hypotheses)
        break


## Under the Hood

NeMo is open-source and we do all our model development in the open, so you can inspect our code if you wish.

In particular, ``nemo_asr.model.EncDecCTCModel`` is an encoder-decoder model which is constructed using several ``Neural Modules`` taken from ``nemo_asr.modules.`` Here is what its forward pass looks like:
```python
def forward(self, input_signal, input_signal_length):
    processed_signal, processed_signal_len = self.preprocessor(
        input_signal=input_signal, length=input_signal_length,
    )
    # Spec augment is not applied during evaluation/testing
    if self.spec_augmentation is not None and self.training:
        processed_signal = self.spec_augmentation(input_spec=processed_signal)
    encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_len)
    log_probs = self.decoder(encoder_output=encoded)
    greedy_predictions = log_probs.argmax(dim=-1, keepdim=False)
    return log_probs, encoded_len, greedy_predictions
```
Here:

* ``self.preprocessor`` is an instance of ``nemo_asr.modules.AudioToMelSpectrogramPreprocessor``, which is a neural module that takes audio signal and converts it into a Mel-Spectrogram
* ``self.spec_augmentation`` - is a neural module of type ```nemo_asr.modules.SpectrogramAugmentation``, which implements data augmentation.
* ``self.encoder`` - is a convolutional Jasper/QuartzNet-like encoder of type ``nemo_asr.modules.ConvASREncoder``
* ``self.decoder`` - is a ``nemo_asr.modules.ConvASRDecoder`` which simply projects into the target alphabet (vocabulary).

Also, ``EncDecCTCModel`` uses the audio dataset class ``nemo_asr.data.AudioToCharDataset`` and CTC loss implemented in ``nemo_asr.losses.CTCLoss``.

You can use these and other neural modules (or create new ones yourself!) to construct new ASR models.

# Further Reading/Watching:

That's all for now! If you'd like to learn more about the topics covered in this tutorial, here are some resources that may interest you:
- [Stanford Lecture on ASR](https://www.youtube.com/watch?v=3MjIkWxXigM)
- ["An Intuitive Explanation of Connectionist Temporal Classification"](https://towardsdatascience.com/intuitively-understanding-connectionist-temporal-classification-3797e43a86c)
- [Explanation of CTC with Prefix Beam Search](https://medium.com/corti-ai/ctc-networks-and-language-models-prefix-beam-search-explained-c11d1ee23306)
- [Listen Attend and Spell Paper (seq2seq ASR model)](https://arxiv.org/abs/1508.01211)
- [Explanation of the mel spectrogram in more depth](https://towardsdatascience.com/getting-to-know-the-mel-spectrogram-31bca3e2d9d0)
- [Jasper Paper](https://arxiv.org/abs/1904.03288)
- [QuartzNet paper](https://arxiv.org/abs/1910.10261)
- [SpecAugment Paper](https://arxiv.org/abs/1904.08779)
- [Explanation and visualization of SpecAugment](https://towardsdatascience.com/state-of-the-art-audio-data-augmentation-with-google-brains-specaugment-and-pytorch-d3d1a3ce291e)
- [Cutout Paper](https://arxiv.org/pdf/1708.04552.pdf)
- [Transfer Learning Blogpost](https://developer.nvidia.com/blog/jump-start-training-for-speech-recognition-models-with-nemo/)