## install dependencies

In [None]:
!pip install openai-whisper
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2
BRANCH = 'main'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]


## imports

In [None]:
import os
import glob
import os
import random
import subprocess
import editdistance
import tarfile
import wget
import librosa
import IPython.display as ipd
import numpy as np
import json
from google.colab import drive
import soundfile as sf
import argparse
from datasets import load_dataset, Dataset
from whisper.normalizers import EnglishTextNormalizer
import torch
from joblib import Parallel, delayed
from tqdm import tqdm
import json
import pandas as pd
import nemo.collections.asr as nemo_asr
drive.mount('/content/drive')


Mounted at /content/drive


## mount dataset

In [None]:
# Run this once
data_dir = '/content/drive/MyDrive'
os.listdir(data_dir)
if not os.path.exists(data_dir + '/svarah.tar'):
    svarah_url = 'https://indic-asr-public.objectstore.e2enetworks.net/svarah.tar'
    svarah_path = wget.download(svarah_url, data_dir)
    print(f"Dataset downloaded at: {svarah_path}")
    tar = tarfile.open(svarah_path)
    tar.extractall(path=data_dir)
else:
  print('data already downloaded')

Dataset downloaded at: /content/drive/MyDrive/svarah.tar


## data loader

In [None]:
def get_data(split):
    js_data = json.loads(split)
    aud = {}
    aud['path'] = js_data['audio_filepath']
    y, sr = sf.read(aud['path'])
    aud['array'] = y
    aud['sampling_rate'] = sr
    return (aud, js_data['text'])

In [None]:
class eval_dataset(Dataset):

  def __init__(self):
      self.audios = []
      self.sents = []

  def __len__(self):
      return len(self.audios)

  def __getitem__(self, i):
      return {"raw": self.audios[i]['array'], "sampling_rate":self.audios[i]['sampling_rate'],"audio_path" :self.audios[i]['path'] , "reference":self.sents[i]}

  def fill_data(self, aud, sent):
      self.audios.append(aud)
      self.sents.append(sent)


In [None]:
manifest_path = '/content/drive/MyDrive/svarah/svarah_manifest.json'
train_portion = 0.7
with open(manifest_path, 'r') as f:
    data = f.read()
    splits = data.split('\n')[:-1]
    jsons = [json.loads(split) for split in splits]
    for js in jsons:
      js['audio_filepath'] = '/content/drive/MyDrive/svarah/'+js['audio_filepath']
    splits = [json.dumps(js) for js in jsons]
    random.seed(0)
    random.shuffle(splits)
    train_last_idx = int(len(splits)*train_portion)
    train_splits = splits[:train_last_idx]
    eval_splits =  splits[train_last_idx:]

In [None]:
da = Parallel(n_jobs=-240)(delayed(get_data)(split) for split in tqdm(eval_splits))
eval_set = eval_dataset()
for d in da:
    eval_set.fill_data(d[0], d[1])

100%|██████████| 1997/1997 [00:08<00:00, 226.06it/s]


In [None]:
from torch import nn

## ASR decoder with custom head

In [None]:
class AccentASRDecoder(nemo_asr.modules.ConvASRDecoder):
  """
  Alternative ConvASRDecoder including one additional linear layer for learning
  the accent transformation matrix. all other layers of the model should be frozen.
  """
  def __init__(self,decoder):
    super(AccentASRDecoder, self).__init__(
        decoder._feat_in,
        decoder._num_classes-1,
        vocabulary = decoder.vocabulary
        )

    self.decoder_layers = decoder.decoder_layers
    self.linear = nn.Linear(
        in_features=decoder._num_classes,
        out_features=decoder._num_classes,
        bias=False
        )
    nn.init.xavier_uniform_(self.linear.weight) # this does not change the logprobs

  def forward(self, encoder_output):
      # Adapter module forward step
      if self.is_adapter_available():
          encoder_output = encoder_output.transpose(1, 2)  # [B, T, C]
          encoder_output = self.forward_enabled_adapters(encoder_output)
          encoder_output = encoder_output.transpose(1, 2)  # [B, C, T]
      out = self.decoder_layers(encoder_output)
      out = self.linear(out.transpose(1, 2)).transpose(1, 2)

      if self.temperature != 1.0:
          return torch.nn.functional.log_softmax(
              out.transpose(1, 2) / self.temperature, dim=-1
          )
      return torch.nn.functional.log_softmax(out.transpose(1, 2), dim=-1)

In [None]:
# load pretrained
quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En")

# change to our custom decoder
quartznet.decoder = AccentASRDecoder(quartznet.decoder)

# freeze all layers but the last
quartznet.encoder.freeze()
for param in quartznet.decoder.decoder_layers.parameters():
  param.requires_grad=False

[NeMo I 2024-08-23 08:19:53 cloud:68] Downloading from: https://api.ngc.nvidia.com/v2/models/nvidia/nemospeechmodels/versions/1.0.0a5/files/QuartzNet15x5Base-En.nemo to /root/.cache/torch/NeMo/NeMo_2.0.0rc2/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo
[NeMo I 2024-08-23 08:19:56 common:826] Instantiating model from pre-trained checkpoint
[NeMo I 2024-08-23 08:19:57 features:305] PADDING: 16
[NeMo I 2024-08-23 08:19:58 save_restore_connector:275] Model EncDecCTCModel was successfully restored from /root/.cache/torch/NeMo/NeMo_2.0.0rc2/QuartzNet15x5Base-En/2b066be39e9294d7100fb176ec817722/QuartzNet15x5Base-En.nemo.


## Eval on quartznet

In [None]:
!nvidia-smi

Fri Aug 23 08:20:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0              26W /  70W |    195MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
hypothesis = []
ground_truth = []
whisper_norm = EnglishTextNormalizer()
model = quartznet.to("cuda").eval()
for i in tqdm(range(len(eval_set))):
    op = model.transcribe([eval_set[i]['audio_path']], verbose=False)
    hypothesis.append(op[0])
    ground_truth.append(eval_set[i]['reference'])


100%|██████████| 1997/1997 [02:27<00:00, 13.54it/s]


In [None]:
normalized_hypothesis = [whisper_norm(x) if len(whisper_norm(x)) > 0 else 'NA' for x in hypothesis]
normalized_reference = [whisper_norm(x) if len(whisper_norm(x)) > 0 else 'NA' for x in ground_truth]

ref = ' '.join(normalized_reference).split()
pred = ' '.join(normalized_hypothesis).split()
print(f'WER is: {editdistance.distance(ref,pred)/len(ref)}')

WER is: 2.780687553566327


In [None]:
with open('train_manifest.json','w') as f:
  f.write('\n'.join(train_splits)+'\n')

with open('eval_manifest.json','w') as f:
  f.write('\n'.join(eval_splits)+'\n')


In [None]:
# --- Config Information ---#
try:
    from omegaconf import DictConfig
    from ruamel.yaml import YAML
except ModuleNotFoundError:
    from ruamel_yaml import YAML
config_path = './configs/config.yaml'

if not os.path.exists(config_path):
    # Grab the config we'll use in this example
    BRANCH = 'main'
    !mkdir configs
    !wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/config.yaml

yaml = YAML(typ='safe')
with open(config_path) as f:
    params = yaml.load(f)

params['model']['train_ds']['manifest_filepath'] = 'train_manifest.json'
params['model']['validation_ds']['manifest_filepath'] = 'eval_manifest.json'
params['model']['optim']['lr'] = 1e-6


--2024-08-23 08:23:49--  https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/asr/conf/config.yaml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4280 (4.2K) [text/plain]
Saving to: ‘configs/config.yaml’


2024-08-23 08:23:49 (58.5 MB/s) - ‘configs/config.yaml’ saved [4280/4280]



In [None]:
import pytorch_lightning as pl

In [None]:
# Use the smaller learning rate we set before
quartznet.setup_optimization(optim_config=DictConfig(params['model']['optim']))

# Point to the data we'll use for fine-tuning as the training set
quartznet.setup_training_data(train_data_config=params['model']['train_ds'])

# Point to the new validation data for fine-tuning
quartznet.setup_validation_data(val_data_config=params['model']['validation_ds'])

# And now we can create a PyTorch Lightning trainer and call `fit` again.
trainer = pl.Trainer(devices=1, max_epochs=1)
trainer.fit(quartznet)

[NeMo W 2024-08-23 08:26:02 modelPT:665] Trainer wasn't specified in model constructor. Make sure that you really wanted it.


[NeMo I 2024-08-23 08:26:02 modelPT:786] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.8, 0.5]
        eps: 1e-08
        grad_averaging: False
        lr: 0.1
        weight_decay: 0.001
    )


[NeMo W 2024-08-23 08:26:02 lr_scheduler:928] Neither `max_steps` nor `iters_per_batch` were provided to `optim.sched`, cannot compute effective `max_steps` !
    Scheduler will not be instantiated !


[NeMo I 2024-08-23 08:26:02 audio_to_text_dataset:49] Model level config does not contain `sample_rate`, please explicitly provide `sample_rate` to the dataloaders.
[NeMo I 2024-08-23 08:26:02 audio_to_text_dataset:49] Model level config does not contain `labels`, please explicitly provide `labels` to the dataloaders.
[NeMo I 2024-08-23 08:26:02 collections:196] Dataset loaded with 4517 files totalling 5.89 hours
[NeMo I 2024-08-23 08:26:02 collections:197] 142 files were filtered totalling 0.84 hours


    


[NeMo I 2024-08-23 08:26:02 audio_to_text_dataset:49] Model level config does not contain `sample_rate`, please explicitly provide `sample_rate` to the dataloaders.
[NeMo I 2024-08-23 08:26:02 audio_to_text_dataset:49] Model level config does not contain `labels`, please explicitly provide `labels` to the dataloaders.
[NeMo I 2024-08-23 08:26:02 collections:196] Dataset loaded with 1997 files totalling 2.89 hours
[NeMo I 2024-08-23 08:26:02 collections:197] 0 files were filtered totalling 0.00 hours


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2024-08-23 08:26:09 modelPT:786] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.8, 0.5]
        eps: 1e-08
        grad_averaging: False
        lr: 0.1
        weight_decay: 0.001
    )
[NeMo I 2024-08-23 08:26:09 lr_scheduler:948] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x79abd65933a0>" 
    will be used during training (effective maximum steps = 142) - 
    Parameters : 
    (warmup_steps: null
    warmup_ratio: null
    min_lr: 0.0
    last_epoch: -1
    max_steps: 142
    )


INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type                              | Params | Mode 
--------------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | eval 
1 | encoder           | ConvASREncoder                    | 18.9 M | train
2 | decoder           | AccentASRDecoder                  | 30.6 K | train
3 | loss              | CTCLoss                           | 0      | eval 
4 | spec_augmentation | SpectrogramAugmentation           | 0      | eval 
5 | wer               | WER                               | 0      | eval 
--------------------------------------------------------------------------------
841       Trainable params
18.9 M    Non-trainable params
18.9 M    Total params
75.701    Total estimated model params size (MB)
602       Modules in train mode
6         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

      self.pid = os.fork()
    


Training: |          | 0/? [00:00<?, ?it/s]

## evaluate accent trained model

In [None]:
hypothesis = []
ground_truth = []
whisper_norm = EnglishTextNormalizer()
model = quartznet.to("cuda").eval()
for i in tqdm(range(len(eval_set))):
    op = model.transcribe([eval_set[i]['audio_path']], verbose=False)
    hypothesis.append(op[0])
    ground_truth.append(eval_set[i]['reference'])


In [None]:
normalized_hypothesis = [whisper_norm(x) if len(whisper_norm(x)) > 0 else 'NA' for x in hypothesis]
normalized_reference = [whisper_norm(x) if len(whisper_norm(x)) > 0 else 'NA' for x in ground_truth]

ref = ' '.join(normalized_reference).split()
pred = ' '.join(normalized_hypothesis).split()
print(f'WER is: {editdistance.distance(ref,pred)/len(ref)}')