In [2]:
import os
from omegaconf import OmegaConf, open_dict
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.metrics.wer import word_error_rate
from nemo.utils import exp_manager
import nemo
from omegaconf import OmegaConf
# Manifest Utils
from tqdm.auto import tqdm
import json
# Preprocessing steps
import torch.nn as nn
import pytorch_lightning as ptl
import json
from datetime import datetime
from collections import defaultdict
import copy
import torch

In [36]:
WANDB_LOGGER = True
ASR_DIR = "/home/khoatlv/ASR-NEMO"
tokenizer_dir = os.path.join(ASR_DIR, "tokenizers", "tokenizers_citrinet")
TOKENIZER_TYPE = "bpe" #@param ["bpe", "unigram"]

model_config = "config/citrinet_256.yaml"
config_path = os.path.join(ASR_DIR, model_config)

train_manifest_cleaned = "/home/khoatlv/manifests/train_manifest_processed.json"
test_manifest_cleaned = "/home/khoatlv/manifests/test_manifest_processed.json"

train_set = None
test_set = None

In [37]:
def load_config(path):
    config = OmegaConf.load(path)
    config = OmegaConf.to_container(config, resolve=True)
    config = OmegaConf.create(config)
    
    config.model.train_ds.manifest_filepath = train_manifest_cleaned
    config.model.validation_ds.manifest_filepath = test_manifest_cleaned
    config.model.test_ds.manifest_filepath = test_manifest_cleaned
    
    # config.model.train_ds.labels = list(train_set)
    # config.model.validation_ds.labels = list(train_set)
    # config.model.test_ds.labels = list(train_set)
    
    return config

In [30]:
VOCAB_SIZE = 512
!python3 scripts/process_asr_text_tokenizer.py \
  --manifest=$train_manifest_cleaned \
  --vocab_size=$VOCAB_SIZE \
  --data_root=$tokenizer_dir \
  --tokenizer="spe" \
  --spe_type=$TOKENIZER_TYPE \
  --spe_character_coverage=1.0 \
  --no_lower_case \
  --log

INFO:root:Corpus already exists at path : /home/khoatlv/ASR-NEMO/tokenizers_citrinet/text_corpus/document.txt
[NeMo I 2022-03-28 10:11:43 sentencepiece_tokenizer:307] Processing /home/khoatlv/ASR-NEMO/tokenizers_citrinet/text_corpus/document.txt and store at /home/khoatlv/ASR-NEMO/tokenizers_citrinet/tokenizer_spe_bpe_v512
sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=/home/khoatlv/ASR-NEMO/tokenizers_citrinet/text_corpus/document.txt --model_prefix=/home/khoatlv/ASR-NEMO/tokenizers_citrinet/tokenizer_spe_bpe_v512/tokenizer --vocab_size=512 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=bpe --character_coverage=1.0 --bos_id=-1 --eos_id=-1
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /home/khoatlv/ASR-NEMO/tokenizers_citrinet/text_corpus/document.txt
  input_format: 
  model_prefix: /home/khoatlv/ASR-NEMO/tokenizers_citrinet/tokenizer_spe_bpe_v512/tokenizer
  model_type: BPE
  vocab_size: 512
  self_test_s

In [31]:
TOKENIZER_DIR = f"{tokenizer_dir}/tokenizer_spe_{TOKENIZER_TYPE}_v{VOCAB_SIZE}/"
print("Tokenizer directory :", TOKENIZER_DIR)

# Number of tokens in tokenizer - 
with open(os.path.join(TOKENIZER_DIR, 'tokenizer.vocab')) as f:
  tokens = f.readlines()

num_tokens = len(tokens)
print("Number of tokens : ", num_tokens)
if num_tokens < VOCAB_SIZE:
    print(
        f"The text in this dataset is too small to construct a tokenizer "
        f"with vocab size = {VOCAB_SIZE}. Current number of tokens = {num_tokens}. "
        f"Please reconstruct the tokenizer with fewer tokens"
    )

Tokenizer directory : /home/khoatlv/ASR-NEMO/tokenizers_citrinet/tokenizer_spe_bpe_v512/
Number of tokens :  512


In [None]:
asr_model = nemo_asr.models.ASRModel.from_pretrained("stt_en_citrinet_256", map_location='cuda')
# asr_model.change_vocabulary(new_tokenizer_dir=TOKENIZER_DIR, new_tokenizer_type="bpe")
print(asr_model.cfg)

In [5]:
asr_model.summarize()

      asr_model.summarize()
    
      rank_zero_deprecation(
    


  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConvASREncoder                    | 36.3 M
2 | decoder           | ConvASRDecoder                    | 657 K 
3 | loss              | CTCLoss                           | 0     
4 | spec_augmentation | SpectrogramAugmentation           | 0     
5 | _wer              | WERBPE                            | 0     
------------------------------------------------------------------------
37.0 M    Trainable params
0         Non-trainable params
37.0 M    Total params
147.977   Total estimated model params size (MB)

In [34]:
config = load_config(model_config)

cfg = copy.deepcopy(asr_model.cfg)
# Setup new tokenizer
cfg.tokenizer.dir = TOKENIZER_DIR
cfg.tokenizer.type = "bpe"

# Set tokenizer config
asr_model.cfg.tokenizer = cfg.tokenizer

# Setup train, validation, test configs
with open_dict(cfg):
  # Train dataset
  cfg.train_ds.manifest_filepath = train_manifest_cleaned
  cfg.train_ds.batch_size = 32
  cfg.train_ds.num_workers = 8
  cfg.train_ds.pin_memory = True
  cfg.train_ds.use_start_end_token = True
  cfg.train_ds.trim_silence = True

  # Validation dataset
  cfg.validation_ds.manifest_filepath = test_manifest_cleaned
  cfg.validation_ds.batch_size = 8
  cfg.validation_ds.num_workers = 8
  cfg.validation_ds.pin_memory = True
  cfg.validation_ds.use_start_end_token = True
  cfg.validation_ds.trim_silence = True

  # Test dataset
  cfg.test_ds.manifest_filepath = test_manifest_cleaned
  cfg.test_ds.batch_size = 8
  cfg.test_ds.num_workers = 8
  cfg.test_ds.pin_memory = True
  cfg.test_ds.use_start_end_token = True
  cfg.test_ds.trim_silence = True

# setup model with new configs
asr_model.setup_training_data(cfg.train_ds)
asr_model.setup_multiple_validation_data(cfg.validation_ds)
asr_model.setup_multiple_test_data(cfg.test_ds)

with open_dict(asr_model.cfg.optim):
  asr_model.cfg.optim.lr = 0.025
  asr_model.cfg.optim.weight_decay = 0.001
  asr_model.cfg.optim.sched.warmup_steps = None  # Remove default number of steps of warmup
  asr_model.cfg.optim.sched.warmup_ratio = 0.10  # 10 % warmup
  asr_model.cfg.optim.sched.min_lr = 1e-9

with open_dict(asr_model.cfg.spec_augment):
  asr_model.cfg.spec_augment.freq_masks = 2
  asr_model.cfg.spec_augment.freq_width = 27
  asr_model.cfg.spec_augment.time_masks = 2
  asr_model.cfg.spec_augment.time_width = 0.05

asr_model.spec_augmentation = asr_model.from_config_dict(asr_model.cfg.spec_augment)

asr_model._wer.use_cer = True
asr_model._wer.log_prediction = True

trainer = ptl.Trainer(**config.trainer)
asr_model.set_trainer(trainer)
asr_model.cfg = asr_model._cfg

exp_config = exp_manager.ExpManagerConfig(**config.exp_manager)
exp_config = OmegaConf.structured(exp_config)
logdir = exp_manager.exp_manager(trainer, exp_config)

[NeMo I 2022-03-28 10:12:45 collections:173] Dataset loaded with 158604 files totalling 183.65 hours
[NeMo I 2022-03-28 10:12:45 collections:174] 1591 files were filtered totalling 10.04 hours
[NeMo I 2022-03-28 10:12:46 collections:173] Dataset loaded with 17416 files totalling 20.85 hours
[NeMo I 2022-03-28 10:12:46 collections:174] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-03-28 10:12:47 collections:173] Dataset loaded with 17416 files totalling 20.85 hours
[NeMo I 2022-03-28 10:12:47 collections:174] 0 files were filtered totalling 0.00 hours


In [35]:
def analyse_ctc_failures_in_model(model):
    count_ctc_failures = 0
    am_seq_lengths = []
    target_seq_lengths = []

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = model.to(device)
    mode = model.training
    
    train_dl = model.train_dataloader()

    with torch.no_grad():
      model = model.eval()
      for batch in tqdm(train_dl, desc='Checking for CTC failures'):
          x, x_len, y, y_len = batch
          x, x_len = x.to(device), x_len.to(device)
          x_logprobs, x_len, greedy_predictions = model(input_signal=x, input_signal_length=x_len)

          # Find how many CTC loss computation failures will occur
          for xl, yl in zip(x_len, y_len):
              if xl <= yl:
                  count_ctc_failures += 1

          # Record acoustic model lengths=
          am_seq_lengths.extend(x_len.to('cpu').numpy().tolist())

          # Record target sequence lengths
          target_seq_lengths.extend(y_len.to('cpu').numpy().tolist())
          
          del x, x_len, y, y_len, x_logprobs, greedy_predictions
    
    if mode:
      model = model.train()
      
    return count_ctc_failures, am_seq_lengths, target_seq_lengths

results = analyse_ctc_failures_in_model(asr_model)
num_ctc_failures, am_seq_lengths, target_seq_lengths = results
if num_ctc_failures > 0:
  print(f"\nCTC loss will fail for {num_ctc_failures} samples ({num_ctc_failures * 100./ float(len(am_seq_lengths))} % of samples)!\n"
                  f"Increase the vocabulary size of the tokenizer so that this number becomes close to zero !")
else:
  print("No CTC failure cases !")
# Compute average ratio of T / U
avg_T = sum(am_seq_lengths) / float(len(am_seq_lengths))
avg_U = sum(target_seq_lengths) / float(len(target_seq_lengths))

avg_length_ratio = 0
for am_len, tgt_len in zip(am_seq_lengths, target_seq_lengths):
  avg_length_ratio += (am_len / float(tgt_len))
avg_length_ratio = avg_length_ratio / len(am_seq_lengths)

print(f"Average Acoustic model sequence length = {avg_T}")
print(f"Average Target sequence length = {avg_U}")
print()
print(f"Ratio of Average AM sequence length to target sequence length = {avg_length_ratio}")


Checking for CTC failures:   0%|          | 0/4957 [00:01<?, ?it/s]


CTC loss will fail for 33 samples (0.020806537035635925 % of samples)!
Increase the vocabulary size of the tokenizer so that this number becomes close to zero !
Average Acoustic model sequence length = 52.62409523088951
Average Target sequence length = 24.510686994022848

Ratio of Average AM sequence length to target sequence length = 2.344184373505856
