In [17]:
import os
import nemo.collections.asr as nemo_asr
from omegaconf import OmegaConf, open_dict
import pytorch_lightning as ptl
from nemo.utils import exp_manager
import torch.nn as nn
import torch
import sys
import time
from tqdm.auto import tqdm

ASR_DIR = "/home/khoatlv"
sys.path.append(ASR_DIR)
from Conformer_ASR.scripts.utils import config, Logger, Config

In [24]:
ASR_DIR = "/home/khoatlv/Conformer_ASR"

tokenizer_cfg = config.get_config(["training", "tokenizer"])
tokenizer_dir = tokenizer_cfg.tokenizer_dir + "_" + str(int(round(time.time(), 0)))
vocab_size = config.get_config(["training", "vocab_size"])
type = tokenizer_cfg.type                # can be wpe or spe
type_cfg = tokenizer_cfg.type_cfg        # ["bpe", "unigram"]

train_manifest_cleaned = "/home/khoatlv/manifests/train_manifest_processed.json"
test_manifest_cleaned = "/home/khoatlv/manifests/test_manifest_processed.json"

! wc -l {train_manifest_cleaned}
! wc -l {test_manifest_cleaned}

# Tokenizer path
print("Tokenizer directory :", tokenizer_dir)

183819 /home/khoatlv/manifests/train_manifest_processed.json
22184 /home/khoatlv/manifests/test_manifest_processed.json
Tokenizer directory : /home/khoatlv/Conformer_ASR/tokenizers/tokenizers_conformer_1654996267


In [25]:
!python3 tokenizers/process_asr_text_tokenizer.py \
   --manifest=$train_manifest_cleaned \
   --data_root=$tokenizer_dir \
   --tokenizer=$type \
   --spe_type=$type_cfg \
   --spe_character_coverage=1.0 \
   --no_lower_case \
   --log \
   --vocab_size=$vocab_size
# ------------------------------------------------------------------- #

INFO:root:Finished extracting manifest : /home/khoatlv/manifests/train_manifest_processed.json
INFO:root:Finished extracting all manifests ! Number of sentences : 183819
[NeMo I 2022-06-12 01:11:15 sentencepiece_tokenizer:307] Processing /home/khoatlv/Conformer_ASR/tokenizers/tokenizers_conformer_1654996267/text_corpus/document.txt and store at /home/khoatlv/Conformer_ASR/tokenizers/tokenizers_conformer_1654996267/tokenizer_spe_bpe_v256
sentencepiece_trainer.cc(177) LOG(INFO) Running command: --input=/home/khoatlv/Conformer_ASR/tokenizers/tokenizers_conformer_1654996267/text_corpus/document.txt --model_prefix=/home/khoatlv/Conformer_ASR/tokenizers/tokenizers_conformer_1654996267/tokenizer_spe_bpe_v256/tokenizer --vocab_size=256 --shuffle_input_sentence=true --hard_vocab_limit=false --model_type=bpe --character_coverage=1.0 --bos_id=-1 --eos_id=-1
sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /home/khoatlv/Conformer_ASR/tokenizers/tokenizers_conf

In [1]:
def load_config(path):
    config = OmegaConf.load(path)
    config = OmegaConf.to_container(config, resolve=True)
    config = OmegaConf.create(config)
    
    config.model.train_ds.manifest_filepath = train_manifest_cleaned
    config.model.validation_ds.manifest_filepath = test_manifest_cleaned
    config.model.test_ds.manifest_filepath = test_manifest_cleaned
    
    return config

def enable_bn_se(m):
    if type(m) == nn.BatchNorm1d:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)

    if 'SqueezeExcite' in type(m).__name__:
        m.train()
        for param in m.parameters():
            param.requires_grad_(True)

In [2]:
# asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_en_conformer_ctc_small",map_location='cuda')
# asr_model.change_vocabulary(new_tokenizer_dir=TOKENIZER_DIR, new_tokenizer_type=TOKENIZER_TYPE_CFG)

asr_model = nemo_asr.models.EncDecCTCModelBPE.restore_from(
    restore_path="/home/khoatlv/Conformer_ASR/models/conformer/Conformer_small_epoch=98.nemo",
    map_location='cuda'
)

[NeMo I 2022-05-27 15:10:18 mixins:146] Tokenizer SentencePieceTokenizer initialized with 256 tokens


[NeMo W 2022-05-27 15:10:19 modelPT:148] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /home/nhan/NovaIntechs/data/ASR_Data/manifests/train_manifest_processed.json
    sample_rate: 16000
    max_duration: 16.7
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    bucketing_batch_size: null
    shuffle: true
    batch_size: 32
    pin_memory: true
    trim_silence: true
    use_start_end_token: true
    normalize_transcripts: false
    num_workers: 16
    
[NeMo W 2022-05-27 15:10:19 modelPT:155] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manif

[NeMo I 2022-05-27 15:10:19 features:255] PADDING: 0
[NeMo I 2022-05-27 15:10:19 features:272] STFT using torch
[NeMo I 2022-05-27 15:10:40 save_restore_connector:157] Model EncDecCTCModelBPE was successfully restored from /home/khoatlv/Conformer_ASR/models/conformer/Conformer_small_epoch=98.nemo.


In [3]:
asr_model.summarize()

      asr_model.summarize()
    
      rank_zero_deprecation(
    


{'sample_rate': 16000, 'log_prediction': True, 'ctc_reduction': 'mean_batch', 'num_workers': 16, 'train_ds': {'manifest_filepath': '/home/nhan/NovaIntechs/data/ASR_Data/manifests/train_manifest_processed.json', 'sample_rate': 16000, 'max_duration': 16.7, 'min_duration': 0.1, 'is_tarred': False, 'tarred_audio_filepaths': None, 'shuffle_n': 2048, 'bucketing_strategy': 'synced_randomized', 'bucketing_batch_size': None, 'shuffle': True, 'batch_size': 32, 'pin_memory': True, 'trim_silence': True, 'use_start_end_token': True, 'normalize_transcripts': False, 'num_workers': 16}, 'validation_ds': {'manifest_filepath': '/home/nhan/NovaIntechs/data/ASR_Data/manifests/test_manifest_processed.json', 'sample_rate': 16000, 'pin_memory': True, 'shuffle': False, 'batch_size': 8, 'trim_silence': True, 'use_start_end_token': True, 'normalize_transcripts': False, 'num_workers': 16}, 'test_ds': {'manifest_filepath': '/home/nhan/NovaIntechs/data/ASR_Data/manifests/test_manifest_processed.json', 'sample_rate

In [None]:
# freeze_encoder = False 
# freeze_encoder = bool(freeze_encoder)

# if freeze_encoder:
#     asr_model.encoder.freeze()
#     asr_model.encoder.apply(enable_bn_se)
#     print("Model encoder has been frozen, and batch normalization has been unfrozen")
# else:
#     asr_model.encoder.unfreeze()
#     print("Model encoder has been un-frozen")

config = load_config(model_config)

# Set tokenizer config
asr_model.cfg.tokenizer.dir = TOKENIZER_DIR
asr_model.cfg.tokenizer.type = TOKENIZER_TYPE_CFG

asr_model.setup_training_data(config.model.train_ds)
asr_model.setup_validation_data(config.model.validation_ds)
asr_model.setup_multiple_test_data(config.model.test_ds)

with open_dict(asr_model.cfg):
    asr_model.cfg.optim = config.model.optim
    asr_model.cfg.spec_augment = config.model.spec_augment    
    
asr_model.spec_augmentation = asr_model.from_config_dict(config.model.spec_augment)
asr_model.setup_optimization(config.model.optim)

asr_model._wer.use_cer = True
asr_model._wer.log_prediction = True

trainer = ptl.Trainer(**config.trainer)
asr_model.set_trainer(trainer)
asr_model.cfg = asr_model._cfg

exp_config = exp_manager.ExpManagerConfig(**config.exp_manager)
exp_config = OmegaConf.structured(exp_config)
logdir = exp_manager.exp_manager(trainer, exp_config)

In [None]:
# def analyse_ctc_failures_in_model(model):
#     count_ctc_failures = 0
#     am_seq_lengths = []
#     target_seq_lengths = []

#     device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
#     model = model.to(device)
#     mode = model.training
    
#     train_dl = model.train_dataloader()

#     with torch.no_grad():
#       model = model.eval()
#       for batch in tqdm(train_dl, desc='Checking for CTC failures'):
#           x, x_len, y, y_len = batch
#           x, x_len = x.to(device), x_len.to(device)
#           x_logprobs, x_len, greedy_predictions = model(input_signal=x, input_signal_length=x_len)

#           # Find how many CTC loss computation failures will occur
#           for xl, yl in zip(x_len, y_len):
#               if xl <= yl:
#                   count_ctc_failures += 1

#           # Record acoustic model lengths=
#           am_seq_lengths.extend(x_len.to('cpu').numpy().tolist())

#           # Record target sequence lengths
#           target_seq_lengths.extend(y_len.to('cpu').numpy().tolist())
          
#           del x, x_len, y, y_len, x_logprobs, greedy_predictions
    
#     if mode:
#       model = model.train()
      
#     return count_ctc_failures, am_seq_lengths, target_seq_lengths

# results = analyse_ctc_failures_in_model(asr_model)
# num_ctc_failures, am_seq_lengths, target_seq_lengths = results
# if num_ctc_failures > 0:
#   print(f"\nCTC loss will fail for {num_ctc_failures} samples ({num_ctc_failures * 100./ float(len(am_seq_lengths))} % of samples)!\n"
#                   f"Increase the vocabulary size of the tokenizer so that this number becomes close to zero !")
# else:
#   print("No CTC failure cases !")
# # Compute average ratio of T / U
# avg_T = sum(am_seq_lengths) / float(len(am_seq_lengths))
# avg_U = sum(target_seq_lengths) / float(len(target_seq_lengths))

# avg_length_ratio = 0
# for am_len, tgt_len in zip(am_seq_lengths, target_seq_lengths):
#   avg_length_ratio += (am_len / float(tgt_len))
# avg_length_ratio = avg_length_ratio / len(am_seq_lengths)

# print(f"Average Acoustic model sequence length = {avg_T}")
# print(f"Average Target sequence length = {avg_U}")
# print()
# print(f"Ratio of Average AM sequence length to target sequence length = {avg_length_ratio}")


In [None]:
# # Train the model
# trainer.fit(asr_model)

## save checkpoint to nemo

In [10]:
# config = load_config(model_config)
# config.model.tokenizer.dir = TOKENIZER_DIR
# config.model.tokenizer.type = TOKENIZER_TYPE_CFG

# asr_model = nemo_asr.models.EncDecCTCModelBPE(config.model)
# checkpoint = torch.load(
#     "/home/khoatlv/Conformer_ASR/experiments/Conformer_small_Model_Language_vi/2022-04-08_04-39-29/checkpoints/Conformer_small_Model_Language_vi--val_wer=0.0438-epoch=47.ckpt",
#     map_location='cuda'
# )
# asr_model.load_state_dict(state_dict=checkpoint['state_dict'])
# asr_model.save_to("/home/khoatlv/Conformer_ASR/models/conformer/Conformer_small_epoch=98+20+11+47=176.nemo")

[NeMo I 2022-05-01 11:40:46 mixins:146] Tokenizer SentencePieceTokenizer initialized with 256 tokens
[NeMo I 2022-05-01 11:40:46 ctc_bpe_models:206] 
    Replacing placeholder number of classes (-1) with actual number of classes - 256
[NeMo I 2022-05-01 11:40:54 collections:173] Dataset loaded with 169524 files totalling 185.88 hours
[NeMo I 2022-05-01 11:40:54 collections:174] 1589 files were filtered totalling 10.03 hours
[NeMo I 2022-05-01 11:40:55 collections:173] Dataset loaded with 19050 files totalling 21.30 hours
[NeMo I 2022-05-01 11:40:55 collections:174] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-05-01 11:40:56 collections:173] Dataset loaded with 19050 files totalling 21.30 hours
[NeMo I 2022-05-01 11:40:56 collections:174] 0 files were filtered totalling 0.00 hours
[NeMo I 2022-05-01 11:40:56 features:255] PADDING: 0
[NeMo I 2022-05-01 11:40:56 features:272] STFT using torch


In [None]:
# asr_model = nemo_asr.models.EncDecCTCModelBPE.restore_from(
#     "/home/khoatlv/Conformer_ASR/models/conformer/Conformer_small_epoch=98+20+11+47+10=186.nemo",
#     map_location='cuda'
# )

In [None]:
# audio_name = "FPTOpenSpeechData_Set002_V0.1_011692.wav"
# AUDIO_FILENAME = os.path.join("/home/khoatlv/data/FPT/wav", audio_name)

# text = asr_model.transcribe([AUDIO_FILENAME])
# print(text)