# Installing required packages

In [None]:
!pip install nemo_toolkit[all]
!pip install pydub
!pip install jsonlines
!pip install jiwer
!pip install pyaspeller

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nemo_toolkit[all]
  Downloading nemo_toolkit-1.18.1-py3-none-any.whl (2.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub (from nemo_toolkit[all])
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting onnx>=1.7.0 (from nemo_toolkit[all])
  Downloading onnx-1.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
Collecting ruamel.yaml (from nemo_toolkit[all])
  Downloading ruamel.yaml-0.17.26-py3-none-any.whl (109 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 kB[

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import re
import os
import librosa
import nemo
import nemo.collections.asr as nemo_asr
import jsonlines
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from jiwer import wer, cer
import pytorch_lightning as pl
from omegaconf import DictConfig
from pyaspeller import YandexSpeller
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Define the model

In [None]:
asr_model = nemo_asr.models.EncDecCTCModel.load_from_checkpoint("/content/drive/MyDrive/epoch=19-step=13880.ckpt")

[NeMo W 2023-05-24 19:44:05 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /content/train.jsonl
    sample_rate: 16000
    labels:
    - ' '
    - а
    - б
    - в
    - г
    - д
    - е
    - ж
    - з
    - и
    - й
    - к
    - л
    - м
    - н
    - о
    - п
    - р
    - с
    - т
    - у
    - ф
    - х
    - ц
    - ч
    - ш
    - щ
    - ъ
    - ы
    - ь
    - э
    - ю
    - я
    batch_size: 8
    trim_silence: false
    max_duration: 37.02
    min_duration: 0.01
    num_workers: 0
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    parser: ru
    
[NeMo W 2023-05-24 19:44:05 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a vali

[NeMo I 2023-05-24 19:44:05 features:287] PADDING: 16


# Download the manifest

Manifest contains path to audio, transcription and duration of the audio.

In [None]:
manifest = []
with jsonlines.open('/content/drive/MyDrive/manifest_opochka.jsonl') as f:
    for obj in f:
        manifest.append(obj)

# Train

## Define trainer from pytorch lightning

In [None]:
trainer = pl.Trainer(max_epochs=20, accelerator="cuda")

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


## Split data to train and test samples

In [None]:
train, test = train_test_split(manifest, test_size=0.3, random_state=22)

with jsonlines.open('train.jsonl', 'w') as writer:
    writer.write_all(train)
with jsonlines.open('test.jsonl', 'w') as writer:
    writer.write_all(test)

## Change parameters in the config

In [None]:
params = asr_model._cfg
params['spec_augment']['rect_masks'] = 0
params['train_ds']['parser']='ru'
params['train_ds']['manifest_filepath'] = '/content/train.jsonl'
params['train_ds']['batch_size']=8
params['validation_ds']['parser']='ru'
params['validation_ds']['manifest_filepath'] = '/content/test.jsonl'
params['validation_ds']['batch_size']=8
params['validation_ds']['num_workers'] = 0
params['test_ds']['parser']='ru'
params['test_ds']['manifest_filepath'] = '/content/test.jsonl'
params['test_ds']['batch_size']=8

## Restore model with our new config

In [None]:
cfg_ = DictConfig(params)
asr_model = nemo_asr.models.EncDecCTCModel.load_from_checkpoint("/content/drive/MyDrive/epoch=19-step=13880.ckpt", override_config_path=cfg_)
asr_model.setup_training_data(cfg_['train_ds'])
asr_model.setup_validation_data(cfg_['validation_ds'])
asr_model.setup_test_data(cfg_['test_ds'])

[NeMo W 2023-05-24 19:49:07 modelPT:161] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /content/train.jsonl
    sample_rate: 16000
    labels:
    - ' '
    - а
    - б
    - в
    - г
    - д
    - е
    - ж
    - з
    - и
    - й
    - к
    - л
    - м
    - н
    - о
    - п
    - р
    - с
    - т
    - у
    - ф
    - х
    - ц
    - ч
    - ш
    - щ
    - ъ
    - ы
    - ь
    - э
    - ю
    - я
    batch_size: 8
    trim_silence: false
    max_duration: 37.02
    min_duration: 0.01
    num_workers: 0
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    parser: ru
    
[NeMo W 2023-05-24 19:49:07 modelPT:168] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a vali

[NeMo I 2023-05-24 19:49:07 features:287] PADDING: 16
[NeMo I 2023-05-24 19:49:08 collections:193] Dataset loaded with 1579 files totalling 1.82 hours
[NeMo I 2023-05-24 19:49:08 collections:194] 0 files were filtered totalling 0.00 hours
[NeMo I 2023-05-24 19:49:08 collections:193] Dataset loaded with 677 files totalling 0.82 hours
[NeMo I 2023-05-24 19:49:08 collections:194] 0 files were filtered totalling 0.00 hours
[NeMo I 2023-05-24 19:49:08 collections:193] Dataset loaded with 677 files totalling 0.82 hours
[NeMo I 2023-05-24 19:49:08 collections:194] 0 files were filtered totalling 0.00 hours


## Train model

In [None]:
trainer.fit(asr_model)

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2023-05-24 19:49:18 modelPT:722] Optimizer config = Novograd (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        eps: 1e-08
        grad_averaging: False
        lr: 0.001
        weight_decay: 0.001
    )
[NeMo I 2023-05-24 19:49:18 lr_scheduler:910] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7f08546af3a0>" 
    will be used during training (effective maximum steps = 3960) - 
    Parameters : 
    (warmup_steps: 500
    warmup_ratio: null
    min_lr: 0.0
    last_epoch: -1
    max_steps: 3960
    )


INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type                              | Params
------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0     
1 | encoder           | ConvASREncoder                    | 18.9 M
2 | decoder           | ConvASRDecoder                    | 34.9 K
3 | loss              | CTCLoss                           | 0     
4 | spec_augmentation | SpectrogramAugmentation           | 0     
5 | _wer              | WER                               | 0     
------------------------------------------------------------------------
18.9 M    Trainable params
0         Non-trainable params
18.9 M    Total params
75.718    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=20` reached.


## Save model to disk

In [None]:
!cp /content/lightning_logs/version_0/checkpoints/epoch=19-step=3960.ckpt /content/drive/MyDrive

# Evaluate

In [None]:
files = [file['audio_filepath'] for file in test]
transcriptions = asr_model.transcribe(paths2audio_files=files)

Transcribing:   0%|          | 0/170 [00:00<?, ?it/s]

In [None]:
wers = []
cers = []

for i, transcription in enumerate(transcriptions):
    if test[i]['text'] != '' and test[i]['text'] != ' ':
      w = wer(test[i]['text'], transcription)
      wers.append(w)
      c = cer(test[i]['text'], transcription)
      cers.append(c)
      test[i]['transcript'] = transcription

print('Mean WER: ', sum(wers)/len(wers))
print('Mean CER: ', sum(cers)/len(cers))

Mean WER:  0.651518846912585
Mean CER:  0.35396722724849966


In [None]:
df = pd.DataFrame.from_records(test)
df.to_excel('quartznet_opochka_without_spellcheck.xlsx')

# Use spellchecker for the received transcriptions

In [None]:
speller = YandexSpeller()
transcrtiptions_spelled = []
for t in tqdm(transcriptions):
    transcrtiptions_spelled.append(speller.spelled(t))

100%|██████████| 677/677 [10:04<00:00,  1.12it/s]


In [None]:
wers = []
cers = []

for i, transcrtiption_spelled in enumerate(transcrtiptions_spelled):
    if test[i]['text'] != '' and test[i]['text'] != ' ':
      w = wer(test[i]['text'], transcrtiption_spelled)
      wers.append(w)
      c = cer(test[i]['text'], transcrtiption_spelled)
      cers.append(c)
      test[i]['transcript'] = transcrtiption_spelled

print('Mean WER: ', sum(wers)/len(wers))
print('Mean CER: ', sum(cers)/len(cers))

Mean WER:  0.5962078962023646
Mean CER:  0.3520574026329374


In [None]:
df = pd.DataFrame.from_records(test)
df.to_excel('quartznet_opochka_with_spellcheck.xlsx')