<a href="https://colab.research.google.com/github/vladk17/End-to-end-Deep-Learning-Speech-Recognition-Platform/blob/master/ASR_made_simple_reproduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
"""
You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.

Instructions for setting up Colab are as follows:
1. Open a new Python 3 notebook.
2. Import this notebook from GitHub (File -> Upload Notebook -> "GITHUB" tab -> copy/paste GitHub URL)
3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select "GPU" for hardware accelerator)
4. Run this cell to set up dependencies.
"""
# If you're using Google Colab and not running locally, run this cell.
!pip install wget
!apt-get install sox
!pip install git+https://github.com/NVIDIA/apex.git

!pip install unidecode

!mkdir configs
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/master/examples/asr/configs/jasper_an4.
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/master/tests/data/jasper_smaller.yaml

In [0]:
!pip install nemo-toolkit
!pip install nemo-asr

In [0]:
!pip install SoundFile #vk

In [0]:
# This is where the an4/ directory will be placed.
# Change this if you don't want the data to be extracted in the current directory.
data_dir = '.'

In [0]:
import glob
import os
import subprocess
import tarfile
import wget

# Download the dataset. This will take a few moments...
print("******")
if not os.path.exists(data_dir + '/an4_sphere.tar.gz'):
    an4_url = 'http://www.speech.cs.cmu.edu/databases/an4/an4_sphere.tar.gz'
    an4_path = wget.download(an4_url, data_dir)
    print(f"Dataset downloaded at: {an4_path}")
else:
    print("Tarfile already exists.")
    an4_path = data_dir + '/an4_sphere.tar.gz'

# Untar and convert .sph to .wav (using sox)
tar = tarfile.open(an4_path)
tar.extractall(path=data_dir)

print("Converting .sph to .wav...")
sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)
for sph_path in sph_list:
    wav_path = sph_path[:-4] + '.wav'
    cmd = ["sox", sph_path, wav_path]
    subprocess.run(cmd)
print("Finished conversion.\n******")

In [0]:
%matplotlib inline
import soundfile as sf
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

#example_file_name = train_flac_files_list[0]
example_file_name = data_dir + '/an4/wav/an4_clstk/mgah/cen2-mgah-b.wav'
#audio, sample_rate = librosa.load(example_file_name)
audio, sample_rate = sf.read(example_file_name)

plt.rcParams['figure.figsize'] = (15,7)
plt.title(f'Waveform of Audio Example: {example_file_name}')
plt.ylabel('Amplitude')

_ = librosa.display.waveplot(audio)

In [0]:
# Get spectrogram using Librosa's Short-Time Fourier Transform (stft)
spec = np.abs(librosa.stft(audio))
spec_db = librosa.amplitude_to_db(spec, ref=np.max)  # Decibels

# Use log scale to view frequencies
librosa.display.specshow(spec_db, y_axis='log', x_axis='time')
plt.colorbar()
plt.title('Audio Spectrogram');

In [0]:
# Plot the mel spectrogram of our sample
mel_spec = librosa.feature.melspectrogram(audio, sr=sample_rate)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

librosa.display.specshow(
    mel_spec_db, x_axis='time', y_axis='mel')
plt.colorbar()
plt.title('Mel Spectrogram');

In [0]:
!git clone https://github.com/NVIDIA/NeMo.git

In [0]:
!pip install NeMo/.

In [0]:
!pip install frozendict

In [0]:
# Install the ASR collection from collections/nemo_asr
!apt-get install libsndfile1
# !pip install NeMo/nemo/collections/asr/.

# Install the NLP collection from collections/nemo_nlp
# !pip install NeMo/nemo/collections/nlp/.

# Install the TTS collection from collections/nemo_tts
# !pip install NeMo/nemo/collections/tts/.

In [0]:
# NeMo's "core" package
import nemo
# NeMo's ASR collection
import nemo.collections.asr as nemo_asr

In [0]:
! nvidia-smi

In [0]:
#import nemo
#import nemo_asr #nemo asr collection

In [0]:
# --- Building Manifest Files --- #
import json

# Function to build a manifest
def build_manifest(transcripts_path, manifest_path, wav_path):
    with open(transcripts_path, 'r') as fin:
        with open(manifest_path, 'w') as fout:
            for line in fin:
                # Lines look like this:
                # <s> transcript </s> (fileID)
                transcript = line[: line.find('(')-1].lower()
                transcript = transcript.replace('<s>', '').replace('</s>', '')
                transcript = transcript.strip()

                file_id = line[line.find('(')+1 : -2]  # e.g. "cen4-fash-b"
                audio_path = os.path.join(
                    data_dir, wav_path,
                    file_id[file_id.find('-')+1 : file_id.rfind('-')],
                    file_id + '.wav')

                duration = librosa.core.get_duration(filename=audio_path)

                # Write the metadata to the manifest
                metadata = {
                    "audio_filepath": audio_path,
                    "duration": duration,
                    "text": transcript
                }
                json.dump(metadata, fout)
                fout.write('\n')
                
# Building Manifests
print("******")
train_transcripts = data_dir + '/an4/etc/an4_train.transcription'
train_manifest = data_dir + '/an4/train_manifest.json'
build_manifest(train_transcripts, train_manifest, 'an4/wav/an4_clstk')
print("Training manifest created.")

test_transcripts = data_dir + '/an4/etc/an4_test.transcription'
test_manifest = data_dir + '/an4/test_manifest.json'
build_manifest(test_transcripts, test_manifest, 'an4/wav/an4test_clstk')
print("Test manifest created.")
print("******")

1. __Neural Module__ is a block that computes a set of outputs from a set of inputs
2. Neural Modules' inputs and outputs have __NeuralType__
3. NEMO application is a __DAG__ of connected NMs 

### Path to your dataset

In [0]:
#train_manifest = "an4_train.json"
#val_manifest = "an4_val.json"
print(train_manifest)
print(test_manifest)

### Model description

In [0]:
from ruamel.yaml import YAML

# Parse config and pass to model building function
config_path = './configs/jasper_smaller.yaml'
yaml = YAML(typ='safe')
with open(config_path) as f:
    params = yaml.load(f)
    print("******\nLoaded config file.\n******")

labels = params['labels']  # Vocab of tokens
sample_rate = params['sample_rate']

In [0]:
##import toml
#jasper_model_definition = toml.load('/home/okuchaiev/repos/gitlab-master/nemo/test/data/jasper_smaller.yaml') #jasper_smaller.yaml
#labels = jasper_model_definition['labels']['labels']
jasper_model_definition = params
#labels = jasper_model_definition['labels']['labels']
labels = jasper_model_definition['labels']
jasper_model_definition

### Instatiate necessary Neural Modules

In [0]:
jasper_model_definition['AudioToTextDataLayer']['train']

In [0]:
# neural_factory = nemo.core.NeuralModuleFactory(
#     log_dir=data_dir+f'/an4_tutorial/')    
# #    log_dir=data_dir+'/an4_tutorial/')
# #    log_dir=data_dir+'/ls_tutorial/')

# logger = neural_factory.logger
# logger1 = nemo.logging

In [0]:
neural_factory=nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch)

In [0]:
print(dir(logger1))

In [0]:
jasper_model_definition

In [0]:
jasper_model_definition['sample_rate']

In [0]:
data_layer = nemo_asr.AudioToTextDataLayer(#featurizer_config=jasper_model_definition['input'],
                                          manifest_filepath=train_manifest,
                                          #labels=labels#, batch_size=6
                                          sample_rate=jasper_model_definition['sample_rate'],
                                          labels=labels,
                                          batch_size=32,                                           
                                          shuffle=True#,
                                          #**jasper_model_definition['AudioToTextDataLayer']['train']
                                           )

In [0]:
#data_preprocessor = nemo_asr.AudioPreprocessing(**jasper_model_definition['input']) #converts wav to mel spectrogramm 
data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(**jasper_model_definition['AudioToMelSpectrogramPreprocessor']) #converts wav to mel spectrogramm 

In [0]:
print(dir(nemo_asr))

In [0]:
#help(nemo_asr.SpectrogramAugmentation)

In [0]:
#spec_augment = nemo_asr.SpectrogramAugmentation(**jasper_model_definition)
spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5)


In [0]:
jasper_encoder = nemo_asr.JasperEncoder(feat_in=64,**jasper_model_definition['JasperEncoder'])

In [0]:
jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(labels))

In [0]:
#ctc_loss=nemo_asr.CTCLossNN(num_classes=len(labels))
ctc_loss = nemo_asr.CTCLossNM(num_classes=len(labels))

In [0]:
greedy_decoder=nemo_asr.GreedyCTCDecoder()

### Describe How Neural Modules are conntected together

In [0]:
audio_signal, audio_signal_len, transcript, transcript_len = data_layer()
processed_signal, processed_signal_len=data_preprocessor(input_signal=audio_signal,
                                                        length=audio_signal_len)
aug_signal = spec_augment(input_spec=processed_signal)
encoded, encoded_len = jasper_encoder(audio_signal=aug_signal, length=processed_signal_len)

In [0]:
#1) CTC model
log_probs = jasper_decoder(encoder_output=encoded)
predictions=greedy_decoder(log_probs=log_probs)
loss=ctc_loss(log_probs=log_probs, targets=transcript,
             input_length=encoded_len, target_length=transcript_len)
tensors_to_evaluate = [predictions, transcript, transcript_len]

In [0]:
# #2) Instantiate additional Neural Modules
# connector=nemo_asr.JasperRNNCnnector(in_channels=1024, out_channels=jasper_model_definition['rnn_decoder']['hidden_size'])
# #RNN decoder with attention
# rnn_decoder=nemo.backends.pytorch.common.DecoderRNN(voc_size=len(labels), bos_id=1, **jasper_model_definition['rnn_decoder'])
# seq_loss=nemo.backends.pytorch.common.SequentialLoss(pad_id=0, smoothing_coef=0.0)

# #define second part of DAG
# encoded2=connector(tensor=encoded)
# log_probs, _=rnn_decoder(targets=transcript,
#                         encoder_outputs=encoded2)
# loss=seq_loss(log_probs=log_probs, targets=transcript)

# #some bookkeeping
# labels=['pad', 'bos', 'eos']+labels
# tensor_to_evaluate=None

### Run training

In [0]:
#from nemo_asr.helpers import monitor_asr_train_process
from nemo.collections.asr.helpers import monitor_asr_train_progress, \
    process_evaluation_batch, process_evaluation_epoch
from functools import partial
# train_callback = nemo.core.SimpleLossLoggerCallback(
#     tensor_list2string=lambda x: str(x[0].item()),
#     tensor_list2string_evl=lambda x: monitor_asr_train_progress(x, labels=labels))
train_callback = nemo.core.SimpleLossLoggerCallback(
    # Notice that we pass in loss, predictions, and the transcript info.
    # Of course we would like to see our training loss, but we need the
    # other arguments to calculate the WER.
    tensors=[loss, jasper_model_definition, transcript, transcript_len],
    # The print_func defines what gets printed.
    print_func=partial(
        monitor_asr_train_progress,
        labels=labels,
        # logger=logger
        )
    )

eval_callback = nemo.core.EvaluatorCallback(
    eval_tensors=[loss, predictions, transcript, transcript_len],
    user_iter_callback=partial(
        process_evaluation_batch, labels=labels),
    user_epochs_done_callback=partial(
        process_evaluation_epoch, 
        # logger=logger
        ),
    eval_step=500  # How often we evaluate the model on the test set
    )

checkpoint_saver_callback = nemo.core.CheckpointCallback(
    folder=data_dir+'/an4_checkpoints',
    step_freq=1000  # How often checkpoints are saved
    )

In [0]:
#instantiate Neural Factory with supported backend
#neural_factory=nemo.core.NeuralModelFactory(backend=nemo.core.Backend.PyTorch)

In [0]:
help(neural_factory.get_trainer)

In [0]:
# optimizer = neural_factory.get_trainer(
#      params={'optimpizer_kind':'novograd',
#             'optimization_params': {'num_epochs':15, 'lr':2e-2,
#                                    'weight_decay':1e-4}})

In [0]:
# optimizer.train(tensors_to_optimize=[loss],
#                callback=[train_callback],
#                tensors_to_evaluate=tensors_to_evaluate)

In [0]:
help(neural_factory.train)

In [0]:
# neural_factory.train(
#     tensors_to_optimize=[loss],
#     callbacks=[train_callback, eval_callback, checkpoint_saver_callback],
#     optimizer='novograd',
#     optimization_params={
#         "num_epochs": 100, "lr": 0.01, "weight_decay": 1e-4
#     })

neural_factory.train(
    tensors_to_optimize=[loss],
    callbacks=[train_callback],
    optimizer='novograd',
    optimization_params={
            "num_epochs": 100,
            "lr": 0.01,
            "weight_decay": 1e-4

        }
    )


In [0]:
#jasper_encoder.save_to('jasper_encoder.pt')
#jsaper_encoder.freeze()

In [0]:
neural_factory=nemo.core.NeuralModelFactory(
    backend=nemo.core.Backend.PyTorch,
    local_rank=args.local_rank,
    optimization_level=nemo.core.Optimization.mxprO1,
    placement=device)