In [None]:
# Update numba and restart

# In a conda environment, you would use the following command
# Update Numba to > 0.54
# conda install -c conda-forge numba>=0.54
# or
# conda update -c conda-forge numba>=0.54

# For pip based environments,
# Update Numba to > 0.54
import os
import signal

!pip install --upgrade numba

# This will kill the kernel, click next cell to import the latest numba
os.kill(os.getpid(), signal.SIGKILL)

Collecting numba
  Downloading numba-0.55.1-1-cp37-cp37m-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 11.4 MB/s 
Collecting llvmlite<0.39,>=0.38.0rc1
  Downloading llvmlite-0.38.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.5 MB)
[K     |████████████████████████████████| 34.5 MB 7.0 kB/s 
Installing collected packages: llvmlite, numba
  Attempting uninstall: llvmlite
    Found existing installation: llvmlite 0.34.0
    Uninstalling llvmlite-0.34.0:
      Successfully uninstalled llvmlite-0.34.0
  Attempting uninstall: numba
    Found existing installation: numba 0.51.2
    Uninstalling numba-0.51.2:
      Successfully uninstalled numba-0.51.2
Successfully installed llvmlite-0.38.0 numba-0.55.1


# Naive comparison between WarpRNNT Numba and Torchaudio RNNT [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/titu1994/warprnnt_numba/blob/master/scripts/naive_comparisons/warprnnt_numba_vs_torch_audio.ipynb)

This notebook is a colab compatible way to do a **naive comparison** between the two loss functions. 

*Therefore no conclusions can be reached from this notebook, both are useful in many contexts.*

-----

Note that due to some dangling reference issue with running PyTorch `benchmark.Timer` with global variables for the inputs to the function, we will be writing the code in the notebook and in parallel exporting  the code snippets into a new file called `script.py` which will then be executed to write out the results.

-----

## THIS NOTEBOOK MUST BE RUN TOP TO BOTTOM ONLY. 

-----




Check that a recent Numba version has been installed. Anything > 0.53 will do.

In [1]:
import numba
print(numba.__version__)

0.55.1


Install the `warprnnt_numba` library from https://github.com/titu1994/warprnnt_numba.git

In [2]:
!pip install git+https://github.com/titu1994/warprnnt_numba.git

Collecting git+https://github.com/titu1994/warprnnt_numba.git
  Cloning https://github.com/titu1994/warprnnt_numba.git to /tmp/pip-req-build-qg1cdna3
  Running command git clone -q https://github.com/titu1994/warprnnt_numba.git /tmp/pip-req-build-qg1cdna3
Building wheels for collected packages: warprnnt-numba
  Building wheel for warprnnt-numba (setup.py) ... [?25l[?25hdone
  Created wheel for warprnnt-numba: filename=warprnnt_numba-0.4.0-py2.py3-none-any.whl size=46593 sha256=fb860d7b7248fd879ce4551f3cb5efe7359fe1e8f6013fdb744ee8e97ca08848
  Stored in directory: /tmp/pip-ephem-wheel-cache-cjg4enth/wheels/4f/a0/b1/077219f288994e18d6b0fb5bf326931aecce95cdc804e5203d
Successfully built warprnnt-numba
Installing collected packages: warprnnt-numba
Successfully installed warprnnt-numba-0.4.0


## Utility IPython Magic functions

The following two functions are to denote cells that export their code content into `scripts.py`. 

-----

## NOTE

Rerunning a cell multiple times will duplicate the code inside the script, so only run this notebook top to bottom and return here to run again !

In [3]:
from IPython.core.magic import register_cell_magic

@register_cell_magic
def exec_write_cell(line, cell):
    # Run and save python code block to a file

    with open(line, 'a', encoding='utf8') as pyf:
        pyf.write(cell)
        pyf.write("\n\n")

    code = compile(cell, line, 'exec')
    exec(code, globals())
    print("---> wrote cells to file :", line)

@register_cell_magic
def write_cell(line, cell):
    # Save python code block to a file, but do not run it.

    with open(line, 'a', encoding='utf8') as pyf:
        pyf.write(cell)
        pyf.write("\n\n")

    print("---> wrote cells to file (and did not execute):", line)

In [4]:
import os

# Be sure to delete the file cells if creating a new 
if os.path.exists('script.py'):
  os.remove('script.py')

In [5]:
%%exec_write_cell script.py

import torch
import torchaudio
import os

print("Torch :", torch.__version__)
print("Torch Audio:", torchaudio.__version__)
print("[Note]: Torch audio version must be >= 0.10.0")

Torch : 1.10.0+cu111
Torch Audio: 0.10.0+cu111
[Note]: Torch audio version must be >= 0.10.0
---> wrote cells to file : script.py


In [6]:
%%exec_write_cell script.py

import warprnnt_numba
print("warprnnt_numba:", warprnnt_numba.__version__)

warprnnt_numba: 0.4.0
---> wrote cells to file : script.py


In [7]:
%%exec_write_cell script.py

import numba
cuda_supported = warprnnt_numba.numba_utils.numba_cuda_is_supported(numba.__version__)
print("Numba supports CUDA:", cuda_supported)

Numba supports CUDA: True
---> wrote cells to file : script.py


# Helper methods 

Below are some helper methods that will build the loss functions and then call them on some random data. 

-----

Note that for a given set of input arguments to `data_gen()`, the seeds are set in such a way that if the shape matches (via variable `bs`, `t`, `u` and `v`) then the same tensor is generated. This is for fair comparison between same inputs for two different loss functions. 

In [8]:
%%exec_write_cell script.py

import os
import pickle
import subprocess
import traceback

import torch
import torch.utils.benchmark as benchmark

from torchaudio.transforms import RNNTLoss
from warprnnt_numba.rnnt_loss import RNNTLossNumba


DEVICE = 'cuda'

---> wrote cells to file : script.py


# Losses

For comparison, we use <br>
1) Torchaudio RNNT Loss <br>
2) Numba WarpRNNT Loss <br>

-----

Differences between the two losses - 
* Torchaudio does not currently support [FastEmit Regulization](https://arxiv.org/abs/2010.11148). For such cases we skip the calculation of the loss and leave the result a blank row.

* Numba does not currently support float16 CUDA calls, therefore we will test only fp32. If a fp16 tensor is passed to Numba loss, it will explicitly upcast it fp32 before computing the loss (at the cost of 2x memory for the input tensor). 

* Sometimes at Batch size 32 and large dimensions of T, U and V, Torchaudio RNNT loss sometimes hard crashes with cuda illegal memory access error. Its not always, but when it occurs try catch doesnt help since it corrupts the CUDA context and requires the script to be rerun. Maybe its something with GPU config but I havent debugged it, so for now those configurations will be commented out and skipped.


In [9]:
%%exec_write_cell script.py

global x, x_len, y, y_len

def data_gen(bs, t=200, u=100, v=1024, dtype=torch.float32):
    global x, x_len, y, y_len
    torch.cuda.empty_cache()

    shape = [bs, t, u, v + 1]
    torch.manual_seed(0)
    x = torch.randn(*shape, dtype=dtype, device=DEVICE, requires_grad=False)
    x_len = torch.randint(t, size=[bs], device=DEVICE, dtype=torch.int32)
    y = torch.randint(v, size=[bs, u - 1], device=DEVICE, dtype=torch.int32)
    y_len = torch.randint(u, size=[bs], device=DEVICE, dtype=torch.int32)

    # enforce some RNNT input constraints
    rand_idx = torch.randint(bs, size=[1])
    x_len[rand_idx] = t
    y_len[rand_idx] = u - 1

    return x, x_len, y, y_len


def check_time_pt(x, x_len, y, y_len, fastemit_lambda=None, clamp=-1.0):
    blank = x.shape[-1] - 1
    rnnt_loss = RNNTLoss(blank=blank, clamp=clamp, reduction="none")

    try:
        _ = rnnt_loss(logits=x, targets=y, logit_lengths=x_len, target_lengths=y_len)
    except NotImplementedError:
        print()
        print("RNNT Loss not available on this platform. Could not compute Pytorch Audio RNNT Loss.")
        print("Original error below :")
        print(traceback.format_exc())
        exit(1)


def check_time_numba(x, x_len, y, y_len, fastemit_lambda=0.0, clamp=-1.0):
    blank = x.shape[-1] - 1
    rnnt_loss = RNNTLossNumba(blank=blank, reduction='none', fastemit_lambda=fastemit_lambda, clamp=clamp)

    # Numba doesnt support fp16
    if x.dtype != torch.float32:
        x = x.float()

    _ = rnnt_loss(acts=x, labels=y, act_lens=x_len, label_lens=y_len)


def load_results(path):
  with open(path, 'rb') as f:
    results = pickle.load(f)
    return results

def save_results(results, path):
  with open(path, 'wb') as f:
    pickle.dump(results, f)

---> wrote cells to file : script.py


## System Info

The script should emit some key info such as which GPU is being used, how much memory it has and how much is free/allocated at the moment.

In [10]:
%%exec_write_cell script.py

# Print CUDA environment
result = subprocess.run(['nvidia-smi'], capture_output=True, text=True, encoding='utf-8')
print(result.stdout)
result = subprocess.run(['nvidia-smi', '-L'], capture_output=True, text=True, encoding='utf-8')
print(result.stdout)

Sun Jan 30 10:04:18 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P8    33W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
%%exec_write_cell script.py

torch.cuda.empty_cache()
print("GPU Memory :", torch.cuda.memory_summary())

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------------------|
| Active memory         |       0 B  |       0 B  |       0 B  |       0 B  |
|       from large pool |       0 B  |       0 B  |       0 B  |       0 B  |
|       from small pool |       0 B  |       0 B  |       0 B  |       0 B  |
|---------------------------------------------------------------

In [12]:
%%exec_write_cell script.py

basedir = f"results/numba_vs_torch_audio/"
if not os.path.exists(basedir):
    os.makedirs(basedir, exist_ok=True)

---> wrote cells to file : script.py


## Core script

This cell is the main portion of the notebook, which normally should execute with certain combinations noted below.

However, during testing it seems memory is not properly released inside the loop even with explicit None cast and global variables to prevent duplicate referrences. So this snippet simply writes out to the script instead of executing itself.

-----

### Permutations

The ranges have been selected for general Librispeech training. 

Batch size depends on the variable `REQUIRES_GRAD`. Since gradient shape is same as shape of the input joint, it requires roughly 2x the memory so batch size must be halved. 

* `b`: [1, 4, 8, 16]. [32] is added if loss is being computed for inference only. 32 GB memory can go upto 64 with Numba for inference and 32 for training.
* `t`: [200, 400]. Average length of LS is 16 seconds, with 4x stride ~ 400 timesteps, and with 8x stride of encoder its ~ 200 timesteps.
* `u`: [100, 200]. Depends on how the text was encoded - character encoding (upto 400+ characters) to subword encoding (100-200 sub-words). 
* `v`: [28, 1024]. Represents vocabulary size of the model. 28 is for character encoding - 26 lower case alphabet, space and apostrophe. 1024 is for sub-word encoding with fixed vocabulary size - Google papers tend towards 1024 for their RNNT models (though some are upto 4096).
* `fastemit_lambda`: [0.0, 0.001]. FastEmit regularization strength. 0.0 means it is disabled, and any value > 0 will perform fastemit regularization for numba loss. Skipped for Torchaudio loss.
* `dtype`: [torch.float32]. Fixed to float32 for now, since we cant do the largest test suite due to memory constraints. Will be removed once numba supports float16 for CUDA.
* `clamp`: [-1, 0.1]. Factor for gradient clamping. If -1, it is disabled and any value > 0 will enable the gradient clamping step in numba and torchaudio losses.

In [13]:
%%exec_write_cell script.py

REQUIRES_GRAD = True

print("Gradients will be computed :", REQUIRES_GRAD)

Gradients will be computed : True
---> wrote cells to file : script.py


In [14]:
%%write_cell script.py

# Compare takes a list of measurements which we'll save in results.
global results
results = []
torch.cuda.empty_cache()

results_path = os.path.join(basedir, 'rnnt_results.pkl')
save_results(results, results_path)
del results


batchsizes = [1, 4, 8, 16]

if not REQUIRES_GRAD:
  batchsizes.append(32)

for b in batchsizes:  # 1, 4, 8, 16, 32, 64 (on 32 GB GPUs)
    for t in [200, 400]:  # 200, 400, 600 (LibriSpeech with 4x and 8x stride, on 32 GB GPUs)
        for u in [100, 200]:  # 100, 200  # (char enc, subword enc)
            for v in [28, 1024,]:  # 28, 1024  # (char encoding, Conformer RNNT Vocab Size)
                for fastemit_lambda in [0.0, 0.001]:  # 0.0, 0.001  # (Google FastEmit regularization, no extra memory)
                    for dtype in [torch.float32]:  # (AMP / FP32; Note: Numba impl will force cast to fp32)
                        for clamp in [-1.0, 0.1]:  # Gradient clamping
                            global x, x_len, y, y_len
                            x = None
                            x_len = None
                            y = None
                            y_len = None

                            torch.cuda.empty_cache()

                            # label and sub_label are the rows
                            # description is the column
                            label = 'RNNTLoss'
                            sub_label = (
                                f'[b={b}, t={t}, u={u}, v={v}, '
                                f'fastemit_lambda={fastemit_lambda}, '
                                f'clamp={clamp}, '
                                f'dtype={dtype}]'
                            )

                            print("Computing :", sub_label)

                            # Pytorch Audio
                            env = 'TorchAudio'

                            if fastemit_lambda == 0.0:
                                x, x_len, y, y_len = data_gen(b, t, u, v, dtype=dtype)

                                if REQUIRES_GRAD:
                                  x.requires_grad = True

                                # Weird case of cuda illegal mem access beyond this config for fp 16 / fp 32 for batchsize=32
                                # TODO: debug if its hardware issue or something else.
                                # Works uptil b=32, t=329, u=200, v=1024 then fails above that for fp16
                                # Also, setup b=32, t=600, u=100, v=1024 and above fails for fp32
                                if (b * t * u * v) < (2 ** 31):
                                    # fmt: off
                                    t0 = benchmark.Timer(
                                        stmt='check_time_pt(x, x_len, y, y_len, fastemit_lambda, clamp)',
                                        setup="from __main__ import check_time_pt;",
                                        globals={'x': x, 'x_len': x_len, 'y': y, 'y_len': y_len,
                                                  'fastemit_lambda': fastemit_lambda, 'clamp': clamp},
                                        label=label,
                                        sub_label=sub_label,
                                        description=env,
                                        num_threads=torch.get_num_threads(),
                                    ).blocked_autorange(min_run_time=1.0)
                                    # fmt: on

                                    results = load_results(results_path)
                                    results.append(t0)
                                    save_results(results, results_path)
                                    del results, t0
                                    
                                del x, x_len, y_len
                                
                            torch.cuda.empty_cache()

                            # Numba
                            env = 'Numba'
                            x, x_len, y, y_len = data_gen(b, t, u, v, dtype=dtype)

                            if REQUIRES_GRAD:
                                  x.requires_grad = True

                            # fmt: off
                            t0 = benchmark.Timer(
                                stmt='check_time_numba(x, x_len, y, y_len, fastemit_lambda, clamp);',
                                setup="from __main__ import check_time_numba;",
                                globals={'x': x, 'x_len': x_len, 'y': y, 'y_len': y_len,
                                          'fastemit_lambda': fastemit_lambda, 'clamp': clamp},
                                label=label,
                                sub_label=sub_label,
                                description=env,
                                num_threads=torch.get_num_threads(),
                            ).blocked_autorange(min_run_time=1.0)
                            # fmt: on

                            results = load_results(results_path)
                            results.append(t0)
                            save_results(results, results_path)
                            del results, t0

                            del x, x_len, y, y_len
                            torch.cuda.empty_cache()


---> wrote cells to file (and did not execute): script.py


# Execute script

Now that the script has the code contents necessary to perform the evaluations, execute it from the shell

## Training mode

In [15]:
!python script.py

Torch : 1.10.0+cu111
Torch Audio: 0.10.0+cu111
[Note]: Torch audio version must be >= 0.10.0
warprnnt_numba: 0.4.0
Numba supports CUDA: True
Sun Jan 30 10:04:33 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P8    33W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                           

### Print out results

Since the output has been written to a pickle file, print out the output of the script above.

In [16]:
print()
print()

results_path = os.path.join(basedir, 'rnnt_results.pkl')
results = load_results(results_path)
compare = benchmark.Compare(results)
compare.colorize()
compare.print()



[---------------------------------------------------- RNNTLoss ---------------------------------------------------]
                                                                                            |  TorchAudio  |  Numba
1 threads: --------------------------------------------------------------------------------------------------------
      [b=1, t=200, u=100, v=28, fastemit_lambda=0.0, clamp=-1.0, dtype=torch.float32]       |  [92m[1m    3.8   [0m[0m  |  [34m[1m  8.1[0m[0m
      [b=1, t=200, u=100, v=28, fastemit_lambda=0.0, clamp=0.1, dtype=torch.float32]        |  [34m[1m    3.8   [0m[0m  |  [92m[1m  7.7[0m[0m
      [b=1, t=200, u=100, v=28, fastemit_lambda=0.001, clamp=-1.0, dtype=torch.float32]     |              |  [92m[1m  7.7[0m[0m
      [b=1, t=200, u=100, v=28, fastemit_lambda=0.001, clamp=0.1, dtype=torch.float32]      |              |  [34m[1m  7.8[0m[0m
      [b=1, t=200, u=100, v=1024, fastemit_lambda=0.0, clamp=-1.0, dtype=torch.float

## Inference Mode

In [17]:
!sed -i 's/REQUIRES_GRAD = True/REQUIRES_GRAD = False/g' script.py
!python script.py

Torch : 1.10.0+cu111
Torch Audio: 0.10.0+cu111
[Note]: Torch audio version must be >= 0.10.0
warprnnt_numba: 0.4.0
Numba supports CUDA: True
Sun Jan 30 10:11:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.46       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P8    32W / 149W |      3MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                           

### Print out results

Since the output has been written to a pickle file, print out the output of the script above.

In [18]:
print()
print()

results_path = os.path.join(basedir, 'rnnt_results.pkl')
results = load_results(results_path)
compare = benchmark.Compare(results)
compare.colorize()
compare.print()



[---------------------------------------------------- RNNTLoss ---------------------------------------------------]
                                                                                            |  TorchAudio  |  Numba
1 threads: --------------------------------------------------------------------------------------------------------
      [b=1, t=200, u=100, v=28, fastemit_lambda=0.0, clamp=-1.0, dtype=torch.float32]       |  [92m[1m    3.8   [0m[0m  |    7.8
      [b=1, t=200, u=100, v=28, fastemit_lambda=0.0, clamp=0.1, dtype=torch.float32]        |  [34m[1m    3.8   [0m[0m  |  [34m[1m  5.3[0m[0m
      [b=1, t=200, u=100, v=28, fastemit_lambda=0.001, clamp=-1.0, dtype=torch.float32]     |              |  [34m[1m  5.2[0m[0m
      [b=1, t=200, u=100, v=28, fastemit_lambda=0.001, clamp=0.1, dtype=torch.float32]      |              |  [92m[1m  5.1[0m[0m
      [b=1, t=200, u=100, v=1024, fastemit_lambda=0.0, clamp=-1.0, dtype=torch.float32]     |  [31m