# Helper for Numba Simulation

> Check if the Numba CUDA simulator is enabled

In [None]:
#| default_exp NumbaSimSetup

In [None]:
#| export
import os
import sys

In [None]:
#| export
def is_sim():
    """Check if we're running in a simulator by checking the NUMBA_ENABLE_CUDASIM environment variable"""
    return os.environ.get('NUMBA_ENABLE_CUDASIM') == '1'

In [None]:
assert not is_sim()

## Set sim

it have to be called befor importing the **cuda**

In [None]:
#| export
def set_sim():
    """ Seting up Numba CUDA simulator"""
    if not is_sim():
        os.environ['NUMBA_ENABLE_CUDASIM'] = '1'

In [None]:
set_sim()
assert is_sim()

## check if the cuda is available or not

In [None]:
#| export
def cuda_avail(): 
    from numba import cuda
    return  cuda.is_available()


In [None]:
assert cuda_avail()

## device api mimics **torch.device**

In [None]:
#| eval: false
from numba import cuda
d = cuda.device_array(1)
type(d)

numba.cuda.simulator.cudadrv.devicearray.FakeCUDAArray

For a tensor d which is allocated in CUDA
```
isinstance(d, cuda.cudadrv.devicearray.DeviceNDArray)
```

So we are going to use `copy_to_host` for checking is the tensor is already present in the device and can be moved to host.

In [None]:
#| export
def device(x):
    return "cuda" if hasattr(x, 'copy_to_host') else "cpu"

In [None]:
assert device(d) == 'cuda'
assert device(d.copy_to_host()) == 'cpu'

In [None]:
#| export
import numpy as np

def test_close(a, b, tol=1e-4):
    return np.allclose(a, b, rtol=tol, atol=tol)


In [None]:
a = np.array([1.0, 2.0, 3.0], dtype=np.float32)
b = np.array([1.0001, 2.0001, 3.0001], dtype=np.float32)
assert test_close(a, b, tol=1e-4)

In [None]:
#| export
def is_colab():
    "check is running in colab"
    return 'google.colab' in sys.modules

In [None]:
assert not is_colab()

In [None]:
#| export
import math
def dim(base:float, th:float):
    return math.ceil(base/th)

In [None]:
assert dim(8, 5) == 2
assert dim(8, 8) == 1

## Performace Capture

In [None]:
#| export

import time
from functools import wraps

def perf(warmup=2, iters=20):
    def decorator(fn):
        @wraps(fn)
        def wrapper(*args, **kwargs):
            # Warmup runs
            for _ in range(warmup):
                fn(*args, **kwargs)
            
            if is_sim():
                start = time.perf_counter()
                for _ in range(iters):
                    result = fn(*args, **kwargs)
                elapsed_ms = (time.perf_counter() - start) * 1000 / iters
            else:
                cuda.synchronize()
                
                start = cuda.event()
                end = cuda.event()
                elapsed_ms = 0
                
                for _ in range(iters):
                    start.record()
                    result = fn(*args, **kwargs)
                    end.record()
                    end.synchronize()
                    elapsed_ms += cuda.event_elapsed_time(start, end)
                
                elapsed_ms /= iters
            
            print(f"{fn.__name__}: {elapsed_ms:.4f} ms (avg of {iters} runs)")
            return result
        return wrapper
    return decorator


### CPU-only setup for Numba CUDA simulator

In [None]:
@perf()
def test_sleep():
    time.sleep(0.01)  # 10ms sleep

test_sleep()

test_sleep: 10.6071 ms (avg of 20 runs)


### NumbaSim Setup

In [None]:
@cuda.jit
def add_kernel(a, b, c):
    idx = cuda.grid(1)
    if idx < a.size:
        c[idx] = a[idx] + b[idx]

# Test data
N = 1
a = cuda.to_device(np.ones(N, dtype=np.float32))
b = cuda.to_device(np.ones(N, dtype=np.float32))
c = cuda.device_array(N, dtype=np.float32)

@perf(warmup=2, iters=10)
def run_add():
    add_kernel[1, 1](a, b, c)

run_add()

run_add: 1.2910 ms (avg of 10 runs)


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()