# numba

In [1]:
from numba import cuda

In [2]:
[i for i in cuda.devices.gpus]

[<numba.cuda.cudadrv.devices._DeviceContextManager at 0x7f3adc119ac0>]

In [3]:
import numba

In [4]:
numba.cuda.detect()

Found 1 CUDA devices
id 0    b'NVIDIA GeForce RTX 2060 SUPER'                              [SUPPORTED]
                      Compute Capability: 7.5
                           PCI Device ID: 0
                              PCI Bus ID: 38
                                    UUID: GPU-f5dcddd0-bc57-5ebb-7578-229367d62be8
                                Watchdog: Enabled
             FP32/FP64 Performance Ratio: 32
Summary:
	1/1 devices are supported


True

In [5]:
device = cuda.get_current_device()

In [6]:
cd = device.get_primary_context()

In [7]:
cd.get_max_potential_block_size?

[0;31mSignature:[0m
[0mcd[0m[0;34m.[0m[0mget_max_potential_block_size[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mfunc[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mb2d_func[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmemsize[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mblocksizelimit[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mflags[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Suggest a launch configuration with reasonable occupancy.
:param func: kernel for which occupancy is calculated
:param b2d_func: function that calculates how much per-block dynamic
                 shared memory 'func' uses based on the block size.
                 Can also be the address of a C function.
                 Use `0` to pass `NULL` to the underlying CUDA API.
:param memsize: per-block dynamic shared memory usage intended, in bytes
:param blocksizelimit: maximum block size the kernel is designed to
        

In [8]:
cd.get_memory_info()

MemoryInfo(free=7363690496, total=8346664960)

In [27]:
device

{}

# module

In [9]:
import torch

In [10]:
from pysdtw import SoftDTW

In [11]:
sdtw = SoftDTW(True)

In [12]:
sdtw.dist_func

<function pysdtw.SoftDTW._euclidean_dist_func(x, y)>

In [13]:
batch_size, seq_len_a, seq_len_b, dims = 10, 512, 1023, 15

In [14]:
a_cpu = torch.rand((batch_size, seq_len_a, dims), requires_grad=True)
b_cpu = torch.rand((batch_size, seq_len_b, dims))
a_gpu = a_cpu.cuda()
b_gpu = b_cpu.cuda()

In [15]:
cd.get_memory_info()

MemoryInfo(free=6549995520, total=8346664960)

In [16]:
6514999296 / 1024**3

6.06756591796875

In [17]:
a_cpu.dtype

torch.float32

In [18]:
a_gpu.dtype

torch.float32

In [25]:
%%timeit
sdtw(a_gpu, b_gpu)

25.5 ms ± 2.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
sdtw_cpu = SoftDTW(False)

In [24]:
%%timeit
sdtw_cpu(a_cpu, b_cpu)

346 ms ± 4.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# pairwise

In [28]:
def _euclidean_dist_func(x, y):
    """
    Calculates the Euclidean distance between each element in x and y per timestep
    """
    n = x.size(1)
    m = y.size(1)
    d = x.size(2)
    x = x.unsqueeze(2).expand(-1, n, m, d)
    y = y.unsqueeze(1).expand(-1, n, m, d)
    return torch.pow(x - y, 2).sum(3)

In [45]:
def pairwise_l2_squared(x, y, theta):
    '''
    https://discuss.pytorch.org/t/efficient-distance-matrix-computation/9065/2

    Input: x is an Nxd matrix
           y is an Mxd matrix
    Output: dist is a NxM matrix where dist[i,j] is the square norm between x[i,:] and y[j,:]
            if y is not given then use 'y=x'.
    i.e. dist[i,j] = ||x[i,:]-y[j,:]||^2
    '''
    x_norm = (theta * x**2).sum(1).view(-1, 1)
    y_t = torch.transpose(y, 0, 1)
    y_norm = (theta * y**2).sum(1).view(1, -1)
    dist = x_norm + y_norm - 2.0 * torch.mm(theta*x, y_t)
    # Ensure diagonal is zero if x=y
    # if y is None:
    #     dist = dist - torch.diag(dist.diag)
    return torch.clamp(dist, 0.0, np.inf)

In [46]:
_euclidean_dist_func(a_cpu, b_cpu).shape

torch.Size([10, 512, 1023])

In [47]:
theta = torch.ones(dims)

In [50]:
# pairwise_l2_squared(a_cpu, b_cpu, torch.ones(dims)).shape

In [52]:
import numpy as np

In [76]:
USE_64 = True

if USE_64:
    bits = 64
    np_type = np.float64
else:
    bits = 32
    np_type = np.float32

@cuda.jit("void(float{}[:, :], float{}[:], float{}[:, :])".format(bits, bits, bits))
def distance_matrix(mat, theta, out):
    m = mat.shape[0]
    n = mat.shape[1]
    i, j = cuda.grid(2)
    d = 0
    if i < m and j < m:
        for k in range(n):
            tmp = theta[k] * (mat[i, k] - mat[j, k])
            d += tmp * tmp
        out[i, j] = d

In [77]:
N = 15

In [80]:
theta = torch.ones(N, requires_grad=True)

In [81]:
theta

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       requires_grad=True)

In [87]:
def gpu_dist_matrix(mat, theta):
    rows = mat.shape[0]

    block_dim = (16, 16)
    grid_dim = (int(rows/block_dim[0] + 1), int(rows/block_dim[1] + 1))

    stream = cuda.stream()
    mat2 = cuda.to_device(np.asarray(mat, dtype=np_type), stream=stream)
    theta2 = cuda.to_device(np.asarray(theta, dtype=np_type), stream=stream)
    
    out2 = cuda.device_array((rows, rows))
    distance_matrix[grid_dim, block_dim](mat2, out2)
    out = out2.copy_to_host(stream=stream)

    return out

In [88]:
cuda

<module 'numba.cuda' from '/home/antoine/anaconda3/envs/ddtw/lib/python3.9/site-packages/numba/cuda/__init__.py'>

In [89]:
A = np.random.random((N, N))

In [90]:
B = gpu_dist_matrix(A, theta)

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.