In [1]:
 import numpy as np
 import numba
 from numba import cuda
 
 print(np.__version__)
 print(numba.__version__)
 
 cuda.detect()
 
 # 1.21.6
 # 0.55.2
 #
 # Found 1 CUDA devices
 # id 0             b'Tesla T4'                              [SUPPORTED]
 #                       Compute Capability: 7.5
 #                            PCI Device ID: 4
 #                               PCI Bus ID: 0
 #                                     UUID: GPU-e0b8547a-62e9-2ea2-44f6-9cd43bf7472d
 #                                 Watchdog: Disabled
 #              FP32/FP64 Performance Ratio: 32
 # Summary:
 # 1/1 devices are supported

1.23.4
0.56.4
Found 1 CUDA devices
id 0    b'NVIDIA GeForce RTX 2070 SUPER'                              [SUPPORTED]
                      Compute Capability: 7.5
                           PCI Device ID: 0
                              PCI Bus ID: 1
                                    UUID: GPU-1fab6b59-6c9e-c97a-a845-a4abc6279485
                                Watchdog: Enabled
                            Compute Mode: WDDM
             FP32/FP64 Performance Ratio: 32
Summary:
	1/1 devices are supported


True

In [5]:
import cv2
from numba import cuda
import time
import math

# GPU function
@cuda.jit()
def process_gpu(img):
    tx = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    ty = cuda.blockIdx.y * cuda.blockDim.y + cuda.threadIdx.y
    for channel in range(3):
        color = img[tx, ty][channel] * 2.0 + 30
        if color > 255:
            img[tx, ty][channel] = 255
        elif color < 0:
            img[tx, ty][channel] = 0
        else:
            img[tx, ty][channel] = color


# CPU function
def process_cpu(img, dst):
    height, width, channels = img.shape
    for h in range(height):
        for w in range(width):
            for c in range(channels):
                color = img[h, w][c] * 2.0 + 30
                if color > 255:
                    dst[h, w][c] = 255
                elif color < 0:
                    dst[h, w][c] = 0
                else:
                    dst[h, w][c] = color


if __name__ == '__main__':
    img = cv2.imread("./Figure_1.png")
    height, width, channels = img.shape

    dst_cpu = img.copy()
    start_cpu = time.time()
    process_cpu(img, dst_cpu)
    end_cpu = time.time()
    time_cpu = (end_cpu - start_cpu)
    print("CPU process time: " + str(time_cpu))

    ##GPU function
    dImg = cuda.to_device(img)
    threadsperblock = (32, 32)
    blockspergrid_x = int(math.ceil(height / threadsperblock[0]))
    blockspergrid_y = int(math.ceil(width / threadsperblock[1]))
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    cuda.synchronize()
    start_gpu = time.time()
    process_gpu[blockspergrid, threadsperblock](dImg)
    end_gpu = time.time()
    cuda.synchronize()
    time_gpu = (end_gpu - start_gpu)
    print("GPU process time: " + str(time_gpu))
    dst_gpu = dImg.copy_to_host()

    # save
    cv2.imwrite("result_cpu.jpg", dst_cpu)
    cv2.imwrite("result_gpu.jpg", dst_gpu)
    print("Done.")


CPU process time: 1.9570248126983643
GPU process time: 0.13796734809875488
Done.


In [9]:
 import numpy as np
 import numba
 from numba import cuda
 
 print(np.__version__)
 print(numba.__version__)
 
 cuda.detect()
 
  # Example 1.1: Add scalars
 @cuda.jit
 def add_scalars(a, b, c):
     c[0] = a + b
 
 dev_c = cuda.device_array((1,), np.float32)
 
 add_scalars[1, 1024](2.0, 7.0, dev_c)
 
 c = dev_c.copy_to_host()
 print(f"2.0 + 7.0 = {c[0]}")
 #  2.0 + 7.0 = 9.0

1.23.4
0.56.4
Found 1 CUDA devices
id 0    b'NVIDIA GeForce RTX 2070 SUPER'                              [SUPPORTED]
                      Compute Capability: 7.5
                           PCI Device ID: 0
                              PCI Bus ID: 1
                                    UUID: GPU-1fab6b59-6c9e-c97a-a845-a4abc6279485
                                Watchdog: Enabled
                            Compute Mode: WDDM
             FP32/FP64 Performance Ratio: 32
Summary:
	1/1 devices are supported
2.0 + 7.0 = 9.0




In [12]:
cuda.gridDim.x
cuda.blockDim.x
cuda.blockIdx
cuda.

numba.cuda.stubs.blockIdx

In [13]:
# Example 1.2: Add arrays
@cuda.jit
def add_array(a, b, c):
    i = cuda.threadIdx.x + cuda.blockDim.x * cuda.blockIdx.x
    if i < a.size:
        c[i] = a[i] + b[i]

N = 20
a = np.arange(N, dtype=np.float32)
b = np.arange(N, dtype=np.float32)
dev_c = cuda.device_array_like(a)

add_array[4, 8](a, b, dev_c)
import time 
start_time = time.time()
c = dev_c.copy_to_host()
end_time = time.time()
print(c)
print("total time {}s".format(end_time - start_time))
#  [ 0.  2.  4.  6.  8. 10. 12. 14. 16. 18. 20. 22. 24. 26. 28. 30. 32. 34. 36. 38.]

[ 0.  2.  4.  6.  8. 10. 12. 14. 16. 18. 20. 22. 24. 26. 28. 30. 32. 34.
 36. 38.]


