In [1]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy as np
import time

In [2]:
# Define Kernel

func = SourceModule("""
__global__
void jacobiGPUBasic(float* x_new, float* A, float* x_current, float* b, const int Nx, const int Ny)
{
    float sum = 0.0;
    int idx = threadIdx.x;
    int j;
    for(j = 0; j < Ny; j++)
    {
        if(idx != j)
        {
            sum += A[idx * Ny + j] * x_current[j];
        }
        x_new[idx] = (b[idx] - sum) / A[idx * Ny + idx];
    }
}
  """).get_function("jacobiGPUBasic")

In [3]:
# Main body of the program

resolutions = [10, 100, 1000, 2000, 3000]
iterations = 1000

for res in resolutions:

    # Define data structures on CPU

    A = np.identity(res).astype(np.float32)
    x_current = np.ones((res,1)).astype(np.float32)
    x_next = np.ones((res,1)).astype(np.float32)
    b = np.ones_like(x_current).astype(np.float32)
    resolution = np.array([res]).astype(np.float32)

    # Allocate memory on GPU

    A_gpu = cuda.mem_alloc(A.nbytes)
    x_current_gpu = cuda.mem_alloc(x_current.nbytes)
    x_next_gpu = cuda.mem_alloc(x_next.nbytes)
    b_gpu = cuda.mem_alloc(b.nbytes)
    resolution_gpu = cuda.mem_alloc(resolution.nbytes)

    # Transfer data to GPU

    cuda.memcpy_htod(A_gpu, A)
    cuda.memcpy_htod(x_current_gpu, x_current)
    cuda.memcpy_htod(x_next_gpu, x_next)
    cuda.memcpy_htod(b_gpu, b)
    cuda.memcpy_htod(resolution_gpu, resolution)

    # Evaluate the kernel

    start = time.time()
    for i in range(iterations):
        func(x_next_gpu,
                A_gpu,
                x_current_gpu,
                b_gpu,
                resolution_gpu,
                resolution_gpu,
                block = (16,1,1)
            )
    end = time.time()

    print("Resolution : ", res, " ; Time (microseconds) : ", ((end-start) / iterations) * 1e6)

    A_gpu.free()
    x_current_gpu.free()
    x_next_gpu.free()
    b_gpu.free()


print(x_current)

Resolution :  10  ; Time (microseconds) :  10.53762435913086
Resolution :  100  ; Time (microseconds) :  8.234739303588867
Resolution :  1000  ; Time (microseconds) :  8.38160514831543
Resolution :  2000  ; Time (microseconds) :  8.464336395263672
Resolution :  3000  ; Time (microseconds) :  8.26716423034668
[[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [1.]]
