In [None]:
!apt update
!apt purge *nvidia* -y
!apt install nvidia-driver-530 -y

!pip install pyopencl[pocl]
import sys
sys.path.append('/usr/local/lib/python3.6/site-packages')
import pyopencl as cl
import pyopencl.array as cl_array
import numpy as np

In [2]:
!lscpu
!nvidia-smi

Architecture:            x86_64
  CPU op-mode(s):        32-bit, 64-bit
  Address sizes:         46 bits physical, 48 bits virtual
  Byte Order:            Little Endian
CPU(s):                  2
  On-line CPU(s) list:   0,1
Vendor ID:               GenuineIntel
  Model name:            Intel(R) Xeon(R) CPU @ 2.00GHz
    CPU family:          6
    Model:               85
    Thread(s) per core:  2
    Core(s) per socket:  1
    Socket(s):           1
    Stepping:            3
    BogoMIPS:            4000.34
    Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clf
                         lush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_
                         good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fm
                         a cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hyp
                         ervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd i

## Kernel

In [3]:
# Write Kernel Code
C_elem_KernelSource = """
__kernel void parallelSum(
    __global const float *a_g,
    __global const float *b_g,
    __global float *res_g)
{
  int gid = get_global_id(0);
  res_g[gid] = a_g[gid] + b_g[gid];
}
"""

In [5]:
# Vector lenght
N = 2048

# Discover and Select Platform
platform_list = cl.get_platforms()

# Discover and Select Device
devices = platform_list[0].get_devices(device_type=cl.device_type.GPU)

# Create Context
context = cl.Context(devices=devices)

# Create command Queue
queue = cl.CommandQueue(context)

# Create and initialize input vector (Numpy)
mf = cl.mem_flags
a_np = np.arange(N).astype(np.float32)
b_np = np.arange(N).astype(np.float32)

# Create output vector with the same size as the input one (Numpy)
res_np = np.empty_like(a_np)

# Create and initialize input buffers (OpenCL)
a_g = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
b_g = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)

# Create output buffer with the same size as the input one (OpenCL)
res_g = cl.Buffer(context, mf.WRITE_ONLY, a_np.nbytes)

# Create Program
program = cl.Program(context, C_elem_KernelSource).build()

# Create Kernel
kernel = program.parallelSum

# Set Kernel Arguments
kernel.set_scalar_arg_dtypes([np.int32, None, None, None])
kernel.set_args(N, a_g, b_g, res_g)























# Define Global and Local Range Dimension and size
globalrange = res_np.shape
localrange = None

# Excecute Kernel
ev = cl.enqueue_nd_range_kernel(queue, kernel, globalrange, localrange)

# Copy Result from Device to Host
cl.enqueue_copy(queue, res_np, res_g)

# Check on CPU with Numpy:
assert np.allclose(res_np, a_np + b_np)

In [6]:
print(res_np)

[0.000e+00 2.000e+00 4.000e+00 ... 4.090e+03 4.092e+03 4.094e+03]
