In [3]:
import numpy as np
import pyopencl as cl

# Constants
WIDTH = 10
LOCAL_SIZE = 2

In [4]:
# Kernels
def sum_kernel(A, B):
    """
    Sums the neighbors of each element in A.

    Args:
        A: The input array.
        B: The output array.
    """

    # Get the global and local IDs.
    i = get_global_id(0)
    j = get_local_id(0)

    # Read the element from the global array.
    a = A[i]

    # Write the element to the local array.
    B[j] = a

    # Synchronize the local work-group.
    barrier(CLK_LOCAL_MEM_FENCE)

    # Sum the neighbors.
    if j == 0:
        B[0] += A[i - 1]
    if j == 1:
        B[1] += A[i]
    if j == 2:
        B[2] += A[i + 1]

def sum_kernel_2(A, B):
    """
    Sums the neighbors of each element in B.

    Args:
        A: The input array.
        B: The output array.
    """

    # Get the global and local IDs.
    i = get_global_id(0)
    j = get_local_id(0)

    # Read the element from the global array.
    a = B[i]

    # Write the element to the local array.
    B[j] = a

    # Synchronize the local work-group.
    barrier(CLK_LOCAL_MEM_FENCE)

    # Sum the neighbors.
    if j == 0:
        B[0] += A[i - 1]
    if j == 1:
        B[1] += A[i]
    if j == 2:
        B[2] += A[i + 1]


In [6]:
# Main function
def main():

    # Create the OpenCL context and device.
    platforms = cl.get_platforms()
    device = platforms[0].get_devices(device_type=cl.device_type.GPU)[0]
    context = cl.Context(devices=[device])
    queue = cl.CommandQueue(context)

    # Create the OpenCL buffers.
    A = np.arange(WIDTH).astype(np.int32)
    B = np.zeros(LOCAL_SIZE).astype(np.int32)
    A_buffer = cl.Buffer(context, cl.mem_flags.READ_ONLY, A.size)
    B_buffer = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, B.size)

    # Compile the kernels.
    program = cl.Program(context, [sum_kernel, sum_kernel_2]).build()

    # Execute the first kernel.
    kernel = program.get_kernel("sum_kernel")
    kernel.set_arg(0, A_buffer)
    kernel.set_arg(1, B_buffer)
    cl.enqueue_nd_range_kernel(queue, kernel, (WIDTH,), None)

    # Execute the second kernel.
    kernel = program.get_kernel("sum_kernel_2")
    kernel.set_arg(0, B_buffer)
    kernel.set_arg(1, B_buffer)
    cl.enqueue_nd_range_kernel(queue, kernel, (LOCAL_SIZE,), None)

    # Copy the results from the OpenCL device to the host.
    B_out = np.empty_like(B)
    cl.enqueue_copy(queue, B_buffer, B_out)

    # Print the results.
    print(B_out)


In [7]:
# Call the main function.
if __name__ == "__main__":
    main()

IndexError: list index out of range