Import all the libraries needed.

In [None]:
!pip install pycuda
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

Define kernel

In [None]:
modd = SourceModule ("""
__global__ void interleaved_reduction(double* xs, int stride)
{
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  int index_result = index & (2 * stride - 1);
  if (index_result == 0)
  {
    xs[index] += xs[index + stride];
  }
}
""")

Set up data, threads and blocks

In [None]:
import math
import numpy as np

vector_size = 2**10
value_type = float
numThreadsPerBlock = 1024
numBlocks = math.ceil(vector_size/numThreadsPerBlock)
num_iterations = int(math.log2(vector_size))

Lets create data

In [None]:
a = np.random.randn(vector_size)
a = a.astype(value_type)
a_cpu = a


a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)
cuda.memcpy_htod(a_gpu, a)

result = np.zeros_like(a)

So now we can go ahead and call the kernel.

In [None]:
for i in range(0, num_iterations):
  stride = 2**i
  sum_kernel = modd.get_function("interleaved_reduction")
  sum_kernel(a_gpu, np.int32(stride), block=(numThreadsPerBlock,1,1), grid=(numBlocks, 1, 1))

# Copy the result back from the GPU memory.
cuda.memcpy_dtoh(result, a_gpu)


Check the data

In [None]:
a_cpu = np.sum(a_cpu)

# Verify the results
np.allclose(a_cpu, result[0], 0.001, 0.001)