In [None]:
!pip install pycuda

In [None]:
import pycuda.driver as drv
import pycuda.autoinit
import pycuda.gpuarray as gpuarray

drv.init()

print(drv.get_version())


In [None]:
import numpy as np

# 设置随机数的数量：2^12
num_elements = 2<<12

# 在CPU上生成2^12个随机数，赋值给a_host
a_host = np.random.rand(num_elements).astype(np.float32)
b_host = np.random.rand(num_elements).astype(np.float32)
c_host = a_host + b_host
# 打印数组的一部分和数组的形状，确认生成成功
print("Array shape:", a_host.shape,b_host.shape)
print("First 10 elements:", a_host[:10],b_host[:10])

# 将a_host和b_host复制到GPU上
a_gpu = gpuarray.to_gpu(a_host)
b_gpu = gpuarray.to_gpu(b_host)

# 打印GPU数组的形状，确认复制成功
print("GPU array shape:", a_gpu.shape,b_gpu.shape)
c_gpu = a_gpu + b_gpu
c_gpu_host = c_gpu.get()
print("First 10 elements:", c_gpu_host[:10])

#比较c_host和c_gpu每一个元素
# Compare c_host with c_gpu
are_equal = np.allclose(c_host, c_gpu_host)

print("Are all elements equal?", are_equal)



In [None]:
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule

# 定义CUDA kernel
mod = SourceModule("""
__global__ void add_vectors(float *dest, float *a, float *b)
{
    const int i = threadIdx.x + blockIdx.x * blockDim.x;
    dest[i] = a[i] + b[i];
}
""")

# 从module中获取kernel函数
add_vectors = mod.get_function("add_vectors")

# 设置随机数的数量：2^12
num_elements = 2 << 12

# 在CPU上生成随机数，赋值给a_host和b_host
a_host = np.random.rand(num_elements).astype(np.float32)
b_host = np.random.rand(num_elements).astype(np.float32)
c_host = a_host + b_host

# 将a_host和b_host复制到GPU上
# 分配GPU内存并复制数据
a_gpu = drv.mem_alloc(a_host.nbytes)
b_gpu = drv.mem_alloc(b_host.nbytes)
c_gpu = drv.mem_alloc(a_host.nbytes)  # c的结果数组

drv.memcpy_htod(a_gpu, a_host)
drv.memcpy_htod(b_gpu, b_host)

# 设置block和grid的大小
block_size = (256, 1, 1)
num_blocks = (num_elements + block_size[0] - 1) // block_size[0]

# 调用kernel
add_vectors(c_gpu, a_gpu, b_gpu, block=(block_size[0], 1, 1), grid=(num_blocks, 1))

# 将结果复制回CPU
c_host_gpu = np.empty_like(a_host)
drv.memcpy_dtoh(c_host_gpu, c_gpu)

# 比较结果
are_equal = np.allclose(c_host, c_host_gpu)
print("Are all elements equal?", are_equal)
