|
15 | 15 | #
|
16 | 16 | # ################################################################################
|
17 | 17 |
|
18 |
| -import importlib |
19 | 18 | import string
|
20 | 19 | import sys
|
21 | 20 |
|
22 |
| -try: |
23 |
| - from cffi import FFI |
24 |
| -except ImportError: |
25 |
| - print("cffi is not installed, the CPU example will be skipped", file=sys.stderr) |
26 |
| - FFI = None |
27 | 21 | try:
|
28 | 22 | import cupy as cp
|
29 | 23 | except ImportError:
|
|
52 | 46 | func_name = "inplace_plus_arange_N"
|
53 | 47 | func_sig = f"void {func_name}(int* data, size_t N)"
|
54 | 48 |
|
55 |
| -# Here is a concrete (very naive!) implementation on CPU: |
56 |
| -if FFI: |
57 |
| - cpu_code = string.Template(r""" |
58 |
| - extern "C" |
59 |
| - $func_sig { |
60 |
| - for (size_t i = 0; i < N; i++) { |
61 |
| - data[i] += i; |
62 |
| - } |
63 |
| - } |
64 |
| - """).substitute(func_sig=func_sig) |
65 |
| - # This is cffi's way of JIT compiling & loading a CPU function. cffi builds an |
66 |
| - # extension module that has the Python binding to the underlying C function. |
67 |
| - # For more details, please refer to cffi's documentation. |
68 |
| - cpu_prog = FFI() |
69 |
| - cpu_prog.cdef(f"{func_sig};") |
70 |
| - cpu_prog.set_source( |
71 |
| - "_cpu_obj", |
72 |
| - cpu_code, |
73 |
| - source_extension=".cpp", |
74 |
| - extra_compile_args=["-std=c++11"], |
75 |
| - ) |
76 |
| - cpu_prog.compile() |
77 |
| - cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name) |
78 |
| - |
79 |
| -# Here is a concrete (again, very naive!) implementation on GPU: |
80 |
| -if cp: |
81 |
| - gpu_code = string.Template(r""" |
82 |
| - extern "C" |
83 |
| - __global__ $func_sig { |
84 |
| - const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; |
85 |
| - const size_t stride_size = gridDim.x * blockDim.x; |
86 |
| - for (size_t i = tid; i < N; i += stride_size) { |
87 |
| - data[i] += i; |
88 |
| - } |
89 |
| - } |
90 |
| - """).substitute(func_sig=func_sig) |
91 |
| - |
92 |
| - # To know the GPU's compute capability, we need to identify which GPU to use. |
93 |
| - dev = Device(0) |
94 |
| - dev.set_current() |
95 |
| - arch = "".join(f"{i}" for i in dev.compute_capability) |
96 |
| - gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11")) |
97 |
| - mod = gpu_prog.compile(target_type="cubin") |
98 |
| - gpu_ker = mod.get_kernel(func_name) |
99 |
| - |
100 | 49 | # Now we are prepared to run the code from the user's perspective!
|
101 | 50 | #
|
102 | 51 | # ################################################################################
|
|
109 | 58 | # We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
|
110 | 59 | # of which are supported by StridedMemoryView).
|
111 | 60 | @args_viewable_as_strided_memory((0,))
|
112 |
| -def my_func(arr, work_stream): |
| 61 | +def my_func(arr, work_stream, gpu_ker): |
113 | 62 | # Create a memory view over arr (assumed to be a 1D array of int32). The stream
|
114 | 63 | # ordering is taken care of, so that arr can be safely accessed on our work
|
115 | 64 | # stream (ordered after a data stream on which arr is potentially prepared).
|
116 | 65 | view = arr.view(work_stream.handle if work_stream else -1)
|
117 | 66 | assert isinstance(view, StridedMemoryView)
|
118 | 67 | assert len(view.shape) == 1
|
119 | 68 | assert view.dtype == np.int32
|
| 69 | + assert view.is_device_accessible |
120 | 70 |
|
121 | 71 | size = view.shape[0]
|
122 | 72 | # DLPack also supports host arrays. We want to know if the array data is
|
123 | 73 | # accessible from the GPU, and dispatch to the right routine accordingly.
|
124 |
| - if view.is_device_accessible: |
125 |
| - block = 256 |
126 |
| - grid = (size + block - 1) // block |
127 |
| - config = LaunchConfig(grid=grid, block=block) |
128 |
| - launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size)) |
129 |
| - # Here we're being conservative and synchronize over our work stream, |
130 |
| - # assuming we do not know the data stream; if we know then we could |
131 |
| - # just order the data stream after the work stream here, e.g. |
132 |
| - # |
133 |
| - # data_stream.wait(work_stream) |
134 |
| - # |
135 |
| - # without an expensive synchronization (with respect to the host). |
136 |
| - work_stream.sync() |
137 |
| - else: |
138 |
| - cpu_func(cpu_prog.cast("int*", view.ptr), size) |
139 |
| - |
140 |
| - |
141 |
| -# This takes the CPU path |
142 |
| -if FFI: |
143 |
| - # Create input array on CPU |
144 |
| - arr_cpu = np.zeros(1024, dtype=np.int32) |
145 |
| - print(f"before: {arr_cpu[:10]=}") |
146 |
| - |
147 |
| - # Run the workload |
148 |
| - my_func(arr_cpu, None) |
149 |
| - |
150 |
| - # Check the result |
151 |
| - print(f"after: {arr_cpu[:10]=}") |
152 |
| - assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32)) |
153 |
| - |
154 |
| - |
155 |
| -# This takes the GPU path |
156 |
| -if cp: |
| 74 | + block = 256 |
| 75 | + grid = (size + block - 1) // block |
| 76 | + config = LaunchConfig(grid=grid, block=block) |
| 77 | + launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size)) |
| 78 | + # Here we're being conservative and synchronize over our work stream, |
| 79 | + # assuming we do not know the data stream; if we know then we could |
| 80 | + # just order the data stream after the work stream here, e.g. |
| 81 | + # |
| 82 | + # data_stream.wait(work_stream) |
| 83 | + # |
| 84 | + # without an expensive synchronization (with respect to the host). |
| 85 | + work_stream.sync() |
| 86 | + |
| 87 | + |
| 88 | +def run(): |
| 89 | + global my_func |
| 90 | + if not cp: |
| 91 | + return None |
| 92 | + # Here is a concrete (very naive!) implementation on GPU: |
| 93 | + gpu_code = string.Template(r""" |
| 94 | + extern "C" |
| 95 | + __global__ $func_sig { |
| 96 | + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; |
| 97 | + const size_t stride_size = gridDim.x * blockDim.x; |
| 98 | + for (size_t i = tid; i < N; i += stride_size) { |
| 99 | + data[i] += i; |
| 100 | + } |
| 101 | + } |
| 102 | + """).substitute(func_sig=func_sig) |
| 103 | + |
| 104 | + # To know the GPU's compute capability, we need to identify which GPU to use. |
| 105 | + dev = Device(0) |
| 106 | + dev.set_current() |
| 107 | + arch = "".join(f"{i}" for i in dev.compute_capability) |
| 108 | + gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11")) |
| 109 | + mod = gpu_prog.compile(target_type="cubin") |
| 110 | + gpu_ker = mod.get_kernel(func_name) |
| 111 | + |
157 | 112 | s = dev.create_stream()
|
158 |
| - # Create input array on GPU |
159 |
| - arr_gpu = cp.ones(1024, dtype=cp.int32) |
160 |
| - print(f"before: {arr_gpu[:10]=}") |
| 113 | + try: |
| 114 | + # Create input array on GPU |
| 115 | + arr_gpu = cp.ones(1024, dtype=cp.int32) |
| 116 | + print(f"before: {arr_gpu[:10]=}") |
| 117 | + |
| 118 | + # Run the workload |
| 119 | + my_func(arr_gpu, s, gpu_ker) |
| 120 | + |
| 121 | + # Check the result |
| 122 | + print(f"after: {arr_gpu[:10]=}") |
| 123 | + assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32)) |
| 124 | + finally: |
| 125 | + s.close() |
161 | 126 |
|
162 |
| - # Run the workload |
163 |
| - my_func(arr_gpu, s) |
164 | 127 |
|
165 |
| - # Check the result |
166 |
| - print(f"after: {arr_gpu[:10]=}") |
167 |
| - assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32)) |
168 |
| - s.close() |
| 128 | +if __name__ == "__main__": |
| 129 | + run() |
0 commit comments