Skip to content

Commit 6c8ab73

Browse files
Clean up cffi resources in file (#679)
* Remove clean_up_cffi_files fixture * examples/strided_memory_view.py to clean up FFI resources * Ensure temporary is delete is compile fails Make sure to del cpu_func and my_func that references it. * Consolidated example for strided_memory_view_cpu/gpu to single run functions
1 parent a8285b0 commit 6c8ab73

File tree

3 files changed

+190
-111
lines changed

3 files changed

+190
-111
lines changed
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
# ################################################################################
6+
#
7+
# This demo aims to illustrate two takeaways:
8+
#
9+
# 1. The similarity between CPU and GPU JIT-compilation with C++ sources
10+
# 2. How to use StridedMemoryView to interface with foreign C/C++ functions
11+
#
12+
# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU
13+
# path, which can be easily installed from pip or conda following their instructions.
14+
# We also use NumPy/CuPy as the CPU/GPU array container.
15+
#
16+
# ################################################################################
17+
18+
import importlib
19+
import shutil
20+
import string
21+
import sys
22+
import tempfile
23+
24+
try:
25+
from cffi import FFI
26+
except ImportError:
27+
print("cffi is not installed, the CPU example will be skipped", file=sys.stderr)
28+
FFI = None
29+
import numpy as np
30+
31+
from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
32+
33+
# ################################################################################
34+
#
35+
# Usually this entire code block is in a separate file, built as a Python extension
36+
# module that can be imported by users at run time. For illustrative purposes we
37+
# use JIT compilation to make this demo self-contained.
38+
#
39+
# Here we assume an in-place operation, equivalent to the following NumPy code:
40+
#
41+
# >>> arr = ...
42+
# >>> assert arr.dtype == np.int32
43+
# >>> assert arr.ndim == 1
44+
# >>> arr += np.arange(arr.size, dtype=arr.dtype)
45+
#
46+
# is implemented for both CPU and GPU at low-level, with the following C function
47+
# signature:
48+
func_name = "inplace_plus_arange_N"
49+
func_sig = f"void {func_name}(int* data, size_t N)"
50+
51+
52+
# Now we are prepared to run the code from the user's perspective!
53+
#
54+
# ################################################################################
55+
56+
57+
# Below, as a user we want to perform the said in-place operation on a CPU
58+
# or GPU, by calling the corresponding function implemented "elsewhere"
59+
# (in the body of run function).
60+
61+
62+
# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
63+
# of which are supported by StridedMemoryView).
64+
@args_viewable_as_strided_memory((0,))
65+
def my_func(arr):
66+
global cpu_func
67+
global cpu_prog
68+
# Create a memory view over arr (assumed to be a 1D array of int32). The stream
69+
# ordering is taken care of, so that arr can be safely accessed on our work
70+
# stream (ordered after a data stream on which arr is potentially prepared).
71+
view = arr.view(-1)
72+
assert isinstance(view, StridedMemoryView)
73+
assert len(view.shape) == 1
74+
assert view.dtype == np.int32
75+
assert not view.is_device_accessible
76+
77+
size = view.shape[0]
78+
# DLPack also supports host arrays. We want to know if the array data is
79+
# accessible from the GPU, and dispatch to the right routine accordingly.
80+
cpu_func(cpu_prog.cast("int*", view.ptr), size)
81+
82+
83+
def run():
84+
global my_func
85+
if not FFI:
86+
return
87+
# Here is a concrete (very naive!) implementation on CPU:
88+
cpu_code = string.Template(r"""
89+
extern "C"
90+
$func_sig {
91+
for (size_t i = 0; i < N; i++) {
92+
data[i] += i;
93+
}
94+
}
95+
""").substitute(func_sig=func_sig)
96+
# This is cffi's way of JIT compiling & loading a CPU function. cffi builds an
97+
# extension module that has the Python binding to the underlying C function.
98+
# For more details, please refer to cffi's documentation.
99+
cpu_prog = FFI()
100+
cpu_prog.cdef(f"{func_sig};")
101+
cpu_prog.set_source(
102+
"_cpu_obj",
103+
cpu_code,
104+
source_extension=".cpp",
105+
extra_compile_args=["-std=c++11"],
106+
)
107+
temp_dir = tempfile.mkdtemp()
108+
saved_sys_path = sys.path.copy()
109+
try:
110+
cpu_prog.compile(tmpdir=temp_dir)
111+
112+
sys.path.append(temp_dir)
113+
cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name)
114+
115+
# Create input array on CPU
116+
arr_cpu = np.zeros(1024, dtype=np.int32)
117+
print(f"before: {arr_cpu[:10]=}")
118+
119+
# Run the workload
120+
my_func(arr_cpu)
121+
122+
# Check the result
123+
print(f"after: {arr_cpu[:10]=}")
124+
assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32))
125+
finally:
126+
sys.path = saved_sys_path
127+
# to allow FFI module to unload, we delete references to
128+
# to cpu_func
129+
del cpu_func, my_func
130+
# clean up temp directory
131+
shutil.rmtree(temp_dir)
132+
133+
134+
if __name__ == "__main__":
135+
run()

cuda_core/examples/strided_memory_view.py renamed to cuda_core/examples/strided_memory_view_gpu.py

Lines changed: 55 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,9 @@
1515
#
1616
# ################################################################################
1717

18-
import importlib
1918
import string
2019
import sys
2120

22-
try:
23-
from cffi import FFI
24-
except ImportError:
25-
print("cffi is not installed, the CPU example will be skipped", file=sys.stderr)
26-
FFI = None
2721
try:
2822
import cupy as cp
2923
except ImportError:
@@ -52,51 +46,6 @@
5246
func_name = "inplace_plus_arange_N"
5347
func_sig = f"void {func_name}(int* data, size_t N)"
5448

55-
# Here is a concrete (very naive!) implementation on CPU:
56-
if FFI:
57-
cpu_code = string.Template(r"""
58-
extern "C"
59-
$func_sig {
60-
for (size_t i = 0; i < N; i++) {
61-
data[i] += i;
62-
}
63-
}
64-
""").substitute(func_sig=func_sig)
65-
# This is cffi's way of JIT compiling & loading a CPU function. cffi builds an
66-
# extension module that has the Python binding to the underlying C function.
67-
# For more details, please refer to cffi's documentation.
68-
cpu_prog = FFI()
69-
cpu_prog.cdef(f"{func_sig};")
70-
cpu_prog.set_source(
71-
"_cpu_obj",
72-
cpu_code,
73-
source_extension=".cpp",
74-
extra_compile_args=["-std=c++11"],
75-
)
76-
cpu_prog.compile()
77-
cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name)
78-
79-
# Here is a concrete (again, very naive!) implementation on GPU:
80-
if cp:
81-
gpu_code = string.Template(r"""
82-
extern "C"
83-
__global__ $func_sig {
84-
const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
85-
const size_t stride_size = gridDim.x * blockDim.x;
86-
for (size_t i = tid; i < N; i += stride_size) {
87-
data[i] += i;
88-
}
89-
}
90-
""").substitute(func_sig=func_sig)
91-
92-
# To know the GPU's compute capability, we need to identify which GPU to use.
93-
dev = Device(0)
94-
dev.set_current()
95-
arch = "".join(f"{i}" for i in dev.compute_capability)
96-
gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11"))
97-
mod = gpu_prog.compile(target_type="cubin")
98-
gpu_ker = mod.get_kernel(func_name)
99-
10049
# Now we are prepared to run the code from the user's perspective!
10150
#
10251
# ################################################################################
@@ -109,60 +58,72 @@
10958
# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
11059
# of which are supported by StridedMemoryView).
11160
@args_viewable_as_strided_memory((0,))
112-
def my_func(arr, work_stream):
61+
def my_func(arr, work_stream, gpu_ker):
11362
# Create a memory view over arr (assumed to be a 1D array of int32). The stream
11463
# ordering is taken care of, so that arr can be safely accessed on our work
11564
# stream (ordered after a data stream on which arr is potentially prepared).
11665
view = arr.view(work_stream.handle if work_stream else -1)
11766
assert isinstance(view, StridedMemoryView)
11867
assert len(view.shape) == 1
11968
assert view.dtype == np.int32
69+
assert view.is_device_accessible
12070

12171
size = view.shape[0]
12272
# DLPack also supports host arrays. We want to know if the array data is
12373
# accessible from the GPU, and dispatch to the right routine accordingly.
124-
if view.is_device_accessible:
125-
block = 256
126-
grid = (size + block - 1) // block
127-
config = LaunchConfig(grid=grid, block=block)
128-
launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
129-
# Here we're being conservative and synchronize over our work stream,
130-
# assuming we do not know the data stream; if we know then we could
131-
# just order the data stream after the work stream here, e.g.
132-
#
133-
# data_stream.wait(work_stream)
134-
#
135-
# without an expensive synchronization (with respect to the host).
136-
work_stream.sync()
137-
else:
138-
cpu_func(cpu_prog.cast("int*", view.ptr), size)
139-
140-
141-
# This takes the CPU path
142-
if FFI:
143-
# Create input array on CPU
144-
arr_cpu = np.zeros(1024, dtype=np.int32)
145-
print(f"before: {arr_cpu[:10]=}")
146-
147-
# Run the workload
148-
my_func(arr_cpu, None)
149-
150-
# Check the result
151-
print(f"after: {arr_cpu[:10]=}")
152-
assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32))
153-
154-
155-
# This takes the GPU path
156-
if cp:
74+
block = 256
75+
grid = (size + block - 1) // block
76+
config = LaunchConfig(grid=grid, block=block)
77+
launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
78+
# Here we're being conservative and synchronize over our work stream,
79+
# assuming we do not know the data stream; if we know then we could
80+
# just order the data stream after the work stream here, e.g.
81+
#
82+
# data_stream.wait(work_stream)
83+
#
84+
# without an expensive synchronization (with respect to the host).
85+
work_stream.sync()
86+
87+
88+
def run():
89+
global my_func
90+
if not cp:
91+
return None
92+
# Here is a concrete (very naive!) implementation on GPU:
93+
gpu_code = string.Template(r"""
94+
extern "C"
95+
__global__ $func_sig {
96+
const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
97+
const size_t stride_size = gridDim.x * blockDim.x;
98+
for (size_t i = tid; i < N; i += stride_size) {
99+
data[i] += i;
100+
}
101+
}
102+
""").substitute(func_sig=func_sig)
103+
104+
# To know the GPU's compute capability, we need to identify which GPU to use.
105+
dev = Device(0)
106+
dev.set_current()
107+
arch = "".join(f"{i}" for i in dev.compute_capability)
108+
gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11"))
109+
mod = gpu_prog.compile(target_type="cubin")
110+
gpu_ker = mod.get_kernel(func_name)
111+
157112
s = dev.create_stream()
158-
# Create input array on GPU
159-
arr_gpu = cp.ones(1024, dtype=cp.int32)
160-
print(f"before: {arr_gpu[:10]=}")
113+
try:
114+
# Create input array on GPU
115+
arr_gpu = cp.ones(1024, dtype=cp.int32)
116+
print(f"before: {arr_gpu[:10]=}")
117+
118+
# Run the workload
119+
my_func(arr_gpu, s, gpu_ker)
120+
121+
# Check the result
122+
print(f"after: {arr_gpu[:10]=}")
123+
assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
124+
finally:
125+
s.close()
161126

162-
# Run the workload
163-
my_func(arr_gpu, s)
164127

165-
# Check the result
166-
print(f"after: {arr_gpu[:10]=}")
167-
assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
168-
s.close()
128+
if __name__ == "__main__":
129+
run()

cuda_core/tests/conftest.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
# Copyright 2024 NVIDIA Corporation. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4-
import glob
54
import os
6-
import sys
75

86
try:
97
from cuda.bindings import driver
@@ -67,21 +65,6 @@ def pop_all_contexts():
6765
return pop_all_contexts
6866

6967

70-
# samples relying on cffi could fail as the modules cannot be imported
71-
sys.path.append(os.getcwd())
72-
73-
74-
@pytest.fixture(scope="session", autouse=True)
75-
def clean_up_cffi_files():
76-
yield
77-
files = glob.glob(os.path.join(os.getcwd(), "_cpu_obj*"))
78-
for f in files:
79-
try: # noqa: SIM105
80-
os.remove(f)
81-
except FileNotFoundError:
82-
pass # noqa: SIM105
83-
84-
8568
skipif_testing_with_compute_sanitizer = pytest.mark.skipif(
8669
os.environ.get("CUDA_PYTHON_TESTING_WITH_COMPUTE_SANITIZER", "0") == "1",
8770
reason="The compute-sanitizer is running, and this test causes an API error.",

0 commit comments

Comments
 (0)