3
3
4
4
import logging
5
5
from functools import wraps , lru_cache
6
+ import os , sys
6
7
from typing import Dict , List , Optional , Collection
7
8
8
9
from parla import array
15
16
16
17
logger = logging .getLogger (__name__ )
17
18
19
+ if 'cupy_backends' in sys .modules :
20
+ # TODO: This should be dynamically configurable. That needs to be fixed upstream though.
21
+ raise ImportError ("cupy must be imported after parla.gpu for per-thread default stream configuration to work properly." )
22
+
23
+ os .environ ["CUPY_CUDA_PER_THREAD_DEFAULT_STREAM" ] = "1"
24
+ # numba responds to this env var even if it has already been imported.
25
+ os .environ ["NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM" ] = "1"
26
+
18
27
try :
19
28
import cupy
20
29
import cupy .cuda
@@ -167,33 +176,19 @@ def get_array_module(self, a):
167
176
# Integration with parla.environments
168
177
169
178
class _GPUStacksLocal (threading .local ):
170
- _stream_stack : List [cupy .cuda .Stream ]
171
179
_device_stack : List [cupy .cuda .Device ]
172
180
173
181
def __init__ (self ):
174
182
super (_GPUStacksLocal , self ).__init__ ()
175
- self ._stream_stack = []
176
183
self ._device_stack = []
177
184
178
- def push_stream (self , stream ):
179
- self ._stream_stack .append (stream )
180
-
181
- def pop_stream (self ) -> cupy .cuda .Stream :
182
- return self ._stream_stack .pop ()
183
-
184
185
def push_device (self , dev ):
185
186
self ._device_stack .append (dev )
186
187
187
188
def pop_device (self ) -> cupy .cuda .Device :
188
189
return self ._device_stack .pop ()
189
190
190
191
@property
191
- def stream (self ):
192
- if self ._stream_stack :
193
- return self ._stream_stack [- 1 ]
194
- else :
195
- return None
196
- @property
197
192
def device (self ):
198
193
if self ._device_stack :
199
194
return self ._device_stack [- 1 ]
@@ -213,30 +208,20 @@ def __init__(self, descriptor: "GPUComponent", env: TaskEnvironment):
213
208
# Use a stack per thread per GPU component just in case.
214
209
self ._stack = _GPUStacksLocal ()
215
210
216
- def _make_stream (self ):
217
- with self .gpu .cupy_device :
218
- return cupy .cuda .Stream (null = False , non_blocking = True )
219
-
220
211
def __enter__ (self ):
221
212
_gpu_locals ._gpus = self .gpus
222
213
dev = self .gpu .cupy_device
223
214
dev .__enter__ ()
224
215
self ._stack .push_device (dev )
225
- stream = self ._make_stream ()
226
- stream .__enter__ ()
227
- self ._stack .push_stream (stream )
228
216
return self
229
217
230
218
def __exit__ (self , exc_type , exc_val , exc_tb ):
231
219
dev = self ._stack .device
232
- stream = self ._stack .stream
233
220
try :
234
- stream .synchronize ()
235
- stream .__exit__ (exc_type , exc_val , exc_tb )
221
+ cupy .cuda .get_current_stream ().synchronize ()
236
222
_gpu_locals ._gpus = None
237
223
ret = dev .__exit__ (exc_type , exc_val , exc_tb )
238
224
finally :
239
- self ._stack .pop_stream ()
240
225
self ._stack .pop_device ()
241
226
return ret
242
227
@@ -245,15 +230,15 @@ def initialize_thread(self) -> None:
245
230
# Trigger cuBLAS/etc. initialization for this GPU in this thread.
246
231
with cupy .cuda .Device (gpu .index ) as device :
247
232
a = cupy .asarray ([2. ])
248
- cupy .cuda .get_current_stream (). synchronize ()
249
- with cupy . cuda . Stream ( False , True ) as stream :
250
- cupy .asnumpy (cupy .sqrt (a ))
251
- device .cublas_handle
252
- device .cusolver_handle
253
- device .cusolver_sp_handle
254
- device .cusparse_handle
255
- stream .synchronize ()
256
- device .synchronize ()
233
+ stream = cupy .cuda .get_current_stream ()
234
+ stream . synchronize ()
235
+ cupy .asnumpy (cupy .sqrt (a ))
236
+ device .cublas_handle
237
+ device .cusolver_handle
238
+ device .cusolver_sp_handle
239
+ device .cusparse_handle
240
+ stream .synchronize ()
241
+ device .synchronize ()
257
242
258
243
class GPUComponent (EnvironmentComponentDescriptor ):
259
244
"""A single GPU CUDA component which configures the environment to use the specific GPU using a single
@@ -312,15 +297,15 @@ def initialize_thread(self) -> None:
312
297
# Trigger cuBLAS/etc. initialization for this GPU in this thread.
313
298
with cupy .cuda .Device (gpu .index ) as device :
314
299
a = cupy .asarray ([2. ])
315
- cupy .cuda .get_current_stream (). synchronize ()
316
- with cupy . cuda . Stream ( False , True ) as stream :
317
- cupy .asnumpy (cupy .sqrt (a ))
318
- device .cublas_handle
319
- device .cusolver_handle
320
- device .cusolver_sp_handle
321
- device .cusparse_handle
322
- stream .synchronize ()
323
- device .synchronize ()
300
+ stream = cupy .cuda .get_current_stream ()
301
+ stream . synchronize ()
302
+ cupy .asnumpy (cupy .sqrt (a ))
303
+ device .cublas_handle
304
+ device .cusolver_handle
305
+ device .cusolver_sp_handle
306
+ device .cusparse_handle
307
+ stream .synchronize ()
308
+ device .synchronize ()
324
309
325
310
326
311
class MultiGPUComponent (EnvironmentComponentDescriptor ):
0 commit comments