diff --git a/xrspatial/slope.py b/xrspatial/slope.py index 56b41df3..17cd94af 100644 --- a/xrspatial/slope.py +++ b/xrspatial/slope.py @@ -271,11 +271,21 @@ def _run_dask_numpy_geodesic(data, lat_2d, lon_2d, a2, b2, z_factor, boundary='n return out[0] +def _to_cupy_f64(block): + # Only reached from the dask+cupy path, so `cupy` is the real module here, + # never the import-time fallback class. + return cupy.asarray(block, dtype=cupy.float64) + + def _run_dask_cupy_geodesic(data, lat_2d, lon_2d, a2, b2, z_factor, boundary='nan'): - lat_dask = da.from_array(cupy.asarray(lat_2d, dtype=cupy.float64), - chunks=data.chunksize) - lon_dask = da.from_array(cupy.asarray(lon_2d, dtype=cupy.float64), - chunks=data.chunksize) + # Keep lat/lon as dask-of-numpy on the (zero-stride) broadcast views, then + # convert each block to cupy lazily. Converting up front with + # cupy.asarray(lat_2d) would densify the full (H, W) grid onto a single GPU + # at graph-construction time and OOM on large rasters. + lat_dask = da.from_array(lat_2d, chunks=data.chunksize).map_blocks( + _to_cupy_f64, dtype=np.float64) + lon_dask = da.from_array(lon_2d, chunks=data.chunksize).map_blocks( + _to_cupy_f64, dtype=np.float64) stacked = da.stack([ data.astype(cupy.float64), lat_dask, diff --git a/xrspatial/tests/test_geodesic_slope.py b/xrspatial/tests/test_geodesic_slope.py index d22a379d..9f7ce422 100644 --- a/xrspatial/tests/test_geodesic_slope.py +++ b/xrspatial/tests/test_geodesic_slope.py @@ -296,7 +296,6 @@ def test_numpy_equals_dask(self): class TestGeodesicSlopeCupy: def test_numpy_equals_cupy(self): - import cupy elev = _east_tilted_surface(H=8, W=10, grade=100.0) r_np = _make_geo_raster(elev, 40.0, 41.0, 10.0, 11.0, backend='numpy') r_cu = _make_geo_raster(elev, 40.0, 41.0, 10.0, 11.0, backend='cupy') @@ -321,3 +320,38 @@ def test_numpy_equals_dask_cupy(self): np.testing.assert_allclose( s_np.values, s_dc.data.compute().get(), rtol=1e-5, equal_nan=True ) + + def test_latlon_not_materialized_on_gpu_at_graph_build(self): + """The dask+cupy geodesic path must keep lat/lon chunked. + + Building the graph (no compute) for a large raster must not densify + the full (H, W) lat/lon grids onto the GPU. Converting the broadcast + views with ``cupy.asarray`` up front would allocate ~2*H*W*8 bytes of + GPU memory at graph-construction time and OOM on large rasters. + """ + import cupy + + H = W = 2048 + elev = cupy.zeros((H, W), dtype=cupy.float64) + lat = np.linspace(40.0, 41.0, H) + lon = np.linspace(10.0, 11.0, W) + raster = xr.DataArray( + da.from_array(elev, chunks=(256, 256)), + dims=['lat', 'lon'], + coords={'lat': lat, 'lon': lon}, + ) + + pool = cupy.get_default_memory_pool() + pool.free_all_blocks() + before = pool.used_bytes() + out = slope(raster, method='geodesic') # graph construction only + out.data.__dask_graph__() + delta = pool.used_bytes() - before + + # A single full lat or lon grid is H*W*8 bytes. If either were + # densified eagerly the delta would be at least that large. + one_full_grid = H * W * 8 + assert delta < one_full_grid, ( + f"graph construction allocated {delta} GPU bytes; expected well " + f"under one full lat/lon grid ({one_full_grid} bytes)" + )