diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fab5c26e..62d83ccb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,9 +1,9 @@ repos: - repo: https://github.com/ambv/black - rev: 21.5b2 + rev: 22.3.0 hooks: - id: black - language_version: python3.8 + language_version: python3 exclude: test/testdata/syntax_error.py - repo: https://github.com/pre-commit/pre-commit-hooks rev: v1.2.3 diff --git a/doc/source/conf.py b/doc/source/conf.py index 46628de7..0054a5a0 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -45,9 +45,9 @@ master_doc = "index" # General information about the project. -project = u"Mapchete" -copyright = u"2015, 2016, 2017, 2018, EOX IT Services" -author = u"Joachim Ungar" +project = "Mapchete" +copyright = "2015, 2016, 2017, 2018, EOX IT Services" +author = "Joachim Ungar" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -127,7 +127,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, "Mapchete.tex", u"Mapchete Documentation", u"Joachim Ungar", "manual"), + (master_doc, "Mapchete.tex", "Mapchete Documentation", "Joachim Ungar", "manual"), ] @@ -135,7 +135,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [(master_doc, "mapchete", u"Mapchete Documentation", [author], 1)] +man_pages = [(master_doc, "mapchete", "Mapchete Documentation", [author], 1)] # -- Options for Texinfo output ------------------------------------------- @@ -147,7 +147,7 @@ ( master_doc, "Mapchete", - u"Mapchete Documentation", + "Mapchete Documentation", author, "Mapchete", "Mapchete processes raster and vector geodata.", diff --git a/mapchete/formats/base.py b/mapchete/formats/base.py index 2b51b499..cb5035c3 100644 --- a/mapchete/formats/base.py +++ b/mapchete/formats/base.py @@ -665,19 +665,22 @@ def _read_as_tiledir( [path for _, path in tiles_paths], out_tile, validity_check=validity_check, + skip_missing_files=True, ) - else: + else: # pragma: no cover return [] elif data_type == "raster": if tiles_paths: return read_raster_window( [path for _, path in tiles_paths], out_tile, - indexes=indexes, + indexes=indexes or list(range(1, profile["count"] + 1)), resampling=resampling, src_nodata=profile["nodata"], dst_nodata=dst_nodata, gdal_opts=gdal_opts, + skip_missing_files=True, + dst_dtype=profile["dtype"], ) else: bands = len(indexes) if indexes else profile["count"] diff --git a/mapchete/formats/default/gtiff.py b/mapchete/formats/default/gtiff.py index 3f0b7fed..280279ad 100644 --- a/mapchete/formats/default/gtiff.py +++ b/mapchete/formats/default/gtiff.py @@ -476,7 +476,7 @@ def prepare(self, process_area=None, **kwargs): self.overviews_levels = self.output_params.get( "overviews_levels", [ - 2 ** i + 2**i for i in range( 1, get_maximum_overview_level( diff --git a/mapchete/formats/default/tile_directory.py b/mapchete/formats/default/tile_directory.py index cb86e8b2..d712f30a 100644 --- a/mapchete/formats/default/tile_directory.py +++ b/mapchete/formats/default/tile_directory.py @@ -191,7 +191,9 @@ def bbox(self, out_crs=None): ) -def _get_tiles_paths(basepath=None, ext=None, pyramid=None, bounds=None, zoom=None): +def _get_tiles_paths( + basepath=None, ext=None, pyramid=None, bounds=None, zoom=None, exists_check=False +): return [ (_tile, _path) for _tile, _path in [ @@ -205,7 +207,7 @@ def _get_tiles_paths(basepath=None, ext=None, pyramid=None, bounds=None, zoom=No ) for t in pyramid.tiles_from_bounds(bounds, zoom) ] - if path_exists(_path) + if not exists_check or (exists_check and path_exists(_path)) ] @@ -405,10 +407,9 @@ def _get_tiles_paths( pyramid=self._td_pyramid, bounds=td_bounds, zoom=zoom, + exists_check=True, ) - logger.debug( - "%s existing tiles found at zoom %s", len(tiles_paths), zoom - ) + logger.debug("%s potential tiles at zoom %s", len(tiles_paths), zoom) zoom -= 1 else: tiles_paths = _get_tiles_paths( @@ -418,6 +419,6 @@ def _get_tiles_paths( bounds=td_bounds, zoom=zoom, ) - logger.debug("%s existing tiles found at zoom %s", len(tiles_paths), zoom) + logger.debug("%s potential tiles at zoom %s", len(tiles_paths), zoom) return tiles_paths diff --git a/mapchete/io/_geometry_operations.py b/mapchete/io/_geometry_operations.py index e31e5320..5f4a3f66 100644 --- a/mapchete/io/_geometry_operations.py +++ b/mapchete/io/_geometry_operations.py @@ -76,7 +76,6 @@ def reproject_geometry( ------- geometry : ``shapely.geometry`` """ - logger.debug("reproject geometry from %s to %s", src_crs, dst_crs) src_crs = validate_crs(src_crs) dst_crs = validate_crs(dst_crs) diff --git a/mapchete/io/_path.py b/mapchete/io/_path.py index 0f5f8933..f11da7ff 100644 --- a/mapchete/io/_path.py +++ b/mapchete/io/_path.py @@ -43,6 +43,7 @@ def path_exists(path, fs=None, **kwargs): """ fs = fs or fs_from_path(path, **kwargs) fs.invalidate_cache(path=path) + logger.debug("check if path exists: %s", path) return fs.exists(path) diff --git a/mapchete/io/raster.py b/mapchete/io/raster.py index ca0f9c26..6bf78631 100644 --- a/mapchete/io/raster.py +++ b/mapchete/io/raster.py @@ -32,6 +32,7 @@ ) from mapchete.io._misc import MAPCHETE_IO_RETRY_SETTINGS from mapchete.tile import BufferedTile +from mapchete._timer import Timer from mapchete.validate import validate_write_window_params @@ -47,7 +48,9 @@ def read_raster_window( resampling="nearest", src_nodata=None, dst_nodata=None, + dst_dtype=None, gdal_opts=None, + skip_missing_files=False, ): """ Return NumPy arrays from an input raster. @@ -91,7 +94,9 @@ def read_raster_window( else False, ) ) as env: - logger.debug("reading %s with GDAL options %s", input_files, env.options) + logger.debug( + "reading %s file(s) with GDAL options %s", len(input_files), env.options + ) return _read_raster_window( input_files, tile, @@ -99,6 +104,8 @@ def read_raster_window( resampling=resampling, src_nodata=src_nodata, dst_nodata=dst_nodata, + dst_dtype=dst_dtype, + skip_missing_files=skip_missing_files, ) except FileNotFoundError: # pragma: no cover raise @@ -113,74 +120,103 @@ def _read_raster_window( resampling="nearest", src_nodata=None, dst_nodata=None, + dst_dtype=None, + skip_missing_files=False, ): + def _empty_array(): + if indexes is None: # pragma: no cover + raise ValueError( + "output shape cannot be determined because no given input files " + "exist and no band indexes are given" + ) + dst_shape = ( + (len(indexes), tile.height, tile.width) + if len(indexes) > 1 + else (tile.height, tile.width) + ) + return ma.masked_array( + data=np.full( + dst_shape, + src_nodata if dst_nodata is None else dst_nodata, + dtype=dst_dtype, + ), + mask=True, + ) + if isinstance(input_files, list): # in case multiple input files are given, merge output into one array - # read first file - dst_array = _read_raster_window( - input_files[0], - tile=tile, - indexes=indexes, - resampling=resampling, - src_nodata=src_nodata, - dst_nodata=dst_nodata, - ) - # read subsequent files and merge - for f in input_files[1:]: - f_array = _read_raster_window( - f, - tile=tile, - indexes=indexes, - resampling=resampling, - src_nodata=src_nodata, - dst_nodata=dst_nodata, - ) - dst_array = ma.MaskedArray( - data=np.where(f_array.mask, dst_array, f_array).astype( - dst_array.dtype, copy=False - ), - mask=np.where(f_array.mask, dst_array.mask, f_array.mask).astype( - bool, copy=False - ), - ) + # using the default rasterio behavior, create a 2D array if only one band + # is read and a 3D array if multiple bands are read + dst_array = None + # read files and add one by one to the output array + for f in input_files: + try: + f_array = _read_raster_window( + f, + tile=tile, + indexes=indexes, + resampling=resampling, + src_nodata=src_nodata, + dst_nodata=dst_nodata, + ) + if dst_array is None: + dst_array = f_array + else: + dst_array[~f_array.mask] = f_array.data[~f_array.mask] + dst_array.mask[~f_array.mask] = False + logger.debug("added to output array") + except FileNotFoundError: + if skip_missing_files: + logger.debug("skip missing file %s", f) + else: # pragma: no cover + raise + if dst_array is None: + dst_array = _empty_array() return dst_array else: - input_file = input_files - dst_shape = tile.shape - - if not isinstance(indexes, int): - if indexes is None: - dst_shape = (None,) + dst_shape - elif len(indexes) == 1: - indexes = indexes[0] - else: - dst_shape = (len(indexes),) + dst_shape - # Check if potentially tile boundaries exceed tile matrix boundaries on - # the antimeridian, the northern or the southern boundary. - if tile.tp.is_global and tile.pixelbuffer and tile.is_on_edge(): - return _get_warped_edge_array( - tile=tile, - input_file=input_file, - indexes=indexes, - dst_shape=dst_shape, - resampling=resampling, - src_nodata=src_nodata, - dst_nodata=dst_nodata, - ) + try: + input_file = input_files + dst_shape = tile.shape + + if not isinstance(indexes, int): + if indexes is None: + dst_shape = (None,) + dst_shape + elif len(indexes) == 1: + indexes = indexes[0] + else: + dst_shape = (len(indexes),) + dst_shape + # Check if potentially tile boundaries exceed tile matrix boundaries on + # the antimeridian, the northern or the southern boundary. + if tile.tp.is_global and tile.pixelbuffer and tile.is_on_edge(): + return _get_warped_edge_array( + tile=tile, + input_file=input_file, + indexes=indexes, + dst_shape=dst_shape, + resampling=resampling, + src_nodata=src_nodata, + dst_nodata=dst_nodata, + ) - # If tile boundaries don't exceed pyramid boundaries, simply read window - # once. - else: - return _get_warped_array( - input_file=input_file, - indexes=indexes, - dst_bounds=tile.bounds, - dst_shape=dst_shape, - dst_crs=tile.crs, - resampling=resampling, - src_nodata=src_nodata, - dst_nodata=dst_nodata, - ) + # If tile boundaries don't exceed pyramid boundaries, simply read window + # once. + else: + return _get_warped_array( + input_file=input_file, + indexes=indexes, + dst_bounds=tile.bounds, + dst_shape=dst_shape, + dst_crs=tile.crs, + resampling=resampling, + src_nodata=src_nodata, + dst_nodata=dst_nodata, + ) + except FileNotFoundError: # pragma: no cover + if skip_missing_files: + logger.debug("skip missing file %s", f) + return _empty_array() + else: + raise def _get_warped_edge_array( @@ -268,6 +304,8 @@ def _get_warped_array( src_nodata=src_nodata, dst_nodata=dst_nodata, ) + except FileNotFoundError: + raise except Exception as e: logger.exception("error while reading file %s: %s", input_file, e) raise @@ -337,26 +375,49 @@ def _read( ) if isinstance(input_file, str): - logger.debug("got file path %s", input_file) try: - with rasterio.open(input_file, "r") as src: - return _read( - src, - indexes, - dst_bounds, - dst_shape, - dst_crs, - resampling, - src_nodata, - dst_nodata, - ) + with Timer() as t: + with rasterio.open(input_file, "r") as src: + logger.debug("read from %s...", input_file) + out = _read( + src, + indexes, + dst_bounds, + dst_shape, + dst_crs, + resampling, + src_nodata, + dst_nodata, + ) + logger.debug("read %s in %s", input_file, t) + return out except RasterioIOError as e: - try: - if path_exists(input_file): + # rasterio errors which indicate file does not exist + for i in ( + "does not exist in the file system", + "No such file or directory", + "The specified key does not exist", + ): + if i in str(e): + raise FileNotFoundError( + "%s not found and cannot be opened with rasterio" % input_file + ) + else: + try: + # NOTE: this can cause addional S3 requests + exists = path_exists(input_file) + except Exception: # pragma: no cover + # in order not to mask the original rasterio exception, raise it raise e - except Exception: - raise e - raise FileNotFoundError("%s not found" % input_file) + if exists: + # raise rasterio exception + raise e + else: + # file does not exist + raise FileNotFoundError( + "%s not found and cannot be opened with rasterio" % input_file + ) + else: # pragma: no cover logger.debug("assuming file object %s", input_file) warnings.warn( diff --git a/mapchete/io/vector.py b/mapchete/io/vector.py index f9dd5f42..0ac21b14 100644 --- a/mapchete/io/vector.py +++ b/mapchete/io/vector.py @@ -39,7 +39,9 @@ logger = logging.getLogger(__name__) -def read_vector_window(inp, tile, validity_check=True, clip_to_crs_bounds=False): +def read_vector_window( + inp, tile, validity_check=True, clip_to_crs_bounds=False, skip_missing_files=False +): """ Read a window of an input vector dataset. @@ -62,21 +64,24 @@ def read_vector_window(inp, tile, validity_check=True, clip_to_crs_bounds=False) features : list a list of reprojected GeoJSON-like features """ + + def _gen_features(): + for path in inp if isinstance(inp, list) else [inp]: + try: + yield from _read_vector_window( + path, + tile, + validity_check=validity_check, + clip_to_crs_bounds=clip_to_crs_bounds, + ) + except FileNotFoundError: + if skip_missing_files: + logger.debug("skip missing file %s", path) + else: + raise + try: - return [ - feature - for feature in chain.from_iterable( - [ - _read_vector_window( - path, - tile, - validity_check=validity_check, - clip_to_crs_bounds=clip_to_crs_bounds, - ) - for path in (inp if isinstance(inp, list) else [inp]) - ] - ) - ] + return list(_gen_features()) except FileNotFoundError: # pragma: no cover raise except Exception as e: # pragma: no cover @@ -256,11 +261,31 @@ def _get_reprojected_features( src = exit_stack.enter_context(fiona.open(inp, "r")) src_crs = CRS(src.crs) except Exception as e: - if path_exists(inp): - logger.error("error while reading file %s: %s", inp, e) - raise e + # fiona errors which indicate file does not exist + for i in ( + "does not exist in the file system", + "No such file or directory", + "The specified key does not exist", + ): + if i in str(e): + raise FileNotFoundError( + "%s not found and cannot be opened with Fiona" % inp + ) else: - raise FileNotFoundError("%s not found" % inp) + try: + # NOTE: this can cause addional S3 requests + exists = path_exists(inp) + except Exception: # pragma: no cover + # in order not to mask the original fiona exception, raise it + raise e + if exists: + # raise fiona exception + raise e + else: + # file does not exist + raise FileNotFoundError( + "%s not found and cannot be opened with Fiona" % inp + ) else: src = inp src_crs = inp.crs diff --git a/mapchete/stac.py b/mapchete/stac.py index 0242f817..a13d649f 100644 --- a/mapchete/stac.py +++ b/mapchete/stac.py @@ -384,7 +384,7 @@ def tile_pyramid_from_item(item): matrix_set = matrix_sets[wkss] # find out metatiling - metatiling_opts = [2 ** x for x in range(10)] + metatiling_opts = [2**x for x in range(10)] matching_metatiling_opts = [] for metatiling in metatiling_opts: tp = BufferedTilePyramid(grid, metatiling=metatiling) diff --git a/mapchete/tile.py b/mapchete/tile.py index 526f3340..f894bd27 100644 --- a/mapchete/tile.py +++ b/mapchete/tile.py @@ -453,7 +453,7 @@ def _count_tiles(tiles, geometry, minzoom, maxzoom): # sum up tiles for each remaining zoom level count += sum( [ - 4 ** z + 4**z for z in range( # only count zoom levels which are greater than minzoom or # count all zoom levels from tile zoom level to maxzoom diff --git a/requirements.txt b/requirements.txt index 71d930db..fbd90878 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ aioitertools>=0.7.0 boto3>=1.14.44 cachetools>=4.1.0 cached-property>=1.5.1 -click>=7.1.1,<8 +click>=7.1.1 click-plugins>=1.1.1 click-spinner>=0.1.10 fiona>=1.8.13.post1 diff --git a/setup.py b/setup.py index 9e7be128..4fc8fd88 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ install_requires = [ "cachetools", "cached_property", - "click>=7.1.1,<8", + "click>=7.1.1", "click-plugins", "click-spinner", "fiona>=1.8.13.post1", @@ -41,7 +41,7 @@ ] req_contours = ["matplotlib"] req_dask = ["dask", "distributed"] -req_geobuf = ["geobuf"] +req_geobuf = ["geobuf", "protobuf<=3.20.1"] req_http = ["fsspec[http]", "aiohttp", "requests"] req_s3 = ["boto3", "fsspec[s3]", "s3fs>=0.5.1"] req_serve = ["flask", "werkzeug>=0.15"] diff --git a/test/test_io_raster.py b/test/test_io_raster.py index 69e0210e..ab46b008 100644 --- a/test/test_io_raster.py +++ b/test/test_io_raster.py @@ -169,10 +169,43 @@ def test_read_raster_window_retry(invalid_tif): tile = BufferedTilePyramid("geodetic").tile(zoom=13, row=1918, col=8905) with pytest.raises(MapcheteIOError): read_raster_window(invalid_tif, tile) + + +def test_read_raster_window_filenotfound(): + tile = BufferedTilePyramid("geodetic").tile(zoom=13, row=1918, col=8905) with pytest.raises(FileNotFoundError): read_raster_window("not_existing.tif", tile) +def test_read_raster_window_s3_filenotfound(mp_s3_tmpdir): + tile = BufferedTilePyramid("geodetic").tile(zoom=13, row=1918, col=8905) + with pytest.raises(FileNotFoundError): + read_raster_window(f"{mp_s3_tmpdir}/not_existing.tif", tile) + + +def test_read_raster_window_s3_filenotfound_gdalreaddir(mp_s3_tmpdir): + tile = BufferedTilePyramid("geodetic").tile(zoom=13, row=1918, col=8905) + with pytest.raises(FileNotFoundError): + read_raster_window( + f"{mp_s3_tmpdir}/not_existing.tif", + tile, + gdal_opts=dict(GDAL_DISABLE_READDIR_ON_OPEN=False), + ) + + +@pytest.mark.skip( + reason="this test should pass with a newer GDAL release: https://github.com/OSGeo/gdal/issues/1900" +) +def test_read_raster_window_s3_invalid_file(): + tile = BufferedTilePyramid("geodetic").tile(zoom=13, row=1918, col=8905) + with pytest.raises(MapcheteIOError): + read_raster_window( + "s3://mapchete-test/landpoly.geojson", + tile, + gdal_opts=dict(GDAL_DISABLE_READDIR_ON_OPEN=False), + ) + + def test_read_raster_no_crs_errors(): with tempfile.NamedTemporaryFile() as tmpfile: with pytest.raises(MapcheteIOError):