# Split HydroSHEDS v1.1 Land Mask

In [1]:
from dotenv import load_dotenv

# Path to env file containing the waterbodies database credentials
# Only necessary on the Sandbox.
dotenv_path = "/home/jovyan/.env"
load_dotenv(dotenv_path=dotenv_path, verbose=True, override=True)

True

In [18]:
import logging
import os
import rioxarray
from datacube import Datacube
from tqdm import tqdm
import numpy as np
from waterbodies.logs import logging_setup
from waterbodies.hopper import create_tasks_from_datasets
from waterbodies.grid import WaterbodiesGrid
from waterbodies.utils import rio_slurp_xarray
from waterbodies.text import get_tile_index_str_from_tuple
from waterbodies.io import is_s3_path

In [4]:
hydrosheds_land_mask_file = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/hydrosheds_v1.1_land_mask/af_msk_3s.tif"
output_directory = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/hydrosheds_v1.1_land_mask/"

In [6]:
# Set up logging.
logging_setup(3)
_log = logging.getLogger(__name__)

In [7]:
if is_s3_path(output_directory):
    # To avoid the error GDAL signalled an error: err_no=1, msg='w+ not supported for /vsis3,
    # unless CPL_VSIL_USE_TEMP_FILE_FOR_RANDOM_WRITE is set to YES'
    # when writing to s3 using rioxarray's rio.to_raster
    os.environ["CPL_VSIL_USE_TEMP_FILE_FOR_RANDOM_WRITE"] = "YES"

In [8]:
%%time
# Find all the WOfS All Time Summaries geoboxes
dc = Datacube(app="tiles")
gridspec = WaterbodiesGrid().gridspec

dc_query = dict(product="wofs_ls_summary_alltime")
datasets = dc.find_datasets(**dc_query)

tasks = create_tasks_from_datasets(datasets=datasets, tile_index_filter=None, bin_solar_day=False)
tile_indices = [k for task in tasks for k,v in task.items()]
tile_geoboxes = [gridspec.tile_geobox(tile_index=tile_index) for tile_index in tile_indices]
tiles = list(zip(tile_indices, tile_geoboxes))
_log.info(f"Found {len(tiles)} tiles")

Processing 4461 datasets: 100%|██████████| 4461/4461 [00:02<00:00, 1736.26it/s]

[2024-05-13 19:23:23,236] {<timed exec>:12} INFO - Found 4456 tiles
CPU times: user 3.29 s, sys: 60.6 ms, total: 3.35 s
Wall time: 3.56 s





In [50]:
with tqdm(iterable=tiles, desc="Rasterizing tiles", total=len(tiles)) as tiles:
    for tile  in tiles:
        tile_index, tile_geobox = tile
        tile_index_str = get_tile_index_str_from_tuple(tile_index)
        tile_raster_fp = os.path.join(output_directory, f"goas_v01_{tile_index_str}.tif")
        tile_raster = rio_slurp_xarray(fname=hydrosheds_land_mask_file, gbox=tile_geobox, resampling="bilinear")
        # Indicator values: 1 = land, 2 = ocean sink, 3 = inland sink, 255 is no data.
        tile_raster = ((tile_raster != 255) & (tile_raster != 2)).astype(int)
        cog_bytes = to_cog(geo_im=tile_raster)
        with fs.open(tile_raster_fp, mode='wb') as f:
            f.write(cog_bytes)
        #tile_raster.rio.to_raster(raster_path=tile_raster_fp, compute=True)

Rasterizing tiles:   5%|▍         | 219/4456 [00:02<00:45, 93.94it/s]
