# Split HydroSHEDS v1.1 Land Mask

This notebook uses the following datasets:
- HydroSHEDS vesion 1.1 Land Mask

    License: The core products of HydroSHEDS v1 are freely available for scientific, educational and commercial use. The data are distributed under a specific license agreement that is included in the [HydroSHEDS Technical Documentation](https://data.hydrosheds.org/file/technical-documentation/HydroSHEDS_TechDoc_v1_4.pdf). For all regulations regarding license grants, copyright, redistribution restrictions, required attributions, disclaimer of warranty, indemnification, liability, and waiver of damages, please refer to the license agreement. By downloading and using the data the user agrees to the terms and conditions of the license agreement.

    Reference: Lehner, B., Verdin, K., Jarvis, A. (2008). New global hydrography derived from spaceborne elevation data. Eos, Transactions, American Geophysical Union, 89(10): 93–94. https://doi.org/10.1029/2008eo100001
    
- Global Oceans and Seas v01 (2021-12-14)
   
    License: The dataset is licensed under a [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/) and has the following [Disclaimer](https://www.marineregions.org/disclaimer.php)
    
    Reference: Flanders Marine Institute (2021). Global Oceans and Seas, version 1. Available online at https://www.marineregions.org/. https://doi.org/10.14284/542


> **Prerequisites**:The Global Oceans and Seas v01 (2021-12-14) dataset shapefile was downloaded from [here](https://www.vliz.be/en/imis?dasid=7842&doiid=613) and converted into a GeoParquet file using QGIS then uploaded to s3. The HydroSHED version 1.1 Land Mask for Africa was downloaded from [here](https://data.hydrosheds.org/file/hydrosheds-v1-msk/af_msk_3s.zip), unzipped and the GeoTIFF file uploaded to s3.

In [1]:
from dotenv import load_dotenv

# Path to env file containing the waterbodies database credentials
# Only necessary on the Sandbox.
dotenv_path = "/home/jovyan/.env"
load_dotenv(dotenv_path=dotenv_path, verbose=True, override=True)

True

In [2]:
import logging
import os

import geopandas as gpd
import numpy as np
import rioxarray
from datacube import Datacube
from odc.geo.xr import to_cog
from tqdm import tqdm
from waterbodies.grid import WaterbodiesGrid
from waterbodies.hopper import create_tasks_from_datasets
from waterbodies.io import get_filesystem, is_s3_path, load_vector_file, check_directory_exists
from waterbodies.logs import logging_setup
from waterbodies.text import get_tile_index_str_from_tuple
from waterbodies.utils import rio_slurp_xarray

In [3]:
product_footprint_url = "https://explorer.digitalearth.africa/api/footprint/wofs_ls_summary_alltime"
goas_v01_url = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/land_sea_masks/goas_v01.parquet"
hydrosheds_land_mask_file = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/land_sea_masks/af_msk_3s.tif"
output_directory = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/hydrosheds_v1_1_land_mask/"

In [4]:
# Set up logging.
logging_setup(3)
_log = logging.getLogger(__name__)

In [5]:
if not check_directory_exists(output_directory):
    fs = get_filesystem(output_directory, anon=False)
    fs.mkdirs(output_directory)
    _log.info(f"Created directory {output_directory}")

[2024-06-13 20:46:18,215] {credentials.py:567} INFO - Found credentials in environment variables.
[2024-06-13 20:46:18,389] {554847729.py:4} INFO - Created directory s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/hydrosheds_v1_1_land_mask/


In [6]:
if is_s3_path(output_directory):
    # To avoid the error GDAL signalled an error: err_no=1, msg='w+ not supported for /vsis3,
    # unless CPL_VSIL_USE_TEMP_FILE_FOR_RANDOM_WRITE is set to YES'
    # when writing to s3 using rioxarray's rio.to_raster
    os.environ["CPL_VSIL_USE_TEMP_FILE_FOR_RANDOM_WRITE"] = "YES"

In [7]:
%%time
# Find all the tiles that will be used to generate the Waterbodies
# historical extent polygons
dc = Datacube(app="tiles")
gridspec = WaterbodiesGrid().gridspec

dc_query = dict(product="wofs_ls_summary_alltime")
datasets = dc.find_datasets(**dc_query)

tasks = create_tasks_from_datasets(datasets=datasets, tile_index_filter=None, bin_solar_day=False)
tile_indices = [k for task in tasks for k,v in task.items()]

tile_extents = [
    gridspec.tile_geobox(tile_index=tile_index).extent.geom
    for tile_index in tile_indices
]

tile_extents_gdf = gpd.GeoDataFrame(
    data={"tile_index": tile_indices, "geometry": tile_extents}, crs=gridspec.crs
)

tile_extents_gdf.set_index("tile_index", inplace=True)

_log.info(f"Found {len(tile_extents_gdf)} tiles")

Processing 4461 datasets: 100%|██████████| 4461/4461 [00:02<00:00, 1693.76it/s]


[2024-06-13 20:46:22,270] {<timed exec>:23} INFO - Found 4456 tiles
CPU times: user 3.65 s, sys: 69.6 ms, total: 3.71 s
Wall time: 3.87 s


In [8]:
# Load the product footprint.
product_footprint_gdf = load_vector_file(product_footprint_url).to_crs(gridspec.crs)

In [9]:
# Load the Global Oceans and Seas dataset.
goas_v01_gdf = load_vector_file(goas_v01_url).to_crs(gridspec.crs)
goas_v01_gdf

Unnamed: 0,name,latitude,longitude,min_Y,min_X,max_Y,max_X,area_km2,geometry
0,Southern Ocean,-68.03985,-26.63275,-85.5625,-180.0,-60.0,180.0,6793589,"MULTIPOLYGON (((17367530.445 -6351419.997, 141..."
1,South Atlantic Ocean,-33.73758,-18.83411,-60.0,-69.60084,0.07511,20.0,42815540,"MULTIPOLYGON (((-4889932.764 0.000, -4898670.8..."
2,South Pacific Ocean,-30.09612,-143.06088,-60.0,130.11129,3.39114,-67.26667,90147400,"MULTIPOLYGON (((-6843963.864 -5987437.671, -68..."
3,North Pacific Ocean,26.95013,-169.38334,0.0,117.51622,66.56286,-76.98544,77124830,"MULTIPOLYGON (((15162668.182 6202856.081, 1516..."
4,South China and Easter Archipelagic Seas,5.62943,115.46548,-10.92259,95.43328,25.56728,134.03155,6822162,"MULTIPOLYGON (((11509135.769 3148036.787, 1150..."
5,Indian Ocean,-27.27272,79.60241,-60.0,20.0,31.18586,146.91671,78162363,"MULTIPOLYGON (((5411625.915 3333798.547, 54114..."
6,Mediterranean Region,38.13065,19.70067,30.06809,-6.03255,47.3764,42.35496,2988248,"MULTIPOLYGON (((3142866.248 3666334.372, 31428..."
7,Baltic Sea,58.78478,19.22115,52.65352,9.3656,67.08059,30.3471,415600,"MULTIPOLYGON (((2639423.703 6382686.936, 26395..."
8,North Atlantic Ocean,31.77621,-40.24758,-0.93603,-98.05392,68.63872,12.00594,41741693,"MULTIPOLYGON (((-2839226.735 6814318.697, -283..."
9,Arctic Ocean,79.14792,-3.28568,51.14359,-180.0,90.0,180.0,15571669,"MULTIPOLYGON (((17203129.832 6870188.502, 1720..."


In [10]:
# Clip the Global Oceans and Seas dataset to the product footprint.
goas_v01_gdf_clipped = gpd.clip(goas_v01_gdf, product_footprint_gdf)
goas_v01_gdf_clipped

Unnamed: 0,name,latitude,longitude,min_Y,min_X,max_Y,max_X,area_km2,geometry
1,South Atlantic Ocean,-33.73758,-18.83411,-60.0,-69.60084,0.07511,20.0,42815540,"POLYGON ((899352.608 -248048.382, 899273.347 -..."
5,Indian Ocean,-27.27272,79.60241,-60.0,20.0,31.18586,146.91671,78162363,"MULTIPOLYGON (((3158299.965 -3184931.808, 3158..."
8,North Atlantic Ocean,31.77621,-40.24758,-0.93603,-98.05392,68.63872,12.00594,41741693,"MULTIPOLYGON (((-1322294.947 1208979.004, -132..."
6,Mediterranean Region,38.13065,19.70067,30.06809,-6.03255,47.3764,42.35496,2988248,"MULTIPOLYGON (((3446541.900 4126666.361, 34462..."


In [11]:
# Identify all tiles that intersect with goas_v01_gdf_clipped
# This will be the coastal tiles.
coastal_tile_indices = tile_extents_gdf.sjoin(goas_v01_gdf_clipped, predicate="intersects", how="inner").index.to_list()
coastal_tile_geoboxes = [gridspec.tile_geobox(tile_index=tile_index) for tile_index in coastal_tile_indices]
coastal_tiles = list(zip(coastal_tile_indices, coastal_tile_geoboxes))

_log.info(f"Found {len(coastal_tiles)} coastal tiles")

[2024-06-13 20:47:25,536] {1713832489.py:7} INFO - Found 1415 coastal tiles


In [13]:
fs = get_filesystem(output_directory, anon=False)
with tqdm(iterable=coastal_tiles, desc="Rasterizing coastal HydroSHEDS Land Mask tiles", total=len(coastal_tiles)) as coastal_tiles:
    for tile  in coastal_tiles:
        tile_index, tile_geobox = tile
        tile_index_str = get_tile_index_str_from_tuple(tile_index)
        tile_raster_fp = os.path.join(output_directory, f"hydrosheds_v1_1_land_mask_{tile_index_str}.tif")
        tile_hydrosheds_land_mask = rio_slurp_xarray(fname=hydrosheds_land_mask_file, gbox=tile_geobox, resampling="bilinear")
        # Indicator values: 1 = land, 2 = ocean sink, 3 = inland sink, 255 is no data.
        tile_raster = np.logical_or(tile_hydrosheds_land_mask == 1, tile_hydrosheds_land_mask == 3).astype(int)
        # Write to file
        cog_bytes = to_cog(geo_im=tile_raster)
        with fs.open(tile_raster_fp, 'wb') as f:
            f.write(cog_bytes)

Rasterizing coastal HydroSHEDS Land Mask tiles: 100%|██████████| 1415/1415 [26:38<00:00,  1.13s/it]
