# Split HydroSHEDS v1.1 Land Mask

This notebook uses the following datasets:
- HydroSHEDS vesion 1.1 Land Mask

    License: The core products of HydroSHEDS v1 are freely available for scientific, educational and commercial use. The data are distributed under a specific license agreement that is included in the [HydroSHEDS Technical Documentation](https://data.hydrosheds.org/file/technical-documentation/HydroSHEDS_TechDoc_v1_4.pdf). For all regulations regarding license grants, copyright, redistribution restrictions, required attributions, disclaimer of warranty, indemnification, liability, and waiver of damages, please refer to the license agreement. By downloading and using the data the user agrees to the terms and conditions of the license agreement.

    Reference: Lehner, B., Verdin, K., Jarvis, A. (2008). New global hydrography derived from spaceborne elevation data. Eos, Transactions, American Geophysical Union, 89(10): 93–94. https://doi.org/10.1029/2008eo100001
    
- Global Oceans and Seas v01 (2021-12-14)
   
    License: The dataset is licensed under a [Creative Commons Attribution 4.0 International License](https://creativecommons.org/licenses/by/4.0/) and has the following [Disclaimer](https://www.marineregions.org/disclaimer.php)
    
    Reference: Flanders Marine Institute (2021). Global Oceans and Seas, version 1. Available online at https://www.marineregions.org/. https://doi.org/10.14284/542


> **Prerequisites**:The Global Oceans and Seas v01 (2021-12-14) dataset shapefile was downloaded from [here](https://www.vliz.be/en/imis?dasid=7842&doiid=613) and converted into a GeoParquet file using QGIS then uploaded to s3. The HydroSHED version 1.1 Land Mask for Africa was downloaded from [here](https://data.hydrosheds.org/file/hydrosheds-v1-msk/af_msk_3s.zip), unzipped and the GeoTIFF file uploaded to s3.

In [None]:
from dotenv import load_dotenv

# Path to env file containing the waterbodies database credentials
# Only necessary on the Sandbox.
dotenv_path = "/home/jovyan/.env"
load_dotenv(dotenv_path=dotenv_path, verbose=True, override=True)

In [None]:
import logging
import os

import geopandas as gpd
import numpy as np
import rioxarray  # noqa F401
from datacube import Datacube
from tqdm import tqdm

from waterbodies.grid import WaterbodiesGrid
from waterbodies.hopper import create_tasks_from_datasets
from waterbodies.io import (
    check_directory_exists,
    get_filesystem,
    is_s3_path,
    load_vector_file,
)
from waterbodies.logs import logging_setup
from waterbodies.text import get_tile_index_str_from_tuple
from waterbodies.utils import rio_slurp_xarray

In [None]:
goas_file_path = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/land_sea_masks/goas_v01.parquet"
hydrosheds_land_mask_file_path = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/land_sea_masks/af_msk_3s.tif"
output_directory = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/hydrosheds_v1_1_land_mask/"

In [None]:
logging_setup(3)
_log = logging.getLogger(__name__)

In [None]:
if not check_directory_exists(output_directory):
    fs = get_filesystem(output_directory, anon=False)
    fs.mkdirs(output_directory)
    _log.info(f"Created directory {output_directory}")

In [None]:
if is_s3_path(output_directory):
    # To avoid the error GDAL signalled an error: err_no=1, msg='w+ not supported for /vsis3,
    # unless CPL_VSIL_USE_TEMP_FILE_FOR_RANDOM_WRITE is set to YES'
    # when writing to s3 using rioxarray's rio.to_raster
    os.environ["CPL_VSIL_USE_TEMP_FILE_FOR_RANDOM_WRITE"] = "YES"

In [None]:
# Find all the tiles that will be used to generate the Waterbodies
# historical extent polygons
dc = Datacube(app="tiles")
gridspec = WaterbodiesGrid().gridspec

dc_query = dict(product="wofs_ls_summary_alltime")
datasets = dc.find_datasets(**dc_query)
tasks = create_tasks_from_datasets(
    datasets=datasets, tile_index_filter=None, bin_solar_day=False
)
tile_indices = [k for task in tasks for k, v in task.items()]
tile_extents = [
    gridspec.tile_geobox(tile_index=tile_index).extent.geom for tile_index in tile_indices
]
tile_extents_gdf = gpd.GeoDataFrame(
    data={"tile_index": tile_indices, "geometry": tile_extents}, crs=gridspec.crs
)
tile_extents_gdf.set_index("tile_index", inplace=True)

_log.info(f"Found {len(tile_extents_gdf)} WaterBodiesGrid tiles")

In [None]:
# Load the Global Oceans and Seas dataset.
goas_v01_gdf = load_vector_file(goas_file_path).to_crs(gridspec.crs)
goas_v01_gdf

In [None]:
# Identify all tiles that intersect with  Global Oceans and Seas dataset
# This will be the coastal tiles.
coastal_tile_indices = (
    tile_extents_gdf.sjoin(goas_v01_gdf, predicate="intersects", how="inner")
    .index.unique()
    .to_list()
)
coastal_tile_geoboxes = [
    gridspec.tile_geobox(tile_index=tile_index) for tile_index in coastal_tile_indices
]
coastal_tiles = list(zip(coastal_tile_indices, coastal_tile_geoboxes))

_log.info(f"Found {len(coastal_tiles)} coastal WaterBodiesGrid tiles")

In [None]:
fs = get_filesystem(output_directory, anon=False)
with tqdm(
    iterable=coastal_tiles,
    desc="Rasterizing coastal HydroSHEDS version 1.1 Land Mask tiles",
    total=len(coastal_tiles),
) as coastal_tiles:
    for tile in coastal_tiles:
        tile_index, tile_geobox = tile
        tile_index_str = get_tile_index_str_from_tuple(tile_index)
        tile_raster_fp = os.path.join(
            output_directory, f"hydrosheds_v1_1_land_mask_{tile_index_str}.tif"
        )
        tile_hydrosheds_land_mask = rio_slurp_xarray(
            fname=hydrosheds_land_mask_file_path, gbox=tile_geobox, resampling="bilinear"
        )
        # Indicator values: 1 = land, 2 = ocean sink, 3 = inland sink, 255 is no data.
        tile_raster = np.logical_or(
            tile_hydrosheds_land_mask == 1, tile_hydrosheds_land_mask == 3
        ).astype(int)
        # Write to file
        tile_raster.rio.to_raster(tile_raster_fp)