This notebook converts tiled geotiffs of the various datasets in Zeno project to 
zarr format for efficient read for large scale zonal stats analysis.

See `compare_zarr_to_cog.ipynb` for a performance benchmark between tiled geotiff, cog and zarr formats

In [None]:
import coiled

import fsspec
import numpy as np
import rioxarray
import xarray as xr
import fsspec
import pandas as pd
import logging
from flox.xarray import xarray_reduce
import numpy as np

import dask
import zarr
import gcsfs

In [None]:
fs = fsspec.filesystem("s3", requester_pays=True)

In [None]:
logging.getLogger("distributed.client").setLevel(logging.ERROR)

In [None]:
cluster = coiled.Cluster(
    name="tcl_dask",
    region="us-east-1",
    n_workers=20,
    tags={"project": "tcl_dask"},
    scheduler_vm_types="r7g.xlarge",
    worker_vm_types="r7g.2xlarge",
    compute_purchase_option="spot_with_fallback"
)

client = cluster.get_client()

In [None]:
cluster.adapt(minimum=10, maximum=50)

In [None]:
dist_alerts_tiles = pd.read_json(
    "s3://gfw-data-lake/umd_glad_dist_alerts/v20250510/raster/epsg-4326/10/40000/default/gdal-geotiff/tiles.geojson"
)
adm0_tiles = pd.read_json(
    's3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/10/40000/adm0/gdal-geotiff/tiles.geojson'
)

adm1_tiles = pd.read_json(
    's3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/10/40000/adm1/gdal-geotiff/tiles.geojson'
)

adm2_tiles = pd.read_json(
    's3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/10/40000/adm2/gdal-geotiff/tiles.geojson'
)

pixel_area_tiles = pd.read_json(
    's3://gfw-data-lake/umd_area_2013/v1.10/raster/epsg-4326/10/40000/area_m/gdal-geotiff/tiles.geojson'
)

def get_uri(feature):
    raw = feature['properties']['name'].split('/')[2:]
    uri = '/'.join(['s3:/'] + raw)
    return uri

dist_alerts_tile_uris = dist_alerts_tiles.features.apply(get_uri)
adm0_tile_uris = adm0_tiles.features.apply(get_uri)
adm1_tile_uris = adm1_tiles.features.apply(get_uri)
adm2_tile_uris = adm2_tiles.features.apply(get_uri)
pixel_area_uris = pixel_area_tiles.features.apply(get_uri)


In [None]:
dist_alerts = tcl_year = xr.open_mfdataset(
    dist_alerts_tile_uris,
    parallel=True,
    chunks={'x': 10000, 'y':10000}
).astype(np.int16)

In [None]:
dist_alerts.band_data

In [None]:
dist_zarr_name = "s3://gfw-data-lake/umd_glad_dist_alerts/v20250510/raster/epsg-4326/zarr/dist_alerts_full.zarr"

In [None]:
dist_alerts.band_data.to_zarr(dist_zarr_name, mode='w')

#### Save alert date and confidence as separate variables as well

In [None]:
alert_date = dist_alerts.band_data % 10000
alert_conf = (dist_alerts.band_data // 10000).astype(np.uint8)
alert_conf.name = "confidence"
alert_date.name = "alert_date"
date_conf = xr.merge((alert_conf, alert_date))
date_conf.to_zarr("s3://gfw-data-lake/umd_glad_dist_alerts/v20250510/raster/epsg-4326/zarr/date_conf.zarr", mode="w")

In [None]:

adm0 = xr.open_mfdataset(
    adm0_tile_uris,
    parallel=True,
    chunks={'x': 10000, 'y':10000}
).astype(np.uint16)

In [None]:
adm0.band_data

In [None]:
adm1 = xr.open_mfdataset(
    adm1_tile_uris,
    parallel=True,
    chunks={'x': 10000, 'y':10000}
).astype(np.uint8)

In [None]:
adm1.band_data

In [None]:
adm2 = xr.open_mfdataset(
    adm2_tile_uris,
    parallel=True,
    chunks={'x': 10000, 'y':10000}
).astype(np.uint16)

In [None]:
adm2.band_data

In [None]:
pixel_area = xr.open_mfdataset(
    pixel_area_uris,
    parallel=True,
    chunks={'x': 10000, 'y':10000}
)
pixel_area.band_data

In [None]:
_, adm1_aligned = xr.align(dist_alerts, adm1, join='left')

In [None]:
adm1_aligned.band_data.to_zarr(
    's3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/zarr/adm1_clipped_to_dist.zarr'
)

In [None]:
_, adm2_aligned = xr.align(dist_alerts, adm2, join='left')

In [None]:
adm2_aligned.band_data.to_zarr(
    's3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/zarr/adm2_clipped_to_dist.zarr'
)

In [None]:
_, adm0_aligned = xr.align(dist_alerts, adm0, join='left')

In [None]:
adm0_aligned.band_data.to_zarr(
    's3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/zarr/adm0_clipped_to_dist.zarr', mode='w'
)

In [None]:
_, pixel_area_aligned = xr.align(dist_alerts, pixel_area, join='left')

In [None]:
pixel_area_aligned.band_data

In [None]:
pixel_area_aligned.band_data.to_zarr(
    's3://gfw-data-lake/umd_area_2013/v1.10/raster/epsg-4326/zarr/pixel_area_clipped_to_dist.zarr', mode='w'
)

In [None]:
def set_env():
    import os
    os.environ['GS_NO_SIGN_REQUEST'] = 'YES'

client.run(set_env)

In [None]:
gfs = gcsfs.GCSFileSystem(token=None)

bucket_path = 'lcl_public/SBTN_NaturalLands/v1_1/classification/'
file_list = gfs.glob(f'{bucket_path}*.tif')

natural_lands_urls_all_classes_urls = [f'gs://{f}' for f in file_list]

In [None]:
sbtn_natural_lands_all_classes = xr.open_mfdataset(
    natural_lands_urls_all_classes_urls,
    parallel=True,
    chunks={'x': 10000, 'y':10000}
).astype(np.uint8)

In [None]:
sbtn_natural_lands_all_classes.band_data

In [None]:
_, sbtn_natural_lands_all_classes_clipped = xr.align(dist_alerts, sbtn_natural_lands_all_classes, join='left')

In [None]:
sbtn_natural_lands_all_classes_clipped.to_zarr(
    "s3://gfw-data-lake/sbtn_natural_lands/zarr/sbtn_natural_lands_all_classes_clipped_to_dist.zarr", mode="w"
)