### Rasterio-based validation of Zeno zonal statistics

In [1]:
import numpy as np
import pandas as pd
import rasterio as rio
import geopandas as gpd
import pandera.pandas as pa
from pandera.typing.pandas import Series
from rasterio.windows import from_bounds
from rasterio.features import geometry_mask
from dateutil.relativedelta import relativedelta
from datetime import date
from datetime import date
from pydantic import BaseModel
from typing import Dict, Optional, Literal

#from ..pipelines.disturbance.check_for_new_alerts import get_latest_version

In [2]:
isos = [
    'AFG', 'ALA', 'ALB', 'DZA', 'ASM', 'AND', 'AGO', 'AIA', 'ATA', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS', 'AUT', 'AZE',
    'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ', 'BEN', 'BMU', 'BTN', 'BOL', 'BES', 'BIH', 'BWA', 'BVT', 'BRA',
    'IOT', 'BRN', 'BGR', 'BFA', 'BDI', 'CPV', 'KHM', 'CMR', 'CAN', 'CYM', 'CAF', 'TCD', 'CHL', 'CHN', 'CXR', 'CCK',
    'COL', 'COM', 'COG', 'COD', 'COK', 'CRI', 'CIV', 'HRV', 'CUB', 'CUW', 'CYP', 'CZE', 'DNK', 'DJI', 'DMA', 'DOM',
    'ECU', 'EGY', 'SLV', 'GNQ', 'ERI', 'EST', 'SWZ', 'ETH', 'FLK', 'FRO', 'FJI', 'FIN', 'FRA', 'GUF', 'PYF', 'ATF',
    'GAB', 'GMB', 'GEO', 'DEU', 'GHA', 'GIB', 'GRC', 'GRL', 'GRD', 'GLP', 'GUM', 'GTM', 'GGY', 'GIN', 'GNB', 'GUY',
    'HTI', 'HMD', 'VAT', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'IMN', 'ISR', 'ITA', 'JAM',
    'JPN', 'JEY', 'JOR', 'KAZ', 'KEN', 'KIR', 'PRK', 'KOR', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LSO', 'LBR', 'LBY',
    'LIE', 'LTU', 'LUX', 'MAC', 'MDG', 'MWI', 'MYS', 'MDV', 'MLI', 'MLT', 'MHL', 'MTQ', 'MRT', 'MUS', 'MYT', 'MEX',
    'FSM', 'MDA', 'MCO', 'MNG', 'MNE', 'MSR', 'MAR', 'MOZ', 'MMR', 'NAM', 'NRU', 'NPL', 'NLD', 'NCL', 'NZL', 'NIC',
    'NER', 'NGA', 'NIU', 'NFK', 'MKD', 'MNP', 'NOR', 'OMN', 'PAK', 'PLW', 'PSE', 'PAN', 'PNG', 'PRY', 'PER', 'PHL',
    'PCN', 'POL', 'PRT', 'PRI', 'QAT', 'REU', 'ROU', 'RUS', 'RWA', 'BLM', 'SHN', 'KNA', 'LCA', 'MAF', 'SPM', 'VCT',
    'WSM', 'SMR', 'STP', 'SAU', 'SEN', 'SRB', 'SYC', 'SLE', 'SGP', 'SXM', 'SVK', 'SVN', 'SLB', 'SOM', 'ZAF', 'SGS',
    'SSD', 'ESP', 'LKA', 'SDN', 'SUR', 'SJM', 'SWE', 'CHE', 'SYR', 'TWN', 'TJK', 'TZA', 'THA', 'TLS', 'TGO', 'TKL',
    'TON', 'TTO', 'TUN', 'TUR', 'TKM', 'TCA', 'TUV', 'UGA', 'UKR', 'ARE', 'GBR', 'USA', 'UMI', 'URY', 'UZB', 'VUT',
    'VEN', 'VNM', 'VGB', 'VIR', 'WLF', 'ESH', 'YEM', 'ZMB', 'ZWE'
]

numeric_to_alpha3 = {
    4: 'AFG', 248: 'ALA', 8: 'ALB', 12: 'DZA', 16: 'ASM', 20: 'AND', 24: 'AGO', 660: 'AIA',
    10: 'ATA', 28: 'ATG', 32: 'ARG', 51: 'ARM', 533: 'ABW', 36: 'AUS', 40: 'AUT', 31: 'AZE',
    44: 'BHS', 48: 'BHR', 50: 'BGD', 52: 'BRB', 112: 'BLR', 56: 'BEL', 84: 'BLZ', 204: 'BEN',
    60: 'BMU', 64: 'BTN', 68: 'BOL', 535: 'BES', 70: 'BIH', 72: 'BWA', 74: 'BVT', 76: 'BRA',
    86: 'IOT', 96: 'BRN', 100: 'BGR', 854: 'BFA', 108: 'BDI', 132: 'CPV', 116: 'KHM', 120: 'CMR',
    124: 'CAN', 136: 'CYM', 140: 'CAF', 148: 'TCD', 152: 'CHL', 156: 'CHN', 162: 'CXR', 166: 'CCK',
    170: 'COL', 174: 'COM', 178: 'COG', 180: 'COD', 184: 'COK', 188: 'CRI', 384: 'CIV', 191: 'HRV',
    192: 'CUB', 531: 'CUW', 196: 'CYP', 203: 'CZE', 208: 'DNK', 262: 'DJI', 212: 'DMA', 214: 'DOM',
    218: 'ECU', 818: 'EGY', 222: 'SLV', 226: 'GNQ', 232: 'ERI', 233: 'EST', 748: 'SWZ', 231: 'ETH',
    238: 'FLK', 234: 'FRO', 242: 'FJI', 246: 'FIN', 250: 'FRA', 254: 'GUF', 258: 'PYF', 260: 'ATF',
    266: 'GAB', 270: 'GMB', 268: 'GEO', 276: 'DEU', 288: 'GHA', 292: 'GIB', 300: 'GRC', 304: 'GRL',
    308: 'GRD', 312: 'GLP', 316: 'GUM', 320: 'GTM', 831: 'GGY', 324: 'GIN', 624: 'GNB', 328: 'GUY',
    332: 'HTI', 334: 'HMD', 336: 'VAT', 340: 'HND', 344: 'HKG', 348: 'HUN', 352: 'ISL', 356: 'IND',
    360: 'IDN', 364: 'IRN', 368: 'IRQ', 372: 'IRL', 833: 'IMN', 376: 'ISR', 380: 'ITA', 388: 'JAM',
    392: 'JPN', 832: 'JEY', 400: 'JOR', 398: 'KAZ', 404: 'KEN', 296: 'KIR', 408: 'PRK', 410: 'KOR',
    414: 'KWT', 417: 'KGZ', 418: 'LAO', 428: 'LVA', 422: 'LBN', 426: 'LSO', 430: 'LBR', 434: 'LBY',
    438: 'LIE', 440: 'LTU', 442: 'LUX', 446: 'MAC', 450: 'MDG', 454: 'MWI', 458: 'MYS', 462: 'MDV',
    466: 'MLI', 470: 'MLT', 584: 'MHL', 474: 'MTQ', 478: 'MRT', 480: 'MUS', 175: 'MYT', 484: 'MEX',
    583: 'FSM', 498: 'MDA', 492: 'MCO', 496: 'MNG', 499: 'MNE', 500: 'MSR', 504: 'MAR', 508: 'MOZ',
    104: 'MMR', 516: 'NAM', 520: 'NRU', 524: 'NPL', 528: 'NLD', 540: 'NCL', 554: 'NZL', 558: 'NIC',
    562: 'NER', 566: 'NGA', 570: 'NIU', 574: 'NFK', 807: 'MKD', 580: 'MNP', 578: 'NOR', 512: 'OMN',
    586: 'PAK', 585: 'PLW', 275: 'PSE', 591: 'PAN', 598: 'PNG', 600: 'PRY', 604: 'PER', 608: 'PHL',
    612: 'PCN', 616: 'POL', 620: 'PRT', 630: 'PRI', 634: 'QAT', 638: 'REU', 642: 'ROU', 643: 'RUS',
    646: 'RWA', 652: 'BLM', 654: 'SHN', 659: 'KNA', 662: 'LCA', 663: 'MAF', 666: 'SPM', 670: 'VCT',
    882: 'WSM', 674: 'SMR', 678: 'STP', 682: 'SAU', 686: 'SEN', 688: 'SRB', 690: 'SYC', 694: 'SLE',
    702: 'SGP', 534: 'SXM', 703: 'SVK', 705: 'SVN', 90: 'SLB', 706: 'SOM', 710: 'ZAF', 239: 'SGS',
    728: 'SSD', 724: 'ESP', 144: 'LKA', 729: 'SDN', 740: 'SUR', 744: 'SJM', 752: 'SWE', 756: 'CHE',
    760: 'SYR', 158: 'TWN', 762: 'TJK', 834: 'TZA', 764: 'THA', 626: 'TLS', 768: 'TGO', 772: 'TKL',
    776: 'TON', 780: 'TTO', 788: 'TUN', 792: 'TUR', 795: 'TKM', 796: 'TCA', 798: 'TUV', 800: 'UGA',
    804: 'UKR', 784: 'ARE', 826: 'GBR', 840: 'USA', 581: 'UMI', 858: 'URY', 860: 'UZB', 548: 'VUT',
    862: 'VEN', 704: 'VNM', 92: 'VGB', 850: 'VIR', 876: 'WLF', 732: 'ESH', 887: 'YEM', 894: 'ZMB',
    716: 'ZWE'
}

Provide an admin area shapefile and associate 10/40000 tile to generate validation stats

In [None]:
gdf = gpd.read_file("../pipelines/validation_statistics/br_rn.json") # State of Rio Grande do Norte, Brazil
aoi = gdf.iloc[0]
aoi_tile = "00N_040W" # This AOI fits within a tile, but we should build VRTs so we can use any (resonably sized) AOI
version = "v20251004"
gadm41_iso = 76
gadm41_region = 20
gadm41_subregion = 150

In [None]:
prefect config set PREFECT_API_URL=http://127.0.0.1:4200/api
pytest api/test/unit/domain/compute_engines/test_tree_cover_loss_compute_engine.py::test_get_tree_cover_loss_precalc_handler_happy_path

In [18]:
class ContextualLayer(BaseModel):
    name: Literal["sbtn_natural_lands", "grasslands", "drivers", "land_cover"]
    source_uri: str
    column_name: str
    classes: Dict[int, str]

SBTN_NATURAL_LANDS = ContextualLayer(
    name="sbtn_natural_lands",
    source_uri="s3://gfw-data-lake/sbtn_natural_lands_classification/v1.1/raster/epsg-4326/10/40000/class/geotiff/00N_040W.tif",
    column_name="natural_land_class",
    classes= {
      2: "Natural forests",
      3: "Natural short vegetation",
      4: "Natural water",
      5: "Mangroves",
      6: "Bare",
      7: "Snow",
      8: "Wetland natural forests",
      9: "Natural peat forests",
      10: "Wetland natural short vegetation",
      11: "Natural peat short vegetation",
      12: "Cropland",
      13: "Built-up",
      14: "Non-natural tree cover",
      15: "Non-natural short vegetation",
      16: "Non-natural water",
      17: "Wetland non-natural tree cover",
      18: "Non-natural peat tree cover",
      19: "Wetland non-natural short vegetation",
      20: "Non-natural peat short vegetation",
      21: "Non-natural bare",
  }
)

In [19]:
contextual_layer = SBTN_NATURAL_LANDS

In [20]:
# read dist alerts for AOI
bounds = aoi.geometry.bounds
with rio.Env(AWS_REQUEST_PAYER="requester"):
    with rio.open(
        f"s3://gfw-data-lake/umd_glad_dist_alerts/{version}/raster/epsg-4326/10/40000/default/gdal-geotiff/{aoi_tile}.tif"
    ) as src:
        window = from_bounds(
            bounds[0], bounds[1], bounds[2], bounds[3], src.transform
        )
        dist_alerts = src.read(1, window=window)
        win_affine = src.window_transform(window)

# read area for AOI
with rio.Env(AWS_REQUEST_PAYER="requester"):
    with rio.open(
        f"s3://gfw-data-lake/umd_area_2013/v1.10/raster/epsg-4326/10/40000/area_m/gdal-geotiff/{aoi_tile}.tif"
    ) as src:
        pixel_area__m = src.read(1, window=window)
        pixel_area_ha = pixel_area__m / 10000

In [21]:
# set no_data from -1 to 0
dist_alerts = np.where(dist_alerts == -1, 0, dist_alerts)

# Extract confidence level (first digit)
dist_confidence_levels = dist_alerts // 10000
dist_high_conf = np.where(dist_confidence_levels == 3, 1, 0)
dist_low_conf = np.where(dist_confidence_levels == 2, 1, 0)

# Extract Julian date (remaining digits)
dist_julian_date = dist_alerts % 10000 

# create geometry_mask to mask dist alerts by aoi geometry
aoi_mask = geometry_mask(
    [aoi.geometry], invert=True, transform=win_affine, out_shape=dist_alerts.shape
)

# dist_alert_confidence level maskings
# anything outside the AOI becomes zero
dist_high_conf_aoi = aoi_mask * dist_high_conf * pixel_area_ha
dist_low_conf_aoi = aoi_mask * dist_low_conf * pixel_area_ha
dist_julian_date_aoi = aoi_mask * dist_julian_date

# create a dataframe of analysis results
high_conf_flat = dist_high_conf_aoi.flatten()
low_conf_flat = dist_low_conf_aoi.flatten()
julian_date_flat = dist_julian_date_aoi.flatten()

In [22]:
# read and process contextual layer
if contextual_layer:
    with rio.Env(AWS_REQUEST_PAYER="requester"):
        with rio.open(contextual_layer.source_uri) as src:
            contextual_data = src.read(1, window=window)
    contextual_data_aoi = aoi_mask * contextual_data
    contextual_flat = contextual_data_aoi.flatten()

    df = pd.DataFrame({
        "dist_alert_date": julian_date_flat,
        contextual_layer.name: contextual_flat,
        "high_conf": high_conf_flat,
        "low_conf": low_conf_flat,
    })
    high_conf_results = df.groupby([contextual_layer.name, "dist_alert_date"])["high_conf"].sum().reset_index()
    low_conf_results = df.groupby([contextual_layer.name, "dist_alert_date"])["low_conf"].sum().reset_index()

    # map contextual layer names
    high_conf_results[contextual_layer.column_name] = high_conf_results[contextual_layer.name].map(contextual_layer.classes)
    low_conf_results[contextual_layer.column_name] = low_conf_results[contextual_layer.name].map(contextual_layer.classes)
else:
    df = pd.DataFrame(
        {
            "dist_alert_date": julian_date_flat,
            "high_conf": high_conf_flat,
            "low_conf": low_conf_flat,
        }
    )
    high_conf_results = df.groupby("dist_alert_date")["high_conf"].sum().reset_index()
    low_conf_results = df.groupby("dist_alert_date")["low_conf"].sum().reset_index()

In [23]:
# set dist_alert_confidence levels and GADM IDs
high_conf_results["dist_alert_confidence"] = "high"
high_conf_results["country"] = 76
high_conf_results["region"] = 20
high_conf_results["subregion"] = (
    150  # placeholder for subregion (adm2) since we are running on an adm1 AOI
)
low_conf_results["dist_alert_confidence"] = "low"
low_conf_results["country"] = 76
low_conf_results["region"] = 20
low_conf_results["subregion"] = (
    150  # placeholder for subregion (adm2) since we are running on an adm1 AOI
)

# rename high_conf to value
high_conf_results.rename(columns={"high_conf": "area_ha"}, inplace=True)
low_conf_results.rename(columns={"low_conf": "area_ha"}, inplace=True)


# reorder columns to country, region, subregion, contextual layer, dist_alert_date, confidence, value
if contextual_layer:
    column_order = ["country", "region", "subregion", "dist_alert_date", "dist_alert_confidence", "area_ha", contextual_layer.column_name]        
else:
    column_order = ["country", "region", "subregion", "dist_alert_date", "dist_alert_confidence", "area_ha"]
high_conf_results = high_conf_results[column_order]
low_conf_results = low_conf_results[column_order]

In [24]:
# concatenate dist_alert_confidence dfs into one validation df
results = pd.concat([high_conf_results, low_conf_results], ignore_index=True)

# drop rows where dist_alert_date is zero
results = results[results["dist_alert_date"] != 0]

results.rename(columns={"confidence": "dist_alert_confidence"}, inplace=True)
results["dist_alert_date"] = results.sort_values(
    by="dist_alert_date"
).dist_alert_date.apply(lambda x: date(2020, 12, 31) + relativedelta(days=x))
results["country"] = results["country"].apply(
    lambda x: numeric_to_alpha3.get(x, None)
)

result_df = pd.DataFrame(results)

In [25]:
result_df

Unnamed: 0,country,region,subregion,dist_alert_date,dist_alert_confidence,area_ha,natural_land_class
1,BRA,20,150,2023-10-27,high,0.153277,
2,BRA,20,150,2023-11-04,high,0.153277,
3,BRA,20,150,2023-12-30,high,0.153269,
4,BRA,20,150,2024-01-30,high,0.306555,
5,BRA,20,150,2024-04-28,high,0.459824,
...,...,...,...,...,...,...,...
10201,BRA,20,150,2025-09-13,low,0.229855,Non-natural bare
10202,BRA,20,150,2025-09-14,low,0.459460,Non-natural bare
10203,BRA,20,150,2025-09-16,low,0.229595,Non-natural bare
10204,BRA,20,150,2025-09-21,low,0.305974,Non-natural bare


In [26]:
result_df

Unnamed: 0,country,region,subregion,dist_alert_date,dist_alert_confidence,area_ha,natural_land_class
1,BRA,20,150,2023-10-27,high,0.153277,
2,BRA,20,150,2023-11-04,high,0.153277,
3,BRA,20,150,2023-12-30,high,0.153269,
4,BRA,20,150,2024-01-30,high,0.306555,
5,BRA,20,150,2024-04-28,high,0.459824,
...,...,...,...,...,...,...,...
10201,BRA,20,150,2025-09-13,low,0.229855,Non-natural bare
10202,BRA,20,150,2025-09-14,low,0.459460,Non-natural bare
10203,BRA,20,150,2025-09-16,low,0.229595,Non-natural bare
10204,BRA,20,150,2025-09-21,low,0.305974,Non-natural bare


In [27]:
zeno_df = pd.read_parquet("s3://gfw-data-lake/umd_glad_dist_alerts/v20251004/tabular/zonal_stats/gadm/gadm_adm2_dist_alerts_by_natural_lands.parquet")
zeno_aoi_df = zeno_df[(zeno_df["country"] == "BRA") & (zeno_df["region"] == 20)]

In [28]:
zeno_aoi_df

Unnamed: 0,country,region,subregion,natural_land_class,dist_alert_date,dist_alert_confidence,area_ha
6611432,BRA,20,1,Natural forests,2024-08-02,high,0.305861
6611433,BRA,20,1,Natural forests,2024-09-06,high,0.076475
6611434,BRA,20,1,Natural forests,2024-09-16,high,0.152923
6611435,BRA,20,1,Natural forests,2024-10-01,high,0.076483
6611436,BRA,20,1,Natural forests,2024-11-14,high,0.229410
...,...,...,...,...,...,...,...
6745632,BRA,20,167,Built-up,2025-04-01,high,0.076477
6745633,BRA,20,167,Built-up,2025-04-11,high,0.076477
6745634,BRA,20,167,Built-up,2025-04-13,low,0.076477
6745635,BRA,20,167,Built-up,2025-07-15,low,0.076477


In [29]:
zeno_aoi_df.groupby("natural_land_class")["area_ha"].sum()

natural_land_class
Bare                                   2915.571232
Built-up                               4339.134440
Cropland                             329905.747271
Mangroves                               323.093870
Natural forests                       81653.862067
Natural peat forests                     81.326020
Natural peat short vegetation          1335.820915
Natural short vegetation             787650.041175
Natural water                          6740.054994
Non-natural bare                       1425.149004
Non-natural peat short vegetation       351.882865
Non-natural short vegetation         576171.943287
Non-natural tree cover                 5554.398500
Non-natural water                       217.864062
Unclassified                             16.932374
Wetland natural short vegetation        252.376730
Name: area_ha, dtype: float64

In [30]:
result_df.groupby("natural_land_class")["area_ha"].sum()

natural_land_class
Bare                                   2900.094286
Built-up                               4332.086178
Cropland                             329889.143021
Mangroves                               323.017233
Natural forests                       81641.218992
Natural peat forests                     81.249466
Natural peat short vegetation          1335.820915
Natural short vegetation             787505.756328
Natural water                          6738.447311
Non-natural bare                       1424.689415
Non-natural peat short vegetation       351.882865
Non-natural short vegetation         576112.900480
Non-natural tree cover                 5552.867406
Non-natural water                       217.864062
Wetland natural short vegetation        252.376730
Name: area_ha, dtype: float64