### Rasterio-based validation of Zeno zonal statistics

In [28]:
import numpy as np
import pandas as pd
import rasterio as rio
import geopandas as gpd
import pandera.pandas as pa
from pandera.typing.pandas import Series
from rasterio.windows import from_bounds
from rasterio.features import geometry_mask

Provide an admin area shapefile and associate 10/40000 tile to generate validation stats

In [2]:
gdf = gpd.read_file("br_rn.json") # State of Rio Grande do Norte, Brazil
aoi = gdf.iloc[0]
aoi_tile = "00N_040W" # This AOI fits within a tile, but we should build VRTs so we can use any (resonably sized) AOI
gadm41_iso = 76
gadm41_region = 20
gadm41_subregion = 150

In [3]:
bounds = aoi.geometry.bounds

In [4]:
# read dist alerts
with rio.open(f"s3://gfw-data-lake/umd_glad_dist_alerts/v20250628/raster/epsg-4326/10/40000/currentweek/gdal-geotiff/{aoi_tile}.tif") as src:
    window = from_bounds(
        bounds[0],
        bounds[1],
        bounds[2],
        bounds[3],
        src.transform
    )
    dist_alerts = src.read(1, window=window)
    win_affine = src.window_transform(window)

# set no_data from -1 to 0
dist_alerts = np.where(dist_alerts == -1, 0, dist_alerts)

# Extract confidence level (first digit)
dist_confidence_levels = dist_alerts // 10000
dist_high_conf = np.where(dist_confidence_levels == 3, 1, 0)
dist_low_conf = np.where(dist_confidence_levels == 2, 1, 0)

# Extract Julian date (remaining digits)
dist_julian_date = dist_alerts % 10000 

In [5]:
# read natural lands data
with rio.open(f"s3://gfw-data-lake/sbtn_natural_forests_map/v202504/raster/epsg-4326/10/40000/class/gdal-geotiff/{aoi_tile}.tif") as src:
    sbtn_data = src.read(1, window=window)
    win_affine = src.window_transform(window)

    # Extract natural forest and non-natural forest areas within the admin area
    natural_forest = np.where(sbtn_data == 1, 1, 0)
    nonnatural_forest = np.where(sbtn_data == 2, 1, 0)

In [6]:
# read area data
#with rio.open(f"s3://gfw-data-lake/umd_area_2013/v1.10/raster/epsg-4326/10/40000/area_m/gdal-geotiff/{aoi_tile}.tif") as src:
#    area = src.read(1, window=window)

In [7]:
# mask dist alerts by aoi geometry
aoi_mask = geometry_mask([aoi.geometry], invert=True, transform=win_affine, out_shape=dist_alerts.shape)

In [8]:
# contextual layer maskings 
dist_high_conf_aoi = aoi_mask * dist_high_conf * natural_forest
dist_low_conf_aoi = aoi_mask * dist_low_conf * natural_forest
dist_julian_date_aoi = aoi_mask * dist_julian_date * natural_forest

In [35]:
# create a dataframe of analysis results

high_conf_flat = dist_high_conf_aoi.flatten()
low_conf_flat = dist_low_conf_aoi.flatten()
julian_date_flat = dist_julian_date_aoi.flatten()

df = pd.DataFrame({
    "alert_date": julian_date_flat,
    "high_conf": high_conf_flat,
    "low_conf": low_conf_flat
})

high_conf_results = df.groupby("alert_date")["high_conf"].sum().reset_index()
low_conf_results = df.groupby("alert_date")["low_conf"].sum().reset_index()

high_conf_results["confidence"] = 3
high_conf_results["country"] = gadm41_iso
high_conf_results["region"] = gadm41_region
high_conf_results["subregion"] = gadm41_subregion

low_conf_results["confidence"] = 2
low_conf_results["country"] = gadm41_iso
low_conf_results["region"] = gadm41_region
low_conf_results["subregion"] = gadm41_subregion

# rename high_conf to value
high_conf_results.rename(columns={"high_conf": "value"}, inplace=True)
low_conf_results.rename(columns={"low_conf": "value"}, inplace=True)

# reorder columns to country, region, subregion, alert_date, confidence, value
high_conf_results = high_conf_results[["country", "region", "subregion", "alert_date", "confidence", "value"]]
low_conf_results = low_conf_results[["country", "region", "subregion", "alert_date", "confidence", "value"]]

# concatenate the results
results = pd.concat([high_conf_results, low_conf_results], ignore_index=True)

# drop rows where alert_date is zero
results = results[results["alert_date"] != 0]

# drop rows where alerts are zero
results = results[results["value"] != 0]

In [36]:
results

Unnamed: 0,country,region,subregion,alert_date,confidence,value
1,76,20,150,1015,3,6
2,76,20,150,1087,3,1
3,76,20,150,1127,3,1
6,76,20,150,1174,3,1
7,76,20,150,1195,3,2
...,...,...,...,...,...,...
545,76,20,150,1627,2,459
546,76,20,150,1628,2,542
547,76,20,150,1629,2,15175
548,76,20,150,1633,2,96


In [17]:
# write validation results to CSV
results.to_csv("validation_stats.csv", index=False)

#### Load pre-calculated Zeno stats for this AOI and compare results

In [11]:
zeno_parquet = pd.read_parquet("s3://gfw-data-lake/umd_glad_dist_alerts/v20250628/tabular/epsg-4326/zonal_stats/dist_alerts_by_adm2.parquet")

In [12]:
zeno_stats = zeno_parquet[
    (zeno_parquet["country"] == 76) &
    (zeno_parquet["region"] == 20)
]
zeno_stats

Unnamed: 0,country,region,subregion,alert_date,confidence,value
622725,76,20,0,762,3,1
622726,76,20,0,807,2,2
622727,76,20,0,852,3,1
622728,76,20,0,887,2,2
622729,76,20,0,902,2,2
...,...,...,...,...,...,...
623824,76,20,156,1585,3,6
623825,76,20,156,1587,2,5
623826,76,20,156,1587,3,11
623827,76,20,156,1589,2,3
