### Rasterio-based validation of Zeno zonal statistics

In [3]:
import numpy as np
import pandas as pd
import rasterio as rio
import geopandas as gpd
import pandera.pandas as pa
from pandera.typing.pandas import Series
from rasterio.windows import from_bounds
from rasterio.features import geometry_mask

#from ..pipelines.disturbance.check_for_new_alerts import get_latest_version

Provide an admin area shapefile and associate 10/40000 tile to generate validation stats

In [2]:
gdf = gpd.read_file("../test/validation_statistics/br_rn.json") # State of Rio Grande do Norte, Brazil
aoi = gdf.iloc[0]
aoi_tile = "00N_040W" # This AOI fits within a tile, but we should build VRTs so we can use any (resonably sized) AOI
#ersion = get_latest_version("umd_glad_dist_alerts")
gadm41_iso = 76
gadm41_region = 20
gadm41_subregion = 150

In [3]:
bounds = aoi.geometry.bounds

In [None]:
with rio.open("")

In [5]:
# read dist alerts
with rio.open(f"s3://gfw-data-lake/umd_glad_dist_alerts/v20250712/raster/epsg-4326/10/40000/currentweek/gdal-geotiff/{aoi_tile}.tif") as src:
    window = from_bounds(
        bounds[0],
        bounds[1],
        bounds[2],
        bounds[3],
        src.transform
    )
    dist_alerts = src.read(1, window=window)
    win_affine = src.window_transform(window)

# set no_data from -1 to 0
dist_alerts = np.where(dist_alerts == -1, 0, dist_alerts)

# Extract confidence level (first digit)
dist_confidence_levels = dist_alerts // 10000
dist_high_conf = np.where(dist_confidence_levels == 3, 1, 0)
dist_low_conf = np.where(dist_confidence_levels == 2, 1, 0)

# Extract Julian date (remaining digits)
dist_julian_date = dist_alerts % 10000 

In [5]:
# read natural lands data
with rio.open(f"s3://gfw-data-lake/sbtn_natural_forests_map/v202504/raster/epsg-4326/10/40000/class/gdal-geotiff/{aoi_tile}.tif") as src:
    sbtn_data = src.read(1, window=window)
    win_affine = src.window_transform(window)

    # Extract natural forest and non-natural forest areas within the admin area
    natural_forest = np.where(sbtn_data == 1, 1, 0)
    nonnatural_forest = np.where(sbtn_data == 2, 1, 0)

In [6]:
# read area data
#with rio.open(f"s3://gfw-data-lake/umd_area_2013/v1.10/raster/epsg-4326/10/40000/area_m/gdal-geotiff/{aoi_tile}.tif") as src:
#    area = src.read(1, window=window)

In [8]:
# mask dist alerts by aoi geometry
aoi_mask = geometry_mask([aoi.geometry], invert=True, transform=win_affine, out_shape=dist_alerts.shape)

In [9]:
# contextual layer maskings 
dist_high_conf_aoi = aoi_mask * dist_high_conf #* natural_forest
dist_low_conf_aoi = aoi_mask * dist_low_conf#* natural_forest
dist_julian_date_aoi = aoi_mask * dist_julian_date #* natural_forest

In [10]:
# create a dataframe of analysis results

high_conf_flat = dist_high_conf_aoi.flatten()
low_conf_flat = dist_low_conf_aoi.flatten()
julian_date_flat = dist_julian_date_aoi.flatten()

df = pd.DataFrame({
    "alert_date": julian_date_flat,
    "high_conf": high_conf_flat,
    "low_conf": low_conf_flat
})

In [28]:
high_conf_results = df.groupby("alert_date")["high_conf"].sum().reset_index()
low_conf_results = df.groupby("alert_date")["low_conf"].sum()

In [29]:
# set confidence levels and GADM IDs
high_conf_results["confidence"] = 3
high_conf_results["country"] = 76
high_conf_results["region"] = 20
high_conf_results["subregion"] = 150
low_conf_results["confidence"] = 2
low_conf_results["country"] = 76
low_conf_results["region"] = 20
low_conf_results["subregion"] = 150

high_conf_results.head(10)

Unnamed: 0,alert_date,high_conf,confidence,country,region,subregion
0,0,0,3,76,20,150
1,887,1,3,76,20,150
2,895,1,3,76,20,150
3,902,0,3,76,20,150
4,967,1,3,76,20,150
5,975,0,3,76,20,150
6,998,1,3,76,20,150
7,1007,9,3,76,20,150
8,1015,7,3,76,20,150
9,1017,1,3,76,20,150


In [35]:


high_conf_results = df.groupby("alert_date")["high_conf"].sum().reset_index()
low_conf_results = df.groupby("alert_date")["low_conf"].sum().reset_index()

high_conf_results["confidence"] = 3
high_conf_results["country"] = gadm41_iso
high_conf_results["region"] = gadm41_region
high_conf_results["subregion"] = gadm41_subregion

low_conf_results["confidence"] = 2
low_conf_results["country"] = gadm41_iso
low_conf_results["region"] = gadm41_region
low_conf_results["subregion"] = gadm41_subregion

# rename high_conf to value
high_conf_results.rename(columns={"high_conf": "value"}, inplace=True)
low_conf_results.rename(columns={"low_conf": "value"}, inplace=True)

# reorder columns to country, region, subregion, alert_date, confidence, value
high_conf_results = high_conf_results[["country", "region", "subregion", "alert_date", "confidence", "value"]]
low_conf_results = low_conf_results[["country", "region", "subregion", "alert_date", "confidence", "value"]]

# concatenate the results
results = pd.concat([high_conf_results, low_conf_results], ignore_index=True)

# drop rows where alert_date is zero
results = results[results["alert_date"] != 0]

# drop rows where alerts are zero
results = results[results["value"] != 0]

In [36]:
results

Unnamed: 0,country,region,subregion,alert_date,confidence,value
1,76,20,150,1015,3,6
2,76,20,150,1087,3,1
3,76,20,150,1127,3,1
6,76,20,150,1174,3,1
7,76,20,150,1195,3,2
...,...,...,...,...,...,...
545,76,20,150,1627,2,459
546,76,20,150,1628,2,542
547,76,20,150,1629,2,15175
548,76,20,150,1633,2,96


In [17]:
# write validation results to CSV
results.to_csv("validation_stats.csv", index=False)

In [1]:
import pandas as pd
import geopandas as gpd
import pandera.pandas as pa
from pandera.typing.pandas import Series
from typing import List
#from prefect.logging import get_run_logger
import rasterio as rio
from rasterio.windows import from_bounds
from rasterio.features import geometry_mask
import numpy as np
from pipelines.disturbance.check_for_new_alerts import get_latest_version

ModuleNotFoundError: No module named 'pipelines'

In [9]:
"""Generate zonal statistics for the admin area AOI."""
gdf = gpd.read_file("../test/validation_statistics/br_rn.json") # State of Rio Grande do Norte, Brazil
aoi = gdf.iloc[0]
aoi_tile = "00N_040W" # This AOI fits within a tile, but we should build VRTs so we can use any (resonably sized) AOI
version = "v20250712"
""
# read dist alerts for AOI
bounds = aoi.geometry.bounds
with rio.open(f"s3://gfw-data-lake/umd_glad_dist_alerts/{version}/raster/epsg-4326/10/40000/default/gdal-geotiff/{aoi_tile}.tif") as src:
    window = from_bounds(
        bounds[0],
        bounds[1],
        bounds[2],
        bounds[3],
        src.transform
    )
    dist_alerts = src.read(1, window=window)
    win_affine = src.window_transform(window)

# Extract confidence level (first digit)
dist_confidence_levels = dist_alerts // 10000
dist_high_conf = np.where(dist_confidence_levels == 3, 1, 0)
dist_low_conf = np.where(dist_confidence_levels == 2, 1, 0)

# Extract Julian date (remaining digits)
dist_julian_date = dist_alerts % 10000 

# mask dist alerts by aoi geometry
aoi_mask = geometry_mask([aoi.geometry], invert=True, transform=win_affine, out_shape=dist_alerts.shape)

# confidence level maskings 
dist_high_conf_aoi = aoi_mask * dist_high_conf
dist_low_conf_aoi = aoi_mask * dist_low_conf
dist_julian_date_aoi = aoi_mask * dist_julian_date

# create a dataframe of analysis results
high_conf_flat = dist_high_conf_aoi.flatten()
low_conf_flat = dist_low_conf_aoi.flatten()
julian_date_flat = dist_julian_date_aoi.flatten()
df = pd.DataFrame({
    "alert_date": julian_date_flat,
    "high_conf": high_conf_flat,
    "low_conf": low_conf_flat
})
high_conf_results = df.groupby("alert_date")["high_conf"].sum().reset_index()
low_conf_results = df.groupby("alert_date")["low_conf"].sum().reset_index()

# set confidence levels and GADM IDs
high_conf_results["confidence"] = 3
high_conf_results["country"] = 76
high_conf_results["region"] = 20
high_conf_results["subregion"] = 150
low_conf_results["confidence"] = 2
low_conf_results["country"] = 76
low_conf_results["region"] = 20
low_conf_results["subregion"] = 150

# rename high_conf to value
high_conf_results.rename(columns={"high_conf": "value"}, inplace=True)
low_conf_results.rename(columns={"low_conf": "value"}, inplace=True)

# reorder columns to country, region, subregion, alert_date, confidence, value
high_conf_results = high_conf_results[["country", "region", "subregion", "alert_date", "confidence", "value"]]
low_conf_results = low_conf_results[["country", "region", "subregion", "alert_date", "confidence", "value"]]

# concatenate confidence dfs into one validation df
results = pd.concat([high_conf_results, low_conf_results], ignore_index=True)

# drop rows where alert_date is zero
results = results[results["alert_date"] != 0]

# drop rows where alerts are zero
results = results[results["value"] != 0]

In [11]:
results

Unnamed: 0,country,region,subregion,alert_date,confidence,value
1,76,20,150,762,3,2
3,76,20,150,787,3,6
4,76,20,150,807,3,20
5,76,20,150,823,3,12
6,76,20,150,827,3,1
...,...,...,...,...,...,...
994,76,20,150,1645,2,372789
995,76,20,150,1646,2,24571
996,76,20,150,1647,2,9598
997,76,20,150,1648,2,388


In [8]:
parquet_uri = "s3://gfw-data-lake/umd_glad_dist_alerts/v20250712/tabular/parquet/gadm_dist_alerts.parquet"
zeno_df = pd.read_parquet(parquet_uri)
zeno_aoi_df = zeno_df[(zeno_df["country"] == 76) & (zeno_df["region"] == 20)]
zeno_aoi_df

Unnamed: 0,country,region,subregion,alert_date,confidence,value
622485,76,20,0,762,3,1
622486,76,20,0,807,2,2
622487,76,20,0,852,3,1
622488,76,20,0,887,2,2
622489,76,20,0,902,2,2
...,...,...,...,...,...,...
623576,76,20,156,1585,3,8
623577,76,20,156,1587,2,5
623578,76,20,156,1587,3,14
623579,76,20,156,1589,2,3
