### Rasterio-based validation of Zeno zonal statistics

In [50]:
import os
import numpy as np
import pandas as pd
import rasterio as rio
import geopandas as gpd
import pandera.pandas as pa
from pandera.typing.pandas import Series
from rasterio.windows import from_bounds
from rasterio.features import geometry_mask
from dateutil.relativedelta import relativedelta
from datetime import date
from datetime import date
from pydantic import BaseModel
from typing import Dict, Optional, Literal, List

os.environ['AWS_PROFILE'] = 'gfw-production'

In [3]:
isos = [
    "AFG",
    "ALA",
    "ALB",
    "DZA",
    "ASM",
    "AND",
    "AGO",
    "AIA",
    "ATA",
    "ATG",
    "ARG",
    "ARM",
    "ABW",
    "AUS",
    "AUT",
    "AZE",
    "BHS",
    "BHR",
    "BGD",
    "BRB",
    "BLR",
    "BEL",
    "BLZ",
    "BEN",
    "BMU",
    "BTN",
    "BOL",
    "BES",
    "BIH",
    "BWA",
    "BVT",
    "BRA",
    "IOT",
    "BRN",
    "BGR",
    "BFA",
    "BDI",
    "CPV",
    "KHM",
    "CMR",
    "CAN",
    "CYM",
    "CAF",
    "TCD",
    "CHL",
    "CHN",
    "CXR",
    "CCK",
    "COL",
    "COM",
    "COG",
    "COD",
    "COK",
    "CRI",
    "CIV",
    "HRV",
    "CUB",
    "CUW",
    "CYP",
    "CZE",
    "DNK",
    "DJI",
    "DMA",
    "DOM",
    "ECU",
    "EGY",
    "SLV",
    "GNQ",
    "ERI",
    "EST",
    "SWZ",
    "ETH",
    "FLK",
    "FRO",
    "FJI",
    "FIN",
    "FRA",
    "GUF",
    "PYF",
    "ATF",
    "GAB",
    "GMB",
    "GEO",
    "DEU",
    "GHA",
    "GIB",
    "GRC",
    "GRL",
    "GRD",
    "GLP",
    "GUM",
    "GTM",
    "GGY",
    "GIN",
    "GNB",
    "GUY",
    "HTI",
    "HMD",
    "VAT",
    "HND",
    "HKG",
    "HUN",
    "ISL",
    "IND",
    "IDN",
    "IRN",
    "IRQ",
    "IRL",
    "IMN",
    "ISR",
    "ITA",
    "JAM",
    "JPN",
    "JEY",
    "JOR",
    "KAZ",
    "KEN",
    "KIR",
    "PRK",
    "KOR",
    "KWT",
    "KGZ",
    "LAO",
    "LVA",
    "LBN",
    "LSO",
    "LBR",
    "LBY",
    "LIE",
    "LTU",
    "LUX",
    "MAC",
    "MDG",
    "MWI",
    "MYS",
    "MDV",
    "MLI",
    "MLT",
    "MHL",
    "MTQ",
    "MRT",
    "MUS",
    "MYT",
    "MEX",
    "FSM",
    "MDA",
    "MCO",
    "MNG",
    "MNE",
    "MSR",
    "MAR",
    "MOZ",
    "MMR",
    "NAM",
    "NRU",
    "NPL",
    "NLD",
    "NCL",
    "NZL",
    "NIC",
    "NER",
    "NGA",
    "NIU",
    "NFK",
    "MKD",
    "MNP",
    "NOR",
    "OMN",
    "PAK",
    "PLW",
    "PSE",
    "PAN",
    "PNG",
    "PRY",
    "PER",
    "PHL",
    "PCN",
    "POL",
    "PRT",
    "PRI",
    "QAT",
    "REU",
    "ROU",
    "RUS",
    "RWA",
    "BLM",
    "SHN",
    "KNA",
    "LCA",
    "MAF",
    "SPM",
    "VCT",
    "WSM",
    "SMR",
    "STP",
    "SAU",
    "SEN",
    "SRB",
    "SYC",
    "SLE",
    "SGP",
    "SXM",
    "SVK",
    "SVN",
    "SLB",
    "SOM",
    "ZAF",
    "SGS",
    "SSD",
    "ESP",
    "LKA",
    "SDN",
    "SUR",
    "SJM",
    "SWE",
    "CHE",
    "SYR",
    "TWN",
    "TJK",
    "TZA",
    "THA",
    "TLS",
    "TGO",
    "TKL",
    "TON",
    "TTO",
    "TUN",
    "TUR",
    "TKM",
    "TCA",
    "TUV",
    "UGA",
    "UKR",
    "ARE",
    "GBR",
    "USA",
    "UMI",
    "URY",
    "UZB",
    "VUT",
    "VEN",
    "VNM",
    "VGB",
    "VIR",
    "WLF",
    "ESH",
    "YEM",
    "ZMB",
    "ZWE",
]

numeric_to_alpha3 = {
    4: "AFG",
    248: "ALA",
    8: "ALB",
    12: "DZA",
    16: "ASM",
    20: "AND",
    24: "AGO",
    660: "AIA",
    10: "ATA",
    28: "ATG",
    32: "ARG",
    51: "ARM",
    533: "ABW",
    36: "AUS",
    40: "AUT",
    31: "AZE",
    44: "BHS",
    48: "BHR",
    50: "BGD",
    52: "BRB",
    112: "BLR",
    56: "BEL",
    84: "BLZ",
    204: "BEN",
    60: "BMU",
    64: "BTN",
    68: "BOL",
    535: "BES",
    70: "BIH",
    72: "BWA",
    74: "BVT",
    76: "BRA",
    86: "IOT",
    96: "BRN",
    100: "BGR",
    854: "BFA",
    108: "BDI",
    132: "CPV",
    116: "KHM",
    120: "CMR",
    124: "CAN",
    136: "CYM",
    140: "CAF",
    148: "TCD",
    152: "CHL",
    156: "CHN",
    162: "CXR",
    166: "CCK",
    170: "COL",
    174: "COM",
    178: "COG",
    180: "COD",
    184: "COK",
    188: "CRI",
    384: "CIV",
    191: "HRV",
    192: "CUB",
    531: "CUW",
    196: "CYP",
    203: "CZE",
    208: "DNK",
    262: "DJI",
    212: "DMA",
    214: "DOM",
    218: "ECU",
    818: "EGY",
    222: "SLV",
    226: "GNQ",
    232: "ERI",
    233: "EST",
    748: "SWZ",
    231: "ETH",
    238: "FLK",
    234: "FRO",
    242: "FJI",
    246: "FIN",
    250: "FRA",
    254: "GUF",
    258: "PYF",
    260: "ATF",
    266: "GAB",
    270: "GMB",
    268: "GEO",
    276: "DEU",
    288: "GHA",
    292: "GIB",
    300: "GRC",
    304: "GRL",
    308: "GRD",
    312: "GLP",
    316: "GUM",
    320: "GTM",
    831: "GGY",
    324: "GIN",
    624: "GNB",
    328: "GUY",
    332: "HTI",
    334: "HMD",
    336: "VAT",
    340: "HND",
    344: "HKG",
    348: "HUN",
    352: "ISL",
    356: "IND",
    360: "IDN",
    364: "IRN",
    368: "IRQ",
    372: "IRL",
    833: "IMN",
    376: "ISR",
    380: "ITA",
    388: "JAM",
    392: "JPN",
    832: "JEY",
    400: "JOR",
    398: "KAZ",
    404: "KEN",
    296: "KIR",
    408: "PRK",
    410: "KOR",
    414: "KWT",
    417: "KGZ",
    418: "LAO",
    428: "LVA",
    422: "LBN",
    426: "LSO",
    430: "LBR",
    434: "LBY",
    438: "LIE",
    440: "LTU",
    442: "LUX",
    446: "MAC",
    450: "MDG",
    454: "MWI",
    458: "MYS",
    462: "MDV",
    466: "MLI",
    470: "MLT",
    584: "MHL",
    474: "MTQ",
    478: "MRT",
    480: "MUS",
    175: "MYT",
    484: "MEX",
    583: "FSM",
    498: "MDA",
    492: "MCO",
    496: "MNG",
    499: "MNE",
    500: "MSR",
    504: "MAR",
    508: "MOZ",
    104: "MMR",
    516: "NAM",
    520: "NRU",
    524: "NPL",
    528: "NLD",
    540: "NCL",
    554: "NZL",
    558: "NIC",
    562: "NER",
    566: "NGA",
    570: "NIU",
    574: "NFK",
    807: "MKD",
    580: "MNP",
    578: "NOR",
    512: "OMN",
    586: "PAK",
    585: "PLW",
    275: "PSE",
    591: "PAN",
    598: "PNG",
    600: "PRY",
    604: "PER",
    608: "PHL",
    612: "PCN",
    616: "POL",
    620: "PRT",
    630: "PRI",
    634: "QAT",
    638: "REU",
    642: "ROU",
    643: "RUS",
    646: "RWA",
    652: "BLM",
    654: "SHN",
    659: "KNA",
    662: "LCA",
    663: "MAF",
    666: "SPM",
    670: "VCT",
    882: "WSM",
    674: "SMR",
    678: "STP",
    682: "SAU",
    686: "SEN",
    688: "SRB",
    690: "SYC",
    694: "SLE",
    702: "SGP",
    534: "SXM",
    703: "SVK",
    705: "SVN",
    90: "SLB",
    706: "SOM",
    710: "ZAF",
    239: "SGS",
    728: "SSD",
    724: "ESP",
    144: "LKA",
    729: "SDN",
    740: "SUR",
    744: "SJM",
    752: "SWE",
    756: "CHE",
    760: "SYR",
    158: "TWN",
    762: "TJK",
    834: "TZA",
    764: "THA",
    626: "TLS",
    768: "TGO",
    772: "TKL",
    776: "TON",
    780: "TTO",
    788: "TUN",
    792: "TUR",
    795: "TKM",
    796: "TCA",
    798: "TUV",
    800: "UGA",
    804: "UKR",
    784: "ARE",
    826: "GBR",
    840: "USA",
    581: "UMI",
    858: "URY",
    860: "UZB",
    548: "VUT",
    862: "VEN",
    704: "VNM",
    92: "VGB",
    850: "VIR",
    876: "WLF",
    732: "ESH",
    887: "YEM",
    894: "ZMB",
    716: "ZWE",
}

In [9]:
gdf = gpd.read_file("../pipelines/validation_statistics/br_rn.json") # State of Rio Grande do Norte, Brazil
aoi = gdf.iloc[0]
aoi_tile = "00N_040W" # This AOI fits within a tile, but we should build VRTs so we can use any (resonably sized) AOI
version = "v20251004"
gadm41_iso = 76
gadm41_region = 20
gadm41_subregion = 150

In [45]:
class ContextualLayer(BaseModel):
    name: Literal["sbtn_natural_lands", "gfw_grasslands", "umd_drivers", "umd_land_cover"]
    source_uri: str
    output_uri: str
    column_name: str
    classes: Dict[int, str]

NATURAL_LANDS = ContextualLayer(
    name="sbtn_natural_lands",
    source_uri="s3://gfw-data-lake/sbtn_natural_lands_classification/v1.1/raster/epsg-4326/10/40000/class/geotiff/00N_040W.tif",
    output_uri="s3://lcl-analytics/zonal-statistics/dist-alerts/v20251004/admin-dist-alerts-by-natural-land-class.parquet",
    column_name="natural_land_class",
    classes= {
      2: "Natural forests",
      3: "Natural short vegetation",
      4: "Natural water",
      5: "Mangroves",
      6: "Bare",
      7: "Snow",
      8: "Wetland natural forests",
      9: "Natural peat forests",
      10: "Wetland natural short vegetation",
      11: "Natural peat short vegetation",
      12: "Cropland",
      13: "Built-up",
      14: "Non-natural tree cover",
      15: "Non-natural short vegetation",
      16: "Non-natural water",
      17: "Wetland non-natural tree cover",
      18: "Non-natural peat tree cover",
      19: "Wetland non-natural short vegetation",
      20: "Non-natural peat short vegetation",
      21: "Non-natural bare",
  }
)

DIST_DRIVERS = ContextualLayer(
    name="umd_drivers",
    source_uri="s3://gfw-data-lake/umd_glad_dist_alerts_driver/umd_dist_alerts_driver.tif",
    output_uri="s3://lcl-analytics/zonal-statistics/dist-alerts/v20251004/admin-dist-alerts-by-driver.parquet",
    column_name="driver",
    classes={
        1: "Wildfire",
        2: "Flooding",
        3: "Crop management",
        4: "Potential conversion",
        5: "Unclassified",
    }
)

GRASSLANDS = ContextualLayer(
    name="gfw_grasslands",
    source_uri="s3://gfw-data-lake/gfw_grasslands/v1/geotiff/grasslands_2022.tif",
    output_uri="s3://lcl-analytics/zonal-statistics/dist-alerts/v20251004/admin-dist-alerts-by-grassland-class.parquet",
    column_name="grasslands",
    classes={
        0: "non-grasslands",
        1: "grasslands"
    }
)

LAND_COVER = ContextualLayer(
    name="umd_land_cover",
    source_uri="s3://lcl-cogs/global-land-cover/global_land_cover_2024.tif",
    output_uri="s3://lcl-analytics/zonal-statistics/dist-alerts/v20251004/admin-dist-alerts-by-land-cover-class.parquet",
    column_name="land_cover",
    classes={
        0: "Short vegetation",
        1: "Bare and sparse vegetation",
        2: "Short vegetation",
        3: "Tree cover",
        4: "Wetland – short vegetation",
        5: "Water",
        6: "Snow-ice",
        7: "Cropland",
        8: "Built-up",
    }
)


unique_cols = [
    "country",
    "region",
    "subregion",
    "dist_alert_date",
    "dist_alert_confidence",
]

In [46]:
class DistZonalStats(pa.DataFrameModel):
    country: Series[str] = pa.Field(eq="BRA")
    region: Series[int] = pa.Field(eq=20)  # gadm id for adm1 AOI
    subregion: Series[int] = pa.Field(lt=170)  # placeholder adm2
    dist_alert_date: Series[date] = pa.Field(
        ge=date.fromisoformat("2023-01-01"), le=date.fromisoformat("2025-10-04")
    )  # julian date between 2023-01-01 to latest version
    dist_alert_confidence: Series[str] = pa.Field(
        isin=["low", "high"]
    )  # low confidence, high confidence
    area_ha: Series[float]
    aoi_type: str = pa.Field(eq="admin")

    class Config:
        coerce = True
        strict = True
        ordered = True
        unique = unique_cols

    @staticmethod
    def calculate_area_sums_by_confidence(df: pd.DataFrame) -> dict:
        """Calculate the total area by confidence level."""
        area_sums = df.groupby("dist_alert_confidence")["area_ha"].sum().to_dict()
        return {
            "low_confidence": area_sums.get("low", 0),
            "high_confidence": area_sums.get("high", 0),
        }

    @staticmethod
    def spot_check_julian_dates(
        df: pd.DataFrame, julian_dates: List[date]
    ) -> pd.DataFrame:
        filtered_by_date_df = df[df["dist_alert_date"].isin(julian_dates)]
        filtered_by_date_df = filtered_by_date_df.sort_values(
            by="dist_alert_date"
        ).reset_index(drop=True)

        # return all columns except admin columns
        exclude_cols = ["country", "region", "subregion"]
        columns = [col for col in filtered_by_date_df.columns if col not in exclude_cols]
        return filtered_by_date_df[columns]

class NaturalLandsZonalStats(DistZonalStats):
    natural_land_class: Series[str]

    class Config:
        coerce = True
        strict = True
        ordered = False
        unique = unique_cols + [NATURAL_LANDS.column_name]

class DriversZonalStats(DistZonalStats):
    driver: Series[str]

    class Config:
        coerce = True
        strict = True
        ordered = False
        unique = unique_cols + [DIST_DRIVERS.column_name]

class GrasslandsZonalStats(DistZonalStats):
    grasslands: Series[str]

    class Config:
        coerce = True
        strict = True
        ordered = False
        unique = unique_cols + [GRASSLANDS.column_name]

class LandCoverZonalStats(DistZonalStats):
    land_cover: Series[str]

    class Config:
        coerce = True
        strict = True
        ordered = False
        unique = unique_cols + [LAND_COVER.column_name]

In [47]:
contextual_layer = DIST_DRIVERS

In [13]:
def _read_raster_window(uri: str, bounds: tuple, requester_pays: bool = True):
    """Read a raster window for the given bounds and return an numpy array"""
    env = rio.Env(AWS_REQUEST_PAYER="requester") if requester_pays else rio.Env()
    with env:
        with rio.open(uri) as src:
            window = from_bounds(bounds[0], bounds[1], bounds[2], bounds[3], src.transform)
            data = src.read(1, window=window)
            win_affine = src.window_transform(window)

    return data, win_affine

def _add_metadata_to_df(conf_df: pd.DataFrame, conf_level: str) -> pd.DataFrame:
    """Read in df by confidence level and add metadata"""
    conf_df["aoi_type"] = "admin"
    conf_df["dist_alert_confidence"] = conf_level
    conf_df["country"] = 76
    conf_df["region"] = 20
    conf_df["subregion"] = 150  # placeholder for subregion (adm2) since we are running on an adm1 AOI
    conf_df.rename(columns={f"{conf_level}_conf": "area_ha"}, inplace=True)

    return conf_df

In [14]:
# read dist alerts for AOI
dist_latest_uri = f"s3://gfw-data-lake/umd_glad_dist_alerts/{version}/raster/epsg-4326/10/40000/default/gdal-geotiff/{aoi_tile}.tif"
bounds = aoi.geometry.bounds
dist_alerts, win_affine = _read_raster_window(dist_latest_uri, bounds)

# read area raster for AOI
area_uri = f"s3://gfw-data-lake/umd_area_2013/v1.10/raster/epsg-4326/10/40000/area_m/gdal-geotiff/{aoi_tile}.tif"
pixel_area__m, win_affine = _read_raster_window(area_uri, bounds)
pixel_area_ha = pixel_area__m / 10000

In [15]:
# Extract confidence levels and julian dates as separate arrays
dist_confidence_levels = dist_alerts // 10000
dist_high_conf = np.where(dist_confidence_levels == 3, 1, 0)
dist_low_conf = np.where(dist_confidence_levels == 2, 1, 0)

# Extract Julian date (remaining digits)
dist_julian_date = dist_alerts % 10000

# create geometry_mask to mask dist alerts by aoi geometry
aoi_mask = geometry_mask(
    [aoi.geometry], invert=True, transform=win_affine, out_shape=dist_alerts.shape
)

# dist_alert_confidence level maskings
# anything outside the AOI becomes zero
dist_high_conf_aoi = aoi_mask * dist_high_conf * pixel_area_ha
dist_low_conf_aoi = aoi_mask * dist_low_conf * pixel_area_ha
dist_julian_date_aoi = aoi_mask * dist_julian_date

# flatten arrays to format for results dataframe
high_conf_flat = dist_high_conf_aoi.flatten()
low_conf_flat = dist_low_conf_aoi.flatten()
julian_date_flat = dist_julian_date_aoi.flatten()

In [16]:
# create results dataframe by confidence level and contextual layer classes
if contextual_layer is not None:
    contextual_data, win_affine = _read_raster_window(contextual_layer.source_uri, bounds)
    contextual_data_aoi = aoi_mask * contextual_data
    contextual_flat = contextual_data_aoi.flatten()

    df = pd.DataFrame({
        "dist_alert_date": julian_date_flat,
        contextual_layer.name: contextual_flat,
        "high_conf": high_conf_flat,
        "low_conf": low_conf_flat,
    })
    high_conf_results = df.groupby([contextual_layer.name, "dist_alert_date"])["high_conf"].sum().reset_index()
    low_conf_results = df.groupby([contextual_layer.name, "dist_alert_date"])["low_conf"].sum().reset_index()

    # map contextual layer names
    high_conf_results[contextual_layer.column_name] = high_conf_results[contextual_layer.name].map(contextual_layer.classes)
    low_conf_results[contextual_layer.column_name] = low_conf_results[contextual_layer.name].map(contextual_layer.classes)
else:
    df = pd.DataFrame(
        {
            "dist_alert_date": julian_date_flat,
            "high_conf": high_conf_flat,
            "low_conf": low_conf_flat,
        }
    )
    high_conf_results = df.groupby("dist_alert_date")["high_conf"].sum().reset_index()
    low_conf_results = df.groupby("dist_alert_date")["low_conf"].sum().reset_index()

In [17]:
# add metadata to match expected schema
high_conf_df = _add_metadata_to_df(high_conf_results, "high")
low_conf_df = _add_metadata_to_df(low_conf_results, "low")

# reorder columns to country, region, subregion, contextual layer, dist_alert_date, confidence, value
if contextual_layer:
    column_order = ["country", "region", "subregion", contextual_layer.name, "dist_alert_date", "dist_alert_confidence", "area_ha"]        
else:
    column_order = ["country", "region", "subregion", "dist_alert_date", "dist_alert_confidence", "area_ha"]
high_conf_df = high_conf_df[column_order]
low_conf_df = low_conf_df[column_order]

In [18]:
# concatenate dist_alert_confidence dfs into one validation df
results = pd.concat([high_conf_results, low_conf_results], ignore_index=True)

# drop rows where dist_alert_date is zero
results = results[results["dist_alert_date"] != 0]

results.rename(columns={"confidence": "dist_alert_confidence"}, inplace=True)
results["dist_alert_date"] = results.sort_values(
    by="dist_alert_date"
).dist_alert_date.apply(lambda x: date(2020, 12, 31) + relativedelta(days=x))
results["country"] = results["country"].apply(
    lambda x: numeric_to_alpha3.get(x, None)
)

result_df = pd.DataFrame(results)

In [19]:
result_df

Unnamed: 0,umd_drivers,dist_alert_date,area_ha,driver,aoi_type,dist_alert_confidence,country,region,subregion
1,0,2023-02-01,0.152937,,admin,high,BRA,20,150
2,0,2023-02-06,0.000000,,admin,high,BRA,20,150
3,0,2023-02-26,0.459089,,admin,high,BRA,20,150
4,0,2023-03-18,1.529302,,admin,high,BRA,20,150
5,0,2023-04-03,0.917827,,admin,high,BRA,20,150
...,...,...,...,...,...,...,...,...,...
3921,4,2025-09-13,0.306055,Potential conversion,admin,low,BRA,20,150
3922,4,2025-09-14,0.383035,Potential conversion,admin,low,BRA,20,150
3923,4,2025-09-16,0.382890,Potential conversion,admin,low,BRA,20,150
3924,4,2025-09-22,1.149494,Potential conversion,admin,low,BRA,20,150


In [51]:
os.environ['AWS_PROFILE'] = 'zeno'
zeno_df = pd.read_parquet(contextual_layer.output_uri)
zeno_aoi_df = zeno_df[(zeno_df["country"] == "BRA") & (zeno_df["region"] == 20)]

In [52]:
zeno_aoi_df

Unnamed: 0,country,region,subregion,driver,dist_alert_date,dist_alert_confidence,area_ha,aoi_type
2784440,BRA,20,1,Unclassified,2023-11-16,high,1.376165,admin
2784441,BRA,20,1,Unclassified,2023-11-26,high,0.458735,admin
2784442,BRA,20,1,Unclassified,2023-12-01,high,0.611589,admin
2784443,BRA,20,1,Unclassified,2023-12-06,high,0.382282,admin
2784444,BRA,20,1,Unclassified,2023-12-14,high,0.382236,admin
...,...,...,...,...,...,...,...,...
2840972,BRA,20,167,Crop management,2025-07-20,low,0.458853,admin
2840973,BRA,20,167,Crop management,2025-07-21,low,0.076476,admin
2840974,BRA,20,167,Crop management,2025-07-30,low,1.911892,admin
2840975,BRA,20,167,Crop management,2025-08-14,low,0.994183,admin


In [53]:
zeno_aoi_df.groupby(contextual_layer.column_name)["area_ha"].sum()

driver
Crop management         5.784960e+03
Flooding                5.354061e+03
Potential conversion    2.069251e+03
Unclassified            1.775586e+06
Wildfire                8.453124e+02
Name: area_ha, dtype: float64

In [54]:
result_df.groupby(contextual_layer.column_name)["area_ha"].sum()

driver
Crop management         5788.934980
Flooding                5353.985401
Potential conversion    2068.256770
Wildfire                 845.158829
Name: area_ha, dtype: float64

In [55]:
layer_name = contextual_layer.name if contextual_layer else "base alerts"
validation_df = result_df

In [56]:
match contextual_layer.name if contextual_layer else None:
    case None:
        schema = DistZonalStats
    case "sbtn_natural_lands":
        schema = NaturalLandsZonalStats
    case "umd_drivers":
        schema = DriversZonalStats
    case "gfw_grasslands":
        schema = GrasslandsZonalStats
    case "umd_land_cover":
        schema = LandCoverZonalStats

In [57]:
try:
    schema.validate(zeno_aoi_df)
    print("Zonal stats schema validation passed.")
except Exception as e:
    print(f"Schema validation failed: {e}")

Zonal stats schema validation passed.


In [58]:
# validate alert area sums with 2% tolerance
validation_areas = schema.calculate_area_sums_by_confidence(validation_df)
zeno_areas = schema.calculate_area_sums_by_confidence(zeno_aoi_df)
zeno_aoi_df["area_ha"]
tolerance_pct = 0.02  # 2% tolerance

low_conf_tolerance = validation_areas["low_confidence"] * tolerance_pct
high_conf_tolerance = validation_areas["high_confidence"] * tolerance_pct
low_conf_diff = abs(
    validation_areas["low_confidence"] - zeno_areas["low_confidence"]
)
high_conf_diff = abs(
    validation_areas["high_confidence"] - zeno_areas["high_confidence"]
)

if low_conf_diff > low_conf_tolerance or high_conf_diff > high_conf_tolerance:
    print("Area sums exceed 2% tolerance")
print("Area sums validation passed.")

Area sums validation passed.


In [59]:
# generate results for spot checking dates
validation_dates = [
    date.fromisoformat(dstr) for dstr in ["2023-06-06", "2023-06-21", "2023-09-27"]
]  # example julian dates (800, 900, ..., 1500)
validation_spot_check = schema.spot_check_julian_dates(
    validation_df, validation_dates
)
zeno_spot_check_raw = schema.spot_check_julian_dates(
    zeno_aoi_df, validation_dates
)

# group zeno results by dist_alert_date and dist_alert_confidence to aggregate subregions (since AOI is an adm1)
# include contextual layer column if present
group_cols = ["dist_alert_date", "dist_alert_confidence"]
if contextual_layer:
    group_cols.insert(1, contextual_layer.column_name)

zeno_spot_check = (
    zeno_spot_check_raw.groupby(group_cols)[
        "area_ha"
    ]
    .sum()
    .reset_index()
)

# confirm that both dataframes have results for the validation dates
validation_dates_set = set(validation_dates)
zeno_dates_set = set(zeno_spot_check["dist_alert_date"])
missing_in_zeno = validation_dates_set - zeno_dates_set
if missing_in_zeno:
    print(f"Parquet results are missing dates: {sorted(missing_in_zeno)}")
print("No missing dist_alert_dates in parquet")

# spot check alert area for random dates with 0.1% tolerance
tolerance_values = validation_spot_check["area_ha"] * tolerance_pct
area_diff = abs(validation_spot_check["area_ha"] - zeno_spot_check["area_ha"])
exceeds_tolerance = area_diff > tolerance_values
if exceeds_tolerance.any():
    print("Spot check area values exceed 0.1% tolerance")

print("Spot check validation passed.")

No missing dist_alert_dates in parquet
Spot check validation passed.
