### Alternate method QC

In [150]:
import numpy as np
import pandas as pd
import rasterio as rio
import geopandas as gpd
import pandera.pandas as pa
from rasterio.windows import from_bounds
from rasterio.features import geometry_mask

In [2]:
bra_gdf = gpd.read_file("gadm41_BRA_2.json")
rgn_gdf = bra_gdf[bra_gdf["NAME_1"] == "RioGrandedoNorte"]
aoi = rgn_gdf.iloc[156]

In [68]:
rgn_gdf = gpd.read_file("br_rn.json")
aoi = rgn_gdf.iloc[0]

In [69]:
bounds = aoi.geometry.bounds

In [70]:
# read dist alerts
with rio.open("s3://gfw-data-lake/umd_glad_dist_alerts/v20250628/raster/epsg-4326/10/40000/currentweek/gdal-geotiff/00N_040W.tif") as src:
    window = from_bounds(
        bounds[0],
        bounds[1],
        bounds[2],
        bounds[3],
        src.transform
    )
    dist_alerts = src.read(1, window=window)
    win_affine = src.window_transform(window)

# set no_data from -1 to 0
dist_alerts = np.where(dist_alerts == -1, 0, dist_alerts)

# Extract confidence level (first digit)
dist_confidence_levels = dist_alerts // 10000
dist_high_conf = np.where(dist_confidence_levels == 3, 1, 0)
dist_low_conf = np.where(dist_confidence_levels == 2, 1, 0)

# Extract Julian date (remaining digits)
dist_julian_date = dist_alerts % 10000 

In [71]:
with rio.open("s3://gfw-data-lake/sbtn_natural_forests_map/v202504/raster/epsg-4326/10/40000/class/gdal-geotiff/00N_040W.tif") as src:
    sbtn_data = src.read(1, window=window)
    win_affine = src.window_transform(window)

    # Extract natural forest and non-natural forest areas within the admin area
    natural_forest = np.where(sbtn_data == 1, 1, 0)
    nonnatural_forest = np.where(sbtn_data == 2, 1, 0)

In [72]:
with rio.open("s3://gfw-data-lake/umd_area_2013/v1.10/raster/epsg-4326/10/40000/area_m/gdal-geotiff/00N_040W.tif") as src:
    area = src.read(1, window=window)

In [73]:
# mask dist alerts by aoi geometry
aoi_mask = geometry_mask([aoi.geometry], invert=True, transform=win_affine, out_shape=dist_alerts.shape)

In [81]:
dist_high_conf_aoi = aoi_mask * dist_high_conf * natural_forest
dist_low_conf_aoi = aoi_mask * dist_low_conf * natural_forest
dist_julian_date_aoi = aoi_mask * dist_julian_date * natural_forest

In [82]:
# Flatten the arrays
high_conf_flat = dist_high_conf_aoi.flatten()
low_conf_flat = dist_low_conf_aoi.flatten()
julian_date_flat = dist_julian_date_aoi.flatten()

In [126]:
# Create a DataFrame
df = pd.DataFrame({
    "alert_date": julian_date_flat,
    "high_conf": high_conf_flat,
    "low_conf": low_conf_flat
})

In [138]:
high_conf_results = df.groupby("alert_date")["high_conf"].sum().reset_index()
low_conf_results = df.groupby("alert_date")["low_conf"].sum().reset_index()

In [204]:
high_conf_results["confidence"] = 3
high_conf_results["country"] = 76
high_conf_results["region"] = 20
high_conf_results["subregion"] = 150

low_conf_results["confidence"] = 2
low_conf_results["country"] = 76
low_conf_results["region"] = 20
low_conf_results["subregion"] = 150


# rename high_conf to value
high_conf_results.rename(columns={"high_conf": "value"}, inplace=True)
low_conf_results.rename(columns={"low_conf": "value"}, inplace=True)

# reorder columns to country, region, subregion, alert_date, confidence, value
high_conf_results = high_conf_results[["country", "region", "subregion", "alert_date", "confidence", "value"]]
low_conf_results = low_conf_results[["country", "region", "subregion", "alert_date", "confidence", "value"]]

# concatenate the results
results = pd.concat([high_conf_results, low_conf_results], ignore_index=True)

# drop rows where alert_date is zero
results = results[results["alert_date"] != 0]


In [205]:
results

Unnamed: 0,country,region,subregion,alert_date,confidence,value
1,76,20,150,1015,3,6
2,76,20,150,1087,3,1
3,76,20,150,1127,3,1
4,76,20,150,1151,3,0
5,76,20,150,1157,3,0
...,...,...,...,...,...,...
545,76,20,150,1627,2,459
546,76,20,150,1628,2,542
547,76,20,150,1629,2,15175
548,76,20,150,1633,2,96


In [206]:
zeno_stats = pd.read_parquet("s3://gfw-data-lake/umd_glad_dist_alerts/v20250628/tabular/epsg-4326/zonal_stats/dist_alerts_by_adm2.parquet")

In [207]:
rgn_zeno_stats = zeno_stats[
    (zeno_stats["country"] == 76) &
    (zeno_stats["region"] == 20)
]
rgn_zeno_stats

Unnamed: 0,country,region,subregion,alert_date,confidence,value
622725,76,20,0,762,3,1
622726,76,20,0,807,2,2
622727,76,20,0,852,3,1
622728,76,20,0,887,2,2
622729,76,20,0,902,2,2
...,...,...,...,...,...,...
623824,76,20,156,1585,3,6
623825,76,20,156,1587,2,5
623826,76,20,156,1587,3,11
623827,76,20,156,1589,2,3


In [208]:
schema = pa.DataFrameSchema({
    "country": pa.Column(int, pa.Check.eq(76)),
    "region": pa.Column(int, pa.Check.eq(20)),
    "subregion": pa.Column(pa.Int, pa.Check.lt(170)),
    "alert_date": pa.Column(pa.Int, pa.Check.in_range(731, 1640)), # julian date between 2023-01-01 to latest version
    "confidence": pa.Column(int, pa.Check.isin([2, 3])),
    "value": pa.Column(pa.Int)
})

In [209]:
schema.validate(rgn_zeno_stats)

Unnamed: 0,country,region,subregion,alert_date,confidence,value
622725,76,20,0,762,3,1
622726,76,20,0,807,2,2
622727,76,20,0,852,3,1
622728,76,20,0,887,2,2
622729,76,20,0,902,2,2
...,...,...,...,...,...,...
623824,76,20,156,1585,3,6
623825,76,20,156,1587,2,5
623826,76,20,156,1587,3,11
623827,76,20,156,1589,2,3


In [210]:
schema.validate(results)

Unnamed: 0,country,region,subregion,alert_date,confidence,value
1,76,20,150,1015,3,6
2,76,20,150,1087,3,1
3,76,20,150,1127,3,1
4,76,20,150,1151,3,0
5,76,20,150,1157,3,0
...,...,...,...,...,...,...
545,76,20,150,1627,2,459
546,76,20,150,1628,2,542
547,76,20,150,1629,2,15175
548,76,20,150,1633,2,96


In [211]:
assert rgn_zeno_stats.shape == results.shape, "Shape mismatch between zonal stats and results"

AssertionError: Shape mismatch between zonal stats and results

In [216]:
# spot check specific dates
zeno_alerts_1127 = rgn_zeno_stats[rgn_zeno_stats["alert_date"] == 1127]["value"].sum()
local_results_1127 = results[results["alert_date"] == 1127]["value"].sum()

assert zeno_alerts_1127 == local_results_1127, "Mismatch in alert counts for julian date 1127"


AssertionError: Mismatch in alert counts for julian date 1127