# DIST Alerts Quarterly Country Aggregation

This notebook computes quarterly DIST alert areas for countries in two categories:
1. Areas where tree cover density is ≥30%
2. Areas where tree cover density is ≥30% and occurring in primary forests

To accurately detect tree cover loss alerts, areas that already experienced tree cover loss prior to the alert period are excluded from the aggregation.

The computation uses zarr files generated by the GNW pipeline.

## How to Run

The notebook uses the Zeno infrastructure environment and runs in a Coiled account. Follow these steps:

### 1. Install Zeno Pipeline Dependencies
```bash
cd pipelines
uv sync
source .venv/bin/activate
```

### 2. Set Environment Variables

Configure Zeno account AWS credentials using either:
- `AWS_PROFILE`, or
- `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`

### 3. Log in to Coiled
```bash
coiled login
```

### 4. Start the Coiled Notebook Server
```bash
coiled notebook start --region "us-east-1" --vm-type r7g.xlarge
```

This command starts the notebook server in the Coiled account (which runs in the Zeno account) and syncs the virtual environment dependencies.
The command will provide a URL to access the notebook interface and may open it in your browser.

Then upload the notebook to the Coiled notebook environment.

#### Alternative: Run Locally

Alternatively, you can run the notebook locally, though this is less efficient:
```bash
pip install jupyterlab
jupyter-lab
```

### 5. Configure and Run

1. Set the latest GNW pipeline Docker image from [ECR](https://us-east-1.console.aws.amazon.com/ecr/repositories/private/084375562450/analytics-api?region=us-east-1) (Zeno account) for the Dask cluster to use (`COILED_CLUSTER_IMAGE` variable)
2. Specify the target quarter and corresponding DIST version pairs (`DIST_VERSIONS_QUARTERS` variable)
3. Execute the notebook cells

In [None]:
import xarray as xr
import coiled
import os

from flox.xarray import xarray_reduce
from flox import ReindexArrayType, ReindexStrategy
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from datetime import date
import logging


In [None]:
os.environ["AWS_REQUEST_PAYER"] = "requester"

In [None]:
logging.getLogger("distributed.client").setLevel(logging.ERROR)

In [None]:
DIST_DAY_0 = date(2020, 12, 31)


#### User input

In [None]:

# The quarters to process along with their corresponding DIST versions to use
DIST_VERSIONS_QUARTERS = [
    ("v20241228", "2024Q4"),
    ("v20250412", "2025Q1"),
    ("v20250708", "2025Q2"),
    ("v20251004", "2025Q3")
]

# we should tag latest in ECR after pushing a new image and use that here
COILED_CLUSTER_IMAGE = "084375562450.dkr.ecr.us-east-1.amazonaws.com/analytics-api:2ff8b78_pipelines"

In [None]:
cluster = coiled.Cluster(
    name="dist_quarterly_country_stats",
    region="us-east-1",
    n_workers=50,
    tags={"project": "dist_quarterly_country_stats"},
    scheduler_vm_types=["r7g.xlarge"],
    worker_vm_types=["r7g.2xlarge"],
    compute_purchase_option="spot_with_fallback",
    no_client_timeout="5 seconds",
    container=COILED_CLUSTER_IMAGE,
    environ={
        "AWS_REQUEST_PAYER": "requester",  # for reading COGS from gfw account
    },
)

client = cluster.get_client()

In [None]:
numeric_to_alpha3 = {
    4: "AFG",
    248: "ALA",
    8: "ALB",
    12: "DZA",
    16: "ASM",
    20: "AND",
    24: "AGO",
    660: "AIA",
    10: "ATA",
    28: "ATG",
    32: "ARG",
    51: "ARM",
    533: "ABW",
    36: "AUS",
    40: "AUT",
    31: "AZE",
    44: "BHS",
    48: "BHR",
    50: "BGD",
    52: "BRB",
    112: "BLR",
    56: "BEL",
    84: "BLZ",
    204: "BEN",
    60: "BMU",
    64: "BTN",
    68: "BOL",
    535: "BES",
    70: "BIH",
    72: "BWA",
    74: "BVT",
    76: "BRA",
    86: "IOT",
    96: "BRN",
    100: "BGR",
    854: "BFA",
    108: "BDI",
    132: "CPV",
    116: "KHM",
    120: "CMR",
    124: "CAN",
    136: "CYM",
    140: "CAF",
    148: "TCD",
    152: "CHL",
    156: "CHN",
    162: "CXR",
    166: "CCK",
    170: "COL",
    174: "COM",
    178: "COG",
    180: "COD",
    184: "COK",
    188: "CRI",
    384: "CIV",
    191: "HRV",
    192: "CUB",
    531: "CUW",
    196: "CYP",
    203: "CZE",
    208: "DNK",
    262: "DJI",
    212: "DMA",
    214: "DOM",
    218: "ECU",
    818: "EGY",
    222: "SLV",
    226: "GNQ",
    232: "ERI",
    233: "EST",
    748: "SWZ",
    231: "ETH",
    238: "FLK",
    234: "FRO",
    242: "FJI",
    246: "FIN",
    250: "FRA",
    254: "GUF",
    258: "PYF",
    260: "ATF",
    266: "GAB",
    270: "GMB",
    268: "GEO",
    276: "DEU",
    288: "GHA",
    292: "GIB",
    300: "GRC",
    304: "GRL",
    308: "GRD",
    312: "GLP",
    316: "GUM",
    320: "GTM",
    831: "GGY",
    324: "GIN",
    624: "GNB",
    328: "GUY",
    332: "HTI",
    334: "HMD",
    336: "VAT",
    340: "HND",
    344: "HKG",
    348: "HUN",
    352: "ISL",
    356: "IND",
    360: "IDN",
    364: "IRN",
    368: "IRQ",
    372: "IRL",
    833: "IMN",
    376: "ISR",
    380: "ITA",
    388: "JAM",
    392: "JPN",
    832: "JEY",
    400: "JOR",
    398: "KAZ",
    404: "KEN",
    296: "KIR",
    408: "PRK",
    410: "KOR",
    414: "KWT",
    417: "KGZ",
    418: "LAO",
    428: "LVA",
    422: "LBN",
    426: "LSO",
    430: "LBR",
    434: "LBY",
    438: "LIE",
    440: "LTU",
    442: "LUX",
    446: "MAC",
    450: "MDG",
    454: "MWI",
    458: "MYS",
    462: "MDV",
    466: "MLI",
    470: "MLT",
    584: "MHL",
    474: "MTQ",
    478: "MRT",
    480: "MUS",
    175: "MYT",
    484: "MEX",
    583: "FSM",
    498: "MDA",
    492: "MCO",
    496: "MNG",
    499: "MNE",
    500: "MSR",
    504: "MAR",
    508: "MOZ",
    104: "MMR",
    516: "NAM",
    520: "NRU",
    524: "NPL",
    528: "NLD",
    540: "NCL",
    554: "NZL",
    558: "NIC",
    562: "NER",
    566: "NGA",
    570: "NIU",
    574: "NFK",
    807: "MKD",
    580: "MNP",
    578: "NOR",
    512: "OMN",
    586: "PAK",
    585: "PLW",
    275: "PSE",
    591: "PAN",
    598: "PNG",
    600: "PRY",
    604: "PER",
    608: "PHL",
    612: "PCN",
    616: "POL",
    620: "PRT",
    630: "PRI",
    634: "QAT",
    638: "REU",
    642: "ROU",
    643: "RUS",
    646: "RWA",
    652: "BLM",
    654: "SHN",
    659: "KNA",
    662: "LCA",
    663: "MAF",
    666: "SPM",
    670: "VCT",
    882: "WSM",
    674: "SMR",
    678: "STP",
    682: "SAU",
    686: "SEN",
    688: "SRB",
    690: "SYC",
    694: "SLE",
    702: "SGP",
    534: "SXM",
    703: "SVK",
    705: "SVN",
    90: "SLB",
    706: "SOM",
    710: "ZAF",
    239: "SGS",
    728: "SSD",
    724: "ESP",
    144: "LKA",
    729: "SDN",
    740: "SUR",
    744: "SJM",
    752: "SWE",
    756: "CHE",
    760: "SYR",
    158: "TWN",
    762: "TJK",
    834: "TZA",
    764: "THA",
    626: "TLS",
    768: "TGO",
    772: "TKL",
    776: "TON",
    780: "TTO",
    788: "TUN",
    792: "TUR",
    795: "TKM",
    796: "TCA",
    798: "TUV",
    800: "UGA",
    804: "UKR",
    784: "ARE",
    826: "GBR",
    840: "USA",
    581: "UMI",
    858: "URY",
    860: "UZB",
    548: "VUT",
    862: "VEN",
    704: "VNM",
    92: "VGB",
    850: "VIR",
    876: "WLF",
    732: "ESH",
    887: "YEM",
    894: "ZMB",
    716: "ZWE",
}

In [None]:
def load_datasets(dist_version):
    dist_alerts = xr.open_zarr(
    f"s3://lcl-analytics/zarr/dist-alerts/{dist_version}/umd_glad_dist_alerts.zarr/"
    )
    canopy_cover = xr.open_zarr(
        "s3://gfw-data-lake/umd_tree_cover_density_2000/v1.8/raster/epsg-4326/zarr/threshold.zarr/",
        storage_options={"requester_pays": True},
    ).band_data.reindex_like(
            dist_alerts, method="nearest", tolerance=1e-5
    )
    canopy_cover = xr.align(dist_alerts, canopy_cover, join="left")[1]
    canopy_cover.name = "canopy_cover"

    primary_forest = xr.open_zarr(
        "s3://gfw-data-lake/umd_regional_primary_forest_2001/v201901/raster/epsg-4326/zarr/is.zarr/",
        storage_options={"requester_pays": True},
    ).band_data.reindex_like(
            dist_alerts, method="nearest", tolerance=1e-5
    )
    primary_forest = xr.align(dist_alerts, primary_forest, join="left")[1]
    primary_forest.name = "is_primary_forest"

    tcl = xr.open_zarr(
        "s3://gfw-data-lake/umd_tree_cover_loss/v1.12/raster/epsg-4326/zarr/year.zarr/",
        storage_options={"requester_pays": True},
    ).band_data.reindex_like(
            dist_alerts, method="nearest", tolerance=1e-5
    )
    tcl = xr.align(dist_alerts, tcl, join="left")[1]
    tcl.name = "tcl_year"


    countries = xr.open_zarr(
        "s3://gfw-data-lake/gadm_administrative_boundaries/v4.1.85/raster/epsg-4326/zarr/adm0.zarr",
        storage_options={"requester_pays": True},

    ).band_data.reindex_like(
            dist_alerts, method="nearest", tolerance=1e-5
    )
    countries = xr.align(dist_alerts, countries, join="left")[1]
    countries.name = "country"

    return xr.merge([countries, canopy_cover, primary_forest, tcl, dist_alerts])

    
def compute_quarterly_country_stats(pixel_area, dataset, quarter):
    pd_quarter = pd.Period(quarter, freq="Q")
    start_date = (pd_quarter.start_time.date() - DIST_DAY_0).days
    end_date = (pd_quarter.end_time.date() - DIST_DAY_0).days

    alerts_area = xarray_reduce(
        pixel_area,
        *(
            dataset.country,
            dataset.canopy_cover,
            dataset.is_primary_forest,
            dataset.tcl_year,
            dataset.alert_date,
            dataset.confidence
        ),
        func="sum",
        expected_groups=(
            np.arange(999),
            [5, 6, 7],  # only return canopy cover >= 30%  
            [0, 1],
            np.arange(25),
            np.arange(start_date, end_date + 1),
            [3,]
        ),
        reindex=ReindexStrategy(
            blockwise=False, array_type=ReindexArrayType.SPARSE_COO
        ),
        fill_value=0
    ).compute()

    return alerts_area


def postprocess_result(result):
    sparse_data = result.data
    dim_names = result.dims
    indices = sparse_data.coords
    values = sparse_data.data

    coord_dict = {
        dim: result.coords[dim].values[indices[i]]
        for i, dim in enumerate(dim_names)
    }
    coord_dict["alert_area__ha"] = values

    df = pd.DataFrame(coord_dict)
    df.alert_date = df.alert_date.apply(
            lambda x: date(2020, 12, 31) + relativedelta(days=x)
        )
    df["country"] = df.country.apply(lambda x: numeric_to_alpha3.get(x, None))
    df["tcl_year"] = df.tcl_year.apply(lambda x: 2000 + x if x > 0 else x)
    df["alert_area__ha"] = df.apply(lambda x: 0. if (x.tcl_year < x.alert_date.year and x.tcl_year > 0) else x.alert_area__ha, axis=1)
    df["alert_area_in_primary_forest__ha"] = df.apply(lambda x: x.alert_area__ha if x.is_primary_forest == 1 else 0., axis=1)

    df_agg = df[["country", "alert_area__ha", "alert_area_in_primary_forest__ha"]].groupby(["country"]).sum().reset_index()
    df_agg.set_index("country", inplace=True)


    return df_agg

In [None]:
results = []

for dist_version, quarter in DIST_VERSIONS_QUARTERS:
    datasets = load_datasets(dist_version)
    pixel_area = xr.open_zarr(
        "s3://gfw-data-lake/umd_area_2013/v1.10/raster/epsg-4326/zarr/pixel_area_ha.zarr/",
        storage_options={"requester_pays": True},
    ).band_data.reindex_like(datasets, method="nearest", tolerance=1e-5)

    pixel_area = xr.align(datasets.confidence, pixel_area, join="left")[1]
    alert_area = compute_quarterly_country_stats(pixel_area, datasets, quarter)
    result_df = postprocess_result(alert_area)

    result_df["quarter"] = pd.Period(quarter, freq="Q").start_time

    results.append(result_df)

In [None]:
all_quarters = pd.concat(results, axis=0)

In [None]:
all_quarters.sort_values(by=[all_quarters.index.name or "index", "quarter"]).to_csv("./dist_alerts_quarterly_summary.csv")