## Monitoring agent data generation

This notebooks documents how the KBA data was generated for the monitoring prototype.

The concept is that all data is static. It is pre-generated for all the Key
Biodiversity Areas (KBA). The statistics are then compiled into unified datasets that
will be accessed by the monitoring agent.

We have used the following files

- KBA Shapefile containing the geometries and simple attributes like names
- Metadata for each KBA with detailed descriptions of each area
- Static data from Global Forest Watch, compiled without time dimenison. Most data is changes between 2001 and 2023.
- Time series data from Grasslands dataset, in yearly increments
- Time series data for tree cover loss and associated carbon emissions

### Static data preparation

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

DATADIR = "zeno/agents/monitoring/kba_data_preparation"

# Read source data and unify sitecode column
kba = gpd.read_file(
    f"{$DATADIR}/kba_global_2024_semptember.gpkg"
)
kba.rename(columns={"SitRecID": "sitecode"}, inplace=True)

# KBA static data
df_kba_static = pd.read_csv(
    f"{$DATADIR}/KBA info scrape - static data.csv"
)
df_kba_static.drop_duplicates(subset="sitecode", inplace=True)

df_kba_tcl = pd.read_csv(
    f"{$DATADIR}/KBA info scrape - tree cover loss drivers data.csv"
)
df_kba_tcl.rename(columns={"SitRecID": "sitecode"}, inplace=True)

# KBA detailed descriptions
df_kba_meta = pd.read_csv(
    f"{$DATADIR}/KBA info scrape - scraped kba metadata.csv"
)

# Merge different source file into single dataframe
merged = kba.merge(df_kba_static, on="sitecode", suffixes=("", "_dup"), how="left")
merged = merged.loc[:, ~merged.columns.str.endswith("_dup")]

merged = merged.merge(df_kba_tcl, on="sitecode", suffixes=("", "_dup"), how="left")
merged = merged.loc[:, ~merged.columns.str.endswith("_dup")]

merged = merged.merge(df_kba_meta, on="sitecode", suffixes=("", "_dup"), how="left")
merged = merged.loc[:, ~merged.columns.str.endswith("_dup")]

# Check for consistency
print(f"Unique sitecodes in kba: {len(kba['sitecode'].unique())}, {kba.shape[0]}")
print(
    f"unique sitecodes in df_kba_meta: {len(df_kba_meta['sitecode'].unique())}, {df_kba_meta.shape[0]}"
)
print(
    f"unique sitecodes in df_kba_static_data: {len(df_kba_static['sitecode'].unique())}, {df_kba_static.shape[0]}"
)
print(
    f"unique sitecodes in df_kba_tcl: {len(df_kba_tcl['sitecode'].unique())}, {df_kba_tcl.shape[0]}"
)
print(
    f"unique sitecodes in merged: {len(merged['sitecode'].unique())}, {merged.shape[0]}"
)

DESCRIBED_FIELDS = [
    "area__ha",
    "umd_tree_cover_extent_2000__ha",
    "umd_tree_cover_gain__ha",
    "gfw_forest_carbon_gross_removals_aboveground_2001_2023__Mg_CO2",
    "gfw_forest_carbon_gross_removals_belowground_2001_2023__Mg_CO2",
    "gfw_forest_carbon_gross_removals_2001_2023__Mg_CO2",
    "gfw_forest_carbon_gross_emissions_all_gases_2001_2023__Mg_CO2e",
    "gfw_forest_carbon_net_flux_2001_2023__Mg_CO2e",
    "gfw_aboveground_carbon_stock_2000__Mg_C",
    "gfw_belowground_carbon_stock_2000__Mg_C",
    "gfw_soil_carbon_stock_2000__Mg_C",
    "whrc_aboveground_biomass_stock_2000__Mg",
    "avg_whrc_aboveground_biomass_density_2000__Mg_ha-1",
    "permAg_tcl_2001-2023",
    "hardCommodities_tcl_2001-2023",
    "shifting_tcl_2001-2023",
    "forestMgmt_tcl_2001-2023",
    "wildfire_tcl_2001-2023",
    "settlements_tcl_2001-2023",
    "natural_tcl_2001-2023",
    "additionalBiodiversityValues",
    "calculatedProtectedArea",
    "country",
    "deliniationRationale",
    "ecosystems",
    "elevation(M)",
    "globalKbaCriteria",
    "habitatDescription",
    "howIsTheSiteManaged",
    "indigenousGroups",
    "irreplaceabilityAssessmentApproved",
    "kbaClassification",
    "landUseRegimesAtSite",
    "latitude",
    "longitude",
    "rationaleForSiteInformation",
    "references",
    "regions",
    "siteAreaCalculated",
    "sitecode",
    "siteDescription",
    "siteName",
    "siteNameNational",
    "threatsDescription",
    "updatedAt",
    "yearOfAssessment",
    "geometry",
]

for bla in DESCRIBED_FIELDS:
    if bla not in merged.columns:
        print(f"Missing field: {bla}")


merged = merged[DESCRIBED_FIELDS]

print("Should be same", np.unique(merged["sitecode"]).shape, merged.shape[0])

merged.to_file(f"{$DATADIR}/kba_merged.gpkg", driver="GPKG")

### Time series data preparation

The dime series data is prepared in a "melted" format. That means that the
table will have a multi-index where the combination of `sitecode` and `year`
is unique. For each of thes combinations, we compile different values.

The result outputs two tables, one for grasslands, and one for tree cover loss.

In [None]:
import pandas as pd

gpp_data = pd.read_csv(f"${DATADIR}/GPW_grass_GPP.csv")
area_data = pd.read_csv(f"${DATADIR}/GPW_grass_class_area.csv")
kba_data = pd.read_csv(f"${DATADIR}/KBA info scrape - annual data.csv")
kba_data = kba_data[
    [
        "sitecode",
        "year",
        "gfw_forest_carbon_gross_emissions_all_gases",
        "umd_tree_cover_loss",
    ]
]
kba_data.set_index(["sitecode", "year"], inplace=True)

result = None
for key in ["GPP", "cultivated", "nsn"]:
    print(key)
    data = gpp_data if key == "GPP" else area_data

    colnames = [dat for dat in data.columns if f"_{key}_" in dat]
    print(colnames)
    years = [dat.split("_")[0] for dat in colnames]

    dfd = {year: data[colname] for year, colname in zip(years, colnames)}
    dfd["sitecode"] = data["SitRecID"]

    df = pd.DataFrame(dfd)

    df = df.melt(id_vars=["sitecode"], var_name="year", value_name=key)

    df["year"] = df["year"].astype(int)

    df.columns = ["sitecode", "year", key]

    df.set_index(["sitecode", "year"], inplace=True)

    if result is None:
        result = df
    else:
        result = result.merge(df, left_index=True, right_index=True, how="outer")

final = result.merge(kba_data, left_index=True, right_index=True, how="outer")

final.reset_index(inplace=True)

# Check consistency
print(final[final["sitecode"] == 8010])

final.to_parquet(f"${DATADIR}/kba_timeseries_data.parquet")