In [12]:
# importing dependencies for notebook
import sys
import os
import rioxarray
import rasterio
import xarray as xr
import numpy as np
import pandas as pd
from xrspatial import zonal_stats
from xrspatial.zonal import _stats_count as xr_count
from pathlib import Path 

# dask/parallelization libraries
import coiled
import dask
from dask.distributed import Client, LocalCluster
from dask.distributed import print
import distributed
import concurrent.futures

# Dynamically creating the utilities module using a relative path
import importlib.util
spec = importlib.util.spec_from_file_location("utilities", "../utilities/__init__.py")
utilities = importlib.util.module_from_spec(spec)
sys.modules["utilities"] = utilities
spec.loader.exec_module(utilities)

# Importing utilities from utilities module 
from utilities import constants_and_names as cn
from utilities import universal_utilities as uu
from utilities import log_utilities as lu
from utilities import numba_utilities as nu

<font size="5">Set notebook options</font> 

In [16]:
#create cloud vs local cluster (options = 'full', 'test', 'local' )
cluster_type = 'local'

#date of ipcc landcover class rasters
ipcc_class_date = '20240205'

#tile list 
tile_id = "00N_000W"  #this is a placeholder for the data dictionaries, overwritten by tile_id
tile_id_list = ["00N_060W"]

#years list (options: '2000', '2000_2005', '2005_2010', '2010_2015', '2015_2020')
#'2000': Get carbon densities in 2000
year_list = ['2000', '2000_2005', '2005_2010', '2010_2015', '2015_2020'] 

#zone list (options: 'ipcc_class', 'climate', 'ecozone', 'ifl_primary', 'drivers')
zone_list = ['ipcc_class', 'climate', 'ecozone', 'ifl_primary', 'drivers']
#TODO: Add GADM and protected areas as contextual layers

#stats list (options: 'agc', 'bgc', 'deadwood', 'litter')
stat_list = ['agc', 'bgc', 'deadwood', 'litter']

#flux list (options: 'emissions', 'removals', 'net_flux')
flux_list = ['emissions', 'removals', 'net_flux']

<font size="5">Creating clusters</font> 

In [3]:
if cluster_type == 'full': 
    # Full cluster with 40 workers
    coiled_cluster = coiled.Cluster(
        n_workers=40,  
        use_best_zone=True, 
        compute_purchase_option="spot_with_fallback",
        idle_timeout="10 minutes",
        region="us-east-1",
        name="AFOLU_zonal_stats", 
        workspace='wri-forest-research', 
        worker_cpu=4,
        worker_memory = "16GiB" 
    )
    client = coiled_cluster.get_client()
    
elif cluster_type == 'test': 
    # Test cluster with 1 worker
    coiled_cluster = coiled.Cluster(
        n_workers=1,  
        use_best_zone=True, 
        compute_purchase_option="spot_with_fallback",
        idle_timeout="10 minutes",
        region="us-east-1",
        name="AFOLU_zonal_stats", 
        workspace='wri-forest-research', 
        worker_cpu=4,
        worker_memory = "16GiB" 
    )
    client = coiled_cluster.get_client()
elif cluster_type == 'local':
    # Local cluster with multiple workers
    local_cluster = LocalCluster()  
    client = Client(local_cluster)
else: 
    print("set cluster_type to one of the following: 'full', 'test', 'local'")

client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 33867 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:33867/status,

0,1
Dashboard: http://127.0.0.1:33867/status,Workers: 4
Total threads: 12,Total memory: 15.47 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:33663,Workers: 4
Dashboard: http://127.0.0.1:33867/status,Total threads: 12
Started: Just now,Total memory: 15.47 GiB

0,1
Comm: tcp://127.0.0.1:33745,Total threads: 3
Dashboard: http://127.0.0.1:34135/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:40185,
Local directory: /tmp/dask-scratch-space/worker-pfod9bc6,Local directory: /tmp/dask-scratch-space/worker-pfod9bc6

0,1
Comm: tcp://127.0.0.1:39043,Total threads: 3
Dashboard: http://127.0.0.1:34367/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:46345,
Local directory: /tmp/dask-scratch-space/worker-5ksw3lmg,Local directory: /tmp/dask-scratch-space/worker-5ksw3lmg

0,1
Comm: tcp://127.0.0.1:33983,Total threads: 3
Dashboard: http://127.0.0.1:35057/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:43205,
Local directory: /tmp/dask-scratch-space/worker-e0o00bq3,Local directory: /tmp/dask-scratch-space/worker-e0o00bq3

0,1
Comm: tcp://127.0.0.1:37543,Total threads: 3
Dashboard: http://127.0.0.1:33007/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:42155,
Local directory: /tmp/dask-scratch-space/worker-iq_n1mkj,Local directory: /tmp/dask-scratch-space/worker-iq_n1mkj


<font size="5">Utilities and Variables</font> 

In [4]:
##########################################################################################################################################
# Optional Zone Inputs (Categorical)
##########################################################################################################################################


#ecozone
continent_ecozone_path = "s3://gfw2-data/climate/carbon_model/fao_ecozones/ecozone_continent/20190116/processed/"
continent_ecozone_pattern = "fao_ecozones_continents_processed"

#intact forest landscapes/ primary forest
ifl_primary_path = "s3://gfw2-data/climate/carbon_model/ifl_primary_merged/processed/20200724/"
ifl_primary_pattern = "ifl_2000_primary_2001_merged"

#drivers 
drivers_path = "s3://gfw2-data/drivers_of_loss/1_km/processed/20241004/"
drivers_pattern = "drivers_of_TCL_1_km_20241004"

#protected areas

#----------------------------------------------------------------------------------------------------------------------------------------

#IPCC basic class [for single years only (i.e. 2000, 2005, 2010, 2015, 2020)]
ipcc_class_2000_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2000/40000_pixels/20240205/"
ipcc_class_2000_pattern = "IPCC_classes_2000"

ipcc_class_2005_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2005/40000_pixels/20240205/"
ipcc_class_2005_pattern = "IPCC_classes_2005"

ipcc_class_2010_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2010/40000_pixels/20240205/"
ipcc_class_2010_pattern = "IPCC_classes_2010"

ipcc_class_2015_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2015/40000_pixels/20240205/"
ipcc_class_2015_pattern = "IPCC_classes_2015"

ipcc_class_2020_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2020/40000_pixels/20240205/"
ipcc_class_2020_pattern = "IPCC_classes_2020"

#----------------------------------------------------------------------------------------------------------------------------------------

#IPCC change class [for intervals (i.e. 2000_2005, 2005_2010, 2010_2015, 2015_2020)]
ipcc_change_2000_2005_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2000_2005/40000_pixels/20240205/"
ipcc_change_2000_2005_pattern = "IPCC_change_2000_2005"

ipcc_change_2005_2010_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2005_2010/40000_pixels/20240205/"
ipcc_change_2005_2010_pattern = "IPCC_change_2005_2010"

ipcc_change_2010_2015_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2010_2015/40000_pixels/20240205/"
ipcc_change_2010_2015_pattern = "IPCC_change_2010_2015"

ipcc_change_2015_2020_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2015_2020/40000_pixels/20240205/"
ipcc_change_2015_2020_pattern = "IPCC_change_2015_2020"

#----------------------------------------------------------------------------------------------------------------------------------------

#node codes  [for intervals (i.e. 2000_2005, 2005_2010, 2010_2015, 2015_2020)]
#Note: a tile will have multiple tiffs (40N_010E__10_31_11_32__land_state_node_2000_2005.tif)
# land_state_node_2000_2005_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/land_state_node/2000_2005/4000_pixels/20240930/"
# land_state_node_2000_2005_pattern = "__land_state_node_2000_2005"

# land_state_node_2005_2010_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/land_state_node/2005_2010/4000_pixels/20240930/"
# land_state_node_2005_2010_pattern = "__land_state_node_2005_2010"

# land_state_node_2010_2015_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/land_state_node/2010_2015/4000_pixels/20240930/"
# land_state_node_2010_2015_pattern = "__land_state_node_2010_2015"

# land_state_node_2015_2020_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/land_state_node/2015_2020/4000_pixels/20240930/"
# land_state_node_2015_2020_pattern = "__land_state_node_2015_2020"



##########################################################################################################################################
#Optional Stats Inputs (Quantitiative)
##########################################################################################################################################
#pixel area (m2)
pixel_area_path = "s3://gfw2-data/umd_area_2013/v1.10/raster/epsg-4326/10/40000/area_m/geotiff/"

#----------------------------------------------------------------------------------------------------------------------------------------

#AGC density 2000 (MgC per ha)
agc_density_2000_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/AGC_density_MgC_ha/2000/40000_pixels/20240821/"
agc_density_2000_pattern = "AGC_density_MgC_ha_2000"
#Note 2005 to 2020 only have 1000 pixels not 40000 pixels like 2000

#----------------------------------------------------------------------------------------------------------------------------------------

#BGC density 2000 (MgC per ha)
bgc_density_2000_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/BGC_density_MgC_ha/2000/40000_pixels/20240821/"
bgc_density_2000_pattern = "BGC_density_MgC_ha_2000"

#----------------------------------------------------------------------------------------------------------------------------------------

#Deadwood density 2000 (MgC per ha)
deadwood_density_2000_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/deadwood_C_density_MgC_ha/2000/40000_pixels/20240821/"
deadwood_density_2000_pattern = "deadwood_C_density_MgC_ha_2000"

#----------------------------------------------------------------------------------------------------------------------------------------

#Litter density 2000 (MgC per ha)
litter_density_2000_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/litter_C_density_MgC_ha/2000/40000_pixels/20240821/"
litter_density_2000_pattern = "litter_C_density_MgC_ha_2000"

#----------------------------------------------------------------------------------------------------------------------------------------


#TODO: Update with constants and names variables
#TODO: Update final dates for all zone/stat inputs

# gadm boundary tiles
# iso_gadm_path = 's3://gfw-data-lake/gadm_administrative_boundaries/v3.6/raster/epsg-4326/10/40000/adm0/geotiff/'
# iso_gadm_pattern = ''
#TODO: Why is iso_gadm showing up as none?

<font size="4">Zone Inputs (Categorical)</font> 

In [17]:
zone_layers = {}

for year in year_list: 
    if year == '2000': 
        zone_layers["ipcc_class_2000"] = f"{cn.outputs_path}{cn.IPCC_class_path}/{year}/40000_pixels/{ipcc_class_date}/{tile_id}__{cn.IPCC_class_pattern}_{year}.tif"
    else:
        zone_layers[f"ipcc_change_{year}"] = f"{cn.outputs_path}{cn.IPCC_change_path}/{year}/40000_pixels/{ipcc_class_date}/{tile_id}__{cn.IPCC_change_pattern}_{year}.tif"
if 'climate' in zone_list: 
    zone_layers["climate_domain"] = f"{cn.climate_domain_path}{tile_id}_{cn.climate_domain_pattern}.tif"
if 'ecozone' in zone_list: 
    zone_layers["continent_ecozone"] = f"{cn.continent_ecozone_path}{tile_id}_{cn.continent_ecozone_pattern}.tif"
if 'ifl_primary'in zone_list: 
    zone_layers["ifl_primary"] = f"{cn.ifl_primary_path}{tile_id}_{cn.ifl_primary_pattern}.tif"
if 'drivers'in zone_list: 
    zone_layers["drivers"] = f"{cn.drivers_path}{tile_id}_{cn.drivers_pattern}.tif"

zone_layers

AttributeError: module 'utilities.constants_and_names' has no attribute 'ifl_primary_path'

<font size="5">Calculate Densities 2000</font> 

In [134]:
if '2000' in year_list:
    for tile_id in tile_id_list: 
        zone_dict = {}
        
        # Adding contextual layers to zone data dictionary 
        zone_dict["ipcc_class_2000"] = f"{ipcc_class_2000_path}{tile_id}__{ipcc_class_2000_pattern}.tif"
        zone_dict["climate_domain"] = f"{climate_domain_path}{tile_id}_{climate_domain_pattern}.tif"
        zone_dict["continent_ecozone"] = f"{continent_ecozone_path}{tile_id}_{continent_ecozone_pattern}.tif"
        
        
        zone_first_tiles = uu.first_file_name_in_s3_folder(zone_dict)
        zone_dict_with_data_types = uu.add_file_type_to_dict(zone_first_tiles)
        #NOTE: uu.check_for_tile() doesn't work if there is not a second item (ie data_type), in the download dictionary
        #NOTE: uu.zone_first_tiles() overwrites tile_id w/ first tile so you have to update it again

<font size="4">Stat Inputs (Quantitative)</font> 

In [135]:
stat_dict = {}
stat_dict["pixel_area"] = f"{pixel_area_path}{tile_id}.tif"
stat_dict["agc_density_2000"] = f"{agc_density_2000_path}{tile_id}__{agc_density_2000_pattern}.tif"
stat_dict["bgc_density_2000"] = f"{bgc_density_2000_path}{tile_id}__{bgc_density_2000_pattern}.tif"
stat_dict["deadwood_density_2000"] = f"{deadwood_density_2000_path}{tile_id}__{deadwood_density_2000_pattern}.tif"
stat_dict["litter_density_2000"] = f"{litter_density_2000_path}{tile_id}__{litter_density_2000_pattern}.tif"

stat_first_tiles = uu.first_file_name_in_s3_folder(stat_dict)
stat_dict_with_data_types = uu.add_file_type_to_dict(stat_first_tiles)
#NOTE: uu.check_for_tile() doesn't work if there is not a second item (ie data_type), in the download dictionary
#NOTE: uu.zone_first_tiles() overwrites tile_id w/ first tile so you have to update it again

<font size="4">Download Zone and Stat Inputs Layers</font> 

In [136]:
is_final = False
logger = lu.setup_logging()

#bounds = uu.get_10x10_tile_bounds(tile_id)
bounds = (-60, -10, -59.75, -9.75)
chunk_length_pixels = uu.calc_chunk_length_pixels(bounds)

# Replace tile_id in dictionaries with data_types
updated_zone_dict = uu.replace_tile_id_in_dict(zone_dict_with_data_types, tile_id)
print(updated_zone_dict)
updated_stat_dict = uu.replace_tile_id_in_dict(stat_dict_with_data_types, tile_id)
print(updated_stat_dict)

# Download zone Layers
tile_exists = uu.check_for_tile(updated_zone_dict, is_final, logger)
print(tile_exists)
# if not tile_exists:
#     return f"Skipped chunk {bounds_str} because {tile_id} does not exist for any inputs: {timestr()}"

zone_futures = uu.prepare_to_download_chunk(bounds, updated_zone_dict, chunk_length_pixels, is_final, logger)
print(zone_futures)

# Download Stat Layers
tile_exists = uu.check_for_tile(updated_stat_dict, is_final, logger)
print(tile_exists)
# if not tile_exists:
#     return f"Skipped chunk {bounds_str} because {tile_id} does not exist for any inputs: {timestr()}"

stat_futures = uu.prepare_to_download_chunk(bounds, updated_stat_dict, chunk_length_pixels, is_final, logger)
print(stat_futures)

{'climate_domain': ['s3://gfw2-data/climate/carbon_model/inputs_for_carbon_pools/processed/fao_ecozones_bor_tem_tro/20190418/00N_060W_fao_ecozones_bor_tem_tro_processed.tif', 'Int16'], 'continent_ecozone': ['s3://gfw2-data/climate/carbon_model/fao_ecozones/ecozone_continent/20190116/processed/00N_060W_fao_ecozones_continents_processed.tif', 'Int16'], 'ipcc_class_2000': ['s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2000/40000_pixels/20240205/00N_060W__IPCC_classes_2000.tif', 'Byte']}
{'pixel_area': ['s3://gfw2-data/umd_area_2013/v1.10/raster/epsg-4326/10/40000/area_m/geotiff/00N_060W.tif', 'Float32'], 'agc_density_2000': ['s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/AGC_density_MgC_ha/2000/40000_pixels/20240821/00N_060W__AGC_density_MgC_ha_2000.tif', 'Float32'], 'bgc_density_2000': ['s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/BGC_density_MgC_ha/2000/40000_pixels/20240821/00N_060W__BGC_density_MgC_ha_2000.tif', 'Float32'], 'deadwood_de

In [137]:
zone_layers = {}
zone_layers_unique_values = {}

# Waits for requests to come back with data from S3
for zone_future in concurrent.futures.as_completed(zone_futures):
    zone_layer = zone_futures[zone_future]
    zone_layers[zone_layer] = zone_future.result()

    # Get unique values for each layer and compute them
    unique_values = dask.array.unique(zone_layers[zone_layer])
    zone_layers_unique_values[zone_layer] = unique_values.compute()

# Print data type and dimensions for each zone array
for layer, array in zone_layers.items():
    print(f"Data type in layer '{layer}': {array.dtype}")
    print(f"Dimensions in layer '{layer}': {array.shape}")

# Print unique values for each zone array
for layer, unique_vals in zone_layers_unique_values.items():
    print(f"Unique values in layer {layer}: {unique_vals}")

Data type in layer 'continent_ecozone': int16
Dimensions in layer 'continent_ecozone': (1000, 1000)
Data type in layer 'climate_domain': int16
Dimensions in layer 'climate_domain': (1000, 1000)
Data type in layer 'ipcc_class_2000': uint8
Dimensions in layer 'ipcc_class_2000': (1000, 1000)
Unique values in layer continent_ecozone: [2020]
Unique values in layer climate_domain: [1]
Unique values in layer ipcc_class_2000: [1 3 5]


In [138]:
stat_layers = {}

# Waits for requests to come back with data from S3
for stat_future in concurrent.futures.as_completed(stat_futures):
    stat_layer = stat_futures[stat_future]
    layer = stat_future.result()

    # Mask nodata values using rioxarray's rio.nodata attribute
    if hasattr(layer, 'rio'):
        nodata_value = layer.rio.nodata
        if nodata_value is not None:
            # Mask nodata values in the array
            layer = layer.where(layer != nodata_value)

    # Store the masked layer
    stat_layers[stat_layer] = layer
    

# Print data type and dimensions for each zone array
for layer, array in stat_layers.items():
    print(f"Data type in layer '{layer}': {array.dtype}")
    print(f"Dimensions in layer '{layer}': {array.shape}")

Data type in layer 'litter_density_2000': float32
Dimensions in layer 'litter_density_2000': (1000, 1000)
Data type in layer 'agc_density_2000': float32
Dimensions in layer 'agc_density_2000': (1000, 1000)
Data type in layer 'pixel_area': float32
Dimensions in layer 'pixel_area': (1000, 1000)
Data type in layer 'deadwood_density_2000': float32
Dimensions in layer 'deadwood_density_2000': (1000, 1000)
Data type in layer 'bgc_density_2000': float32
Dimensions in layer 'bgc_density_2000': (1000, 1000)


<font size="4">Bit shifting to get unique zone id for all zone_layers inputs</font> 

In [139]:
import numpy as np
import dask.array as da

# Function to calculate the number of bits needed to represent the maximum value in the array
def calculate_bits_needed(max_value):
    return int(np.ceil(np.log2(max_value + 1)))

# Convert numpy arrays to dask arrays if needed
def ensure_dask_array(array, chunks="auto"):
    if isinstance(array, np.ndarray):
        # Convert numpy array to dask array
        return da.from_array(array, chunks=chunks)
    return array  # Already a dask array

# Ensure all layers have a consistent data type (int16) for bit-shifting
def ensure_dtype(layer_array, dtype=np.int16):
    if layer_array.dtype != dtype:
        return layer_array.astype(dtype)
    return layer_array

# Prepare to dynamically combine layers using bit-shifting
def combine_zone_layers(sorted_layers):
    combined_array = None
    total_shift = 0

    # Loop through each layer
    for layer_name, layer_array in sorted_layers:
        # Convert to dask.array if it's a numpy array
        layer_array = ensure_dask_array(layer_array)

        # Convert layer to int16 if necessary for safe bit-shifting
        layer_array = ensure_dtype(layer_array)

        # Find the maximum value in the layer (using Dask's max function)
        max_value = da.max(layer_array).compute()  # Compute to get the actual maximum value

        # Determine the number of bits needed to represent this layer
        bits_needed = calculate_bits_needed(max_value)

        # Print unique values in the current layer before shifting
        #print(f"Unique values in layer '{layer_name}' before shifting: {np.unique(layer_array.compute())}")

        # Shift the layer by the cumulative number of bits (based on previous layers)
        shifted_layer = layer_array << total_shift

        # Print unique values in the current layer after shifting
        #print(f"Unique values in layer '{layer_name}' after shifting: {np.unique(shifted_layer.compute())}")

        # If this is the first layer, initialize the combined array
        if combined_array is None:
            combined_array = shifted_layer
        else:
            # Use bitwise OR to combine the shifted layer with the previous layers
            combined_array = combined_array | shifted_layer

        # Update the total bit shift for the next layer
        total_shift += bits_needed

    return combined_array

# Sort zone_layers items to ensure consistent layer order
zone_layers_sorted = sorted(zone_layers.items())
#TODO: Overwrite zone layers as 1 object

# Combine the zone layers dynamically
combined_zone_array = combine_zone_layers(zone_layers_sorted)

# Compute the final result from the combined Dask array
combined_zone_result = combined_zone_array.compute()

# Print the unique values in the combined result
unique_values_combined_zones = np.unique(combined_zone_result)
print("Unique values in combined result:")
print(unique_values_combined_zones)

Unique values in combined result:
[ 8137 16329 24521]


In [140]:
#OUTPUT = COMBINED ZONE RESULTS

<font size="4">Multiply densities/ fluxes by area to calculate gross values</font> 

In [141]:
import numpy as np
from numba import jit

# JIT compiled function
@jit(nopython=True)
def calculate_total_mgc(agc, bgc, deadwood, litter, pixel_area):
    # Get the shape of the arrays (assuming they are the same shape)
    total_mgc_agc = np.zeros_like(agc)
    total_mgc_bgc = np.zeros_like(bgc)
    total_mgc_deadwood = np.zeros_like(deadwood)
    total_mgc_litter = np.zeros_like(litter)
    area_ha = np.zeros_like(litter)
    
    # Loop over each pixel
    for i in range(agc.shape[0]):
        for j in range(agc.shape[1]):
            # Convert pixel_area from square meters to hectares
            square_meters_to_hectares = 10000.0
            area_in_hectares = pixel_area[i, j] / square_meters_to_hectares
            
            # Calculate total MgC for each density type
            total_mgc_agc[i, j] = agc[i, j] * area_in_hectares
            total_mgc_bgc[i, j] = bgc[i, j] * area_in_hectares
            total_mgc_deadwood[i, j] = deadwood[i, j] * area_in_hectares
            total_mgc_litter[i, j] = litter[i, j] * area_in_hectares
            area_ha[i,j] = area_in_hectares
    
    # Return as a dictionary
    return {
        'agc_total_MgC': total_mgc_agc,
        'bgc_total_MgC': total_mgc_bgc,
        'deadwood_total_MgC': total_mgc_deadwood,
        'litter_total_MgC': total_mgc_litter, 
        'area_ha': area_ha
    }

# Call the JIT-compiled function and return a dictionary
total_mgc_dict = calculate_total_mgc(
    stat_layers['agc_density_2000'], stat_layers['bgc_density_2000'], stat_layers['deadwood_density_2000'], stat_layers['litter_density_2000'], stat_layers['pixel_area'])

total_mgc_dict

DictType[unicode_type,array(float32, 2d, C)]<iv=None>({agc_total_MgC: [[12.334657  12.691151  12.833748  ... 12.085113  12.19206   12.334657 ]
 [12.976335  12.726789  12.833737  ...  8.021086  12.156402  12.441595 ]
 [12.798079  13.154571  13.796258  ... 12.370288  12.013796  12.156393 ]
 ...
 [12.111825  11.292496  11.577479  ... 11.862463  12.147448  11.577479 ]
 [11.898077  11.755586  11.648716  ... 11.68434   11.57747   11.862453 ]
 [11.755577  11.506216  11.684331  ... 11.4705925 11.933691  12.076183 ]], bgc_total_MgC: [[3.7690954 3.878029  3.921602  ... 3.326815  3.356256  3.3955102]
 [3.9651725 3.888919  3.921599  ... 2.2080612 3.3464396 3.4249485]
 [3.910703  4.019636  4.215716  ... 3.4053187 3.3071826 3.3464372]
 ...
 [3.8250296 3.5662777 3.6562781 ... 2.494807  2.5547423 2.4348714]
 [3.7575257 3.7125258 3.6787755 ... 2.4573452 2.4348695 2.4948049]
 [3.7125232 3.6337724 3.690023  ... 2.4123921 2.5097868 2.5397546]], deadwood_total_MgC: [[0.7400794  0.76146907 0.77002484 ... 0.

<font size="4">Calculate Zonal Stats</font> 

In [144]:
import pandas as pd
import numpy as np
from dask import delayed
from dask.distributed import Client
from xrspatial import zonal_stats


# #Function to compute zonal stats using Dask futures and return a pandas DataFrame
# def compute_zonal_stats_to_dataframe(layers, zone):
#     zonal_stats_futures = {}

#     # Iterate over the layers in the total_mgc_dict
#     for layer_name, layer_data in layers.items():
        
#         # Use Dask futures to compute zonal statistics in parallel
#         future = client.submit(zonal_stats, zones=zone, values=layer_data, stats_funcs=['mean', 'sum', 'min', 'max'])
#         zonal_stats_futures[layer_name] = future

#     # Create an empty list to hold all data frames for each layer
#     all_layer_dfs = []

#     # Gather the results from the futures and convert them to DataFrames
#     for layer_name, future in zonal_stats_futures.items():
#         stats = future.result()

#         # Convert the stats dictionary to a DataFrame for this layer
#         df = pd.DataFrame(stats)
#         df['layer'] = layer_name  # Add a column for the layer name

#         # Append this DataFrame to the list
#         all_layer_dfs.append(df)

#     # Concatenate all DataFrames into one DataFrame
#     final_df = pd.concat(all_layer_dfs)

#     return final_df

def compute_zonal_stats_to_dataframe(total_mgc_dict, combined_zone_result):
    zonal_stats_futures = {}

    # Ensure that the combined_zone_result is an xr DataArray for zonal_stats function
    combined_zone_result = xr.DataArray(combined_zone_result)  

    # Iterate over the layers in the total_mgc_dict
    for layer_name, layer_data in total_mgc_dict.items():
        # Ensure layer_data is an xr DataArray for zonal_stats function
        layer_data = xr.DataArray(layer_data) 

        # Use Dask futures to compute zonal statistics in parallel
        future = client.submit(zonal_stats, zones=combined_zone_result, values=layer_data, stats_funcs=['count', 'sum', 'min', 'max'])
        zonal_stats_futures[layer_name] = future

    # Create an empty list to hold all data frames for each layer
    all_layer_dfs = []

    # Gather the results from the futures and convert them to DataFrames
    for layer_name, future in zonal_stats_futures.items():
        stats = future.result()

        # Convert the stats dictionary to a DataFrame for this layer
        df = pd.DataFrame(stats)
        df['layer'] = layer_name  # Add a column for the layer name

        # Append this DataFrame to the list
        all_layer_dfs.append(df)

    # Concatenate all DataFrames into one DataFrame
    final_df = pd.concat(all_layer_dfs)

    return final_df

#Make sure zone is a numpy array
#combined_zone_result = np.array(combined_zone_result, dtype=np.int16, copy=True)

# Call the function and get the zonal stats as a DataFrame
zonal_stats_df = compute_zonal_stats_to_dataframe(total_mgc_dict, combined_zone_result)

# Display the first few rows of the resulting DataFrame
print(zonal_stats_df.head())

RuntimeError: Error during deserialization of the task graph. This frequently
occurs if the Scheduler and Client have different environments.
For more information, see
https://docs.dask.org/en/stable/deployment-considerations.html#consistent-software-environments


In [123]:
import xarray as xr
import dask.array as da
from xrspatial import zonal_stats  

# Ensure combined_zone_result is converted to a Dask array (if it's not already)
combined_zone_result = da.from_array(combined_zone_result)

# Wrap combined_zone_result in an xarray.DataArray
combined_zone_da = xr.DataArray(combined_zone_result)

# Function to process all zonal stats calls for a tile
def run_all_zonal_stats(zone, layer, layer_name):
    
    # Convert Dask array to xarray.DataArray
    layer_da = xr.DataArray(layer)
    
    # Apply zonal stats
    result = zonal_stats(
        zones=zone,
        values=layer_da,
        stats_funcs=["sum", "count", "min", "max"]
    ).set_index("zone").rename(columns={
        "count": f"{layer_name}_count",
        "sum": f"{layer_name}_sum",
        "min": f"{layer_name}_min",
        "max": f"{layer_name}_max"
    })
    return result

# For loop to run all zonal stats per tile
futures = []

for layer, array in stat_layers.items():
    # Ensure each layer is a Dask array and wrap it in an xarray.DataArray
    array = da.from_array(array)
    result = run_all_zonal_stats(combined_zone_da, array, layer)
    results.append(result)

print(results)
#TODO change to use future instead

We're assuming that the indices of each dataframes are 
 aligned. This assumption is not generally safe.


RuntimeError: Error during deserialization of the task graph. This frequently
occurs if the Scheduler and Client have different environments.
For more information, see
https://docs.dask.org/en/stable/deployment-considerations.html#consistent-software-environments


In [None]:
print(combined_zone_result)
print(combined_zone_result.dtype)
print(combined_zone_result.shape)

for layer, array in stat_layers.items():
    print(layer)
    print(array)
    print(array.dtype)
    print(array.shape)

<font size="4">Reverse bit shifting to get original layer values for combined zone inputs</font> 

In [None]:
#TEST Example dask data frame with zonal stats values by bit-shifted zone
import pandas as pd

# Create a Pandas DataFrame
zone_df = pd.DataFrame({'bit_shifted_values': unique_values_combined_zones})

# Print the Dask DataFrame to verify
print(zone_df)
print(zone_df.columns)

In [None]:
# Function to reverse the bit-shifting process
def reverse_bit_shifting(df, column_name, sorted_layers):
    """
    Reverse the bit-shifting operation and extract the original values for each layer.

    Parameters:
    - df: the Pandas DataFrame containing the combined bit-shifted values.
    - column_name: the name of the column that contains the bit-shifted values.
    - sorted_layers: a dictionary containing the original Dask arrays (for each layer) sorted alpahbaetically.

    Returns:
    - df: a Pandas DataFrame with new columns for each original layer.
    """
    
    # Calculate bits_needed_per_layer based on max values from Dask arrays in sorted_layers
    bits_needed_per_layer = []
    
    for layer_name, layer_array in sorted_layers:
        # Ensure the layer is a Dask array and calculate max value
        layer_array = ensure_dask_array(layer_array)
        max_value = da.max(layer_array).compute()  # Compute the maximum value
        
        # Determine the number of bits needed to represent this layer
        bits_needed = calculate_bits_needed(max_value)
        bits_needed_per_layer.append(bits_needed)

    total_shift = sum(bits_needed_per_layer)  # Start with the total bits used

    # Reverse bit-shifting: loop through each layer in reverse order
    layers = [layer_name for layer_name, _ in sorted_layers]  # Get the sorted layer names
    for i in range(len(layers)-1, -1, -1):
        layer = layers[i]
        bits_needed = bits_needed_per_layer[i]
        total_shift -= bits_needed
        # Create a mask for extracting the current layer
        mask = (1 << bits_needed) - 1
        # Shift right and apply the mask to extract the current layer's values
        df[layer] = df[column_name].apply(lambda x: (x >> total_shift) & mask)

    return df

# Reverse the bit-shifting
parse_zone_data = reverse_bit_shifting(zone_df, 'bit_shifted_values', zone_layers_sorted)

# Print the Pandas DataFrame to verify the results
print(parse_zone_data)

<font size="5">Shutting down cloud and local clusters</font> 

In [105]:
if cluster_type in ['full', 'test']: 
    coiled_cluster.shutdown()
    print('coiled cluster shut down')
elif cluster_type == 'local':
    client.shutdown()
    print('local client shut down')
else: 
    print("No clusters were shut down. Check that cluster_type is set to one of the following: 'full', 'test', 'local'")

# coiled_client.restart() 

local client shut down


In [None]:
######################################################################

<font size="5">Zone Inputs (Categorical)</font> 

In [None]:
# # gadm boundary tiles
# iso_gadm_uri = 's3://gfw-data-lake/gadm_administrative_boundaries/v3.6/raster/epsg-4326/10/40000/adm0/geotiff/'
# iso_gadm_pattern = ''

# climate_domain_path = "s3://gfw2-data/climate/carbon_model/inputs_for_carbon_pools/processed/fao_ecozones_bor_tem_tro/20190418/"
# climate_domain_pattern = "fao_ecozones_bor_tem_tro_processed"

# continent_ecozone_path = "s3://gfw2-data/climate/carbon_model/fao_ecozones/ecozone_continent/20190116/processed/"
# continent_ecozone_pattern = "fao_ecozones_continents_processed"



# # MODEL OUTPUT SHAPEFILES
# # # IPCC reporting classes
# # ipcc_class_2000 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2000/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2000__10x10.shp'
# # ipcc_class_2005 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2005/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2005__10x10.shp'
# # ipcc_class_2010 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2010/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2010__10x10.shp'
# # ipcc_class_2015 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2015/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2015__10x10.shp'
# # ipcc_class_2020 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2020/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2020__10x10.shp'

# # # IPCC change classes
# # ipcc_change_2005 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2000_2005/40000_pixels/20240205/raster_footprints_IPCC_basic_change_2000_2005__10x105.shp'
# # ipcc_change_2010 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2005_2010/40000_pixels/20240205/raster_footprints_IPCC_basic_change_2005_2010__10x10.shp'
# # ipcc_change_2015 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2010_2015/40000_pixels/20240205/raster_footprints_IPCC_basic_change_2010_2015__10x10.shp'
# # ipcc_change_2020 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2015_2020/40000_pixels/20240205/raster_footprints_IPCC_basic_change_2015_2020__10x10.shp'

# # 2000 carbon densities 
# AGC_density_2000 = 'gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/AGC_density_MgC_ha/2000/40000_pixels/20240729/raster_footprints_AGC_2000__global__10x10.shp'
# BGC_density_2000 = 'gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/BGC_density_MgC_ha/2000/40000_pixels/20240729/raster_footprints_BGC_2000__global__10x10.shp'
# deadwood_C_density_2000 = 'gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/deadwood_C_density_MgC_ha/2000/40000_pixels/20240729/raster_footprints_deadwood_C_2000__global__10x10.shp'
# litter_C_density = 'gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/litter_C_density_MgC_ha/2000/40000_pixels/20240729/raster_footprints_litter_C_2000__global__10x10.shp'

#TODO: Include land state nodes? --> no shapefiles

In [None]:
# #rasterize vector data
# name path column

# shapefile = gpd.read_file(path)
# utm = shapefile.to_crs("EPSG:4326")
# geom = utm[['geometry', column]].values.tolist()

In [None]:
# # lazily "computing" tile sets
# iso_gadm = get_tile_dataset(iso_gadm_uri, tile, "iso")
# iso_gadm = iso_gadm.where(iso_gadm != 9999)