In [1]:
# importing dependencies for notebook
import sys
import os
import rioxarray
import rasterio
import xarray as xr
import numpy as np
import pandas as pd
import dask.array as da
import pandas as pd
from xrspatial import zonal_stats
from xrspatial.zonal import _stats_count as xr_count
from pathlib import Path 

# dask/parallelization libraries
import coiled
import dask
from dask.distributed import Client, LocalCluster
from dask.distributed import print
import distributed
import concurrent.futures

# Dynamically creating the utilities module using a relative path
import importlib.util
spec = importlib.util.spec_from_file_location("utilities", "../utilities/__init__.py")
utilities = importlib.util.module_from_spec(spec)
sys.modules["utilities"] = utilities
spec.loader.exec_module(utilities)

# Importing utilities from utilities module 
from utilities import constants_and_names as cn
from utilities import universal_utilities as uu
from utilities import log_utilities as lu
from utilities import numba_utilities as nu



<font size="5">Set notebook options</font> 

In [2]:
#create cloud vs local cluster (options = 'full', 'test', 'local' )
cluster_type = 'local'

#tile list 
tile_id = "00N_000W"  #this is a placeholder for the data dictionaries, overwritten by tile_id
tile_id_list = ["00N_060W"]

#years list (options: '2000', '2000_2005', '2005_2010', '2010_2015', '2015_2020')
#'2000': Get carbon densities in 2000
year_list = ['2000'] 

#zone list (options: 'ipcc_class', 'climate', 'ecozone', 'ifl_primary', 'drivers')
zone_list = ['ipcc_class', 'climate', 'ecozone']
#TODO: Add GADM and protected areas as contextual layers

#stats list (options: 'agc', 'bgc', 'deadwood', 'litter', 'soil')
stat_list = ['agc', 'bgc', 'deadwood', 'litter', 'soil']

#flux list (options: 'emissions', 'removals', 'net_flux')
flux_list = ['emissions', 'removals', 'net_flux']

#date of ipcc landcover class rasters
ipcc_class_date = '20240205'

#date of emission, removal, and netflux rasters
flux_calc_date='20241231' 
#TODO: update flux_calc_date

<font size="5">Creating clusters</font> 

In [3]:
if cluster_type == 'full': 
    # Full cluster with 40 workers
    coiled_cluster = coiled.Cluster(
        n_workers=40,  
        use_best_zone=True, 
        compute_purchase_option="spot_with_fallback",
        idle_timeout="10 minutes",
        region="us-east-1",
        name="AFOLU_zonal_stats", 
        workspace='wri-forest-research', 
        worker_cpu=4,
        worker_memory = "16GiB" 
    )
    client = coiled_cluster.get_client()
    
elif cluster_type == 'test': 
    # Test cluster with 1 worker
    coiled_cluster = coiled.Cluster(
        n_workers=1,  
        use_best_zone=True, 
        compute_purchase_option="spot_with_fallback",
        idle_timeout="10 minutes",
        region="us-east-1",
        name="AFOLU_zonal_stats", 
        workspace='wri-forest-research', 
        worker_cpu=4,
        worker_memory = "16GiB" 
    )
    client = coiled_cluster.get_client()
elif cluster_type == 'local':
    # Local cluster with multiple workers
    local_cluster = LocalCluster()  
    client = Client(local_cluster)
else: 
    print("set cluster_type to one of the following: 'full', 'test', 'local'")

client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 43411 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:43411/status,

0,1
Dashboard: http://127.0.0.1:43411/status,Workers: 4
Total threads: 12,Total memory: 15.47 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:46563,Workers: 4
Dashboard: http://127.0.0.1:43411/status,Total threads: 12
Started: Just now,Total memory: 15.47 GiB

0,1
Comm: tcp://127.0.0.1:35003,Total threads: 3
Dashboard: http://127.0.0.1:37067/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:33113,
Local directory: /tmp/dask-scratch-space/worker-d2i8q20i,Local directory: /tmp/dask-scratch-space/worker-d2i8q20i

0,1
Comm: tcp://127.0.0.1:40495,Total threads: 3
Dashboard: http://127.0.0.1:42607/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:38257,
Local directory: /tmp/dask-scratch-space/worker-mg3gqzao,Local directory: /tmp/dask-scratch-space/worker-mg3gqzao

0,1
Comm: tcp://127.0.0.1:40359,Total threads: 3
Dashboard: http://127.0.0.1:32775/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:37935,
Local directory: /tmp/dask-scratch-space/worker-zax9slvj,Local directory: /tmp/dask-scratch-space/worker-zax9slvj

0,1
Comm: tcp://127.0.0.1:42421,Total threads: 3
Dashboard: http://127.0.0.1:35895/status,Memory: 3.87 GiB
Nanny: tcp://127.0.0.1:39523,
Local directory: /tmp/dask-scratch-space/worker-w2oh9ewg,Local directory: /tmp/dask-scratch-space/worker-w2oh9ewg


<font size="5">Creating zone and stats data dictionaries</font> 

<font size="4">Zone Inputs (Categorical)</font> 

In [4]:
zone_dict = {}

for year in year_list: 
    if year == '2000': 
        zone_dict["ipcc_class_2000"] = f"{cn.outputs_path}{cn.IPCC_class_path}/{year}/40000_pixels/{ipcc_class_date}/{tile_id}__{cn.IPCC_class_pattern}_{year}.tif"
    else:
        zone_dict[f"ipcc_change_{year}"] = f"{cn.outputs_path}{cn.IPCC_change_path}/{year}/40000_pixels/{ipcc_class_date}/{tile_id}__{cn.IPCC_change_pattern}_{year}.tif"
if 'climate' in zone_list: 
    zone_dict["climate_domain"] = f"{cn.climate_domain_path}{tile_id}_{cn.climate_domain_pattern}.tif"
if 'ecozone' in zone_list: 
    zone_dict["continent_ecozone"] = f"{cn.continent_ecozone_path}{tile_id}_{cn.continent_ecozone_pattern}.tif"
if 'ifl_primary'in zone_list: 
    zone_dict["ifl_primary"] = f"{cn.ifl_primary_path}{tile_id}_{cn.ifl_primary_pattern}.tif"
if 'drivers'in zone_list: 
    zone_dict["drivers"] = f"{cn.drivers_path}{tile_id}_{cn.drivers_pattern}.tif"

zone_dict

{'ipcc_class_2000': 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2000/40000_pixels/20240205/00N_000W__IPCC_classes_2000.tif',
 'climate_domain': 's3://gfw2-data/climate/carbon_model/inputs_for_carbon_pools/processed/fao_ecozones_bor_tem_tro/20190418/00N_000W_fao_ecozones_bor_tem_tro_processed.tif',
 'continent_ecozone': 's3://gfw2-data/climate/carbon_model/fao_ecozones/ecozone_continent/20190116/processed/00N_000W_fao_ecozones_continents_processed.tif'}

<font size="4">Stat Inputs (Quantitative)</font> 

In [5]:
stat_dict = {}

#Adding pixel area so total carbon (MgC) can be calculated by multiplying pixel values (MgC per ha) by pixel areas (square meters later converted to hectares)
stat_dict['pixel_area_m'] = "s3://gfw2-data/umd_area_2013/v1.10/raster/epsg-4326/10/40000/area_m/geotiff/"

for year in year_list: 
    #Adding 2000 carbon densities for user-specified carbon pools
    if year == '2000': 
        if 'agc' in stat_list: 
            stat_dict["agc_density_MgC_ha_2000"] = f"{cn.outputs_path}{cn.AGC_density_path_part}/{year}/40000_pixels/{cn.carbon_pool_2000_date}/{tile_id}__{cn.AGC_density_path_part}_{year}.tif"
        if 'bgc' in stat_list: 
            stat_dict["bgc_density_MgC_ha_2000"] = f"{cn.outputs_path}{cn.BGC_density_path_part}/{year}/40000_pixels/{cn.carbon_pool_2000_date}/{tile_id}__{cn.BGC_density_path_part}_{year}.tif"
        if 'deadwood' in stat_list: 
            stat_dict["deadwood_density_MgC_ha_2000"] = f"{cn.outputs_path}{cn.deadwood_c_density_path_part}/{year}/40000_pixels/{cn.carbon_pool_2000_date}/{tile_id}__{cn.deadwood_c_density_path_part}_{year}.tif"
        if 'litter' in stat_list: 
            stat_dict["litter_density_MgC_ha_2000"] = f"{cn.outputs_path}{cn.litter_c_density_path_part}/{year}/40000_pixels/{cn.carbon_pool_2000_date}/{tile_id}__{cn.litter_c_density_path_part}_{year}.tif"
        if 'soil' in stat_list: 
            stat_dict["soil_density_MgC_ha_2000"] = f"{cn.soil_c_2000_path}/{tile_id}__{cn.soil_c_2000_pattern}.tif"
    #Adding fluxes for each time interval for user-specified carbon pools
    else:
        if 'emissions' in flux_list: 
            if 'agc' in stat_list: 
                stat_dict[f"agc_emissions_MgC_ha_{year}"] = f"{cn.outputs_path}{cn.agc_gross_emis_pattern}/{year}/40000_pixels/{flux_calc_date}/{tile_id}__{cn.agc_gross_emis_pattern}_{year}.tif"
            if 'bgc' in stat_list: 
                stat_dict[f"bgc_emissions_MgC_ha_{year}"] = f"{cn.outputs_path}{cn.bgc_gross_emis_pattern}/{year}/40000_pixels/{flux_calc_date}/{tile_id}__{cn.bgc_gross_emis_pattern}_{year}.tif"
            if 'deadwood' in stat_list: 
                stat_dict[f"deadwood_emissions_MgC_ha_{year}"] = f"{cn.outputs_path}{cn.deadwood_c_gross_emis_pattern}/{year}/40000_pixels/{flux_calc_date}/{tile_id}__{cn.deadwood_c_gross_emis_pattern}_{year}.tif"
            if 'litter' in stat_list: 
                stat_dict[f"litter_emissions_MgC_ha_{year}"] = f"{cn.outputs_path}{cn.litter_c_gross_emis_pattern}/{year}/40000_pixels/{flux_calc_date}/{tile_id}__{cn.litter_c_gross_emis_pattern}_{year}.tif"
            #TODO: Add soil emissions
        if 'removals' in flux_list: 
            if 'agc' in stat_list: 
                stat_dict[f"agc_removals_MgC_ha_{year}"] = f"{cn.outputs_path}{cn.agc_gross_removals_pattern}/{year}/40000_pixels/{flux_calc_date}/{tile_id}__{cn.agc_gross_removals_pattern}_{year}.tif"
            if 'bgc' in stat_list: 
                stat_dict[f"bgc_removals_MgC_ha_{year}"] = f"{cn.outputs_path}{cn.bgc_gross_removals_pattern}/{year}/40000_pixels/{flux_calc_date}/{tile_id}__{cn.bgc_gross_removals_pattern}_{year}.tif"
            if 'deadwood' in stat_list: 
                stat_dict[f"deadwood_removals_MgC_ha_{year}"] = f"{cn.outputs_path}{cn.deadwood_c_gross_removals_pattern}/{year}/40000_pixels/{flux_calc_date}/{tile_id}__{cn.deadwood_c_gross_removals_pattern}_{year}.tif"
            if 'litter' in stat_list: 
                stat_dict[f"litter_removals_MgC_ha_{year}"] = f"{cn.outputs_path}{cn.litter_c_gross_removals_pattern}/{year}/40000_pixels/{flux_calc_date}/{tile_id}__{cn.litter_c_gross_removals_pattern}_{year}.tif"
            #TODO: Add soil 
        if 'net_flux' in flux_list: 
            if 'agc' in stat_list: 
                stat_dict[f"agc_net_flux_MgC_ha_{year}"] = f"{cn.outputs_path}{cn.agc_net_flux_pattern}/{year}/40000_pixels/{flux_calc_date}/{tile_id}__{cn.agc_net_flux_pattern}_{year}.tif"
            if 'bgc' in stat_list: 
                stat_dict[f"bgc_net_flux_MgC_ha_{year}"] = f"{cn.outputs_path}{cn.bgc_net_flux_pattern}/{year}/40000_pixels/{flux_calc_date}/{tile_id}__{cn.bgc_net_flux_pattern}_{year}.tif"
            if 'deadwood' in stat_list: 
                stat_dict[f"deadwood_net_flux_MgC_ha_{year}"] = f"{cn.outputs_path}{cn.deadwood_c_net_flux_pattern}/{year}/40000_pixels/{flux_calc_date}/{tile_id}__{cn.deadwood_c_net_flux_pattern}_{year}.tif"
            if 'litter' in stat_list: 
                stat_dict[f"litter_net_flux_MgC_ha_{year}"] = f"{cn.outputs_path}{cn.litter_c_net_flux_pattern}/{year}/40000_pixels/{flux_calc_date}/{tile_id}__{cn.litter_c_net_flux_pattern}_{year}.tif"
            #TODO: Add soil 

#Note: All flux outputs are invalid: currently 1000 pixels (not 40000), multiple tiffs per tile, additional time string in tile_id+pattern 
stat_dict

{'pixel_area_m': 's3://gfw2-data/umd_area_2013/v1.10/raster/epsg-4326/10/40000/area_m/geotiff/',
 'agc_density_MgC_ha_2000': 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/AGC_density_MgC_ha/2000/40000_pixels/20240821/00N_000W__AGC_density_MgC_ha_2000.tif',
 'bgc_density_MgC_ha_2000': 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/BGC_density_MgC_ha/2000/40000_pixels/20240821/00N_000W__BGC_density_MgC_ha_2000.tif',
 'deadwood_density_MgC_ha_2000': 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/deadwood_C_density_MgC_ha/2000/40000_pixels/20240821/00N_000W__deadwood_C_density_MgC_ha_2000.tif',
 'litter_density_MgC_ha_2000': 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/litter_C_density_MgC_ha/2000/40000_pixels/20240821/00N_000W__litter_C_density_MgC_ha_2000.tif',
 'soil_density_MgC_ha_2000': 's3://gfw2-data/climate/carbon_model/carbon_pools/soil_carbon/intermediate_full_extent/standard/20231108//00N_000W__soil_C_full_extent_2000_Mg_C_ha.tif'}

<font size="4">Adding datatype to download dictionaries</font> 

In [6]:
#Returns the first tile for each input in both both data dictionaries so that the datatype can be determined
zone_first_tiles = uu.first_file_name_in_s3_folder(zone_dict)
stat_first_tiles = uu.first_file_name_in_s3_folder(stat_dict)

#Creates a download dictionary with the datatype of each input in the values
zone_dict_with_data_types = uu.add_file_type_to_dict(zone_first_tiles)
stat_dict_with_data_types = uu.add_file_type_to_dict(stat_first_tiles)

#Print data dictionaries with data type
print("Zone data dictionary with data types:")
for key, value in zone_dict_with_data_types.items():
    print(f"{key}: {value}")
print("")
print("Stat data dictionary with data types:")
for key, value in stat_dict_with_data_types.items():
    print(f"{key}: {value}")

#NOTE: uu.check_for_tile() doesn't work if there is not a second item (ie data_type) in the download dictionary so you have to use uu.add_file_type_to_dict
#NOTE: uu.first_file_name_in_s3_folder overwrites test tile_id in the notebook options chunk w/ first tile in s3

Zone data dictionary with data types:
ipcc_class_2000: ['s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2000/40000_pixels/20240205/00N_010E__IPCC_classes_2000.tif', 'Byte']
climate_domain: ['s3://gfw2-data/climate/carbon_model/inputs_for_carbon_pools/processed/fao_ecozones_bor_tem_tro/20190418/00N_010E_fao_ecozones_bor_tem_tro_processed.tif', 'Int16']
continent_ecozone: ['s3://gfw2-data/climate/carbon_model/fao_ecozones/ecozone_continent/20190116/processed/00N_010E_fao_ecozones_continents_processed.tif', 'Int16']

Stat data dictionary with data types:
pixel_area_m: ['s3://gfw2-data/umd_area_2013/v1.10/raster/epsg-4326/10/40000/area_m/geotiff/00N_000E.tif', 'Float32']
agc_density_MgC_ha_2000: ['s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/AGC_density_MgC_ha/2000/40000_pixels/20240821/00N_010E__AGC_density_MgC_ha_2000.tif', 'Float32']
bgc_density_MgC_ha_2000: ['s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/BGC_density_MgC_ha/2000/40000_pixels

<font size="5">Calculate zonal stats</font> 

In [9]:
is_final = False   #for print vs log
no_upload = True 

# Create an empty list to hold all data frames for each layer in each tile_id
zonal_stats_dfs = []

#for tile_id in tile_id_list: 
#bounds = uu.get_10x10_tile_bounds(tile_id)
bounds = (-60, -10, -59.75, -9.75)
chunk_length_pixels = uu.calc_chunk_length_pixels(bounds)

In [17]:
def calculate_tile_zonal_stats(bounds, zone_dict_with_data_types, stat_dict_with_data_types, zonal_stats_dfs, is_final, no_upload):
    
    logger = lu.setup_logging()

    tile_id = uu.xy_to_tile_id(bounds[0], bounds[3])
    chunk_length_pixels = uu.calc_chunk_length_pixels(bounds)

    #STEP 1: DOWNLOAD DATA
    # Replace tile_id in dictionaries with data_types
    updated_zone_dict = uu.replace_tile_id_in_dict(zone_dict_with_data_types, tile_id)
    print(f"Zone data dictionary for {tile_id}:")
    for key, value in updated_zone_dict.items():
        print(f"{key}: {value}")
    print("")
    
    updated_stat_dict = uu.replace_tile_id_in_dict(stat_dict_with_data_types, tile_id)
    print(f"Stat data dictionary for {tile_id}:")
    for key, value in updated_stat_dict.items():
        print(f"{key}: {value}")
    print("")  

    
    # Check that tile exists in zone and stat data dictionaries
    zone_tile_exists = uu.check_for_tile(updated_zone_dict, is_final, logger)
    stat_tile_exists = uu.check_for_tile(updated_stat_dict, is_final, logger)
    
    if not zone_tile_exists and not stat_tile_exists:
        return f"Skipped {tile_id}: data does not exist for any inputs: {timestr()}"

    
    # Download zone and stat data 
    zone_futures = uu.prepare_to_download_chunk(bounds, updated_zone_dict, chunk_length_pixels, is_final, logger)
    print(f"Dask futures for zone data: {zone_futures}")

    stat_futures = uu.prepare_to_download_chunk(bounds, updated_stat_dict, chunk_length_pixels, is_final, logger)
    print(f"Dask futures for stat data: {stat_futures}")
    print("")

    # Dictionary that stores downloaded data and the unique values in each tile 
    tile_zone_dict = {}
    tile_zone_dict_unique_values = {}
    tile_stat_dict = {}

    # Waits for requests to come back with zone data from S3
    for zone_future in concurrent.futures.as_completed(zone_futures):
        zone_layer = zone_futures[zone_future]
        tile_zone_dict[zone_layer] = zone_future.result()
    
    # Waits for requests to come back with stat data from S3
    for stat_future in concurrent.futures.as_completed(stat_futures):
        stat_layer = stat_futures[stat_future]
        layer = stat_future.result()
    
        # Mask nodata values using rioxarray's rio.nodata attribute
        if hasattr(layer, 'rio'):
            nodata_value = layer.rio.nodata
            if nodata_value is not None:
                # Mask nodata values in the array
                layer = layer.where(layer != nodata_value)
    
        # Store the masked layer
        tile_stat_dict[stat_layer] = layer
    
    # Print data type and dimensions for each zone array
    print(f"Zone layers in {tile_id}:" )
    for layer, array in tile_zone_dict.items():
        print(f"Data type in layer '{layer}': {array.dtype}")
        print(f"Dimensions in layer '{layer}': {array.shape}")
    print("")
    
    # Print data type and dimensions for each zone array
    print(f"Stat layers in {tile_id}:" )
    for layer, array in tile_stat_dict.items():
        print(f"Data type in layer '{layer}': {array.dtype}")
        print(f"Dimensions in layer '{layer}': {array.shape}")
    print("")

    #STEP 2: BIT SHIFTING TO GET UNIQUE ZONE ID FOR ALL ZONE LAYERS
    
    # Sort zone_dict items to ensure consistent layer order
    tile_zone_dict = sorted(tile_zone_dict.items())
    
    # Combine the zone layers dynamically
    combined_zone_array = uu.combine_zone_layers(tile_zone_dict)
    
    #STEP 3: CALCULATE PER-PIXEL CARBON VALUES BY MUTIPLYING DENSITIES/ FLUXES (MGC PER HA) BY PIXEL AREAS (SQUARE METERS CONVERTED TO HA)
    
    
    
    return combined_zone_array

In [None]:
combined_zone_array = calculate_tile_zonal_stats(bounds, zone_dict_with_data_types, stat_dict_with_data_types, is_final, no_upload)
combined_zone_array

In [18]:
#def calculate_tile_zonal_stats(bounds, zone_dict_with_data_types, stat_dict_with_data_types, is_final, no_upload):
    
logger = lu.setup_logging()

tile_id = uu.xy_to_tile_id(bounds[0], bounds[3])
chunk_length_pixels = uu.calc_chunk_length_pixels(bounds)

#STEP 1: DOWNLOAD DATA
# Replace tile_id in dictionaries with data_types
updated_zone_dict = uu.replace_tile_id_in_dict(zone_dict_with_data_types, tile_id)
# print(f"Zone data dictionary for {tile_id}:")
# for key, value in updated_zone_dict.items():
#     print(f"{key}: {value}")
# print("")

updated_stat_dict = uu.replace_tile_id_in_dict(stat_dict_with_data_types, tile_id)
# print(f"Stat data dictionary for {tile_id}:")
# for key, value in updated_stat_dict.items():
#     print(f"{key}: {value}")
# print("")  


# Check that tile exists in zone and stat data dictionaries
zone_tile_exists = uu.check_for_tile(updated_zone_dict, is_final, logger)
stat_tile_exists = uu.check_for_tile(updated_stat_dict, is_final, logger)

# if not zone_tile_exists and not stat_tile_exists:
#     return f"Skipped {tile_id}: data does not exist for any inputs: {timestr()}"


# Download zone and stat data 
zone_futures = uu.prepare_to_download_chunk(bounds, updated_zone_dict, chunk_length_pixels, is_final, logger)
print(f"Dask futures for zone data: {zone_futures}")

stat_futures = uu.prepare_to_download_chunk(bounds, updated_stat_dict, chunk_length_pixels, is_final, logger)
print(f"Dask futures for stat data: {stat_futures}")
print("")

# Dictionary that stores downloaded data and the unique values in each tile 
tile_zone_dict = {}
tile_zone_dict_unique_values = {}
tile_stat_dict = {}

# Waits for requests to come back with zone data from S3
for zone_future in concurrent.futures.as_completed(zone_futures):
    zone_layer = zone_futures[zone_future]
    tile_zone_dict[zone_layer] = zone_future.result()

# Waits for requests to come back with stat data from S3
for stat_future in concurrent.futures.as_completed(stat_futures):
    stat_layer = stat_futures[stat_future]
    layer = stat_future.result()

    # Mask nodata values using rioxarray's rio.nodata attribute
    if hasattr(layer, 'rio'):
        nodata_value = layer.rio.nodata
        if nodata_value is not None:
            # Mask nodata values in the array
            layer = layer.where(layer != nodata_value)

    # Store the masked layer
    tile_stat_dict[stat_layer] = layer

# # Print data type and dimensions for each zone array
# print(f"Zone layers in {tile_id}:" )
# for layer, array in tile_zone_dict.items():
#     print(f"Data type in layer '{layer}': {array.dtype}")
#     print(f"Dimensions in layer '{layer}': {array.shape}")
# print("")

# # Print data type and dimensions for each zone array
# print(f"Stat layers in {tile_id}:" )
# for layer, array in tile_stat_dict.items():
#     print(f"Data type in layer '{layer}': {array.dtype}")
#     print(f"Dimensions in layer '{layer}': {array.shape}")
# print("")

#STEP 2: BIT SHIFTING TO GET UNIQUE ZONE ID FOR ALL ZONE LAYERS

# Sort zone_dict items to ensure consistent layer order
tile_zone_dict = sorted(tile_zone_dict.items())

# Combine the zone layers dynamically
combined_zone_array = uu.combine_zone_layers(tile_zone_dict)

#STEP 3: CALCULATE PER-PIXEL CARBON VALUES BY MUTIPLYING DENSITIES/ FLUXES (MGC PER HA) BY PIXEL AREAS (SQUARE METERS CONVERTED TO HA)
# Convert all inputs in stats layers to float32, convert to a numba dictionary, and 
tile_stat_dict= uu.to_numpy_type(tile_stat_dict, check_type=np.float32)
numba_stat_dict = uu.to_numba_dict(tile_stat_dict)
total_mgc_dict = uu.calculate_total_mgc(numba_stat_dict, pixel_area_name="pixel_area_m")

#STEP 4: CALCULATE ZONAL STATS
# Call .compute() on combined_zone_array and convery from a dask to an xr DataArray for zonal_stats function
combined_zone_result =combined_zone_array.compute()
combined_zone_result = xr.DataArray(combined_zone_result)  

# Submit zonal stats futures for each layer in the total_mgc_dict
zonal_stats_futures = {}

for layer_name, layer_data in total_mgc_dict.items():
    
    # Ensure layer_data is an xr DataArray for zonal_stats function
    layer_data = xr.DataArray(layer_data) 

    # Use Dask futures to compute zonal statistics in parallel
    zonal_stats_future = client.submit(zonal_stats, zones=combined_zone_result, values=layer_data, stats_funcs=['count', 'sum', 'min', 'max'])
    zonal_stats_futures[layer_name] = zonal_stats_future

# Gather the results from the futures and convert them to DataFrames
for layer_name, future in zonal_stats_futures.items():
    stats = future.result()

    # Convert the stats dictionary to a DataFrame for this layer
    df = pd.DataFrame(stats)
    df['tile_id'] = tile_id  # Add a column for the tile_id
    df['layer'] = layer_name  # Add a column for the layer name
    
    # Append this DataFrame to the master list
    zonal_stats_dfs.append(df)


#return combined_zone_array

flm: Tile id 00N_060W exists for some inputs. Proceeding: 20241024_02_42_29 
flm: Tile id 00N_060W exists for some inputs. Proceeding: 20241024_02_42_29 
flm: Requesting data in chunk -60_-10_-60_-10 in 00N_060W: 20241024_02_42_29
Dask futures for zone data: {<Future at 0x7fe893569150 state=finished returned ndarray>: 'ipcc_class_2000', <Future at 0x7fe8a169e920 state=finished returned ndarray>: 'climate_domain', <Future at 0x7fe8a1026c50 state=finished returned ndarray>: 'continent_ecozone'}
flm: Requesting data in chunk -60_-10_-60_-10 in 00N_060W: 20241024_02_42_41
Dask futures for stat data: {<Future at 0x7fe89356b250 state=finished returned ndarray>: 'pixel_area_m', <Future at 0x7fe8a01e3ac0 state=finished returned ndarray>: 'agc_density_MgC_ha_2000', <Future at 0x7fe8a1f381f0 state=finished returned ndarray>: 'bgc_density_MgC_ha_2000', <Future at 0x7fe893aee920 state=finished returned ndarray>: 'deadwood_density_MgC_ha_2000', <Future at 0x7fe893900280 state=finished returned ndar

In [19]:
zonal_stats_dfs

[    zone     count           sum  min       max   tile_id  \
 0   8137  985587.0  3.208284e+06  0.0  9.837257  00N_060W   
 1  16329    2787.0  6.130190e+03  0.0  8.907089  00N_060W   
 2  24521   11626.0  7.859305e+03  0.0  6.413132  00N_060W   
 
                   layer  
 0  bgc_density_MgC_2000  
 1  bgc_density_MgC_2000  
 2  bgc_density_MgC_2000  ,
     zone     count           sum       min       max   tile_id  \
 0   8137  985587.0  2.683588e+06  2.047649  4.168651  00N_060W   
 1  16329    2787.0  7.788658e+03  2.350703  3.260094  00N_060W   
 2  24521   11626.0  3.277230e+04  2.275152  3.866907  00N_060W   
 
                    layer  
 0  soil_density_MgC_2000  
 1  soil_density_MgC_2000  
 2  soil_density_MgC_2000  ,
     zone     count            sum  min       max   tile_id  \
 0   8137  985587.0  692457.187500  0.0  0.876956  00N_060W   
 1  16329    2787.0    1242.380615  0.0  0.825530  00N_060W   
 2  24521   11626.0    1711.706543  0.0  0.780613  00N_060W   
 
    

In [None]:
# Append stats data frame for each tile_id to the master list
zonal_stats_dfs.append(df)

# Concatenate all data frames into one dataframe
final_df = pd.concat(zonal_stats_dfs)

In [None]:
tile_stat_dict= uu.to_numpy_type(tile_stat_dict, check_type=np.float32)
numba_stat_dict = uu.to_numba_dict(tile_stat_dict)
total_mgc_dict = uu.calculate_total_mgc(numba_stat_dict, pixel_area_name="pixel_area_m")
total_mgc_dict

In [None]:
# # Compute the final result from the combined Dask array
# combined_zone_result = combined_zone_array.compute()

# # Print the unique values in the combined result
# unique_values_combined_zones = np.unique(combined_zone_result)
# print("Unique values in combined result:")
# print(unique_values_combined_zones)

In [None]:
import numpy as np
from numba import jit

# JIT compiled function
@jit(nopython=True)
def calculate_total_mgc(agc, bgc, deadwood, litter, pixel_area):
    # Get the shape of the arrays (assuming they are the same shape)
    total_mgc_agc = np.zeros_like(agc)
    total_mgc_bgc = np.zeros_like(bgc)
    total_mgc_deadwood = np.zeros_like(deadwood)
    total_mgc_litter = np.zeros_like(litter)
    area_ha = np.zeros_like(litter)
    
    # Loop over each pixel
    for i in range(agc.shape[0]):
        for j in range(agc.shape[1]):
            # Convert pixel_area from square meters to hectares
            square_meters_to_hectares = 10000.0
            area_in_hectares = pixel_area[i, j] / square_meters_to_hectares
            
            # Calculate total MgC for each density type
            total_mgc_agc[i, j] = agc[i, j] * area_in_hectares
            total_mgc_bgc[i, j] = bgc[i, j] * area_in_hectares
            total_mgc_deadwood[i, j] = deadwood[i, j] * area_in_hectares
            total_mgc_litter[i, j] = litter[i, j] * area_in_hectares
            area_ha[i,j] = area_in_hectares
    
    # Return as a dictionary
    return {
        'agc_total_MgC': total_mgc_agc,
        'bgc_total_MgC': total_mgc_bgc,
        'deadwood_total_MgC': total_mgc_deadwood,
        'litter_total_MgC': total_mgc_litter, 
        'area_ha': area_ha
    }

# Call the JIT-compiled function and return a dictionary
total_mgc_dict = calculate_total_mgc(
    tile_stat_dict['agc_density_2000'], tile_stat_dict['bgc_density_2000'], tile_stat_dict['deadwood_density_2000'], tile_stat_dict['litter_density_2000'], tile_stat_dict['pixel_area'])

total_mgc_dict

<font size="4">Calculate Zonal Stats</font> 

In [None]:
import pandas as pd
import numpy as np
from dask import delayed
from dask.distributed import Client
from xrspatial import zonal_stats


# #Function to compute zonal stats using Dask futures and return a pandas DataFrame
# def compute_zonal_stats_to_dataframe(layers, zone):
#     zonal_stats_futures = {}

#     # Iterate over the layers in the total_mgc_dict
#     for layer_name, layer_data in layers.items():
        
#         # Use Dask futures to compute zonal statistics in parallel
#         future = client.submit(zonal_stats, zones=zone, values=layer_data, stats_funcs=['mean', 'sum', 'min', 'max'])
#         zonal_stats_futures[layer_name] = future

#     # Create an empty list to hold all data frames for each layer
#     all_layer_dfs = []

#     # Gather the results from the futures and convert them to DataFrames
#     for layer_name, future in zonal_stats_futures.items():
#         stats = future.result()

#         # Convert the stats dictionary to a DataFrame for this layer
#         df = pd.DataFrame(stats)
#         df['layer'] = layer_name  # Add a column for the layer name

#         # Append this DataFrame to the list
#         all_layer_dfs.append(df)

#     # Concatenate all DataFrames into one DataFrame
#     final_df = pd.concat(all_layer_dfs)

#     return final_df

def compute_zonal_stats_to_dataframe(total_mgc_dict, combined_zone_result):
    zonal_stats_futures = {}

    # Ensure that the combined_zone_result is an xr DataArray for zonal_stats function
    combined_zone_result = xr.DataArray(combined_zone_result)  

    # Iterate over the layers in the total_mgc_dict
    for layer_name, layer_data in total_mgc_dict.items():
        # Ensure layer_data is an xr DataArray for zonal_stats function
        layer_data = xr.DataArray(layer_data) 

        # Use Dask futures to compute zonal statistics in parallel
        future = client.submit(zonal_stats, zones=combined_zone_result, values=layer_data, stats_funcs=['count', 'sum', 'min', 'max'])
        zonal_stats_futures[layer_name] = future

    # Create an empty list to hold all data frames for each layer
    all_layer_dfs = []

    # Gather the results from the futures and convert them to DataFrames
    for layer_name, future in zonal_stats_futures.items():
        stats = future.result()

        # Convert the stats dictionary to a DataFrame for this layer
        df = pd.DataFrame(stats)
        df['layer'] = layer_name  # Add a column for the layer name

        # Append this DataFrame to the list
        all_layer_dfs.append(df)

    # Concatenate all DataFrames into one DataFrame
    final_df = pd.concat(all_layer_dfs)

    return final_df

#Make sure zone is a numpy array
#combined_zone_result = np.array(combined_zone_result, dtype=np.int16, copy=True)

# Call the function and get the zonal stats as a DataFrame
zonal_stats_df = compute_zonal_stats_to_dataframe(total_mgc_dict, combined_zone_result)

# Display the first few rows of the resulting DataFrame
print(zonal_stats_df.head())

In [None]:
import xarray as xr
import dask.array as da
from xrspatial import zonal_stats  

# Ensure combined_zone_result is converted to a Dask array (if it's not already)
combined_zone_result = da.from_array(combined_zone_result)

# Wrap combined_zone_result in an xarray.DataArray
combined_zone_da = xr.DataArray(combined_zone_result)

# Function to process all zonal stats calls for a tile
def run_all_zonal_stats(zone, layer, layer_name):
    
    # Convert Dask array to xarray.DataArray
    layer_da = xr.DataArray(layer)
    
    # Apply zonal stats
    result = zonal_stats(
        zones=zone,
        values=layer_da,
        stats_funcs=["sum", "count", "min", "max"]
    ).set_index("zone").rename(columns={
        "count": f"{layer_name}_count",
        "sum": f"{layer_name}_sum",
        "min": f"{layer_name}_min",
        "max": f"{layer_name}_max"
    })
    return result

# For loop to run all zonal stats per tile
futures = []

for layer, array in tile_stat_dict.items():
    # Ensure each layer is a Dask array and wrap it in an xarray.DataArray
    array = da.from_array(array)
    result = run_all_zonal_stats(combined_zone_da, array, layer)
    results.append(result)

print(results)
#TODO change to use future instead

In [None]:
print(combined_zone_result)
print(combined_zone_result.dtype)
print(combined_zone_result.shape)

for layer, array in tile_stat_dict.items():
    print(layer)
    print(array)
    print(array.dtype)
    print(array.shape)

<font size="4">Reverse bit shifting to get original layer values for combined zone inputs</font> 

In [None]:
#TEST Example dask data frame with zonal stats values by bit-shifted zone
import pandas as pd

# Create a Pandas DataFrame
zone_df = pd.DataFrame({'bit_shifted_values': unique_values_combined_zones})

# Print the Dask DataFrame to verify
print(zone_df)
print(zone_df.columns)

In [None]:
# Function to reverse the bit-shifting process
def reverse_bit_shifting(df, column_name, sorted_layers):
    """
    Reverse the bit-shifting operation and extract the original values for each layer.

    Parameters:
    - df: the Pandas DataFrame containing the combined bit-shifted values.
    - column_name: the name of the column that contains the bit-shifted values.
    - sorted_layers: a dictionary containing the original Dask arrays (for each layer) sorted alpahbaetically.

    Returns:
    - df: a Pandas DataFrame with new columns for each original layer.
    """
    
    # Calculate bits_needed_per_layer based on max values from Dask arrays in sorted_layers
    bits_needed_per_layer = []
    
    for layer_name, layer_array in sorted_layers:
        # Ensure the layer is a Dask array and calculate max value
        layer_array = ensure_dask_array(layer_array)
        max_value = da.max(layer_array).compute()  # Compute the maximum value
        
        # Determine the number of bits needed to represent this layer
        bits_needed = calculate_bits_needed(max_value)
        bits_needed_per_layer.append(bits_needed)

    total_shift = sum(bits_needed_per_layer)  # Start with the total bits used

    # Reverse bit-shifting: loop through each layer in reverse order
    layers = [layer_name for layer_name, _ in sorted_layers]  # Get the sorted layer names
    for i in range(len(layers)-1, -1, -1):
        layer = layers[i]
        bits_needed = bits_needed_per_layer[i]
        total_shift -= bits_needed
        # Create a mask for extracting the current layer
        mask = (1 << bits_needed) - 1
        # Shift right and apply the mask to extract the current layer's values
        df[layer] = df[column_name].apply(lambda x: (x >> total_shift) & mask)

    return df

# Reverse the bit-shifting
parse_zone_data = reverse_bit_shifting(zone_df, 'bit_shifted_values', tile_zone_dict_sorted)

# Print the Pandas DataFrame to verify the results
print(parse_zone_data)

<font size="5">Shutting down cloud and local clusters</font> 

In [None]:
if cluster_type in ['full', 'test']: 
    coiled_cluster.shutdown()
    print('coiled cluster shut down')
elif cluster_type == 'local':
    client.shutdown()
    print('local client shut down')
else: 
    print("No clusters were shut down. Check that cluster_type is set to one of the following: 'full', 'test', 'local'")

# coiled_client.restart() 

In [None]:
######################################################################

<font size="5">Zone Inputs (Categorical)</font> 

In [None]:
# # gadm boundary tiles
# iso_gadm_uri = 's3://gfw-data-lake/gadm_administrative_boundaries/v3.6/raster/epsg-4326/10/40000/adm0/geotiff/'
# iso_gadm_pattern = ''

# climate_domain_path = "s3://gfw2-data/climate/carbon_model/inputs_for_carbon_pools/processed/fao_ecozones_bor_tem_tro/20190418/"
# climate_domain_pattern = "fao_ecozones_bor_tem_tro_processed"

# continent_ecozone_path = "s3://gfw2-data/climate/carbon_model/fao_ecozones/ecozone_continent/20190116/processed/"
# continent_ecozone_pattern = "fao_ecozones_continents_processed"



# # MODEL OUTPUT SHAPEFILES
# # # IPCC reporting classes
# # ipcc_class_2000 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2000/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2000__10x10.shp'
# # ipcc_class_2005 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2005/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2005__10x10.shp'
# # ipcc_class_2010 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2010/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2010__10x10.shp'
# # ipcc_class_2015 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2015/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2015__10x10.shp'
# # ipcc_class_2020 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2020/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2020__10x10.shp'

# # # IPCC change classes
# # ipcc_change_2005 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2000_2005/40000_pixels/20240205/raster_footprints_IPCC_basic_change_2000_2005__10x105.shp'
# # ipcc_change_2010 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2005_2010/40000_pixels/20240205/raster_footprints_IPCC_basic_change_2005_2010__10x10.shp'
# # ipcc_change_2015 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2010_2015/40000_pixels/20240205/raster_footprints_IPCC_basic_change_2010_2015__10x10.shp'
# # ipcc_change_2020 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2015_2020/40000_pixels/20240205/raster_footprints_IPCC_basic_change_2015_2020__10x10.shp'

# # 2000 carbon densities 
# AGC_density_2000 = 'gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/AGC_density_MgC_ha/2000/40000_pixels/20240729/raster_footprints_AGC_2000__global__10x10.shp'
# BGC_density_2000 = 'gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/BGC_density_MgC_ha/2000/40000_pixels/20240729/raster_footprints_BGC_2000__global__10x10.shp'
# deadwood_C_density_2000 = 'gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/deadwood_C_density_MgC_ha/2000/40000_pixels/20240729/raster_footprints_deadwood_C_2000__global__10x10.shp'
# litter_C_density = 'gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/litter_C_density_MgC_ha/2000/40000_pixels/20240729/raster_footprints_litter_C_2000__global__10x10.shp'

#TODO: Include land state nodes? --> no shapefiles

In [None]:
# #rasterize vector data
# name path column

# shapefile = gpd.read_file(path)
# utm = shapefile.to_crs("EPSG:4326")
# geom = utm[['geometry', column]].values.tolist()

In [None]:
# # lazily "computing" tile sets
# iso_gadm = get_tile_dataset(iso_gadm_uri, tile, "iso")
# iso_gadm = iso_gadm.where(iso_gadm != 9999)