In [34]:
# importing dependencies for notebook
import rioxarray
import rasterio
import xarray as xr
import numpy as np
import pandas as pd
import pycountry
import dask.multiprocessing
from xrspatial import zonal_stats
from dask.distributed import Client
#from wri_notebooks_utils.wri_notebooks_utils import get_dask_ecs_client
from xrspatial.zonal import _stats_count as xr_count
from pathlib import Path 

# dask/parallelization libraries
import coiled
import dask
from dask.distributed import Client, LocalCluster
from dask.distributed import print
import distributed



<font size="5">Creating clusters</font> 

In [None]:
# # create a dask cluster and make this notebook its client
# client = get_dask_ecs_client(n_workers=100) # options: n_workers, worker_mem, worker_cpu, scheduler_mem
# client

In [None]:
# # Full cluster
# coiled_cluster = coiled.Cluster(
#     n_workers=40,
#     use_best_zone=True, 
#     compute_purchase_option="spot_with_fallback",
#     idle_timeout="10 minutes",
#     region="us-east-1",
#     name="AFOLU_flux_model", 
#     account='wri-forest-research', 
#     worker_cpu=4,
#     worker_memory = "32GiB" 
# )

# # Coiled cluster (cloud run)
# coiled_client = coiled_cluster.get_client()
# coiled_client

In [None]:
# # Test cluster
# coiled_cluster = coiled.Cluster(
#     n_workers=1,
#     use_best_zone=True, 
#     compute_purchase_option="spot_with_fallback",
#     idle_timeout="20 minutes",
#     region="us-east-1",
#     name="AFOLU_flux_model", 
#     account='wri-forest-research', 
#     worker_cpu=4,
#     worker_memory = "32GiB" 
# )

# # Coiled cluster (cloud run)
# coiled_client = coiled_cluster.get_client()
# coiled_client

In [35]:
# Local single-process cluster (local run). Will run .compute() on just one process, not a whole cluster.
local_client = Client(processes=False)
local_client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://172.17.68.74:8787/status,

0,1
Dashboard: http://172.17.68.74:8787/status,Workers: 1
Total threads: 12,Total memory: 15.47 GiB
Status: running,Using processes: False

0,1
Comm: inproc://172.17.68.74/390527/1,Workers: 1
Dashboard: http://172.17.68.74:8787/status,Total threads: 12
Started: Just now,Total memory: 15.47 GiB

0,1
Comm: inproc://172.17.68.74/390527/4,Total threads: 12
Dashboard: http://172.17.68.74:38113/status,Memory: 15.47 GiB
Nanny: None,
Local directory: /tmp/dask-scratch-space/worker-5o0935rl,Local directory: /tmp/dask-scratch-space/worker-5o0935rl


In [None]:
# # Local cluster with multiple workers
# local_cluster = LocalCluster()  
# local_client = Client(local_cluster)
# local_client

<font size="5">Shutting down cloud and local clusters</font> 

In [39]:
# coiled_client.restart() 

In [39]:
# coiled_cluster.shutdown()

In [47]:
local_client.shutdown()

<font size="5">Utilities and Variables</font> 

<font size="4">Run/ source utilities and variables</font> 

In [36]:
#tile list 
tile_id = "00N_060W"

In [37]:
# year list 

In [38]:
#Optional Zone Inputs

# gadm boundary tiles
iso_gadm_path = 's3://gfw-data-lake/gadm_administrative_boundaries/v3.6/raster/epsg-4326/10/40000/adm0/geotiff/'
iso_gadm_pattern = ''

# climate domain
climate_domain_path = "s3://gfw2-data/climate/carbon_model/inputs_for_carbon_pools/processed/fao_ecozones_bor_tem_tro/20190418/"
climate_domain_pattern = "fao_ecozones_bor_tem_tro_processed"

#ecozone
continent_ecozone_path = "s3://gfw2-data/climate/carbon_model/fao_ecozones/ecozone_continent/20190116/processed/"
continent_ecozone_pattern = "fao_ecozones_continents_processed"

#IPCC basic class [for single yeaars only (i.e. 2000, 2005, 2010, 2015, 2020)]
ipcc_class_2000_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2000/40000_pixels/20240205/"
ipcc_class_2000_pattern = "IPCC_classes_2000"

In [39]:
#Optional Stats Inputs

#AGC density 2000 (MgC per ha)
agc_density_2000_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/AGC_density_MgC_ha/2000/40000_pixels/20240821/"
agc_density_2000_pattern = "AGC_density_MgC_ha_2000"

#BGC density 2000 (MgC per ha)
bgc_density_2000_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/BGC_density_MgC_ha/2000/40000_pixels/20240821/"
bgc_density_2000_pattern = "BGC_density_MgC_ha_2000"

#Deadwood density 2000 (MgC per ha)
deadwood_density_2000_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/deadwood_C_density_MgC_ha/2000/40000_pixels/20240821/"
deadwood_density_2000_pattern = "deadwood_C_density_MgC_ha_2000"

#Litter density 2000 (MgC per ha)
litter_density_2000_path = "s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/litter_C_density_MgC_ha/2000/40000_pixels/20240821/"
litter_density_2000_pattern = "litter_C_density_MgC_ha_2000"

<font size="5">Calculate Zonal Stats 2000</font> 

<font size="4">Zone Inputs (Categorical)</font> 

In [40]:
zone_dict = {}
#zone_dict["iso_gadm"] = f"{iso_gadm_path}{tile_id}{iso_gadm_pattern}.tif"
zone_dict["climate_domain"] = f"{climate_domain_path}{tile_id}_{climate_domain_pattern}.tif"
zone_dict["continent_ecozone"] = f"{continent_ecozone_path}{tile_id}_{continent_ecozone_pattern}.tif"
zone_dict["ipcc_class_2000"] = f"{ipcc_class_2000_path}{tile_id}__{ipcc_class_2000_pattern}.tif"

print(zone_dict)
#TODO: Why is iso_gadm showing up as none?
#9.71Gib

{'climate_domain': 's3://gfw2-data/climate/carbon_model/inputs_for_carbon_pools/processed/fao_ecozones_bor_tem_tro/20190418/00N_060W_fao_ecozones_bor_tem_tro_processed.tif', 'continent_ecozone': 's3://gfw2-data/climate/carbon_model/fao_ecozones/ecozone_continent/20190116/processed/00N_060W_fao_ecozones_continents_processed.tif', 'ipcc_class_2000': 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2000/40000_pixels/20240205/00N_060W__IPCC_classes_2000.tif'}


<font size="4">Stat Inputs (Quantitative)</font> 

In [41]:
stat_dict = {}
stat_dict["agc_density_2000"] = f"{agc_density_2000_path}{tile_id}__{agc_density_2000_pattern}.tif"
stat_dict["bgc_density_2000"] = f"{bgc_density_2000_path}{tile_id}__{bgc_density_2000_pattern}.tif"
stat_dict["deadwood_density_2000"] = f"{deadwood_density_2000_path}{tile_id}__{deadwood_density_2000_pattern}.tif"
stat_dict["litter_density_2000"] = f"{litter_density_2000_path}{tile_id}__{litter_density_2000_pattern}.tif"

print(stat_dict)

{'agc_density_2000': 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/AGC_density_MgC_ha/2000/40000_pixels/20240821/00N_060W__AGC_density_MgC_ha_2000.tif', 'bgc_density_2000': 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/BGC_density_MgC_ha/2000/40000_pixels/20240821/00N_060W__BGC_density_MgC_ha_2000.tif', 'deadwood_density_2000': 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/deadwood_C_density_MgC_ha/2000/40000_pixels/20240821/00N_060W__deadwood_C_density_MgC_ha_2000.tif', 'litter_density_2000': 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/litter_C_density_MgC_ha/2000/40000_pixels/20240821/00N_060W__litter_C_density_MgC_ha_2000.tif'}


<font size="4">Download Zone Inputs Layers</font> 

In [42]:
is_final = False
logger = setup_logging()
#bounds = get_10x10_tile_bounds(tile_id)
bounds = (-60, -10, -59, -9)

tile_exists = check_for_tile(zone_dict, is_final, logger)
print(tile_exists)
# if not tile_exists:
#     return f"Skipped chunk {bounds_str} because {tile_id} does not exist for any inputs: {timestr()}"

zone_futures = prepare_to_download_chunk(bounds, zone_dict, is_final, logger)
print(zone_futures)

flm: Tile id 00N_060W exists for some inputs. Proceeding: 20240920_19_02_49 
True
flm: Requesting data in chunk -60_-10_-59_-9 in 00N_060W: 20240920_19_02_49
{<Future at 0x7f13180d9d80 state=finished returned ndarray>: 'climate_domain', <Future at 0x7f1303f41ff0 state=finished returned ndarray>: 'continent_ecozone', <Future at 0x7f1303fb4a60 state=finished returned ndarray>: 'ipcc_class_2000'}


In [111]:
zone_layers = {}
zone_layers_unique_values = {}

# Waits for requests to come back with data from S3
for zone_future in concurrent.futures.as_completed(zone_futures):
    zone_layer = zone_futures[zone_future]
    zone_layers[zone_layer] = zone_future.result()

    # Get unique values for each layer and compute them
    unique_values = dask.array.unique(zone_layers[zone_layer])
    zone_layers_unique_values[zone_layer] = unique_values.compute()

# Print data type and dimensions for each zone array
for layer, array in zone_layers.items():
    print(f"Data type in layer '{layer}': {array.dtype}")
    print(f"Dimensions in layer '{layer}': {array.shape}")

# Print unique values for each zone array
for layer, unique_vals in zone_layers_unique_values.items():
    print(f"Unique values in layer {layer}: {unique_vals}")

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Data type in layer 'continent_ecozone': int16
Dimensions in layer 'continent_ecozone': (4000, 4000)
Data type in layer 'ipcc_class_2000': uint8
Dimensions in layer 'ipcc_class_2000': (4000, 4000)
Data type in layer 'climate_domain': int16
Dimensions in layer 'climate_domain': (4000, 4000)
Unique values in layer continent_ecozone: [2020]
Unique values in layer ipcc_class_2000: [1 2 3 4 5 6]
Unique values in layer climate_domain: [1]


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


<font size="4">Bit shifting to get unique zone id for all zone_layers inputs</font> 

In [113]:
import numpy as np
import dask.array as da

# Function to calculate the number of bits needed to represent the maximum value in the array
def calculate_bits_needed(max_value):
    return int(np.ceil(np.log2(max_value + 1)))

# Convert numpy arrays to dask arrays if needed
def ensure_dask_array(array, chunks="auto"):
    if isinstance(array, np.ndarray):
        # Convert numpy array to dask array
        return da.from_array(array, chunks=chunks)
    return array  # Already a dask array

# Ensure all layers have a consistent data type (int16) for bit-shifting
def ensure_dtype(layer_array, dtype=np.int16):
    if layer_array.dtype != dtype:
        return layer_array.astype(dtype)
    return layer_array

# Prepare to dynamically combine layers using bit-shifting
def combine_zone_layers(sorted_layers):
    combined_array = None
    total_shift = 0

    # Loop through each layer
    for layer_name, layer_array in sorted_layers:
        # Convert to dask.array if it's a numpy array
        layer_array = ensure_dask_array(layer_array)

        # Convert layer to int16 if necessary for safe bit-shifting
        layer_array = ensure_dtype(layer_array)

        # Find the maximum value in the layer (using Dask's max function)
        max_value = da.max(layer_array).compute()  # Compute to get the actual maximum value

        # Determine the number of bits needed to represent this layer
        bits_needed = calculate_bits_needed(max_value)

        # Print unique values in the current layer before shifting
        #print(f"Unique values in layer '{layer_name}' before shifting: {np.unique(layer_array.compute())}")

        # Shift the layer by the cumulative number of bits (based on previous layers)
        shifted_layer = layer_array << total_shift

        # Print unique values in the current layer after shifting
        #print(f"Unique values in layer '{layer_name}' after shifting: {np.unique(shifted_layer.compute())}")

        # If this is the first layer, initialize the combined array
        if combined_array is None:
            combined_array = shifted_layer
        else:
            # Use bitwise OR to combine the shifted layer with the previous layers
            combined_array = combined_array | shifted_layer

        # Update the total bit shift for the next layer
        total_shift += bits_needed

    return combined_array

# Sort zone_layers items by name or key to ensure consistent layer order
zone_layers_sorted = sorted(zone_layers.items())

# Combine the zone layers dynamically
combined_zone_array = combine_zone_layers(zone_layers_sorted)

# Compute the final result from the combined Dask array
combined_zone_result = combined_zone_array.compute()

# Print the unique values in the combined result
unique_values_combined_zones = np.unique(combined_zone_result)
print("Unique values in combined result:")
print(unique_values_combined_zones)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.


Unique values in combined result:
[ 8137 12233 16329 20425 24521 28617]


<font size="4">Reverse bit shifting to get original layer values for combined zone inputs</font> 

In [107]:
#TEST Example dask data frame with zonal stats values by bit-shifted zone
import pandas as pd

# Create a Pandas DataFrame
zone_df = pd.DataFrame({'bit_shifted_values': unique_values_combined_zones})

# Print the Dask DataFrame to verify
print(zone_df)
print(zone_df.columns)

   bit_shifted_values
0                8137
1               12233
2               16329
3               20425
4               24521
5               28617
Index(['bit_shifted_values'], dtype='object')


In [108]:
# Function to reverse the bit-shifting process
def reverse_bit_shifting(df, column_name, sorted_layers):
    """
    Reverse the bit-shifting operation and extract the original values for each layer.

    Parameters:
    - df: the Pandas DataFrame containing the combined bit-shifted values.
    - column_name: the name of the column that contains the bit-shifted values.
    - sorted_layers: a dictionary containing the original Dask arrays (for each layer) sorted alpahbaetically.

    Returns:
    - df: a Pandas DataFrame with new columns for each original layer.
    """
    
    # Calculate bits_needed_per_layer based on max values from Dask arrays in sorted_layers
    bits_needed_per_layer = []
    
    for layer_name, layer_array in sorted_layers:
        # Ensure the layer is a Dask array and calculate max value
        layer_array = ensure_dask_array(layer_array)
        max_value = da.max(layer_array).compute()  # Compute the maximum value
        
        # Determine the number of bits needed to represent this layer
        bits_needed = calculate_bits_needed(max_value)
        bits_needed_per_layer.append(bits_needed)

    total_shift = sum(bits_needed_per_layer)  # Start with the total bits used

    # Reverse bit-shifting: loop through each layer in reverse order
    layers = [layer_name for layer_name, _ in sorted_layers]  # Get the sorted layer names
    for i in range(len(layers)-1, -1, -1):
        layer = layers[i]
        bits_needed = bits_needed_per_layer[i]
        total_shift -= bits_needed
        # Create a mask for extracting the current layer
        mask = (1 << bits_needed) - 1
        # Shift right and apply the mask to extract the current layer's values
        df[layer] = df[column_name].apply(lambda x: (x >> total_shift) & mask)

    return df

# Reverse the bit-shifting
parse_zone_data = reverse_bit_shifting(zone_df, 'bit_shifted_values', zone_layers_sorted)

# Print the Pandas DataFrame to verify the results
print(parse_zone_data)

   bit_shifted_values  ipcc_class_2000  continent_ecozone  climate_domain
0                8137                1               2020               1
1               12233                2               2020               1
2               16329                3               2020               1
3               20425                4               2020               1
4               24521                5               2020               1
5               28617                6               2020               1


This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.
This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [None]:
######################################################################

<font size="5">Zone Inputs (Categorical)</font> 

In [None]:
# # gadm boundary tiles
# iso_gadm_uri = 's3://gfw-data-lake/gadm_administrative_boundaries/v3.6/raster/epsg-4326/10/40000/adm0/geotiff/'
# iso_gadm_pattern = ''

# climate_domain_path = "s3://gfw2-data/climate/carbon_model/inputs_for_carbon_pools/processed/fao_ecozones_bor_tem_tro/20190418/"
# climate_domain_pattern = "fao_ecozones_bor_tem_tro_processed"

# continent_ecozone_path = "s3://gfw2-data/climate/carbon_model/fao_ecozones/ecozone_continent/20190116/processed/"
# continent_ecozone_pattern = "fao_ecozones_continents_processed"



# # MODEL OUTPUT SHAPEFILES
# # # IPCC reporting classes
# # ipcc_class_2000 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2000/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2000__10x10.shp'
# # ipcc_class_2005 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2005/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2005__10x10.shp'
# # ipcc_class_2010 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2010/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2010__10x10.shp'
# # ipcc_class_2015 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2015/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2015__10x10.shp'
# # ipcc_class_2020 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2020/40000_pixels/20240205/raster_footprints_IPCC_basic_classes_2020__10x10.shp'

# # # IPCC change classes
# # ipcc_change_2005 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2000_2005/40000_pixels/20240205/raster_footprints_IPCC_basic_change_2000_2005__10x105.shp'
# # ipcc_change_2010 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2005_2010/40000_pixels/20240205/raster_footprints_IPCC_basic_change_2005_2010__10x10.shp'
# # ipcc_change_2015 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2010_2015/40000_pixels/20240205/raster_footprints_IPCC_basic_change_2010_2015__10x10.shp'
# # ipcc_change_2020 = 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2015_2020/40000_pixels/20240205/raster_footprints_IPCC_basic_change_2015_2020__10x10.shp'

# # 2000 carbon densities 
# AGC_density_2000 = 'gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/AGC_density_MgC_ha/2000/40000_pixels/20240729/raster_footprints_AGC_2000__global__10x10.shp'
# BGC_density_2000 = 'gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/BGC_density_MgC_ha/2000/40000_pixels/20240729/raster_footprints_BGC_2000__global__10x10.shp'
# deadwood_C_density_2000 = 'gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/deadwood_C_density_MgC_ha/2000/40000_pixels/20240729/raster_footprints_deadwood_C_2000__global__10x10.shp'
# litter_C_density = 'gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/litter_C_density_MgC_ha/2000/40000_pixels/20240729/raster_footprints_litter_C_2000__global__10x10.shp'

#TODO: Include land state nodes? --> no shapefiles

In [None]:
# #rasterize vector data
# name path column

# shapefile = gpd.read_file(path)
# utm = shapefile.to_crs("EPSG:4326")
# geom = utm[['geometry', column]].values.tolist()

In [None]:
# # lazily "computing" tile sets
# iso_gadm = get_tile_dataset(iso_gadm_uri, tile, "iso")
# iso_gadm = iso_gadm.where(iso_gadm != 9999)