In [1]:
import os

# dask/parallelization libraries
import coiled
import dask
from dask.distributed import Client, LocalCluster
from dask.distributed import print as dask_print
import dask.config
import distributed

# scipy basics
import numpy as np
import rasterio
import rasterio.features
import rasterio.transform
import rasterio.windows

from numba import jit
import concurrent.futures

import boto3
import time
import math
import ctypes
import pandas as pd

<font size="6">Making cloud and local clusters</font> 

In [2]:
coiled_cluster = coiled.Cluster(
    n_workers=20,
    use_best_zone=True, 
    compute_purchase_option="spot_with_fallback",
    idle_timeout="10 minutes",
    region="us-east-1",
    name="next_gen_forest_carbon_flux_model", 
    account='jterry64', # Necessary to use the AWS environment that Justin set up in Coiled
    worker_memory = "32GiB" 
)

Output()

Output()

In [3]:
# Coiled cluster (cloud run)
coiled_client = coiled_cluster.get_client()
coiled_client

0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: https://cluster-jkwdd.dask.host/Frnicu7tDR-w44To/status,

0,1
Dashboard: https://cluster-jkwdd.dask.host/Frnicu7tDR-w44To/status,Workers: 20
Total threads: 160,Total memory: 603.29 GiB

0,1
Comm: tls://10.0.94.30:8786,Workers: 20
Dashboard: http://10.0.94.30:8787/status,Total threads: 160
Started: Just now,Total memory: 603.29 GiB

0,1
Comm: tls://10.0.93.108:39973,Total threads: 8
Dashboard: http://10.0.93.108:8787/status,Memory: 30.17 GiB
Nanny: tls://10.0.93.108:39197,
Local directory: /scratch/dask-scratch-space/worker-8avx1dln,Local directory: /scratch/dask-scratch-space/worker-8avx1dln

0,1
Comm: tls://10.0.81.193:38037,Total threads: 8
Dashboard: http://10.0.81.193:8787/status,Memory: 30.17 GiB
Nanny: tls://10.0.81.193:35077,
Local directory: /scratch/dask-scratch-space/worker-rn9y9zm4,Local directory: /scratch/dask-scratch-space/worker-rn9y9zm4

0,1
Comm: tls://10.0.80.192:40469,Total threads: 8
Dashboard: http://10.0.80.192:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.80.192:38627,
Local directory: /scratch/dask-scratch-space/worker-ahfetfjo,Local directory: /scratch/dask-scratch-space/worker-ahfetfjo

0,1
Comm: tls://10.0.91.188:40745,Total threads: 8
Dashboard: http://10.0.91.188:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.91.188:34125,
Local directory: /scratch/dask-scratch-space/worker-wkwq90e4,Local directory: /scratch/dask-scratch-space/worker-wkwq90e4

0,1
Comm: tls://10.0.95.42:35759,Total threads: 8
Dashboard: http://10.0.95.42:8787/status,Memory: 30.17 GiB
Nanny: tls://10.0.95.42:44529,
Local directory: /scratch/dask-scratch-space/worker-vvddmx7q,Local directory: /scratch/dask-scratch-space/worker-vvddmx7q

0,1
Comm: tls://10.0.95.59:43755,Total threads: 8
Dashboard: http://10.0.95.59:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.95.59:39445,
Local directory: /scratch/dask-scratch-space/worker-g16wffau,Local directory: /scratch/dask-scratch-space/worker-g16wffau

0,1
Comm: tls://10.0.88.181:40791,Total threads: 8
Dashboard: http://10.0.88.181:8787/status,Memory: 30.17 GiB
Nanny: tls://10.0.88.181:42397,
Local directory: /scratch/dask-scratch-space/worker-myhiwicm,Local directory: /scratch/dask-scratch-space/worker-myhiwicm

0,1
Comm: tls://10.0.88.229:35361,Total threads: 8
Dashboard: http://10.0.88.229:8787/status,Memory: 30.15 GiB
Nanny: tls://10.0.88.229:38305,
Local directory: /scratch/dask-scratch-space/worker-fwsk82id,Local directory: /scratch/dask-scratch-space/worker-fwsk82id

0,1
Comm: tls://10.0.89.55:39525,Total threads: 8
Dashboard: http://10.0.89.55:8787/status,Memory: 30.14 GiB
Nanny: tls://10.0.89.55:37743,
Local directory: /scratch/dask-scratch-space/worker-pqedk5r0,Local directory: /scratch/dask-scratch-space/worker-pqedk5r0

0,1
Comm: tls://10.0.87.95:41623,Total threads: 8
Dashboard: http://10.0.87.95:8787/status,Memory: 30.17 GiB
Nanny: tls://10.0.87.95:38925,
Local directory: /scratch/dask-scratch-space/worker-7qedqw6y,Local directory: /scratch/dask-scratch-space/worker-7qedqw6y

0,1
Comm: tls://10.0.90.139:36723,Total threads: 8
Dashboard: http://10.0.90.139:8787/status,Memory: 30.17 GiB
Nanny: tls://10.0.90.139:41759,
Local directory: /scratch/dask-scratch-space/worker-g2g8xiqs,Local directory: /scratch/dask-scratch-space/worker-g2g8xiqs

0,1
Comm: tls://10.0.81.30:37877,Total threads: 8
Dashboard: http://10.0.81.30:8787/status,Memory: 30.15 GiB
Nanny: tls://10.0.81.30:36285,
Local directory: /scratch/dask-scratch-space/worker-cptaj_8q,Local directory: /scratch/dask-scratch-space/worker-cptaj_8q

0,1
Comm: tls://10.0.90.193:34521,Total threads: 8
Dashboard: http://10.0.90.193:8787/status,Memory: 30.17 GiB
Nanny: tls://10.0.90.193:37575,
Local directory: /scratch/dask-scratch-space/worker-sh6f544z,Local directory: /scratch/dask-scratch-space/worker-sh6f544z

0,1
Comm: tls://10.0.83.229:34045,Total threads: 8
Dashboard: http://10.0.83.229:8787/status,Memory: 30.15 GiB
Nanny: tls://10.0.83.229:34083,
Local directory: /scratch/dask-scratch-space/worker-oazkpdpb,Local directory: /scratch/dask-scratch-space/worker-oazkpdpb

0,1
Comm: tls://10.0.93.145:39789,Total threads: 8
Dashboard: http://10.0.93.145:8787/status,Memory: 30.15 GiB
Nanny: tls://10.0.93.145:35311,
Local directory: /scratch/dask-scratch-space/worker-46rh4350,Local directory: /scratch/dask-scratch-space/worker-46rh4350

0,1
Comm: tls://10.0.93.103:37421,Total threads: 8
Dashboard: http://10.0.93.103:8787/status,Memory: 30.18 GiB
Nanny: tls://10.0.93.103:42069,
Local directory: /scratch/dask-scratch-space/worker-3phty40z,Local directory: /scratch/dask-scratch-space/worker-3phty40z

0,1
Comm: tls://10.0.95.64:39767,Total threads: 8
Dashboard: http://10.0.95.64:8787/status,Memory: 30.16 GiB
Nanny: tls://10.0.95.64:38435,
Local directory: /scratch/dask-scratch-space/worker-i6dgiuen,Local directory: /scratch/dask-scratch-space/worker-i6dgiuen

0,1
Comm: tls://10.0.87.108:37251,Total threads: 8
Dashboard: http://10.0.87.108:8787/status,Memory: 30.15 GiB
Nanny: tls://10.0.87.108:32991,
Local directory: /scratch/dask-scratch-space/worker-llhf4dap,Local directory: /scratch/dask-scratch-space/worker-llhf4dap

0,1
Comm: tls://10.0.86.83:44169,Total threads: 8
Dashboard: http://10.0.86.83:8787/status,Memory: 30.17 GiB
Nanny: tls://10.0.86.83:40633,
Local directory: /scratch/dask-scratch-space/worker-_6o02yb9,Local directory: /scratch/dask-scratch-space/worker-_6o02yb9

0,1
Comm: tls://10.0.80.247:33827,Total threads: 8
Dashboard: http://10.0.80.247:8787/status,Memory: 30.16 GiB
Nanny: tls://10.0.80.247:39939,
Local directory: /scratch/dask-scratch-space/worker-xrbbi75n,Local directory: /scratch/dask-scratch-space/worker-xrbbi75n


In [None]:
# Local single-process cluster (local run). Will run .compute() on just one process, not a whole cluster.
local_client = Client(processes=False)
local_client

In [None]:
local_client = Client()
local_client

In [None]:
# Local cluster with multiple workers
local_cluster = LocalCluster()  
local_client = Client(local_cluster)
local_client

<font size="6">Shutting down cloud and local clusters</font> 

In [None]:
coiled_cluster.shutdown()

In [None]:
local_client.shutdown()

<font size="6">Analysis</font> 

<font size="4">Paths and functions</font>

In [4]:
# General paths and constants

general_uri = 's3://gfw2-data/forest_change/GLAD_Europe_height_data/'

s3_out_dir = 'climate/European_height_carbon_model/outputs'

regrowth_spreadsheet = 'regrowth_data_flattened_v2_all__from_Viola_Heinrich_20231219.xlsx'
regrowth_tab = 'processed'

def timestr():
    return time.strftime("%Y%m%d_%H_%M_%S")

In [5]:
# Returns list of all chunk boundaries within a bounding box for chunks of a given size
def get_chunk_bounds(chunk_params):

    min_x = chunk_params[0]
    min_y = chunk_params[1]
    max_x = chunk_params[2]
    max_y = chunk_params[3]
    chunk_size = chunk_params[4]
    
    x, y = (min_x, min_y)
    chunks = []

    # Polygon Size
    while y < max_y:
        while x < max_x:
            bounds = [
                x,
                y,
                x + chunk_size,
                y + chunk_size,
            ]
            chunks.append(bounds)
            x += chunk_size
        x = min_x
        y += chunk_size

    return chunks

# Returns the encompassing tile_id string in the form YYN/S_XXXE/W based on a coordinate
def xy_to_tile_id(top_left_x, top_left_y):

    lat_ceil = math.ceil(top_left_y/10.0) * 10
    lng_floor = math.floor(top_left_x/10.0) * 10
    
    lng: str = f"{str(lng_floor).zfill(3)}E" if (lng_floor >= 0) else f"{str(-lng_floor).zfill(3)}W"
    lat: str = f"{str(lat_ceil).zfill(2)}N" if (lat_ceil >= 0) else f"{str(-lat_ceil).zfill(2)}S"

    return f"{lat}_{lng}"

In [6]:
# Lazily opens tile within provided bounds (i.e. one chunk) and returns as a numpy array
# If it can't open the chunk (no data in it), it returns an array of all 0s
def get_tile_dataset_rio(uri, bounds, chunk_length):

    try:
        with rasterio.open(uri) as ds:
            window = rasterio.windows.from_bounds(*bounds, ds.transform)
            data = ds.read(1, window=window)
    except:
        data = np.zeros((chunk_length, chunk_length))

    if data.size==0:
        # dask_print("No data in chunk")
        return np.zeros((chunk_length, chunk_length))
    else:
        # dask_print("Data in chunk")
        return data

In [7]:
# Creates numpy array of aboveground carbon removal factors that can be used in a numba-decorated function
def prepare_regrowth_array(spreadsheet, tab):

    regrowth_df = pd.read_excel(open(spreadsheet, 'rb'), sheet_name=tab)
    # regrowth_df = regrowth_df.loc[:, ['ecozone_code', 'iso_code', 'forest_code', 'startH_code', 'Slope']]  # Full version
    regrowth_df = regrowth_df.loc[:, ['ecozone_code', 'iso_code', 'startH_code', 'Slope_Mg_AGC_ha_yr']]  # TODO: This drops the forest_code until we get broadleaf/coniferous map
    regrowth_array = regrowth_df.to_numpy().astype(float)  # Need to convery Pandas dataframe to numpy array because Numba jit-decorated function can't use dataframes. 
    regrowth_array = regrowth_array.astype(float)  # Convert from object dtype to float dtype-- necessary for numba to use it
    
    return regrowth_array

<font size="4">Model steps</font>

In [8]:
# Operates on all pixels in each chunk for a given year
# Inputs are forest heights, various contextual layers used to assign forest state, and current carbon stocks for different pools.
# Outputs are forest state, carbon fluxes, and carbon stocks for different pools.

@jit(nopython=True)    # numba decorator that compiles Python to C++ to accelerate processing
def classify(regrowth_array,
             forest_height_previous_block, forest_height_current_block, forest_loss_detection_block, driver_block, planted_forest_type_block, peat_block, 
             agc_current_block, bgc_current_block, deadwood_c_current_block, litter_c_current_block, soil_c_current_block, r_s_ratio_block, ecozone_block, iso_block, 
             land_cover_previous_block, land_cover_next_block, burned_area_two_before_block, burned_area_one_before_block):

    # Outputs
    forest_states = np.zeros(forest_height_previous_block.shape)
    emission_factor = np.zeros(forest_height_previous_block.shape)
    removal_factor = np.zeros(forest_height_previous_block.shape)
    
    agc_flux = np.zeros(forest_height_previous_block.shape)
    bgc_flux = np.zeros(forest_height_previous_block.shape)
    deadwood_c_flux = np.zeros(forest_height_previous_block.shape)
    litter_c_flux = np.zeros(forest_height_previous_block.shape)
    soil_c_flux = np.zeros(forest_height_previous_block.shape)

    # Iterates through all pixels in the chunk
    for row in range(forest_height_previous_block.shape[0]):
        for col in range(forest_height_previous_block.shape[1]):

            # Pixel for each input
            forest_height_previous = forest_height_previous_block[row, col]
            forest_height_current = forest_height_current_block[row, col]
            forest_loss_detection = forest_loss_detection_block[row, col]
            driver = driver_block[row, col]
            planted_forest_type = planted_forest_type_block[row, col]
            peat = peat_block[row, col]

            LC_previous = land_cover_previous_block[row, col]
            LC_next = land_cover_next_block[row, col]
            burned_area_two_before = burned_area_two_before_block[row, col]
            burned_area_one_before = burned_area_one_before_block[row, col]
            
            agc_current = agc_current_block[row, col]
            bgc_current = bgc_current_block[row, col]
            deadwood_c_current = deadwood_c_current_block[row, col]
            litter_c_current = litter_c_current_block[row, col]
            soil_c_current = soil_c_current_block[row, col]
            
            r_s_ratio = r_s_ratio_block[row, col]
            ecozone = ecozone_block[row, col]
            iso = iso_block[row, col]

            # Various definitions used in decision tree
            grassland_forest_previous = (((LC_previous >= 2) & (LC_previous <= 48)) | ((LC_previous >= 102) & (LC_previous <= 148)))
            grassland_forest_next = (((LC_next >= 2) & (LC_next <= 48)) | ((LC_next >= 102) & (LC_next <= 148)))  
            cropland_previous = (LC_previous == 244)
            cropland_next = (LC_next == 244)
            forestry = (driver == 3)
            non_sdpt_forestry = (forestry & (grassland_forest_previous | grassland_forest_next) & (cropland_previous == 0) & (cropland_next == 0))
            burned_area_recent = ((burned_area_two_before != 0) or (burned_area_one_before != 0))

            # The decision tree that produces all the outputs for a pixel for a given year
            if forest_height_previous >= 5 and forest_height_current >= 5:                                     # maintained
                forest_states[row, col] = 1
                agc_rf = regrowth_array[np.where((regrowth_array[:,0] == ecozone) * (regrowth_array[:,1] == iso) * (regrowth_array[:,2] == 2))][0,3]   # TODO: replace [:,2] == 2 with actual assignment
                removal_factor[row, col] = agc_rf
                agc_flux[row, col] = agc_rf
                bgc_flux[row, col] = agc_rf * r_s_ratio
                deadwood_c_flux[row, col] = agc_rf * 0.06
                litter_c_flux[row, col] = agc_rf * 0.06
                soil_c_flux[row, col] = agc_rf * 0.01
                agc_current_block[row, col] = agc_current + agc_flux[row, col]
                bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                deadwood_c_current_block[row, col] = deadwood_c_current + deadwood_c_flux[row, col]
                litter_c_current_block[row, col] = litter_c_current + litter_c_flux[row, col]
                soil_c_current_block[row, col] = soil_c_current + soil_c_flux[row, col]
            elif forest_height_previous < 5 and forest_height_current >= 5:                                    # gain
                forest_states[row, col] = 2
                agc_rf = regrowth_array[np.where((regrowth_array[:,0] == ecozone) * (regrowth_array[:,1] == iso) * (regrowth_array[:,2] == 1))][0,3]   # TODO: replace [:,2] == 2 with actual assignment
                removal_factor[row, col] = agc_rf
                agc_flux[row, col] = agc_rf
                bgc_flux[row, col] = agc_rf * r_s_ratio
                deadwood_c_flux[row, col] = agc_rf * 0.09
                litter_c_flux[row, col] = agc_rf * 0.09
                soil_c_flux[row, col] = agc_rf * 0.04
                agc_current_block[row, col] = agc_current + agc_flux[row, col]
                bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                deadwood_c_current_block[row, col] = deadwood_c_current + deadwood_c_flux[row, col]
                litter_c_current_block[row, col] = litter_c_current + litter_c_flux[row, col]
                soil_c_current_block[row, col] = soil_c_current + soil_c_flux[row, col]
            elif ((forest_height_previous >= 5 and forest_height_current < 5) or forest_loss_detection == 1):  # loss
                if planted_forest_type == 0:                                                                   # loss:no SDPT                
                    if non_sdpt_forestry == 0:                                                                 # loss:no SDPT:no non-SDPT forestry                    
                        forest_states[row, col] = 311
                        biomass_ef = 0.9
                        dead_litter_ef = 0.3
                        soil_ef = 0.1
                        emission_factor[row, col] = biomass_ef
                        agc_flux[row, col] = (agc_current * biomass_ef) * -1
                        bgc_flux[row, col] = (bgc_current * biomass_ef) * -1
                        deadwood_c_flux[row, col] = (deadwood_c_current * dead_litter_ef) * -1
                        litter_c_flux[row, col] = (litter_c_current * dead_litter_ef) * -1
                        soil_c_flux[row, col] = (soil_c_current * soil_ef) * -1
                        agc_current_block[row, col] = agc_current + agc_flux[row, col]
                        bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                        deadwood_c_current_block[row, col] = deadwood_c_current + deadwood_c_flux[row, col]
                        litter_c_current_block[row, col] = litter_c_current + litter_c_flux[row, col]
                        soil_c_current_block[row, col] = soil_c_current + soil_c_flux[row, col]
                    else:                                                                                      # loss:no SDPT:non-SDPT forestry         
                        forest_states[row, col] = 312
                        biomass_ef = 0.7
                        dead_litter_ef = 0.3
                        soil_ef = 0.1
                        emission_factor[row, col] = biomass_ef
                        agc_flux[row, col] = (agc_current * biomass_ef) * -1
                        bgc_flux[row, col] = (bgc_current * biomass_ef) * -1
                        deadwood_c_flux[row, col] = (deadwood_c_current * dead_litter_ef) * -1
                        litter_c_flux[row, col] = (litter_c_current * dead_litter_ef) * -1
                        soil_c_flux[row, col] = (soil_c_current * soil_ef) * -1
                        agc_current_block[row, col] = agc_current + agc_flux[row, col]
                        bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                        deadwood_c_current_block[row, col] = deadwood_c_current + deadwood_c_flux[row, col]
                        litter_c_current_block[row, col] = litter_c_current + litter_c_flux[row, col]
                        soil_c_current_block[row, col] = soil_c_current + soil_c_flux[row, col]
                else:                                                                                          # loss:SDPT
                    if burned_area_recent == 0:                                                                # loss:SDPT:not burned recent
                        if peat == 0:                                                                          # loss:SDPT:not burned recent:not peat
                            forest_states[row, col] = 3211
                            biomass_ef = 0.6
                            dead_litter_ef = 0.5
                            soil_ef = 0.2
                            emission_factor[row, col] = biomass_ef
                            agc_flux[row, col] = (agc_current * biomass_ef) * -1
                            bgc_flux[row, col] = (bgc_current * biomass_ef) * -1
                            deadwood_c_flux[row, col] = (deadwood_c_current * dead_litter_ef) * -1
                            litter_c_flux[row, col] = (litter_c_current * dead_litter_ef) * -1
                            soil_c_flux[row, col] = (soil_c_current * soil_ef) * -1
                            agc_current_block[row, col] = agc_current + agc_flux[row, col]
                            bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                            deadwood_c_current_block[row, col] = deadwood_c_current + deadwood_c_flux[row, col]
                            litter_c_current_block[row, col] = litter_c_current + litter_c_flux[row, col]
                            soil_c_current_block[row, col] = soil_c_current + soil_c_flux[row, col]
                        else:                                                                                  # loss:SDPT:not burned recent:peat
                            forest_states[row, col] = 3212
                            biomass_ef = 0.75
                            dead_litter_ef = 0.4
                            soil_ef = 0.1
                            emission_factor[row, col] = biomass_ef
                            agc_flux[row, col] = (agc_current * biomass_ef) * -1
                            bgc_flux[row, col] = (bgc_current * biomass_ef) * -1
                            deadwood_c_flux[row, col] = (deadwood_c_current * dead_litter_ef) * -1
                            litter_c_flux[row, col] = (litter_c_current * dead_litter_ef) * -1
                            soil_c_flux[row, col] = (soil_c_current * soil_ef) * -1   
                            agc_current_block[row, col] = agc_current + agc_flux[row, col]
                            bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                            deadwood_c_current_block[row, col] = deadwood_c_current + deadwood_c_flux[row, col]
                            litter_c_current_block[row, col] = litter_c_current + litter_c_flux[row, col]
                            soil_c_current_block[row, col] = soil_c_current + soil_c_flux[row, col]
                    else:                                                                                      # loss:SDPT:burned recent
                        if peat == 0:                                                                          # loss:SDPT:burned recent:not peat
                            forest_states[row, col] = 3221                            
                            biomass_ef = 0.65
                            dead_litter_ef = 0.1
                            soil_ef = 0.3
                            emission_factor[row, col] = biomass_ef
                            agc_flux[row, col] = (agc_current * biomass_ef) * -1
                            bgc_flux[row, col] = (bgc_current * biomass_ef) * -1
                            deadwood_c_flux[row, col] = (deadwood_c_current * dead_litter_ef) * -1
                            litter_c_flux[row, col] = (litter_c_current * dead_litter_ef) * -1
                            soil_c_flux[row, col] = (soil_c_current * soil_ef) * -1
                            agc_current_block[row, col] = agc_current + agc_flux[row, col]
                            bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                            deadwood_c_current_block[row, col] = deadwood_c_current + deadwood_c_flux[row, col]
                            litter_c_current_block[row, col] = litter_c_current + litter_c_flux[row, col]
                            soil_c_current_block[row, col] = soil_c_current + soil_c_flux[row, col]
                        else:                                                                                  # loss:SDPT:burned recent:peat
                            forest_states[row, col] = 3222
                            biomass_ef = 0.9
                            dead_litter_ef = 0.1
                            soil_ef = 0.4
                            emission_factor[row, col] = biomass_ef
                            agc_flux[row, col] = (agc_current * biomass_ef) * -1
                            bgc_flux[row, col] = (bgc_current * biomass_ef) * -1
                            deadwood_c_flux[row, col] = (deadwood_c_current * dead_litter_ef) * -1
                            litter_c_flux[row, col] = (litter_c_current * dead_litter_ef) * -1
                            soil_c_flux[row, col] = (soil_c_current * soil_ef) * -1
                            agc_current_block[row, col] = agc_current + agc_flux[row, col]
                            bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                            deadwood_c_current_block[row, col] = deadwood_c_current + deadwood_c_flux[row, col]
                            litter_c_current_block[row, col] = litter_c_current + litter_c_flux[row, col]
                            soil_c_current_block[row, col] = soil_c_current + soil_c_flux[row, col]
            else:                                                                                              # no forest
                forest_states[row, col] = 0
                emission_factor[row, col] = 0
                removal_factor[row, col] = 0
                
                agc_flux[row, col] = 0
                bgc_flux[row, col] = 0                 
                deadwood_c_flux[row, col] = 0
                litter_c_flux[row, col] = 0
                soil_c_flux[row, col] = 0
                
                agc_current_block[row, col] = agc_current
                bgc_current_block[row, col] = bgc_current
                deadwood_c_current_block[row, col] = deadwood_c_current
                litter_c_current_block[row, col] = litter_c_current
                soil_c_current_block[row, col] = soil_c_current

    return forest_states, emission_factor, removal_factor, agc_flux, bgc_flux, deadwood_c_flux, litter_c_flux, soil_c_flux, agc_current_block, bgc_current_block, deadwood_c_current_block, litter_c_current_block, soil_c_current_block

In [9]:
# Runs model on each chunk for all years.
# Chunks are defined by a bounding box and a starting year for iteration

# TODO: is chunk_length_deg really needed for this? It could be calculated from bounds and passed to get_tile_dataset_rio that way. 
def process_chunk(bounds, chunk_length_deg, start_year, regrowth_array):
 
    futures = {}
    layers = {}

    bounds_str = "_".join([str(round(x)) for x in bounds])
    chunk_length_pixels = int(chunk_length_deg * (40000/10))

    # try:
    # Submit requests to S3 for input chunks but dont' actually download them yet. This queueing of the requests before downloading them speeds up the downloading
    # Approach is to download all the input chunks up front for every year to make downloading more efficient, even though it means storing more upfront
    with concurrent.futures.ThreadPoolExecutor() as executor:
        tile_id = xy_to_tile_id(bounds[0], bounds[3])

        # Paths to download from
        drivers_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/tree_cover_loss_drivers/processed/drivers_2022/20230407/{tile_id}_tree_cover_loss_driver_processed.tif"
        planted_forest_type_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/planted_forest_type/SDPT_v1/standard/20200730/{tile_id}_plantation_type_oilpalm_woodfiber_other_unmasked.tif"
        peat_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/peatlands/processed/20230315/{tile_id}_peat_mask_processed.tif"
        
        agc_2000_uri = f"s3://gfw2-data/climate/carbon_model/carbon_pools/aboveground_carbon/extent_2000/standard/20230222/{tile_id}_Mg_AGC_ha_2000.tif"
        bgc_2000_uri = f"s3://gfw2-data/climate/carbon_model/carbon_pools/belowground_carbon/extent_2000/standard/20230222/{tile_id}_Mg_BGC_ha_2000.tif"
        deadwood_c_2000_uri = f"s3://gfw2-data/climate/carbon_model/carbon_pools/deadwood_carbon/extent_2000/standard/20230222/{tile_id}_Mg_deadwood_C_ha_2000.tif"
        litter_c_2000_uri = f"s3://gfw2-data/climate/carbon_model/carbon_pools/litter_carbon/extent_2000/standard/20230222/{tile_id}_Mg_litter_C_ha_2000.tif"
        soil_c_2000_uri = f"s3://gfw2-data/climate/carbon_model/carbon_pools/soil_carbon/intermediate_full_extent/standard/20231108/{tile_id}_soil_C_full_extent_2000_Mg_C_ha.tif"

        r_s_ratio_uri = f"s3://gfw2-data/climate/carbon_model/BGB_AGB_ratio/processed/20230216/{tile_id}_BGB_AGB_ratio.tif"
        ecozone_uri = f"s3://gfw2-data/fao_ecozones/v2000/raster/epsg-4326/10/40000/class/gdal-geotiff/{tile_id}.tif"   # Originally from gfw-data-lake, so it's in 400x400 windows 
        iso_uri = f"s3://gfw2-data/gadm_administrative_boundaries/v3.6/raster/epsg-4326/10/40000/adm0/gdal-geotiff/{tile_id}.tif"  # Originally from gfw-data-lake, so it's in 400x400 windows

        # Save downloads as dictionary from futures -> name so we can know what layer it is on completion
        futures[executor.submit(get_tile_dataset_rio, drivers_uri, bounds, chunk_length_pixels)] = "drivers"
        futures[executor.submit(get_tile_dataset_rio, planted_forest_type_uri, bounds, chunk_length_pixels)] = "planted_forest_type"
        futures[executor.submit(get_tile_dataset_rio, peat_uri, bounds, chunk_length_pixels)] = "peat"
        
        futures[executor.submit(get_tile_dataset_rio, agc_2000_uri, bounds, chunk_length_pixels)] = "agc_2000"
        futures[executor.submit(get_tile_dataset_rio, bgc_2000_uri, bounds, chunk_length_pixels)] = "bgc_2000"
        futures[executor.submit(get_tile_dataset_rio, deadwood_c_2000_uri, bounds, chunk_length_pixels)] = "deadwood_c_2000"
        futures[executor.submit(get_tile_dataset_rio, litter_c_2000_uri, bounds, chunk_length_pixels)] = "litter_c_2000"
        futures[executor.submit(get_tile_dataset_rio, soil_c_2000_uri, bounds, chunk_length_pixels)] = "soil_c_2000"

        futures[executor.submit(get_tile_dataset_rio, r_s_ratio_uri, bounds, chunk_length_pixels)] = "r_s_ratio"
        futures[executor.submit(get_tile_dataset_rio, ecozone_uri, bounds, chunk_length_pixels)] = "ecozone"
        futures[executor.submit(get_tile_dataset_rio, iso_uri, bounds, chunk_length_pixels)] = "iso"

        # Faster to just get every year of the timeseries inputs up front unless we're running into memory issues
        for year in range(start_year-1, 2022):
            # TODO Make sure I'm matching the FH and DFL correctly (e.g., should they be the same year)?
            forest_height_uri = f'{general_uri}202307_revision/FH_{year}.tif'
            forest_loss_detection_uri = f'{general_uri}202307_revision/DFL_{year}.tif'

            #TODO These are placeholder years. Replace with system for getting correct composite landcover years. 
            preceding_land_cover_uri = f's3://gfw2-data/landcover/composite/2000/{tile_id}_composite_landcover_2015.tif'
            next_land_cover_uri = f's3://gfw2-data/landcover/composite/2000/{tile_id}_composite_landcover_2020.tif'

            futures[executor.submit(get_tile_dataset_rio, forest_height_uri, bounds, chunk_length_pixels)] = f"forest_height_{year}"
            futures[executor.submit(get_tile_dataset_rio, forest_loss_detection_uri, bounds, chunk_length_pixels)] = f"forest_loss_detection_{year}"
            
            futures[executor.submit(get_tile_dataset_rio, preceding_land_cover_uri, bounds, chunk_length_pixels)] = f"preceding_land_cover_for_{year}"
            futures[executor.submit(get_tile_dataset_rio, next_land_cover_uri, bounds, chunk_length_pixels)] = f"next_land_cover_for_{year}"

        for year in range(start_year-2, 2022):
            burned_area_uri = f's3://gfw2-data/climate/carbon_model/other_emissions_inputs/burn_year/burn_year_10x10_clip/ba_{year}_50N_010E.tif'

            futures[executor.submit(get_tile_dataset_rio, burned_area_uri, bounds, chunk_length_pixels)] = f"burned_area_{year}"


    # Wait for requests to come back with data from S3
    for future in concurrent.futures.as_completed(futures):
        layer = futures[future]
        layers[layer] = future.result()

    # TODO: Better way to skip chunks without forest in them. This way is: 1) inaccurate because it uses just one year, 
    # and 2) is inefficient because everything has already been downloaded.
    # Skips chunk if it has no forest extent in it
    if not np.any(layers[f"forest_height_2021"]):
        dask_print(f"No data in chunk {bounds_str}. Skipping: {timestr()}")
        return f"No data in chunk {bounds_str}. Skipping: {timestr()}"
        
    dask_print(f"Data in chunk {bounds_str}. Proceeding.")

    # Sets carbon pool rasters that will be iterated on as the densities in 2000
    agc_current = layers["agc_2000"]  
    bgc_current = layers["bgc_2000"]
    deadwood_c_current = layers["deadwood_c_2000"]
    litter_c_current = layers["litter_c_2000"]
    soil_c_current = layers["soil_c_2000"]
    
    # Run forest state classifier, fluxes, and stocks one year at a time
    for year in range(start_year, 2022):
        
        dask_print(f"Classifying/calculating {bounds_str} in {tile_id} for {year}: {timestr()}")
        
        forest_states, emission_factor, removal_factor, agc_flux, bgc_flux, deadwood_c_flux, litter_c_flux, soil_c_flux, agc_current, bgc_current, deadwood_c_current, litter_c_current, soil_c_current = classify(
            regrowth_array,
            layers[f"forest_height_{year - 1}"], 
            layers[f"forest_height_{year}"], 
            layers[f"forest_loss_detection_{year}"], 
            layers["drivers"],
            layers["planted_forest_type"],
            layers["peat"],
            
            agc_current,
            bgc_current,
            deadwood_c_current,
            litter_c_current,
            soil_c_current,
            layers["r_s_ratio"],
            layers["ecozone"],
            layers["iso"],

            layers[f"preceding_land_cover_for_{year}"],
            layers[f"next_land_cover_for_{year}"],
            layers[f"burned_area_{year-2}"],
            layers[f"burned_area_{year-1}"]
        )
        
        transform = rasterio.transform.from_bounds(*bounds, width=chunk_length_pixels, height=chunk_length_pixels)

        file_info = f'{tile_id}__{bounds_str}__{year}'

        # Output files to upload to s3
        output_dict = {
                "forest_states": [forest_states, "uint16"], 
                "emission_factor": [emission_factor, "float32"],
                "removal_factor": [removal_factor, "float32"],
                "agc_flux": [agc_flux, "float32"],
                "bgc_flux": [bgc_flux, "float32"],
                "deadwood_c_flux": [deadwood_c_flux, "float32"],
                "litter_c_flux": [litter_c_flux, "float32"],
                "soil_c_flux": [soil_c_flux, "float32"],
                "agc_density": [agc_current, "float32"],
                "bgc_density": [bgc_current, "float32"],
                "deadwood_c_density": [deadwood_c_current, "float32"],
                "litter_c_density": [litter_c_current, "float32"],
                "soil_c_density": [soil_c_current, "float32"]                   
               }

        s3_client = boto3.client("s3")

        dask_print(f"Saving {bounds_str} in {tile_id} for {year}: {timestr()}")

        # For every output file, saves from array to local raster, then to s3.
        # Can't save directly to s3, unfortunately, so need to save locally first.
        for key, value in output_dict.items():

            file_name = f"{key}__{file_info}__{timestr()}"

            if value[1] == "float32":
                with rasterio.open(f"/tmp/{file_name}.tif", 'w', driver='GTiff', width=chunk_length_pixels, height=chunk_length_pixels, count=1, dtype='float32', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst:
                    dst.write(value[0].astype(rasterio.float32), 1)
            else:
                with rasterio.open(f"/tmp/{file_name}.tif", 'w', driver='GTiff', width=chunk_length_pixels, height=chunk_length_pixels, count=1, dtype='uint16', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst:
                    dst.write(value[0].astype(rasterio.uint16), 1)

            s3_client.upload_file(f"/tmp/{file_name}.tif", "gfw2-data", Key=f"{s3_out_dir}/{file_name}.tif")

            # Deletes the local raster. It won't be used again.
            os.remove(f"/tmp/{file_name}.tif")

        # Clear memory of unneeded arrays
        del forest_states
        del emission_factor
        del removal_factor
        del agc_flux
        del bgc_flux
        del deadwood_c_flux
        del litter_c_flux
        del soil_c_flux
        del layers[f"forest_height_{year - 1}"]
        del layers[f"forest_loss_detection_{year}"]
        del layers[f"burned_area_{year-2}"]
    
    # except Exception as e:
    #     return f"error: {e}"

    return f"success for {bounds_str}: {timestr()}"

In [10]:
%%time

# Year to start the analysis
# start_year = 2002   # full run
# start_year = 2012  # last few years
start_year = 2020  # last two years
# start_year = 2021  # final year

# Area to analyze
# chunk_params arguments: W, S, E, N, chunk size (degrees)
chunk_params = [-12, 34, 32, 72, 1]  # all of Europe
# chunk_params = [10, 40, 20, 50, 1]    # 10x10 deg (50N_010E), 100 chunks
# chunk_params = [10, 46, 14, 50, 2]   # 4x4 deg, 4 chunks
# chunk_params = [10, 48, 12, 50, 1]   # 2x2 deg, 4 chunks
# chunk_params = [10, 49, 11, 50, 1]   # 1x1 deg, 1 chunk
# chunk_params = [10, 49, 11, 50, 0.5] # 1x1 deg, 4 chunks
# chunk_params = [10, 49.5, 10.5, 50, 0.25] # 0.5x0.5 deg, 4 chunks
# chunk_params = [10, 49.75, 10.25, 50, 0.25] # 0.25x0.25 deg, 1 chunk

# Creates numpy array of carbon regrowth rates
regrowth_array = prepare_regrowth_array(regrowth_spreadsheet, regrowth_tab)

# Makes list of chunks to analyze
chunks = get_chunk_bounds(chunk_params)  
print("Processing", len(chunks), "chunks")

# Creates list of tasks to run (1 task = 1 chunk for all years)
delayed = [dask.delayed(process_chunk)(chunk, chunk_params[4], start_year, regrowth_array) for chunk in chunks]

# Actually runs analysis
results = dask.compute(*delayed)
results

Processing 1672 chunks
No data in chunk -1_70_0_71. Skipping: 20231222_20_44_48
No data in chunk 10_71_11_72. Skipping: 20231222_20_44_52
No data in chunk -11_70_-10_71. Skipping: 20231222_20_44_53
No data in chunk 11_70_12_71. Skipping: 20231222_20_44_55
No data in chunk 7_34_8_35. Skipping: 20231222_20_44_55
No data in chunk 16_35_17_36. Skipping: 20231222_20_44_55
Data in chunk 19_70_20_71. Proceeding.
Classifying/calculating 19_70_20_71 in 80N_010E for 2020: 20231222_20_44_56
No data in chunk -12_70_-11_71. Skipping: 20231222_20_44_52
Data in chunk 23_71_24_72. Proceeding.
Classifying/calculating 23_71_24_72 in 80N_020E for 2020: 20231222_20_44_59
No data in chunk 21_34_22_35. Skipping: 20231222_20_44_59
No data in chunk -9_69_-8_70. Skipping: 20231222_20_45_00
Data in chunk 18_70_19_71. Proceeding.
Classifying/calculating 18_70_19_71 in 80N_010E for 2020: 20231222_20_44_59
No data in chunk 20_71_21_72. Skipping: 20231222_20_44_59
No data in chunk 5_34_6_35. Skipping: 20231222_20_4

('No data in chunk -12_34_-11_35. Skipping: 20231222_20_57_54',
 'No data in chunk -11_34_-10_35. Skipping: 20231222_20_46_25',
 'No data in chunk -10_34_-9_35. Skipping: 20231222_21_01_09',
 'No data in chunk -9_34_-8_35. Skipping: 20231222_21_00_42',
 'No data in chunk -8_34_-7_35. Skipping: 20231222_20_56_36',
 'No data in chunk -7_34_-6_35. Skipping: 20231222_20_55_05',
 'No data in chunk -6_34_-5_35. Skipping: 20231222_20_58_56',
 'No data in chunk -5_34_-4_35. Skipping: 20231222_20_55_20',
 'No data in chunk -4_34_-3_35. Skipping: 20231222_20_51_38',
 'No data in chunk -3_34_-2_35. Skipping: 20231222_20_51_23',
 'No data in chunk -2_34_-1_35. Skipping: 20231222_20_58_58',
 'No data in chunk -1_34_0_35. Skipping: 20231222_20_52_48',
 'No data in chunk 0_34_1_35. Skipping: 20231222_20_57_20',
 'No data in chunk 1_34_2_35. Skipping: 20231222_20_46_13',
 'No data in chunk 2_34_3_35. Skipping: 20231222_20_59_47',
 'No data in chunk 3_34_4_35. Skipping: 20231222_20_53_23',
 'No data in

In [None]:
regrowth_df=pd.read_excel(open('regrowth_data_flattened_v2_all__from_Viola_Heinrich_20231219.xlsx', 'rb'), 
              sheet_name='processed')  
regrowth_df
# regrowth_df[(regrowth_df['ecozone_code']==7) & (regrowth_df['iso_code']==8) 
# & (regrowth_df['forest'] == 'broadleaf') & (regrowth_df['startH'] == '<5m')]['Slope']
np_df = regrowth_df.to_numpy()
np_df
np_df = np_df[np.where((np_df[:,6] == 1))]
np_df
np_df = np_df[:, [4,5,7,9]].astype(float)
np_df
# np_df[np.where((np_df[:,0] == 7) * (np_df[:,1] == 8))]
# # np_df[np.where((np_df[:,0] == 7) * (np_df[:,1] == 8))][0,2]

In [None]:
# To run without dask at all
process_chunk([10, 49, 11, 50], 1, start_year)

In [None]:
# Download test-- checks that uri is found and recognized
tile_id = "50N_010E"
# uri = f"s3://gfw2-data/climate/carbon_model/BGB_AGB_ratio/processed/20230216/{tile_id}_BGB_AGB_ratio.tif"
uri = f"s3://gfw2-data/gadm_administrative_boundaries/v3.6/raster/epsg-4326/10/40000/adm0/gdal-geotiff/{tile_id}.tif"  # Originally from gfw-data-lake, so it's in 400x400 windows
# uri = f"s3://gfw2-data/fao_ecozones/v2000/raster/epsg-4326/10/40000/class/gdal-geotiff/{tile_id}.tif"   # Originally from gfw-data-lake, so it's in 400x400 windows 
bounds = [10, 49.75, 10.25, 50]

get_tile_dataset_rio(uri, bounds, 1000)

In [None]:
coiled_client.restart() 

In [None]:
client.cancel(future) # per https://github.com/dask/distributed/issues/3898#issuecomment-645590511

In [None]:
# aws s3 cp s3://gfw2-data/climate/European_height_carbon_model/outputs/ . --recursive --exclude "*" --include "*10_49_11_50*"
# aws s3 cp s3://gfw2-data/climate/European_height_carbon_model/outputs/ . --recursive --exclude "*" --include "*2002*10_49_11_50*"