In [1]:
import os

# dask/parallelization libraries
import coiled
import dask
from dask.distributed import Client, LocalCluster
from dask.distributed import print as dask_print
import dask.config
import distributed

# scipy basics
import numpy as np
import rasterio
import rasterio.features
import rasterio.transform
import rasterio.windows

from numba import jit
import concurrent.futures

import boto3
import time
import math
import ctypes

<font size="6">Making cloud and local clusters</font> 

In [2]:
coiled_cluster = coiled.Cluster(
    n_workers=40,
    use_best_zone=True, 
    compute_purchase_option="spot_with_fallback",
    idle_timeout="10 minutes",
    region="us-east-1",
    name="next_gen_forest_carbon_flux_model", 
    account='jterry64', # Necessary to use the AWS environment that Justin set up in Coiled
    worker_memory = "32GiB" 
)

Output()

Output()

In [3]:
# Coiled cluster (cloud run)
coiled_client = coiled_cluster.get_client()
coiled_client

0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: https://cluster-xmyeb.dask.host/FaAyIpqqz_54hyfT/status,

0,1
Dashboard: https://cluster-xmyeb.dask.host/FaAyIpqqz_54hyfT/status,Workers: 16
Total threads: 128,Total memory: 481.37 GiB

0,1
Comm: tls://10.0.25.45:8786,Workers: 16
Dashboard: http://10.0.25.45:8787/status,Total threads: 128
Started: Just now,Total memory: 481.37 GiB

0,1
Comm: tls://10.0.17.249:44971,Total threads: 8
Dashboard: http://10.0.17.249:8787/status,Memory: 29.90 GiB
Nanny: tls://10.0.17.249:45587,
Local directory: /scratch/dask-scratch-space/worker-cgxzwmrz,Local directory: /scratch/dask-scratch-space/worker-cgxzwmrz

0,1
Comm: tls://10.0.29.55:41715,Total threads: 8
Dashboard: http://10.0.29.55:8787/status,Memory: 30.23 GiB
Nanny: tls://10.0.29.55:44827,
Local directory: /scratch/dask-scratch-space/worker-6cmurkca,Local directory: /scratch/dask-scratch-space/worker-6cmurkca

0,1
Comm: tls://10.0.23.210:42803,Total threads: 8
Dashboard: http://10.0.23.210:8787/status,Memory: 30.23 GiB
Nanny: tls://10.0.23.210:33735,
Local directory: /scratch/dask-scratch-space/worker-v7a8qky3,Local directory: /scratch/dask-scratch-space/worker-v7a8qky3

0,1
Comm: tls://10.0.28.236:32893,Total threads: 8
Dashboard: http://10.0.28.236:8787/status,Memory: 30.23 GiB
Nanny: tls://10.0.28.236:43951,
Local directory: /scratch/dask-scratch-space/worker-78w8sd7_,Local directory: /scratch/dask-scratch-space/worker-78w8sd7_

0,1
Comm: tls://10.0.24.70:34429,Total threads: 8
Dashboard: http://10.0.24.70:8787/status,Memory: 29.90 GiB
Nanny: tls://10.0.24.70:45051,
Local directory: /scratch/dask-scratch-space/worker-atm7yoxa,Local directory: /scratch/dask-scratch-space/worker-atm7yoxa

0,1
Comm: tls://10.0.25.42:37683,Total threads: 8
Dashboard: http://10.0.25.42:8787/status,Memory: 30.23 GiB
Nanny: tls://10.0.25.42:45705,
Local directory: /scratch/dask-scratch-space/worker-l462qunj,Local directory: /scratch/dask-scratch-space/worker-l462qunj

0,1
Comm: tls://10.0.25.126:39579,Total threads: 8
Dashboard: http://10.0.25.126:8787/status,Memory: 29.90 GiB
Nanny: tls://10.0.25.126:40681,
Local directory: /scratch/dask-scratch-space/worker-a146nsh5,Local directory: /scratch/dask-scratch-space/worker-a146nsh5

0,1
Comm: tls://10.0.20.169:33129,Total threads: 8
Dashboard: http://10.0.20.169:8787/status,Memory: 30.23 GiB
Nanny: tls://10.0.20.169:38543,
Local directory: /scratch/dask-scratch-space/worker-wfi7x8ey,Local directory: /scratch/dask-scratch-space/worker-wfi7x8ey

0,1
Comm: tls://10.0.27.56:34387,Total threads: 8
Dashboard: http://10.0.27.56:8787/status,Memory: 30.23 GiB
Nanny: tls://10.0.27.56:33791,
Local directory: /scratch/dask-scratch-space/worker-65p6t_wy,Local directory: /scratch/dask-scratch-space/worker-65p6t_wy

0,1
Comm: tls://10.0.30.54:43249,Total threads: 8
Dashboard: http://10.0.30.54:8787/status,Memory: 29.91 GiB
Nanny: tls://10.0.30.54:42177,
Local directory: /scratch/dask-scratch-space/worker-w7epdk1f,Local directory: /scratch/dask-scratch-space/worker-w7epdk1f

0,1
Comm: tls://10.0.28.0:40677,Total threads: 8
Dashboard: http://10.0.28.0:8787/status,Memory: 30.23 GiB
Nanny: tls://10.0.28.0:37249,
Local directory: /scratch/dask-scratch-space/worker-6qngw7k2,Local directory: /scratch/dask-scratch-space/worker-6qngw7k2

0,1
Comm: tls://10.0.28.105:45471,Total threads: 8
Dashboard: http://10.0.28.105:8787/status,Memory: 30.23 GiB
Nanny: tls://10.0.28.105:36847,
Local directory: /scratch/dask-scratch-space/worker-ejml5h0j,Local directory: /scratch/dask-scratch-space/worker-ejml5h0j

0,1
Comm: tls://10.0.23.247:44335,Total threads: 8
Dashboard: http://10.0.23.247:8787/status,Memory: 30.23 GiB
Nanny: tls://10.0.23.247:40559,
Local directory: /scratch/dask-scratch-space/worker-lzass71i,Local directory: /scratch/dask-scratch-space/worker-lzass71i

0,1
Comm: tls://10.0.20.162:35033,Total threads: 8
Dashboard: http://10.0.20.162:8787/status,Memory: 29.90 GiB
Nanny: tls://10.0.20.162:39215,
Local directory: /scratch/dask-scratch-space/worker-zix6o4qh,Local directory: /scratch/dask-scratch-space/worker-zix6o4qh

0,1
Comm: tls://10.0.22.50:35203,Total threads: 8
Dashboard: http://10.0.22.50:8787/status,Memory: 29.89 GiB
Nanny: tls://10.0.22.50:46161,
Local directory: /scratch/dask-scratch-space/worker-otrtq7a9,Local directory: /scratch/dask-scratch-space/worker-otrtq7a9

0,1
Comm: tls://10.0.20.32:37063,Total threads: 8
Dashboard: http://10.0.20.32:8787/status,Memory: 29.91 GiB
Nanny: tls://10.0.20.32:43447,
Local directory: /scratch/dask-scratch-space/worker-8pmjwxka,Local directory: /scratch/dask-scratch-space/worker-8pmjwxka


In [None]:
# Local single-process cluster (local run). Will run .compute() on just one process, not a whole cluster.
local_client = Client(processes=False)
local_client

In [None]:
local_client = Client()
local_client

In [None]:
# Local cluster with multiple workers
local_cluster = LocalCluster()  
local_client = Client(local_cluster)
local_client

<font size="6">Shutting down cloud and local clusters</font> 

In [None]:
coiled_cluster.shutdown()

In [None]:
local_client.shutdown()

<font size="6">Analysis</font> 

<font size="4">Paths and functions</font>

In [4]:
# General paths and constants

general_uri = 's3://gfw2-data/forest_change/GLAD_Europe_height_data/'

s3_out_dir = 'climate/European_height_carbon_model/outputs'

def timestr():
    return time.strftime("%Y%m%d_%H_%M_%S")

In [5]:
# Returns list of all chunk boundaries within a bounding box for chunks of a given size
def get_chunk_bounds(chunk_params):

    min_x = chunk_params[0]
    min_y = chunk_params[1]
    max_x = chunk_params[2]
    max_y = chunk_params[3]
    chunk_size = chunk_params[4]
    
    x, y = (min_x, min_y)
    chunks = []

    # Polygon Size
    while y < max_y:
        while x < max_x:
            bounds = [
                x,
                y,
                x + chunk_size,
                y + chunk_size,
            ]
            chunks.append(bounds)
            x += chunk_size
        x = min_x
        y += chunk_size

    return chunks

# Returns the encompassing tile_id string in the form YYN/S_XXXE/W based on a coordinate
def xy_to_tile_id(top_left_x, top_left_y):

    lat_ceil = math.ceil(top_left_y/10.0) * 10
    lng_floor = math.floor(top_left_x/10.0) * 10
    
    lng: str = f"{str(lng_floor).zfill(3)}E" if (lng_floor >= 0) else f"{str(-lng_floor).zfill(3)}W"
    lat: str = f"{str(lat_ceil).zfill(2)}N" if (lat_ceil >= 0) else f"{str(-lat_ceil).zfill(2)}S"

    return f"{lat}_{lng}"

In [6]:
# Lazily opens tile within provided bounds (one chunk)
def get_tile_dataset_rio(uri, bounds, chunk_length):

    try:
        with rasterio.open(uri) as ds:
            window = rasterio.windows.from_bounds(*bounds, ds.transform)
            data = ds.read(1, window=window)
            # print(data)
    except:
        data = np.zeros((chunk_length, chunk_length))

    if data.size==0:
        dask_print("No data in chunk")
        return np.zeros((chunk_length, chunk_length))
    else:
        dask_print("Data in chunk")
        return data

<font size="4">Model steps</font>

In [7]:
# Opeates on pixels in each chunk for a given year: forest state, emission factor, removal factor, emissions, removals, carbon stocks
@jit(nopython=True)
def classify(forest_height_previous_block, forest_height_current_block, forest_loss_detection_block, driver_block, planted_forest_type_block, peat_block, 
             agc_current_block, bgc_current_block, deadwood_c_current_block, litter_c_current_block, soil_c_current_block, r_s_ratio_block,
             land_cover_previous_block, land_cover_next_block, burned_area_two_before_block, burned_area_one_before_block):

    # Output chunks
    forest_states = np.zeros(forest_height_previous_block.shape)
    emission_factor = np.zeros(forest_height_previous_block.shape)
    removal_factor = np.zeros(forest_height_previous_block.shape)
    
    agc_flux = np.zeros(forest_height_previous_block.shape)
    bgc_flux = np.zeros(forest_height_previous_block.shape)
    deadwood_c_flux = np.zeros(forest_height_previous_block.shape)
    litter_c_flux = np.zeros(forest_height_previous_block.shape)
    soil_c_flux = np.zeros(forest_height_previous_block.shape)

    # Iterates through all pixels in the chunk
    for row in range(forest_height_previous_block.shape[0]):
        for col in range(forest_height_previous_block.shape[1]):

            # Pixel for each input
            forest_height_previous = forest_height_previous_block[row, col]
            forest_height_current = forest_height_current_block[row, col]
            forest_loss_detection = forest_loss_detection_block[row, col]
            driver = driver_block[row, col]
            planted_forest_type = planted_forest_type_block[row, col]
            peat = peat_block[row, col]
            
            agc_current = agc_current_block[row, col]
            bgc_current = bgc_current_block[row, col]
            deadwood_c_current = deadwood_c_current_block[row, col]
            litter_c_current = litter_c_current_block[row, col]
            soil_c_current = soil_c_current_block[row, col]
            r_s_ratio = r_s_ratio_block[row, col]
            
            LC_previous = land_cover_previous_block[row, col]
            LC_next = land_cover_next_block[row, col]
            burned_area_two_before = burned_area_two_before_block[row, col]
            burned_area_one_before = burned_area_one_before_block[row, col]

            # Various definitions
            grassland_forest_previous = (((LC_previous >= 2) & (LC_previous <= 48)) | ((LC_previous >= 102) & (LC_previous <= 148)))
            grassland_forest_next = (((LC_next >= 2) & (LC_next <= 48)) | ((LC_next >= 102) & (LC_next <= 148)))  
            cropland_previous = (LC_previous == 244)
            cropland_next = (LC_next == 244)
            forestry = (driver == 3)
            non_sdpt_forestry = (forestry & (grassland_forest_previous | grassland_forest_next) & (cropland_previous == 0) & (cropland_next == 0))

            burned_area_recent = ((burned_area_two_before != 0) or (burned_area_one_before != 0))

            # The decision tree that produces all the outputs for a pixel for a given year
            if forest_height_previous >= 5 and forest_height_current >= 5:                                     # maintained
                rf = 5
                forest_states[row, col] = 1
                removal_factor[row, col] = rf
                agc_flux[row, col] = rf
                bgc_flux[row, col] = rf * r_s_ratio
                agc_current_block[row, col] = agc_current + agc_flux[row, col]
                bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
            elif forest_height_previous < 5 and forest_height_current >= 5:                                    # gain
                rf = 10
                forest_states[row, col] = 2
                removal_factor[row, col] = rf
                agc_flux[row, col] = rf
                bgc_flux[row, col] = rf * r_s_ratio
                agc_current_block[row, col] = agc_current + agc_flux[row, col]
                bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
            elif ((forest_height_previous >= 5 and forest_height_current < 5) or forest_loss_detection == 1):  # loss
                if planted_forest_type == 0:                                                                   # loss:no SDPT                
                    if non_sdpt_forestry == 0:                                                                 # loss:no SDPT:no non-SDPT forestry                    
                        biomass_ef = 0.9
                        soil_ef = 0.1
                        forest_states[row, col] = 311
                        emission_factor[row, col] = biomass_ef
                        agc_flux[row, col] = (agc_current * biomass_ef) * -1
                        bgc_flux[row, col] = (bgc_current * biomass_ef) * -1
                        soil_c_flux[row, col] = (soil_c_current * soil_ef) * -1
                        agc_current_block[row, col] = agc_current + agc_flux[row, col]
                        bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                    else:                                                                                      # loss:no SDPT:non-SDPT forestry         
                        biomass_ef = 0.7
                        soil_ef = 0.1
                        forest_states[row, col] = 312
                        emission_factor[row, col] = biomass_ef
                        agc_flux[row, col] = (agc_current * biomass_ef) * -1
                        bgc_flux[row, col] = (bgc_current * biomass_ef) * -1
                        soil_c_flux[row, col] = (soil_c_current * soil_ef) * -1
                        agc_current_block[row, col] = agc_current + agc_flux[row, col]
                        bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                else:                                                                                          # loss:SDPT
                    if burned_area_recent == 0:                                                                # loss:SDPT:not burned recent
                        if peat == 0:                                                                          # loss:SDPT:not burned recent:not peat
                            biomass_ef = 0.6
                            soil_ef = 0.2
                            forest_states[row, col] = 3211
                            emission_factor[row, col] = biomass_ef
                            agc_flux[row, col] = (agc_current * biomass_ef) * -1
                            bgc_flux[row, col] = (bgc_current * biomass_ef) * -1
                            soil_c_flux[row, col] = (soil_c_current * soil_ef) * -1
                            agc_current_block[row, col] = agc_current + agc_flux[row, col]
                            bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                        else:                                                                                  # loss:SDPT:not burned recent:peat
                            biomass_ef = 0.75
                            soil_ef = 0.1
                            forest_states[row, col] = 3212
                            emission_factor[row, col] = biomass_ef
                            agc_flux[row, col] = (agc_current * biomass_ef) * -1
                            bgc_flux[row, col] = (bgc_current * biomass_ef) * -1
                            soil_c_flux[row, col] = (soil_c_current * soil_ef) * -1   
                            agc_current_block[row, col] = agc_current + agc_flux[row, col]
                            bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                    else:                                                                                      # loss:SDPT:burned recent
                        if peat == 0:                                                                          # loss:SDPT:burned recent:not peat
                            biomass_ef = 0.65
                            soil_ef = 0.3
                            forest_states[row, col] = 3221
                            emission_factor[row, col] = biomass_ef
                            agc_flux[row, col] = (agc_current * biomass_ef) * -1
                            bgc_flux[row, col] = (bgc_current * biomass_ef) * -1
                            soil_c_flux[row, col] = (soil_c_current * soil_ef) * -1
                            agc_current_block[row, col] = agc_current + agc_flux[row, col]
                            bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
                        else:                                                                                  # loss:SDPT:burned recent:peat
                            biomass_ef = 0.9
                            soil_ef = 0.4
                            forest_states[row, col] = 3222
                            emission_factor[row, col] = biomass_ef
                            agc_flux[row, col] = (agc_current * biomass_ef) * -1
                            bgc_flux[row, col] = (bgc_current * biomass_ef) * -1
                            soil_c_flux[row, col] = (soil_c_current * soil_ef) * -1
                            agc_current_block[row, col] = agc_current + agc_flux[row, col]
                            bgc_current_block[row, col] = bgc_current + bgc_flux[row, col]
            else:                                                                                              # no forest
                forest_states[row, col] = 0
                emission_factor[row, col] = 0
                removal_factor[row, col] = 0
                
                agc_flux[row, col] = 0
                bgc_flux[row, col] = 0 
                soil_c_flux[row, col] = 0
                
                agc_current_block[row, col] = agc_current
                bgc_current_block[row, col] = bgc_current
                deadwood_c_current_block[row, col] = deadwood_c_current
                litter_c_current_block[row, col] = litter_c_current
                soil_c_current_block[row, col] = soil_c_current

    return forest_states, emission_factor, removal_factor, agc_flux, bgc_flux, soil_c_flux, agc_current_block, bgc_current_block, deadwood_c_current_block, litter_c_current_block, soil_c_current_block

In [12]:
def process_chunk(bounds, chunk_length_deg, start_year):
 
    futures = {}
    layers = {}

    bounds_str = "_".join([str(round(x)) for x in bounds])
    chunk_length_pixels = int(chunk_length_deg * (40000/10))

    try:
        # submit requests to S3 for layers
        with concurrent.futures.ThreadPoolExecutor() as executor:
            tile_id = xy_to_tile_id(bounds[0], bounds[3])
            drivers_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/tree_cover_loss_drivers/processed/drivers_2022/20230407/{tile_id}_tree_cover_loss_driver_processed.tif"
            planted_forest_type_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/planted_forest_type/SDPT_v1/standard/20200730/{tile_id}_plantation_type_oilpalm_woodfiber_other_unmasked.tif"
            peat_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/peatlands/processed/20230315/{tile_id}_peat_mask_processed.tif"
            
            agc_2000_uri = f"s3://gfw2-data/climate/carbon_model/carbon_pools/aboveground_carbon/extent_2000/standard/20230222/{tile_id}_Mg_AGC_ha_2000.tif"
            bgc_2000_uri = f"s3://gfw2-data/climate/carbon_model/carbon_pools/belowground_carbon/extent_2000/standard/20230222/{tile_id}_Mg_BGC_ha_2000.tif"
            deadwood_c_2000_uri = f"s3://gfw2-data/climate/carbon_model/carbon_pools/deadwood_carbon/extent_2000/standard/20230222/{tile_id}_Mg_deadwood_C_ha_2000.tif"
            litter_c_2000_uri = f"s3://gfw2-data/climate/carbon_model/carbon_pools/litter_carbon/extent_2000/standard/20230222/{tile_id}_Mg_litter_C_ha_2000.tif"
            soil_c_2000_uri = f"s3://gfw2-data/climate/carbon_model/carbon_pools/soil_carbon/intermediate_full_extent/standard/20231108/{tile_id}_soil_C_full_extent_2000_Mg_C_ha.tif"
    
            r_s_ratio_uri = f"s3://gfw2-data/climate/carbon_model/BGB_AGB_ratio/processed/20230216/{tile_id}_BGB_AGB_ratio.tif"
    
            # save as dictionary from future -> name so we can know what layer it is on completion
            futures[executor.submit(get_tile_dataset_rio, drivers_uri, bounds, chunk_length_pixels)] = "drivers"
            futures[executor.submit(get_tile_dataset_rio, planted_forest_type_uri, bounds, chunk_length_pixels)] = "planted_forest_type"
            futures[executor.submit(get_tile_dataset_rio, peat_uri, bounds, chunk_length_pixels)] = "peat"
            
            futures[executor.submit(get_tile_dataset_rio, agc_2000_uri, bounds, chunk_length_pixels)] = "agc_2000"
            futures[executor.submit(get_tile_dataset_rio, bgc_2000_uri, bounds, chunk_length_pixels)] = "bgc_2000"
            futures[executor.submit(get_tile_dataset_rio, deadwood_c_2000_uri, bounds, chunk_length_pixels)] = "deadwood_c_2000"
            futures[executor.submit(get_tile_dataset_rio, litter_c_2000_uri, bounds, chunk_length_pixels)] = "litter_c_2000"
            futures[executor.submit(get_tile_dataset_rio, soil_c_2000_uri, bounds, chunk_length_pixels)] = "soil_c_2000"
    
            futures[executor.submit(get_tile_dataset_rio, r_s_ratio_uri, bounds, chunk_length_pixels)] = "r_s_ratio"
    
            # faster to just get every year up front unless we're running into memory issues
            for year in range(start_year-1, 2022):
                forest_height_uri = f'{general_uri}202307_revision/FH_{year}.tif'
                forest_loss_detection_uri = f'{general_uri}202307_revision/DFL_{year}.tif'
    
                #TODO These are placeholder years. Replace with actual landcover years. 
                preceding_land_cover_uri = f's3://gfw2-data/landcover/composite/2000/{tile_id}_composite_landcover_2015.tif'
                next_land_cover_uri = f's3://gfw2-data/landcover/composite/2000/{tile_id}_composite_landcover_2020.tif'
    
                futures[executor.submit(get_tile_dataset_rio, forest_height_uri, bounds, chunk_length_pixels)] = f"forest_height_{year}"
                futures[executor.submit(get_tile_dataset_rio, forest_loss_detection_uri, bounds, chunk_length_pixels)] = f"forest_loss_detection_{year}"
                
                futures[executor.submit(get_tile_dataset_rio, preceding_land_cover_uri, bounds, chunk_length_pixels)] = f"preceding_land_cover_for_{year}"
                futures[executor.submit(get_tile_dataset_rio, next_land_cover_uri, bounds, chunk_length_pixels)] = f"next_land_cover_for_{year}"
    
            for year in range(start_year-2, 2022):
                burned_area_uri = f's3://gfw2-data/climate/carbon_model/other_emissions_inputs/burn_year/burn_year_10x10_clip/ba_{year}_50N_010E.tif'
    
                futures[executor.submit(get_tile_dataset_rio, burned_area_uri, bounds, chunk_length_pixels)] = f"burned_area_{year}"
    
        # wait for requests to come back with data from S3
        for future in concurrent.futures.as_completed(futures):
            layer = futures[future]
            layers[layer] = future.result()
    
        # TODO: Better way to skip chunks without forest in them. This way is: 1) inaccurate because it uses just one year, 
        # and 2) is inefficient because everything has already been downloaded.
        if not np.any(layers[f"forest_height_2001"]):
            dask_print(f"No data in chunk {bounds_str}. Skipping: {timestr()")
            return f"No data in chunk {bounds_str}. Skipping: {timestr()}"
            
        dask_print(f"Data in chunk {bounds_str}. Proceeding.")
        agc_current = layers["agc_2000"]  # .astype('float') # This seems unnecessary now. TODO Verify and delete
        bgc_current = layers["bgc_2000"]
        deadwood_c_current = layers["deadwood_c_2000"]
        litter_c_current = layers["litter_c_2000"]
        soil_c_current = layers["soil_c_2000"]
        
        # run classifier one year at a time
        for year in range(start_year, 2022):
            
            dask_print(f"Classifying {bounds_str} in {tile_id} for {year}: {timestr()}")
            
            forest_states, emission_factor, removal_factor, agc_flux, bgc_flux, soil_c_flux, 
            agc_current, bgc_current, deadwood_c_current, litter_c_current, soil_c_current = classify(
                layers[f"forest_height_{year - 1}"], 
                layers[f"forest_height_{year}"], 
                layers[f"forest_loss_detection_{year}"], 
                layers["drivers"],
                layers["planted_forest_type"],
                layers["peat"],
                
                agc_current,
                bgc_current,
                deadwood_c_current,
                litter_c_current,
                soil_c_current,
                layers["r_s_ratio"],
    
                layers[f"preceding_land_cover_for_{year}"],
                layers[f"next_land_cover_for_{year}"],
                layers[f"burned_area_{year-2}"],
                layers[f"burned_area_{year-1}"],
    )
            
            transform = rasterio.transform.from_bounds(*bounds, width=chunk_length_pixels, height=chunk_length_pixels)
    
            file_info = f'{tile_id}__{bounds_str}__{year}'
    
            output_dict = {
                    "forest_states": [forest_states, "uint16"], 
                    "emission_factor": [emission_factor, "float32"],
                    "removal_factor": [removal_factor, "float32"],
                    "agc_flux": [agc_flux, "float32"],
                    "bgc_flux": [bgc_flux, "float32"],
                    "soil_c_flux": [soil_c_flux, "float32"],
                    "agc_density": [agc_current, "float32"],
                    "bgc_density": [bgc_current, "float32"],
                    "deadwood_c_density": [deadwood_c_current, "float32"],
                    "litter_c_density": [litter_c_current, "float32"],
                    "soil_c_density": [soil_c_current, "float32"]                   
                   }
    
            s3_client = boto3.client("s3")
    
            dask_print(f"Saving {bounds_str} in {tile_id} for {year}: {timestr()}")
    
            for key, value in output_dict.items():
    
                file_name = f"{key}__{file_info}__{timestr()}"
    
                if value[1] == "float32":
                    with rasterio.open(f"/tmp/{file_name}.tif", 'w', driver='GTiff', width=chunk_length_pixels, height=chunk_length_pixels, count=1, dtype='float32', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst:
                        dst.write(value[0].astype(rasterio.float32), 1)
                else:
                    with rasterio.open(f"/tmp/{file_name}.tif", 'w', driver='GTiff', width=chunk_length_pixels, height=chunk_length_pixels, count=1, dtype='uint16', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst:
                        dst.write(value[0].astype(rasterio.uint16), 1)
    
                s3_client.upload_file(f"/tmp/{file_name}.tif", "gfw2-data", Key=f"{s3_out_dir}/{file_name}.tif")
    
                os.remove(f"/tmp/{file_name}.tif")
    
            # clear memory of unneeded arrays
            del forest_states
            del emission_factor
            del removal_factor
            del agc_flux
            del bgc_flux
            del soil_c_flux
            del layers[f"forest_height_{year - 1}"]
            del layers[f"forest_loss_detection_{year}"]
            del layers[f"burned_area_{year-2}"]
    
    except Exception as e:
        return f"error: {e}"

    return f"success for {bounds_str}: {timestr()}"

In [None]:
#Cell to build decision tree

In [11]:
%%time

start_year = 2002   # full run
# start_year = 2012  # last few years
# start_year = 2021  # final year

# chunk_params arguments: W, S, E, N, chunk size (degrees)
chunk_params = [-12, 34, 32, 72, 1]  # all of Europe
# chunk_params = [10, 40, 20, 50, 1]    # 10x10 deg (50N_010E), 100 chunks
# chunk_params = [10, 46, 14, 50, 2]   # 4x4 deg, 4 chunks
# chunk_params = [10, 48, 12, 50, 1]   # 2x2 deg, 4 chunks
# chunk_params = [10, 49, 11, 50, 1]   # 1x1 deg, 1 chunk
# chunk_params = [10, 49, 11, 50, 0.5] # 1x1 deg, 4 chunks
# chunk_params = [10, 49.5, 10.5, 50, 0.25] # 0.5x0.5 deg, 4 chunks
# chunk_params = [10, 49.75, 10.25, 50, 0.25] # 0.1x0.1 deg, 1 chunk

chunks = get_chunk_bounds(chunk_params)  
print("Processing", len(chunks), "chunks")

delayed = [dask.delayed(process_chunk)(chunk, chunk_params[4], start_year) for chunk in chunks]

results = dask.compute(*delayed)
results

Processing 1672 chunks
No data in chunk 5_34_6_35. Skipping.
No data in chunk 21_35_22_36. Skipping.
No data in chunk 14_71_15_72. Skipping.
No data in chunk -7_70_-6_71. Skipping.
No data in chunk 28_35_29_36. Skipping.
No data in chunk 30_34_31_35. Skipping.
No data in chunk 29_34_30_35. Skipping.
Data in chunk 25_34_26_35. Proceeding.
Classifying 25_34_26_35 in 40N_020E for 2002: 20231219_04_03_36
No data in chunk 10_70_11_71. Skipping.
No data in chunk 4_34_5_35. Skipping.
No data in chunk -6_71_-5_72. Skipping.
No data in chunk -8_70_-7_71. Skipping.
No data in chunk 12_70_13_71. Skipping.
No data in chunk 12_71_13_72. Skipping.
No data in chunk -2_35_-1_36. Skipping.
No data in chunk -10_71_-9_72. Skipping.
No data in chunk 3_69_4_70. Skipping.
No data in chunk 8_34_9_35. Skipping.
No data in chunk -11_70_-10_71. Skipping.
No data in chunk -10_34_-9_35. Skipping.
No data in chunk 17_35_18_36. Skipping.
No data in chunk 9_35_10_36. Skipping.
No data in chunk 5_71_6_72. Skipping.
N

(None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'success for 11_34_12_35: 20231219_05_00_23',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'success for 23_34_24_35: 20231219_04_18_22',
 'success for 24_34_25_35: 20231219_05_01_00',
 'success for 25_34_26_35: 20231219_04_31_22',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'success for -6_35_-5_36: 20231219_04_32_36',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'success for 11_35_12_36: 20231219_04_18_10',
 'success for 12_35_13_36: 20231219_04_49_16',
 None,
 'success for 14_35_15_36: 20231219_04_26_29',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 'success for 23_35_24_36: 20231219_04_18_49',
 'success for 24_35_25_36: 20231219_04_57_27',
 'success for 25_35_26_36: 202312

In [None]:
# To run without dask at all
process_chunk([10, 49, 11, 50], 1, start_year)

In [None]:
coiled_client.restart() 

In [None]:
client.cancel(future) # per https://github.com/dask/distributed/issues/3898#issuecomment-645590511

In [None]:
# Manually trim memory, per https://distributed.dask.org/en/latest/worker-memory.html#memory-not-released-back-to-the-os

def trim_memory() -> int:
    libc = ctypes.CDLL("libc.so.6")
    return libc.malloc_trim(0)

coiled_client.run(trim_memory)

In [None]:
# aws s3 cp s3://gfw2-data/climate/European_height_carbon_model/outputs/ . --recursive --exclude "*" --include "*10_49_11_50*"
# aws s3 cp s3://gfw2-data/climate/European_height_carbon_model/outputs/ . --recursive --exclude "*" --include "*2002*10_49_11_50*"

In [None]:
# Tests rasterio with concurrent futures but not with dask
uri = "s3://gfw2-data/climate/carbon_model/other_emissions_inputs/tree_cover_loss_drivers/processed/drivers_2022/20230407/50N_010E_tree_cover_loss_driver_processed.tif"
bounds = [10, 48, 12, 50]
futures = {}
layers = {}
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures[executor.submit(get_tile_dataset_rio, uri, bounds)] = "drivers"

for future in concurrent.futures.as_completed(futures):
            layer = futures[future]
            layers[layer] = future.result()

layers["drivers"]