In [45]:
import os

# scipy basics
import numpy as np
from osgeo import gdal      # Necessary to do this import to get rasterio to import
import rasterio
import rasterio.features
import rasterio.transform
import rasterio.windows

import time

# dask/parallelization libraries
import coiled
import dask
import dask.array as dar
from dask.distributed import Client, LocalCluster
from dask.distributed import print as dask_print  
import rioxarray
import xarray as xr

from numba import jit
import concurrent.futures

import boto3

import math

<font size="6">Making cloud and local clusters</font> 

In [2]:
coiled_cluster = coiled.Cluster(
    n_workers=10,
    use_best_zone=True, 
    compute_purchase_option="spot_with_fallback",
    idle_timeout="20 minutes",
    # name="DGibbs Europe height flux model", 
    account='jterry64'   # Necessary to use the AWS environment that Justin set up in Coiled
)

Output()

Output()

In [3]:
# Coiled cluster (cloud run)
coiled_client = coiled_cluster.get_client()
coiled_client

0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: https://cluster-vxsiu.dask.host/nIl-UeRt1sLG9gAT/status,

0,1
Dashboard: https://cluster-vxsiu.dask.host/nIl-UeRt1sLG9gAT/status,Workers: 5
Total threads: 20,Total memory: 74.17 GiB

0,1
Comm: tls://10.1.11.30:8786,Workers: 5
Dashboard: http://10.1.11.30:8787/status,Total threads: 20
Started: Just now,Total memory: 74.17 GiB

0,1
Comm: tls://10.1.11.240:42297,Total threads: 4
Dashboard: http://10.1.11.240:8787/status,Memory: 14.83 GiB
Nanny: tls://10.1.11.240:41737,
Local directory: /scratch/dask-scratch-space/worker-rsla34nn,Local directory: /scratch/dask-scratch-space/worker-rsla34nn

0,1
Comm: tls://10.1.5.78:46043,Total threads: 4
Dashboard: http://10.1.5.78:8787/status,Memory: 14.83 GiB
Nanny: tls://10.1.5.78:35389,
Local directory: /scratch/dask-scratch-space/worker-_1jlyc8j,Local directory: /scratch/dask-scratch-space/worker-_1jlyc8j

0,1
Comm: tls://10.1.6.119:34143,Total threads: 4
Dashboard: http://10.1.6.119:8787/status,Memory: 14.84 GiB
Nanny: tls://10.1.6.119:38165,
Local directory: /scratch/dask-scratch-space/worker-to1swhcd,Local directory: /scratch/dask-scratch-space/worker-to1swhcd

0,1
Comm: tls://10.1.11.106:41213,Total threads: 4
Dashboard: http://10.1.11.106:8787/status,Memory: 14.84 GiB
Nanny: tls://10.1.11.106:37253,
Local directory: /scratch/dask-scratch-space/worker-11xw4swg,Local directory: /scratch/dask-scratch-space/worker-11xw4swg

0,1
Comm: tls://10.1.8.14:34141,Total threads: 4
Dashboard: http://10.1.8.14:8787/status,Memory: 14.83 GiB
Nanny: tls://10.1.8.14:33197,
Local directory: /scratch/dask-scratch-space/worker-yo6xqhi8,Local directory: /scratch/dask-scratch-space/worker-yo6xqhi8


In [None]:
# Local single-process cluster (local run). Will run .compute() on just one process, not a whole cluster.
local_client = Client()
local_client

<font size="6">Shutting down cloud and local clusters</font> 

In [None]:
coiled_cluster.shutdown()

In [None]:
local_client.shutdown()

<font size="6">Analysis</font> 

<font size="4">Paths and functions</font>

In [4]:
# General paths and constants

general_uri = 's3://gfw2-data/forest_change/GLAD_Europe_height_data/'

random_data_uri = 's3://gfw2-data/forest_change/GLAD_Europe_height_data/dummy_random_data__20230901/'

local_out_dir = 'C:\\GIS\\Carbon_model_Europe\\outputs\\'

def timestr():
    return time.strftime("%Y%m%d_%H_%M_%S")

tile_size = 10      # Tile size in degrees is from the top left of the tile. 10 is a full tile. Anything smaller is a subset of that.

chunk_length = 8000

In [46]:
def get_chunk_bounds(min_x, min_y, max_x, max_y, cell_size):
    x, y = (min_x, min_y)
    chunks = []

    # Polygon Size
    while y < max_y:
        while x < max_x:
            bounds = [
                x,
                y,
                x + cell_size,
                y + cell_size,
            ]
            chunks.append(bounds)
            x += cell_size
        x = min_x
        y += cell_size

    return chunks


def xy_to_tile_id(top_left_x, top_left_y):

    # print(top_left_y)
    lat_ceil = math.ceil(top_left_y/10.0) * 10
    # print(lat_ceil)

    # print(top_left_x)
    lng_floor = math.floor(top_left_x/10.0) * 10
    # print(lng_floor)
    
    lng: str = f"{str(lng_floor).zfill(3)}E" if (lng_floor >= 0) else f"{str(-lng_floor).zfill(3)}W"
    lat: str = f"{str(lat_ceil).zfill(2)}N" if (lat_ceil >= 0) else f"{str(-lat_ceil).zfill(2)}S"

    return f"{lat}_{lng}"

In [6]:
def get_tile_dataset_rio(uri, bounds):
    # If the input tile_size is too large, it reverts to 10 (standard tile size)
    try:
        with rasterio.open(uri) as ds:
            return ds.read(1, window=rasterio.windows.from_bounds(*bounds, ds.transform))
    except rasterio.errors.RasterioIOError as e:
        return np.zeros((chunk_length, chunk_length))
    

<font size="4">Model steps</font>

In [61]:
@jit(nopython=True)
def classify(forest_height_previous_block, forest_height_current_block, forest_loss_detection_block, 
             driver_block, planted_forest_type_block, peat_block, tclf_block, agb_2000_block, soil_c_2000_block):
    forest_states = np.zeros(forest_height_previous_block.shape)
    emissions_factor = np.zeros(forest_height_previous_block.shape)
    emissions_agb = np.zeros(forest_height_previous_block.shape)
    emissions_soil = np.zeros(forest_height_previous_block.shape)
    
    for row in range(forest_height_previous_block.shape[0]):
        for col in range(forest_height_previous_block.shape[1]):
            forest_height_previous = forest_height_previous_block[row, col]
            forest_height_current = forest_height_current_block[row, col]
            forest_loss_detection = forest_loss_detection_block[row, col]
            driver = driver_block[row, col]
            planted_forest_type = planted_forest_type_block[row, col]
            peat = peat_block[row, col]
            tclf = tclf_block[row, col]
            agb_2000 = agb_2000_block[row, col]
            soil_c_2000 = soil_c_2000_block[row, col]

            # create dictionary of forest states to emission factors: look up the emission factor for the forest state
            
            if forest_height_previous >= 5 and forest_height_current >= 5:   # maintained
                ef = 0.0
                rf = 0.1
                forest_states[row, col] = 1
                emissions_factor[row, col] = ef
                emissions_agb[row, col] = agb_2000 * ef
                emissions_soil[row, col] = soil_c_2000 * ef
            elif forest_height_previous < 5 and forest_height_current >= 5:  # gain
                ef = 0.0
                rf = 0.3
                forest_states[row, col] = 2
                emissions_factor[row, col] = ef
                emissions_agb[row, col] = agb_2000 * ef
                emissions_soil[row, col] = soil_c_2000 * ef
            elif ((forest_height_previous >= 5 and forest_height_current < 5) or forest_loss_detection == 1):  # loss
                ef = 0.9
                forest_states[row, col] = 3
                emissions_factor[row, col] = ef
                emissions_agb[row, col] = agb_2000 * ef
                emissions_soil[row, col] = soil_c_2000 * ef

            # elif peat:
            #     forest_states[row, col] = 5
            # elif driver:
            #     forest_states[row, col] = 6
            # elif tclf > 0:
            #     forest_states[row, col] = 7
            # elif planted_forest_type > 0:
            #     forest_states[row, col] = 8
            else:                                                                                                       # no forest
                forest_states[row, col] = 0
                emissions_factor[row, col] = 0
                emissions_agb[row, col] = 0
                emissions_soil[row, col] = 0

    return forest_states, emissions_factor, emissions_agb, emissions_soil

In [63]:
def process_chunk(bounds):
    futures = {}
    layers = {}

    start_year = 2021

    try:
        # submit requests to S3 for layers
        with concurrent.futures.ThreadPoolExecutor() as executor:
            tile_id = xy_to_tile_id(bounds[0], bounds[3])
            drivers_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/tree_cover_loss_drivers/processed/drivers_2022/20230407/{tile_id}_tree_cover_loss_driver_processed.tif"
            planted_forest_type_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/planted_forest_type/SDPT_v1/standard/20200730/{tile_id}_plantation_type_oilpalm_woodfiber_other_unmasked.tif"
            peat_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/peatlands/processed/20230315/{tile_id}_peat_mask_processed.tif"
            tclf_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/tree_cover_loss_fires/20230315/processed/{tile_id}_tree_cover_loss_fire_processed.tif"
            agb_2000_uri = f"s3://gfw2-data/climate/WHRC_biomass/WHRC_V4/Processed/{tile_id}_t_aboveground_biomass_ha_2000.tif"
            soil_c_2000_uri = f"s3://gfw2-data/climate/carbon_model/carbon_pools/soil_carbon/intermediate_full_extent/standard/20230222/{tile_id}_t_soil_C_ha_full_extent_2000.tif"

            # save as dictionary from future -> name so we can know what layer it is on completion
            futures[executor.submit(get_tile_dataset_rio, drivers_uri, bounds)] = "drivers"
            futures[executor.submit(get_tile_dataset_rio, planted_forest_type_uri, bounds)] = "planted_forest_type"
            futures[executor.submit(get_tile_dataset_rio, peat_uri, bounds)] = "peat"
            futures[executor.submit(get_tile_dataset_rio, tclf_uri, bounds)] = "tclf"
            futures[executor.submit(get_tile_dataset_rio, agb_2000_uri, bounds)] = "agb_2000"
            futures[executor.submit(get_tile_dataset_rio, soil_c_2000_uri, bounds)] = "soil_c_2000"

            # faster to just get every year up front unless we're running into memory issues
            for year in range(start_year-1, 2022):
                forest_height_uri = f'{general_uri}202307_revision/FH_{year}.tif'
                forest_loss_detection_uri = f'{general_uri}202307_revision/DFL_{year}.tif'

                futures[executor.submit(get_tile_dataset_rio, forest_height_uri, bounds)] = f"forest_height_{year}"
                futures[executor.submit(get_tile_dataset_rio, forest_loss_detection_uri, bounds)] = f"forest_loss_detection_{year}"

        # wait for requests to come back with data from S3
        for future in concurrent.futures.as_completed(futures):
            layer = futures[future]
            layers[layer] = future.result()

        # agb = agb_2000 # Make sure to reassign variable so it's not accumulating variables

        bounds_str = "_".join([str(round(x)) for x in bounds])
        
        # run classifier one year at a time
        for year in range(start_year, 2022):
            dask_print(f"Classifying {bounds_str} in {tile_id} for {year}: {timestr()}")
            forest_states, emissions_factor, emissions_agb, emissions_soil = classify(
            # forest_states, emissions_factor = classify(
            # forest_states = classify(
                layers[f"forest_height_{year - 1}"], 
                layers[f"forest_height_{year}"], 
                layers[f"forest_loss_detection_{year}"], 
                layers["drivers"],
                layers["planted_forest_type"],
                layers["peat"],
                layers["tclf"],
                layers["agb_2000"],
                layers["soil_c_2000"]
            )
            
            transform = rasterio.transform.from_bounds(*bounds, width=8000, height=8000)
            
            states_file_name = f"forest_states__{year}__{bounds_str}__{timestr()}"
            with rasterio.open(f"/tmp/{states_file_name}.tif", 'w', driver='GTiff', width=8000, height=8000, count=1, dtype='uint8', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst1:
                dst1.write(forest_states.astype(rasterio.uint8), 1)

            ef_file_name = f"emission_factor__{year}__{bounds_str}__{timestr()}"
            with rasterio.open(f"/tmp/{ef_file_name}.tif", 'w', driver='GTiff', width=8000, height=8000, count=1, dtype='float32', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst2:
                dst2.write(emissions_factor.astype(rasterio.float32), 1)

            agb_emis_file_name = f"agb_emis_{year}__{bounds_str}__{timestr()}"
            with rasterio.open(f"/tmp/{agb_emis_file_name}.tif", 'w', driver='GTiff', width=8000, height=8000, count=1, dtype='float32', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst3:
                dst3.write(emissions_agb.astype(rasterio.float32), 1)

            # soil_emis_file_name = f"soil_emis_{year}__{bounds_str}__{timestr()}"
            # with rasterio.open(f"/tmp/{soil_emis_file_name}.tif", 'w', driver='GTiff', width=8000, height=8000, count=1, dtype='float32', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst4:
            #     dst4.write(emissions_soil.astype(rasterio.float32), 1)

            dask_print(f"Uploading {bounds_str} in {tile_id} for {year}: {timestr()}")
            s3_client = boto3.client("s3")
            s3_client.upload_file(f"/tmp/{states_file_name}.tif", "gfw2-data", Key=f"climate/forest_states/{states_file_name}.tif")

            s3_client.upload_file(f"/tmp/{ef_file_name}.tif", "gfw2-data", Key=f"climate/forest_states/{ef_file_name}.tif")

            s3_client.upload_file(f"/tmp/{agb_emis_file_name}.tif", "gfw2-data", Key=f"climate/forest_states/david_test/{agb_emis_file_name}.tif")

            # s3_client.upload_file(f"/tmp/{soil_emis_file_name}.tif", "gfw2-data", Key=f"climate/forest_states/david_test/{soil_emis_file_name}.tif")

            # clear memory of arrays we don't need anymore
            del forest_states
            del emissions_factor
            del emissions_agb
            del emissions_soil
            del layers[f"forest_height_{year - 1}"],
            del layers[f"forest_loss_detection_{year}"]
    except Exception as e:
        return f"error: {e}"

    return f"success: {timestr()}"

In [None]:
%%time


"""
Some code that applies the decision tree to decision_tree_ds to make an xarray of forest_states for the previous and current years
"""

chunks = get_chunk_bounds(-12, 34, 32, 72, chunk_length / 4000)  # all of Europe
# chunks = get_chunk_bounds(6, 40, 16, 50, chunk_length / 4000)  # smaller area that includes 50N_010E
delayed = [dask.delayed(process_chunk)(chunk) for chunk in chunks]

results = dask.compute(*delayed)
results

Classifying 80N_000E, 2021: 20231103_17_52_58
Classifying 80N_000E, 2021: 20231103_17_52_58
Classifying 40N_010E, 2021: 20231103_17_52_58
Classifying 80N_010W, 2021: 20231103_17_52_59
Classifying 80N_020E, 2021: 20231103_17_52_59
Classifying 80N_000E, 2021: 20231103_17_53_00
Classifying 80N_020E, 2021: 20231103_17_53_02
Uploading 80N_000E, 2021: 20231103_17_53_02
Uploading 40N_010E, 2021: 20231103_17_53_02
Uploading 80N_000E, 2021: 20231103_17_53_02
Uploading 80N_020E, 2021: 20231103_17_53_03
Uploading 80N_010W, 2021: 20231103_17_53_03
Uploading 80N_000E, 2021: 20231103_17_53_06
Classifying 40N_000E, 2021: 20231103_17_53_06
Classifying 40N_010E, 2021: 20231103_17_53_08
Uploading 80N_020E, 2021: 20231103_17_53_08
Uploading 40N_000E, 2021: 20231103_17_53_10
Classifying 40N_000E, 2021: 20231103_17_53_11
Classifying 70N_020E, 2021: 20231103_17_53_12
Uploading 40N_010E, 2021: 20231103_17_53_12
Uploading 40N_000E, 2021: 20231103_17_53_15
Uploading 70N_020E, 2021: 20231103_17_53_17
Classifyin

In [None]:
coiled_client.restart() 