In [1]:
import os
os.environ['USE_PYGEOS'] = '0'   # Suppresses some warning about geopandas
import geopandas as gpd

# scipy basics
import numpy as np
import botocore
#from osgeo import gdal      # Necessary to do this import to get rasterio to import
import rasterio as rio
import rasterio.features

import time

# dask/parallelization libraries
import coiled
import dask
import dask.array as dar
from dask.distributed import Client, LocalCluster
import rioxarray
import xarray as xr

<font size="6">Making cloud and local clusters</font> 

In [27]:
coiled_cluster = coiled.Cluster(
    name="flux_model_test_justin",
    n_workers=25,
    use_best_zone=True, 
    compute_purchase_option="spot_with_fallback",
    idle_timeout="20 minutes",
    region="us-east-2",
    # name="DGibbs Europe height flux model", 
    account='jterry64'   # Necessary to use the AWS environment that Justin set up in Coiled
)

Output()

Output()

In [28]:
# Coiled cluster (cloud run)
coiled_client = coiled_cluster.get_client()
coiled_client

0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: https://cluster-rmhqn.dask.host/a5Zn2-YUI8TC-KAo/status,

0,1
Dashboard: https://cluster-rmhqn.dask.host/a5Zn2-YUI8TC-KAo/status,Workers: 14
Total threads: 56,Total memory: 207.12 GiB

0,1
Comm: tls://10.1.9.46:8786,Workers: 14
Dashboard: http://10.1.9.46:8787/status,Total threads: 56
Started: Just now,Total memory: 207.12 GiB

0,1
Comm: tls://10.1.15.176:43591,Total threads: 4
Dashboard: http://10.1.15.176:8787/status,Memory: 14.82 GiB
Nanny: tls://10.1.15.176:41571,
Local directory: /scratch/dask-scratch-space/worker-gbr5nt5t,Local directory: /scratch/dask-scratch-space/worker-gbr5nt5t

0,1
Comm: tls://10.1.5.25:39443,Total threads: 4
Dashboard: http://10.1.5.25:8787/status,Memory: 14.82 GiB
Nanny: tls://10.1.5.25:39761,
Local directory: /scratch/dask-scratch-space/worker-dfzaxhk9,Local directory: /scratch/dask-scratch-space/worker-dfzaxhk9

0,1
Comm: tls://10.1.3.137:33903,Total threads: 4
Dashboard: http://10.1.3.137:8787/status,Memory: 14.82 GiB
Nanny: tls://10.1.3.137:45217,
Local directory: /scratch/dask-scratch-space/worker-wljyy6rk,Local directory: /scratch/dask-scratch-space/worker-wljyy6rk

0,1
Comm: tls://10.1.15.207:41661,Total threads: 4
Dashboard: http://10.1.15.207:8787/status,Memory: 14.82 GiB
Nanny: tls://10.1.15.207:38595,
Local directory: /scratch/dask-scratch-space/worker-5zxz89hj,Local directory: /scratch/dask-scratch-space/worker-5zxz89hj

0,1
Comm: tls://10.1.13.107:44695,Total threads: 4
Dashboard: http://10.1.13.107:8787/status,Memory: 14.82 GiB
Nanny: tls://10.1.13.107:45615,
Local directory: /scratch/dask-scratch-space/worker-3xdtczck,Local directory: /scratch/dask-scratch-space/worker-3xdtczck

0,1
Comm: tls://10.1.0.109:36349,Total threads: 4
Dashboard: http://10.1.0.109:8787/status,Memory: 14.82 GiB
Nanny: tls://10.1.0.109:33851,
Local directory: /scratch/dask-scratch-space/worker-endr_vzs,Local directory: /scratch/dask-scratch-space/worker-endr_vzs

0,1
Comm: tls://10.1.9.32:32941,Total threads: 4
Dashboard: http://10.1.9.32:8787/status,Memory: 14.82 GiB
Nanny: tls://10.1.9.32:42919,
Local directory: /scratch/dask-scratch-space/worker-1tvz99jv,Local directory: /scratch/dask-scratch-space/worker-1tvz99jv

0,1
Comm: tls://10.1.13.159:46511,Total threads: 4
Dashboard: http://10.1.13.159:8787/status,Memory: 14.81 GiB
Nanny: tls://10.1.13.159:33371,
Local directory: /scratch/dask-scratch-space/worker-n1ivfoc_,Local directory: /scratch/dask-scratch-space/worker-n1ivfoc_

0,1
Comm: tls://10.1.12.146:36809,Total threads: 4
Dashboard: http://10.1.12.146:8787/status,Memory: 14.65 GiB
Nanny: tls://10.1.12.146:40281,
Local directory: /scratch/dask-scratch-space/worker-p8i3n0zd,Local directory: /scratch/dask-scratch-space/worker-p8i3n0zd

0,1
Comm: tls://10.1.0.51:40235,Total threads: 4
Dashboard: http://10.1.0.51:8787/status,Memory: 14.82 GiB
Nanny: tls://10.1.0.51:35633,
Local directory: /scratch/dask-scratch-space/worker-4qp7xv4y,Local directory: /scratch/dask-scratch-space/worker-4qp7xv4y

0,1
Comm: tls://10.1.9.81:33435,Total threads: 4
Dashboard: http://10.1.9.81:8787/status,Memory: 14.82 GiB
Nanny: tls://10.1.9.81:33057,
Local directory: /scratch/dask-scratch-space/worker-2ovcxksy,Local directory: /scratch/dask-scratch-space/worker-2ovcxksy

0,1
Comm: tls://10.1.10.212:36759,Total threads: 4
Dashboard: http://10.1.10.212:8787/status,Memory: 14.66 GiB
Nanny: tls://10.1.10.212:46271,
Local directory: /scratch/dask-scratch-space/worker-0kc6bqsh,Local directory: /scratch/dask-scratch-space/worker-0kc6bqsh

0,1
Comm: tls://10.1.5.54:43537,Total threads: 4
Dashboard: http://10.1.5.54:8787/status,Memory: 14.82 GiB
Nanny: tls://10.1.5.54:35353,
Local directory: /scratch/dask-scratch-space/worker-hjngtvyj,Local directory: /scratch/dask-scratch-space/worker-hjngtvyj

0,1
Comm: tls://10.1.9.59:43769,Total threads: 4
Dashboard: http://10.1.9.59:8787/status,Memory: 14.82 GiB
Nanny: tls://10.1.9.59:39027,
Local directory: /scratch/dask-scratch-space/worker-9nbjtcu1,Local directory: /scratch/dask-scratch-space/worker-9nbjtcu1


In [2]:
# Local single-process cluster (local run). Will run .compute() on just one process, not a whole cluster.
local_client = Client()
local_client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 57504 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:57504/status,

0,1
Dashboard: http://127.0.0.1:57504/status,Workers: 4
Total threads: 12,Total memory: 16.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:57505,Workers: 4
Dashboard: http://127.0.0.1:57504/status,Total threads: 12
Started: Just now,Total memory: 16.00 GiB

0,1
Comm: tcp://127.0.0.1:57516,Total threads: 3
Dashboard: http://127.0.0.1:57520/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:57508,
Local directory: /var/folders/9r/q_gyrww12ys5_myvxzxdvl5m0000gn/T/dask-scratch-space/worker-j5086dpj,Local directory: /var/folders/9r/q_gyrww12ys5_myvxzxdvl5m0000gn/T/dask-scratch-space/worker-j5086dpj

0,1
Comm: tcp://127.0.0.1:57517,Total threads: 3
Dashboard: http://127.0.0.1:57522/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:57510,
Local directory: /var/folders/9r/q_gyrww12ys5_myvxzxdvl5m0000gn/T/dask-scratch-space/worker-ggtlq3xe,Local directory: /var/folders/9r/q_gyrww12ys5_myvxzxdvl5m0000gn/T/dask-scratch-space/worker-ggtlq3xe

0,1
Comm: tcp://127.0.0.1:57519,Total threads: 3
Dashboard: http://127.0.0.1:57526/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:57512,
Local directory: /var/folders/9r/q_gyrww12ys5_myvxzxdvl5m0000gn/T/dask-scratch-space/worker-sk0vxn7u,Local directory: /var/folders/9r/q_gyrww12ys5_myvxzxdvl5m0000gn/T/dask-scratch-space/worker-sk0vxn7u

0,1
Comm: tcp://127.0.0.1:57518,Total threads: 3
Dashboard: http://127.0.0.1:57524/status,Memory: 4.00 GiB
Nanny: tcp://127.0.0.1:57514,
Local directory: /var/folders/9r/q_gyrww12ys5_myvxzxdvl5m0000gn/T/dask-scratch-space/worker-fpe02nu7,Local directory: /var/folders/9r/q_gyrww12ys5_myvxzxdvl5m0000gn/T/dask-scratch-space/worker-fpe02nu7


<font size="6">Shutting down cloud and local clusters</font> 

In [47]:
coiled_cluster.shutdown()

In [12]:
local_client.shutdown()

NameError: name 'local_client' is not defined

<font size="6">Analysis</font> 

<font size="4">Paths and functions</font>

In [29]:
# General paths and constants

general_uri = 's3://gfw2-data/forest_change/GLAD_Europe_height_data/'

random_data_uri = 's3://gfw2-data/forest_change/GLAD_Europe_height_data/dummy_random_data__20230901/'

local_out_dir = 'C:\\GIS\\Carbon_model_Europe\\outputs\\'

timestr = time.strftime("%Y%m%d")

tile_size = 10      # Tile size in degrees is from the top left of the tile. 10 is a full tile. Anything smaller is a subset of that.

chunk_length = 8000


In [30]:
import shapely.geometry as geometry
import shapely.wkt as wkt

def get_chunk_bounds(min_x, min_y, max_x, max_y, cell_size):
    x, y = (min_x, min_y)
    chunks = []

    # Polygon Size
    while y < max_y:
        while x < max_x:
            bounds = [
                x,
                y,
                x + cell_size,
                y + cell_size,
            ]
            chunks.append(bounds)
            x += cell_size
        x = min_x
        y += cell_size

    return chunks


def xy_to_tile_id(top_left_x, top_left_y):
    lng: str = f"{str(top_left_x).zfill(3)}E" if (top_left_x >= 0) else f"{str(-top_left_x).zfill(3)}W"
    lat: str = f"{str(top_left_y).zfill(2)}N" if (top_left_y >= 0) else f"{str(-top_left_y).zfill(2)}S"

    return f"{lat}_{lng}"

In [31]:
import rasterio.windows
import rasterio
import numpy as np

def get_tile_dataset_rio(uri, bounds):
    # If the input tile_size is too large, it reverts to 10 (standard tile size)
    try:
        with rasterio.open(uri) as ds:
            return ds.read(1, window=rasterio.windows.from_bounds(*bounds, ds.transform))
    except rasterio.errors.RasterioIOError as e:
        return np.zeros((chunk_length, chunk_length))
    

<font size="4">Model steps</font>

In [33]:
from numba import jit
import numpy as np
import concurrent.futures

@jit(nopython=True)
def classify(forest_height_previous_block, forest_height_current_block, forest_loss_detection_block, driver_block, planted_forest_type_block, peat_block, tclf_block):
    forest_states = np.zeros(forest_height_previous_block.shape)
    emissions_factor = np.zeros(forest_height_previous_block.shape)
    
    for row in range(forest_height_previous_block.shape[0]):
        for col in range(forest_height_previous_block.shape[1]):
            forest_height_previous = forest_height_previous_block[row, col]
            forest_height_current = forest_height_current_block[row, col]
            forest_loss_detection = forest_loss_detection_block[row, col]
            driver = driver_block[row, col]
            planted_forest_type = planted_forest_type_block[row, col]
            peat = peat_block[row, col]
            tclf = tclf_block[row, col]
            
            if forest_height_previous >= 5 and forest_height_current >= 5:   # maintained
                if peat:
                    forest_states[row, col] = 4
                else:
                    forest_states[row, col] = 6
            elif forest_height_previous >= 5 and forest_height_current < 5:  # loss
                forest_states[row, col] = 2
            elif forest_height_previous < 5 and forest_height_current >= 5:  # gain
                forest_states[row, col] = 3
            elif peat:
                forest_states[row, col] = 5
            elif driver:
                forest_states[row, col] = 6
            elif tclf > 0:
                forest_states[row, col] = 7
            elif planted_forest_type > 0:
                forest_states[row, col] = 8
            else:                                                                                                       # no forest
                forest_states[row, col] = 0
                
    return forest_states

In [43]:
import rasterio.transform
from rasterio.profiles import DefaultGTiffProfile

def process_chunk(bounds):
    futures = {}
    layers = {}

    try:
        # submit requests to S3 for layers
        with concurrent.futures.ThreadPoolExecutor() as executor:
            tile_id = xy_to_tile_id(bounds[0], bounds[3])
            drivers_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/tree_cover_loss_drivers/processed/drivers_2022/20230407/{tile_id}_tree_cover_loss_driver_processed.tif"
            planted_forest_type_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/planted_forest_type/SDPT_v1/standard/20200730/{tile_id}_plantation_type_oilpalm_woodfiber_other_unmasked.tif"
            peat_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/peatlands/processed/20230315/{tile_id}_peat_mask_processed.tif"
            tclf_uri = f"s3://gfw2-data/climate/carbon_model/other_emissions_inputs/tree_cover_loss_fires/20230315/processed/{tile_id}_tree_cover_loss_fire_processed.tif"

            # save as dictionary from future -> name so we can know what layer it is on completion
            futures[executor.submit(get_tile_dataset_rio, drivers_uri, bounds)] = "drivers"
            futures[executor.submit(get_tile_dataset_rio, planted_forest_type_uri, bounds)] = "planted_forest_type"
            futures[executor.submit(get_tile_dataset_rio, peat_uri, bounds)] = "peat"
            futures[executor.submit(get_tile_dataset_rio, tclf_uri, bounds)] = "tclf"

            # faster to just get every year up front unless we're running into memory issues
            for year in range(2001, 2022):
                forest_height_uri = f'{general_uri}202307_revision/FH_{year}.tif'
                forest_loss_detection_uri = f'{general_uri}202307_revision/DFL_{year}.tif'

                futures[executor.submit(get_tile_dataset_rio, forest_height_uri, bounds)] = f"forest_height_{year}"
                futures[executor.submit(get_tile_dataset_rio, forest_loss_detection_uri, bounds)] = f"forest_loss_detection_{year}"

        # wait for requests to come back with data from S3
        for future in concurrent.futures.as_completed(futures):
            layer = futures[future]
            layers[layer] = future.result()

        # run classifier one year at a time
        for year in range(2002, 2022):
            forest_states = classify(
                layers[f"forest_height_{year - 1}"], 
                layers[f"forest_height_{year}"], 
                layers[f"forest_loss_detection_{year}"], 
                layers["drivers"],
                layers["planted_forest_type"],
                layers["peat"],
                layers["tclf"]
            )
            
            file_name = "_".join([str(round(x)) for x in bounds]) + f"_{year}"
            transform = rasterio.transform.from_bounds(*bounds, width=8000, height=8000)
            with rasterio.open(f"/tmp/{file_name}.tif", 'w', driver='GTiff', width=8000, height=8000, count=1, dtype='uint8', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst:
                dst.write(forest_states.astype(rasterio.uint8), 1)
        
            s3_client = boto3.client("s3")
            s3_client.upload_file(f"/tmp/{file_name}.tif", "gfw2-data", Key=f"climate/forest_states/justin_test/{file_name}.tif")

            # clear memory of arrays we don't need anymore
            del forest_states
            del layers[f"forest_height_{year - 1}"],
            del layers[f"forest_loss_detection_{year}"]
    except Exception as e:
        return f"error: {e}"

    return "success"

In [44]:
%%time
from numba import jit
import numpy as np
import concurrent.futures
import boto3

"""
Some code that applies the decision tree to decision_tree_ds to make an xarray of forest_states for the previous and current years
"""

chunks = get_chunk_bounds(-11, 34, 32, 72, chunk_length / 4000)
delayed = [dask.delayed(process_chunk)(chunk) for chunk in chunks]

results = dask.compute(*delayed)
results



CPU times: user 2.11 s, sys: 413 ms, total: 2.52 s
Wall time: 17min 34s


('success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'success',
 'su

2023-10-23 15:13:38,111 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
