In [2]:
import os

# dask/parallelization libraries
import coiled
import dask
from dask.distributed import Client, LocalCluster
from dask.distributed import print as dask_print
import dask.config
import distributed

# scipy basics
import numpy as np
import rasterio
import rasterio.features
import rasterio.transform
import rasterio.windows

from numba import jit
import concurrent.futures

import boto3
import time
import math
import ctypes
import pandas as pd

import geopandas as gpd
import pandas as pd
import rioxarray
from shapely.geometry import Polygon
import subprocess

<font size="6">Cluster management</font> 

<font size="5">Creating clusters</font> 

In [None]:
# Full cluster
coiled_cluster = coiled.Cluster(
    n_workers=40,
    use_best_zone=True, 
    compute_purchase_option="spot_with_fallback",
    idle_timeout="10 minutes",
    region="us-east-1",
    name="AFOLU_flux_model", 
    account='jterry64', # Necessary to use the AWS environment that Justin set up in Coiled
    worker_memory = "32GiB" 
)

# Coiled cluster (cloud run)
coiled_client = coiled_cluster.get_client()
coiled_client

In [5]:
# Test cluster
coiled_cluster = coiled.Cluster(
    n_workers=4,
    use_best_zone=True, 
    compute_purchase_option="spot_with_fallback",
    idle_timeout="10 minutes",
    region="us-east-1",
    name="AFOLU_flux_model", 
    account='jterry64', # Necessary to use the AWS environment that Justin set up in Coiled
    worker_memory = "32GiB" 
)

# Coiled cluster (cloud run)
coiled_client = coiled_cluster.get_client()
coiled_client

Output()

Output()

0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: https://cluster-ocolo.dask.host/_zjN9OlyLUEUcsie/status,

0,1
Dashboard: https://cluster-ocolo.dask.host/_zjN9OlyLUEUcsie/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tls://10.0.92.205:8786,Workers: 0
Dashboard: http://10.0.92.205:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [None]:
# Local single-process cluster (local run). Will run .compute() on just one process, not a whole cluster.
local_client = Client(processes=False)
local_client

In [None]:
local_client = Client()
local_client

In [70]:
# Local cluster with multiple workers
local_cluster = LocalCluster()  
local_client = Client(local_cluster)
local_client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 24.91 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:40473,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 24.91 GiB

0,1
Comm: tcp://127.0.0.1:39331,Total threads: 2
Dashboard: http://127.0.0.1:46409/status,Memory: 6.23 GiB
Nanny: tcp://127.0.0.1:45349,
Local directory: /tmp/dask-scratch-space/worker-4dycb09z,Local directory: /tmp/dask-scratch-space/worker-4dycb09z

0,1
Comm: tcp://127.0.0.1:39793,Total threads: 2
Dashboard: http://127.0.0.1:40327/status,Memory: 6.23 GiB
Nanny: tcp://127.0.0.1:40539,
Local directory: /tmp/dask-scratch-space/worker-siuv56nv,Local directory: /tmp/dask-scratch-space/worker-siuv56nv

0,1
Comm: tcp://127.0.0.1:37081,Total threads: 2
Dashboard: http://127.0.0.1:38877/status,Memory: 6.23 GiB
Nanny: tcp://127.0.0.1:35341,
Local directory: /tmp/dask-scratch-space/worker-azkxf7o5,Local directory: /tmp/dask-scratch-space/worker-azkxf7o5

0,1
Comm: tcp://127.0.0.1:37577,Total threads: 2
Dashboard: http://127.0.0.1:33323/status,Memory: 6.23 GiB
Nanny: tcp://127.0.0.1:35073,
Local directory: /tmp/dask-scratch-space/worker-kq9axii9,Local directory: /tmp/dask-scratch-space/worker-kq9axii9


<font size="5">Shutting down cloud and local clusters</font> 

In [46]:
coiled_cluster.shutdown()

In [None]:
local_client.shutdown()

<font size="6">Variables</font> 

In [27]:
# General paths and constants

composite_LC_uri = 's3://gfw2-data/landcover/composite'

s3_out_dir = 'climate/AFOLU_flux_model/LULUCF/outputs'

IPCC_class_max_val = 6

# IPCC codes
forest = 1
cropland = 2
settlement = 3
wetland = 4
grassland = 5
otherland = 6

first_year = 2000
last_year = 2020

s3 = boto3.resource('s3')
my_bucket = s3.Bucket('gfw2-data')

s3_client = boto3.client("s3")

<font size="6">General functions</font> 

In [9]:
def timestr():
    return time.strftime("%Y%m%d_%H_%M_%S")

def boundstr(bounds):
    bounds_str = "_".join([str(round(x)) for x in bounds])
    return bounds_str

def calc_chunk_length_pixels(bounds):
    chunk_length_pixels = int((bounds[3]-bounds[1]) * (40000/10))
    return chunk_length_pixels

In [10]:
# Returns list of all chunk boundaries within a bounding box for chunks of a given size
def get_chunk_bounds(chunk_params):

    min_x = chunk_params[0]
    min_y = chunk_params[1]
    max_x = chunk_params[2]
    max_y = chunk_params[3]
    chunk_size = chunk_params[4]
    
    x, y = (min_x, min_y)
    chunks = []

    # Polygon Size
    while y < max_y:
        while x < max_x:
            bounds = [
                x,
                y,
                x + chunk_size,
                y + chunk_size,
            ]
            chunks.append(bounds)
            x += chunk_size
        x = min_x
        y += chunk_size

    return chunks

In [11]:
# Returns the encompassing tile_id string in the form YYN/S_XXXE/W based on a coordinate
def xy_to_tile_id(top_left_x, top_left_y):

    lat_ceil = math.ceil(top_left_y/10.0) * 10
    lng_floor = math.floor(top_left_x/10.0) * 10
    
    lng: str = f"{str(lng_floor).zfill(3)}E" if (lng_floor >= 0) else f"{str(-lng_floor).zfill(3)}W"
    lat: str = f"{str(lat_ceil).zfill(2)}N" if (lat_ceil >= 0) else f"{str(-lat_ceil).zfill(2)}S"

    return f"{lat}_{lng}"

In [12]:
# Lazily opens tile within provided bounds (i.e. one chunk) and returns as a numpy array
# If it can't open the chunk (no data in it), it returns an array of all 0s
def get_tile_dataset_rio(uri, bounds, chunk_length):

    try:
        with rasterio.open(uri) as ds:
            window = rasterio.windows.from_bounds(*bounds, ds.transform)
            data = ds.read(1, window=window)
    except:
        data = np.zeros((chunk_length, chunk_length))

    if data.size==0:
        # dask_print("No data in chunk")
        return np.zeros((chunk_length, chunk_length))
    else:
        # dask_print("Data in chunk")
        return data

In [13]:
# Prepares list of chunks to download.
# Chunks are defined by a bounding box.
def prepare_to_download_chunk(bounds, download_dict):
 
    futures = {}

    bounds_str = boundstr(bounds)
    tile_id = xy_to_tile_id(bounds[0], bounds[3])
    chunk_length_pixels = calc_chunk_length_pixels(bounds)

    # Submit requests to S3 for input chunks but don't actually download them yet. This queueing of the requests before downloading them speeds up the downloading
    # Approach is to download all the input chunks up front for every year to make downloading more efficient, even though it means storing more upfront
    with concurrent.futures.ThreadPoolExecutor() as executor:
        
        dask_print(f"Requesting data in chunk {bounds_str} in {tile_id}: {timestr()}")

        for key, value in download_dict.items():
            futures[executor.submit(get_tile_dataset_rio, value, bounds, chunk_length_pixels)] = key

    return futures

In [14]:
def check_chunk_for_data(layers, item_to_check, bounds_str, tile_id):

    # Sum of pixel values across years for chunk
    total = 0
    
    # Iterates through years to add up pixel values to determine if there is any data in the chunk
    for year in list(range(first_year+5, last_year+1, 5)):

        year_total = np.sum(layers[f"{item_to_check}{year}"])
        total += year_total

        # Breaks the loop if there is data in the chunk
        if year_total > 0:
            dask_print(f"Data in chunk {bounds_str} for {year}. Proceeding.")
            continue

    # If there's no data in the chunk, stop working on the chunk
    if total == 0:
        dask_print(f"No data in chunk {bounds_str} in {tile_id}. Skipping: {timestr()}")
        return 0

In [41]:
# Saves array as a raster locally, then uploads it to s3
def save_and_upload(bounds, chunk_length_pixels, tile_id, bounds_str, output_dict, is_final):

    s3_client = boto3.client("s3") # Needs to be in the same function as the upload_file call

    transform = rasterio.transform.from_bounds(*bounds, width=chunk_length_pixels, height=chunk_length_pixels)

    file_info = f'{tile_id}__{bounds_str}'

    # For every output file, saves from array to local raster, then to s3.
    # Can't save directly to s3, unfortunately, so need to save locally first.
    for key, value in output_dict.items():

        data_meaning = value[2]
        year_out = value[3]

        dask_print(f"Saving {bounds_str} in {tile_id} for {year_out}: {timestr()}")

        if is_final:
            file_name = f"{file_info}__{key}.tif"
        else:
            file_name = f"{file_info}__{key}__{timestr()}.tif"

        with rasterio.open(f"/tmp/{file_name}", 'w', driver='GTiff', width=chunk_length_pixels, height=chunk_length_pixels, count=1, dtype='uint8', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst:
            dst.write(value[0].astype(rasterio.uint8), 1)

        s3_path = f"{s3_out_dir}/{data_meaning}/{year_out}/{chunk_length_pixels}_pixels/{time.strftime('%Y%m%d')}"

        dask_print(f"Uploading {bounds_str} in {tile_id} for {year_out} to {s3_path}: {timestr()}")

        s3_client.upload_file(f"/tmp/{file_name}", "gfw2-data", Key=f"{s3_path}/{file_name}")

        # Deletes the local raster
        os.remove(f"/tmp/{file_name}")

In [93]:
def make_index_shp(year, type):

    dask_print(f"Year: {year}; type: {type}: {timestr()}")
  
    if type == "IPCC_basic_classes":
        # change_folder = f"climate/AFOLU_flux_model/LULUCF/outputs/{type}/{year}/8000_pixels/{time.strftime('%Y%m%d')}/"
        in_folder = f"climate/AFOLU_flux_model/LULUCF/outputs/{type}/{year}/8000_pixels/20240129/"
    if type == "IPCC_basic_change":
        # change_folder = f"climate/AFOLU_flux_model/LULUCF/outputs/{type}/{year-5}_{year}/8000_pixels/{time.strftime('%Y%m%d')}/"
        in_folder = f"climate/AFOLU_flux_model/LULUCF/outputs/{type}/{year-5}_{year}/8000_pixels/20240129/"

    full_in_folder = f's3://gfw2-data/{in_folder}'
    
    cmd = ['aws', 's3', 'ls', full_in_folder]
    s3_contents_bytes = subprocess.check_output(cmd)
    
    s3_contents_str = s3_contents_bytes.decode('utf-8')
    s3_contents_list = s3_contents_str.splitlines()
    filenames = [line.split()[-1] for line in s3_contents_list]

    tile_paths = []
    tile_paths = [full_in_folder + filename for filename in filenames]

    df = gpd.GeoDataFrame(columns=['location','geometry', 'bound_box'])
            
    # https://gis.stackexchange.com/a/281996/121380
    for count, fname in enumerate(tile_paths):
        if fname.endswith(".tif"):
            small_tif = rasterio.open(fname)
            bounds = small_tif.bounds
            dask_print(f"Raster {count}: {fname}")
            polygon_geom = Polygon([(bounds[0], bounds[1]), 
                            (bounds[0], bounds[3]), 
                            (bounds[2], bounds[3]), 
                            (bounds[2], bounds[1])])
            # gdf = gpd.GeoDataFrame(geometry=[polygon_geom])
            # gdf['location'] = fname
            # gdf['bound_box'] = boundstr(bounds)

            gdf = gpd.GeoDataFrame({"location": fname, "geometry": [polygon_geom], "bound_box": boundstr(bounds)})
            
            df = pd.concat([gdf, df], ignore_index=True)   # https://github.com/geopandas/geopandas/issues/2606
            
            count = count+1
            if count > 2:
                break

    shp = f"{type}_{year}_output_index_{in_folder[-9:-1]}_dask.shp"

    dask_print(f"Saving locally {shp}: {timestr()}")
    
    df.to_file(shp, crs="EPSG:4326")

    dask_print(f"Uploading {full_in_folder}{shp}: {timestr()}")

    s3_client = boto3.client("s3")  # Needs to be in the same function as the upload_file call
    s3_client.upload_file(shp, "gfw2-data", Key=f"{in_folder}{shp}")

    return(f"Completed Year: {year}; type: {type}: {timestr()}")

In [94]:
%%time

types = ["IPCC_basic_classes", "IPCC_basic_change"]
years = [2005, 2010, 2015, 2020]

# Creates list of tasks to run (1 task = 1 chunk for all years)
delayed_result = [dask.delayed(make_index_shp)(year, type) for year in years for type in types]

# Actually runs analysis
results = dask.compute(*delayed_result)
results

Year: 2010; type: IPCC_basic_change: 20240131_11_56_06
Year: 2020; type: IPCC_basic_classes: 20240131_11_56_06
Year: 2005; type: IPCC_basic_classes: 20240131_11_56_06
Year: 2020; type: IPCC_basic_change: 20240131_11_56_06
Year: 2015; type: IPCC_basic_classes: 20240131_11_56_06
Year: 2005; type: IPCC_basic_change: 20240131_11_56_06
Year: 2010; type: IPCC_basic_classes: 20240131_11_56_06
Year: 2015; type: IPCC_basic_change: 20240131_11_56_06
Raster 0: s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2005_2010/8000_pixels/20240129/00N_000E__0_-10_2_-8__IPCC_change_2005_2010.tif
Raster 0: s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2020/8000_pixels/20240129/00N_000E__0_-10_2_-8__IPCC_classes_2020.tif
Raster 0: s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2000_2005/8000_pixels/20240129/00N_000E__0_-10_2_-8__IPCC_change_2000_2005.tif
Raster 0: s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_cl

('Completed Year: 2005; type: IPCC_basic_classes: 20240131_11_56_33',
 'Completed Year: 2005; type: IPCC_basic_change: 20240131_11_56_30',
 'Completed Year: 2010; type: IPCC_basic_classes: 20240131_11_56_30',
 'Completed Year: 2010; type: IPCC_basic_change: 20240131_11_56_28',
 'Completed Year: 2015; type: IPCC_basic_classes: 20240131_11_56_30',
 'Completed Year: 2015; type: IPCC_basic_change: 20240131_11_56_30',
 'Completed Year: 2020; type: IPCC_basic_classes: 20240131_11_56_28',
 'Completed Year: 2020; type: IPCC_basic_change: 20240131_11_56_33')