In [2]:
import os

# dask/parallelization libraries
import coiled
import dask
from dask.distributed import Client, LocalCluster
from dask.distributed import print as dask_print
import dask.config
import distributed

# scipy basics
import numpy as np
import rasterio
import rasterio.features
import rasterio.transform
import rasterio.windows

from numba import jit
import concurrent.futures

import boto3
import time
import math
import ctypes
import pandas as pd

import geopandas as gpd
import pandas as pd
import rioxarray
from shapely.geometry import Polygon
import subprocess

<font size="6">Cluster management</font> 

<font size="5">Creating clusters</font> 

In [3]:
# Full cluster
coiled_cluster = coiled.Cluster(
    n_workers=40,
    use_best_zone=True, 
    compute_purchase_option="spot_with_fallback",
    idle_timeout="10 minutes",
    region="us-east-1",
    name="AFOLU_flux_model", 
    account='jterry64', # Necessary to use the AWS environment that Justin set up in Coiled
    worker_memory = "64GiB" 
)

# Coiled cluster (cloud run)
coiled_client = coiled_cluster.get_client()
coiled_client

Output()

Output()

0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: https://cluster-repwp.dask.host/I-jwrOr_YCElDJmG/status,

0,1
Dashboard: https://cluster-repwp.dask.host/I-jwrOr_YCElDJmG/status,Workers: 10
Total threads: 160,Total memory: 610.32 GiB

0,1
Comm: tls://10.0.35.83:8786,Workers: 10
Dashboard: http://10.0.35.83:8787/status,Total threads: 160
Started: Just now,Total memory: 610.32 GiB

0,1
Comm: tls://10.0.32.11:44561,Total threads: 16
Dashboard: http://10.0.32.11:8787/status,Memory: 61.10 GiB
Nanny: tls://10.0.32.11:44683,
Local directory: /scratch/dask-scratch-space/worker-ev33yi2l,Local directory: /scratch/dask-scratch-space/worker-ev33yi2l

0,1
Comm: tls://10.0.44.182:34433,Total threads: 16
Dashboard: http://10.0.44.182:8787/status,Memory: 61.09 GiB
Nanny: tls://10.0.44.182:43021,
Local directory: /scratch/dask-scratch-space/worker-a9zjvdde,Local directory: /scratch/dask-scratch-space/worker-a9zjvdde

0,1
Comm: tls://10.0.38.165:46207,Total threads: 16
Dashboard: http://10.0.38.165:8787/status,Memory: 61.10 GiB
Nanny: tls://10.0.38.165:36779,
Local directory: /scratch/dask-scratch-space/worker-dqgrlk56,Local directory: /scratch/dask-scratch-space/worker-dqgrlk56

0,1
Comm: tls://10.0.33.243:33523,Total threads: 16
Dashboard: http://10.0.33.243:8787/status,Memory: 61.10 GiB
Nanny: tls://10.0.33.243:38867,
Local directory: /scratch/dask-scratch-space/worker-n0mp84lw,Local directory: /scratch/dask-scratch-space/worker-n0mp84lw

0,1
Comm: tls://10.0.39.253:45135,Total threads: 16
Dashboard: http://10.0.39.253:8787/status,Memory: 61.09 GiB
Nanny: tls://10.0.39.253:35873,
Local directory: /scratch/dask-scratch-space/worker-zx0et3pb,Local directory: /scratch/dask-scratch-space/worker-zx0et3pb

0,1
Comm: tls://10.0.33.6:39843,Total threads: 16
Dashboard: http://10.0.33.6:8787/status,Memory: 61.09 GiB
Nanny: tls://10.0.33.6:38279,
Local directory: /scratch/dask-scratch-space/worker-9cyjsvta,Local directory: /scratch/dask-scratch-space/worker-9cyjsvta

0,1
Comm: tls://10.0.45.207:38429,Total threads: 16
Dashboard: http://10.0.45.207:8787/status,Memory: 61.11 GiB
Nanny: tls://10.0.45.207:35205,
Local directory: /scratch/dask-scratch-space/worker-krg7urj2,Local directory: /scratch/dask-scratch-space/worker-krg7urj2

0,1
Comm: tls://10.0.35.202:43089,Total threads: 16
Dashboard: http://10.0.35.202:8787/status,Memory: 61.09 GiB
Nanny: tls://10.0.35.202:38751,
Local directory: /scratch/dask-scratch-space/worker-_yulytil,Local directory: /scratch/dask-scratch-space/worker-_yulytil

0,1
Comm: tls://10.0.43.180:42249,Total threads: 16
Dashboard: http://10.0.43.180:8787/status,Memory: 61.10 GiB
Nanny: tls://10.0.43.180:45033,
Local directory: /scratch/dask-scratch-space/worker-1rgtsfx4,Local directory: /scratch/dask-scratch-space/worker-1rgtsfx4

0,1
Comm: tls://10.0.37.202:39013,Total threads: 16
Dashboard: http://10.0.37.202:8787/status,Memory: 60.45 GiB
Nanny: tls://10.0.37.202:34601,
Local directory: /scratch/dask-scratch-space/worker-e2gaumpu,Local directory: /scratch/dask-scratch-space/worker-e2gaumpu


In [None]:
# Test cluster
coiled_cluster = coiled.Cluster(
    n_workers=8,
    use_best_zone=True, 
    compute_purchase_option="spot_with_fallback",
    idle_timeout="10 minutes",
    region="us-east-1",
    name="AFOLU_flux_model", 
    account='jterry64', # Necessary to use the AWS environment that Justin set up in Coiled
    # worker_memory = 32GiB" 
)

# Coiled cluster (cloud run)
coiled_client = coiled_cluster.get_client()
coiled_client

In [None]:
# Local single-process cluster (local run). Will run .compute() on just one process, not a whole cluster.
local_client = Client(processes=False)
local_client

In [None]:
local_client = Client()
local_client

In [None]:
# Local cluster with multiple workers
local_cluster = LocalCluster()  
local_client = Client(local_cluster)
local_client

<font size="5">Shutting down cloud and local clusters</font> 

In [None]:
coiled_client.restart() 

In [None]:
coiled_cluster.shutdown()

In [None]:
local_client.shutdown()

<font size="6">Variables</font> 

In [5]:
# General paths and constants

composite_LC_uri = 's3://gfw2-data/landcover/composite'

s3_out_dir = 'climate/AFOLU_flux_model/LULUCF/outputs'

IPCC_class_max_val = 6

# IPCC codes
forest = 1
cropland = 2
settlement = 3
wetland = 4
grassland = 5
otherland = 6

first_year = 2000
last_year = 2020

s3 = boto3.resource('s3')
my_bucket = s3.Bucket('gfw2-data')

s3_client = boto3.client("s3")

<font size="6">General functions</font> 

In [6]:
def timestr():
    return time.strftime("%Y%m%d_%H_%M_%S")

def boundstr(bounds):
    bounds_str = "_".join([str(round(x)) for x in bounds])
    return bounds_str

def calc_chunk_length_pixels(bounds):
    chunk_length_pixels = int((bounds[3]-bounds[1]) * (40000/10))
    return chunk_length_pixels

In [7]:
# Returns list of all chunk boundaries within a bounding box for chunks of a given size
def get_chunk_bounds(chunk_params):

    min_x = chunk_params[0]
    min_y = chunk_params[1]
    max_x = chunk_params[2]
    max_y = chunk_params[3]
    chunk_size = chunk_params[4]
    
    x, y = (min_x, min_y)
    chunks = []

    # Polygon Size
    while y < max_y:
        while x < max_x:
            bounds = [
                x,
                y,
                x + chunk_size,
                y + chunk_size,
            ]
            chunks.append(bounds)
            x += chunk_size
        x = min_x
        y += chunk_size

    return chunks

In [8]:
# Returns the encompassing tile_id string in the form YYN/S_XXXE/W based on a coordinate
def xy_to_tile_id(top_left_x, top_left_y):

    lat_ceil = math.ceil(top_left_y/10.0) * 10
    lng_floor = math.floor(top_left_x/10.0) * 10
    
    lng: str = f"{str(lng_floor).zfill(3)}E" if (lng_floor >= 0) else f"{str(-lng_floor).zfill(3)}W"
    lat: str = f"{str(lat_ceil).zfill(2)}N" if (lat_ceil >= 0) else f"{str(-lat_ceil).zfill(2)}S"

    return f"{lat}_{lng}"

In [9]:
# Lazily opens tile within provided bounds (i.e. one chunk) and returns as a numpy array
# If it can't open the chunk (no data in it), it returns an array of the specified nodata value.
# TODO: It would be better if this just didn't return any array at all if there's no chunk. Returning an array of nodata is pretty inefficient.  
def get_tile_dataset_rio(uri, bounds, chunk_length_pixels, no_data_val):

    bounds_str = boundstr(bounds)

    try:
        with rasterio.open(uri) as ds:
            window = rasterio.windows.from_bounds(*bounds, ds.transform)
            data = ds.read(1, window=window)
    except:
        data = np.full((chunk_length_pixels, chunk_length_pixels), no_data_val)

    return data

In [10]:
# Prepares list of chunks to download.
# Chunks are defined by a bounding box.
def prepare_to_download_chunk(bounds, download_dict, no_data_val):
 
    futures = {}

    bounds_str = boundstr(bounds)
    tile_id = xy_to_tile_id(bounds[0], bounds[3])
    chunk_length_pixels = calc_chunk_length_pixels(bounds)

    # Submit requests to S3 for input chunks but don't actually download them yet. This queueing of the requests before downloading them speeds up the downloading
    # Approach is to download all the input chunks up front for every year to make downloading more efficient, even though it means storing more upfront
    with concurrent.futures.ThreadPoolExecutor() as executor:
        
        dask_print(f"Requesting data in chunk {bounds_str} in {tile_id}: {timestr()}")

        for key, value in download_dict.items():
            futures[executor.submit(get_tile_dataset_rio, value, bounds, chunk_length_pixels, no_data_val)] = key

    return futures

In [11]:
# Checks if tiles exist at all
def check_for_tile(download_dict):

    s3 = boto3.client('s3')

    i=0

    while i < len(list(download_dict.values())):

        s3_key = list(download_dict.values())[i][15:]

        # Breaks the loop if the tile exists
        try:
            s3.head_object(Bucket='gfw2-data', Key=s3_key)
            dask_print(f"Tile id {list(download_dict.values())[i][-12:-4]} exists. Proceeding.")
            return 1
        except:
            pass
            
        i+=1

    dask_print(f"Tile id {list(download_dict.values())[0][-12:-4]} does not exist. Skipping chunk.")
    return 0

In [12]:
# Checks whether a chunk has data in it
def check_chunk_for_data(layers, item_to_check, bounds_str, tile_id, no_data_val):

    i=0

    while i < len(list(layers.values())):

        # Checks if all the pixels have the nodata value
        data_present = np.all(list(layers.values())[i] != no_data_val)

        # Breaks the loop if there is data in the chunk
        if data_present:
            dask_print(f"Data in chunk {bounds_str}. Proceeding.")
            return 1

        i+=1

    dask_print(f"No data in chunk {bounds_str} for any input.")
    return 0

In [13]:
# Saves array as a raster locally, then uploads it to s3
def save_and_upload_raster(bounds, chunk_length_pixels, tile_id, bounds_str, output_dict, is_final):

    s3_client = boto3.client("s3") # Needs to be in the same function as the upload_file call

    transform = rasterio.transform.from_bounds(*bounds, width=chunk_length_pixels, height=chunk_length_pixels)

    file_info = f'{tile_id}__{bounds_str}'

    # For every output file, saves from array to local raster, then to s3.
    # Can't save directly to s3, unfortunately, so need to save locally first.
    for key, value in output_dict.items():

        data_meaning = value[2]
        year_out = value[3]

        dask_print(f"Saving {bounds_str} in {tile_id} for {year_out}: {timestr()}")

        if is_final:
            file_name = f"{file_info}__{key}.tif"
        else:
            file_name = f"{file_info}__{key}__{timestr()}.tif"

        with rasterio.open(f"/tmp/{file_name}", 'w', driver='GTiff', width=chunk_length_pixels, height=chunk_length_pixels, count=1, dtype='uint8', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst:
            dst.write(value[0].astype(rasterio.uint8), 1)

        s3_path = f"{s3_out_dir}/{data_meaning}/{year_out}/{chunk_length_pixels}_pixels/{time.strftime('%Y%m%d')}"

        dask_print(f"Uploading {bounds_str} in {tile_id} for {year_out} to {s3_path}: {timestr()}")

        s3_client.upload_file(f"/tmp/{file_name}", "gfw2-data", Key=f"{s3_path}/{file_name}")

        # Deletes the local raster
        os.remove(f"/tmp/{file_name}")

In [14]:
# Uploads a shapefile to s3
def upload_shp(full_in_folder, in_folder, shp):

    dask_print(f"Uploading to {full_in_folder}{shp}: {timestr()}")

    shp_pattern = shp[:-4]

    s3_client = boto3.client("s3")  # Needs to be in the same function as the upload_file call
    s3_client.upload_file(f"/tmp/{shp}", "gfw2-data", Key=f"{in_folder}{shp}")
    s3_client.upload_file(f"/tmp/{shp_pattern}.dbf", "gfw2-data", Key=f"{in_folder}{shp_pattern}.dbf")
    s3_client.upload_file(f"/tmp/{shp_pattern}.prj", "gfw2-data", Key=f"{in_folder}{shp_pattern}.prj")
    s3_client.upload_file(f"/tmp/{shp_pattern}.shx", "gfw2-data", Key=f"{in_folder}{shp_pattern}.shx")

In [15]:
# Lists rasters in an s3 folder and returns their names as a list
def list_rasters_in_folder(full_in_folder):

    cmd = ['aws', 's3', 'ls', full_in_folder]
    s3_contents_bytes = subprocess.check_output(cmd)

    # Converts subprocess results to useful string
    s3_contents_str = s3_contents_bytes.decode('utf-8')
    s3_contents_list = s3_contents_str.splitlines()
    rasters = [line.split()[-1] for line in s3_contents_list]
    rasters = [i for i in rasters if "tif" in i]

    return rasters

In [16]:
# Makes a shapefile of the footprints of rasters in a folder, for checking geographical completeness of rasters
def make_tile_footprint_shp(year, type):

    # Task properties
    dask_print(f"Year: {year}; type: {type}: {timestr()}")

    # Sets the input folder based on the data type being processed
    if type == "IPCC_basic_classes":
        # change_folder = f"climate/AFOLU_flux_model/LULUCF/outputs/{type}/{year}/8000_pixels/{time.strftime('%Y%m%d')}/"
        in_folder = f"climate/AFOLU_flux_model/LULUCF/outputs/{type}/{year}/8000_pixels/20240202/"
    if type == "IPCC_basic_change":
        if year == 2000:
            dask_print("No basic change for 1995-2000. Not making tile footprints.")
            return
        else:
            # change_folder = f"climate/AFOLU_flux_model/LULUCF/outputs/{type}/{year-5}_{year}/8000_pixels/{time.strftime('%Y%m%d')}/"
            in_folder = f"climate/AFOLU_flux_model/LULUCF/outputs/{type}/{year-5}_{year}/8000_pixels/20240202/"
            year = f"{year-5}_{year}"    # The year is actually compound for change rasters

    # Folder including s3 key
    full_in_folder = f's3://gfw2-data/{in_folder}'
    vsis3_in_folder = f'/vsis3/gfw2-data/{in_folder}'

    # List of all the filenames in the folder
    filenames = list_rasters_in_folder(full_in_folder)

    # List of the tile paths in the folder
    tile_paths = []
    tile_paths = [vsis3_in_folder + filename for filename in filenames]

    # dask_print(tile_paths)

    file_paths = 's3_paths.txt'
    
    with open(file_paths, 'w') as file:
        for item in tile_paths:
            file.write(item + '\n')

    # Output shapefile name
    shp = f"{type}_{year}_raster_footprints_{in_folder[-9:-1]}.shp"
    dask_print(shp)

    cmd = ["gdaltindex", "-skip_different_projection", f"/tmp/{shp}", "--optfile", file_paths]
    subprocess.check_call(cmd)

    # Uploads shapefile to s3
    upload_shp(full_in_folder, in_folder, shp)

    return(f"Completed Year: {year}; type: {type}: {timestr()}")
    
    # # Empty geodataframe with attribute columns
    # df = gpd.GeoDataFrame(columns=['location','geometry', 'bound_box'])

    # # Gets the bounding box of each raster and makes a new geodataframe from that, then adds that to the master geodataframe
    # # https://gis.stackexchange.com/a/281996/121380
    # for count, fname in enumerate(tile_paths):
    #     if fname.endswith(".tif"):

    #         count = count+1
            
    #         small_tif = rasterio.open(fname)
    #         bounds = small_tif.bounds
    #         polygon_geom = Polygon([(bounds[0], bounds[1]), 
    #                         (bounds[0], bounds[3]), 
    #                         (bounds[2], bounds[3]), 
    #                         (bounds[2], bounds[1])])

    #         gdf = gpd.GeoDataFrame({"location": fname, "geometry": [polygon_geom], "bound_box": boundstr(bounds)})

    #         # Adds the current raster's geodataframe to the master geodataframe
    #         df = pd.concat([gdf, df], ignore_index=True)   # https://github.com/geopandas/geopandas/issues/2606

    #         if count == 1:
    #             dask_print(f"Raster {count}: {fname}")
            
    #         if count % 100 == 0:
    #             dask_print(f"Raster {count}: {fname}")

    #         # # For testing. Cuts processing at specified number.
    #         # if count > 40:
    #         #     break

    # Output shapefile name
    # shp = f"{type}_{year}_raster_footprints_{in_folder[-9:-1]}_gdaltindex.shp"

    # # Saves shapefile locally
    # dask_print(f"Saving locally {shp}: {timestr()}") 
    # df.to_file(f"/tmp/{shp}", crs="EPSG:4326")

    # # Uploads shapefile to s3
    # upload_shp(full_in_folder, in_folder, shp)

    # return(f"Completed Year: {year}; type: {type}: {timestr()}")