In [1]:
import os
import boto3
import time
import math
import ctypes
import pandas as pd
import subprocess
import re

# dask/parallelization libraries
import coiled
import dask
from dask.distributed import Client, LocalCluster
from dask.distributed import print as dask_print
import dask.config
import distributed

# scipy basics
import numpy as np
import rasterio
import rasterio.features
import rasterio.transform
import rasterio.windows
import geopandas as gpd
import pandas as pd
import rioxarray
import xarray as xr
from rioxarray.merge import merge_arrays

from numba import jit
import concurrent.futures

<font size="6">Cluster management</font> 

<font size="5">Creating clusters</font> 

In [None]:
# Full cluster
coiled_cluster = coiled.Cluster(
    n_workers=40,
    use_best_zone=True, 
    compute_purchase_option="spot_with_fallback",
    idle_timeout="10 minutes",
    region="us-east-1",
    name="AFOLU_flux_model", 
    account='jterry64', # Necessary to use the AWS environment that Justin set up in Coiled
    worker_memory = "64GiB" 
)

# Coiled cluster (cloud run)
coiled_client = coiled_cluster.get_client()
coiled_client

In [2]:
# Test cluster
coiled_cluster = coiled.Cluster(
    n_workers=1,
    use_best_zone=True, 
    compute_purchase_option="spot_with_fallback",
    idle_timeout="10 minutes",
    region="us-east-1",
    name="AFOLU_flux_model", 
    account='jterry64', # Necessary to use the AWS environment that Justin set up in Coiled
    worker_memory = "64GiB" 
)

# Coiled cluster (cloud run)
coiled_client = coiled_cluster.get_client()
coiled_client

Output()

Output()

0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: https://cluster-oqjyl.dask.host/NVxzUdcousztyxUs/status,

0,1
Dashboard: https://cluster-oqjyl.dask.host/NVxzUdcousztyxUs/status,Workers: 1
Total threads: 16,Total memory: 61.11 GiB

0,1
Comm: tls://10.0.47.229:8786,Workers: 1
Dashboard: http://10.0.47.229:8787/status,Total threads: 16
Started: Just now,Total memory: 61.11 GiB

0,1
Comm: tls://10.0.42.21:33747,Total threads: 16
Dashboard: http://10.0.42.21:8787/status,Memory: 61.11 GiB
Nanny: tls://10.0.42.21:42341,
Local directory: /scratch/dask-scratch-space/worker-1o2809al,Local directory: /scratch/dask-scratch-space/worker-1o2809al


In [None]:
# Local single-process cluster (local run). Will run .compute() on just one process, not a whole cluster.
local_client = Client(processes=False)
local_client

In [None]:
local_client = Client()
local_client

In [None]:
# Local cluster with multiple workers
local_cluster = LocalCluster()  
local_client = Client(local_cluster)
local_client

<font size="5">Shutting down cloud and local clusters</font> 

In [None]:
coiled_client.restart() 

In [None]:
coiled_cluster.shutdown()

In [None]:
local_client.shutdown()

<font size="6">Variables</font> 

In [3]:
# General paths and constants

composite_LC_uri = 's3://gfw2-data/landcover/composite'

s3_out_dir = 'climate/AFOLU_flux_model/LULUCF/outputs'

IPCC_class_max_val = 6

# IPCC codes
forest = 1
cropland = 2
settlement = 3
wetland = 4
grassland = 5
otherland = 6

first_year = 2000
last_year = 2020

s3 = boto3.resource('s3')
my_bucket = s3.Bucket('gfw2-data')

s3_client = boto3.client("s3")

full_raster_dims = 40000

<font size="6">General functions</font> 

In [4]:
def timestr():
    return time.strftime("%Y%m%d_%H_%M_%S")

def boundstr(bounds):
    bounds_str = "_".join([str(round(x)) for x in bounds])
    return bounds_str

def calc_chunk_length_pixels(bounds):
    chunk_length_pixels = int((bounds[3]-bounds[1]) * (40000/10))
    return chunk_length_pixels

In [5]:
# Returns list of all chunk boundaries within a bounding box for chunks of a given size
def get_chunk_bounds(chunk_params):

    min_x = chunk_params[0]
    min_y = chunk_params[1]
    max_x = chunk_params[2]
    max_y = chunk_params[3]
    chunk_size = chunk_params[4]
    
    x, y = (min_x, min_y)
    chunks = []

    # Polygon Size
    while y < max_y:
        while x < max_x:
            bounds = [
                x,
                y,
                x + chunk_size,
                y + chunk_size,
            ]
            chunks.append(bounds)
            x += chunk_size
        x = min_x
        y += chunk_size

    return chunks

In [6]:
# Returns the encompassing tile_id string in the form YYN/S_XXXE/W based on a coordinate
def xy_to_tile_id(top_left_x, top_left_y):

    lat_ceil = math.ceil(top_left_y/10.0) * 10
    lng_floor = math.floor(top_left_x/10.0) * 10
    
    lng: str = f"{str(lng_floor).zfill(3)}E" if (lng_floor >= 0) else f"{str(-lng_floor).zfill(3)}W"
    lat: str = f"{str(lat_ceil).zfill(2)}N" if (lat_ceil >= 0) else f"{str(-lat_ceil).zfill(2)}S"

    return f"{lat}_{lng}"

In [7]:
# Lazily opens tile within provided bounds (i.e. one chunk) and returns as a numpy array
# If it can't open the chunk (no data in it), it returns an array of the specified nodata value.
# TODO: It would be better if this just didn't return any array at all if there's no chunk. Returning an array of nodata is pretty inefficient.  
def get_tile_dataset_rio(uri, bounds, chunk_length_pixels, no_data_val):

    bounds_str = boundstr(bounds)

    try:
        with rasterio.open(uri) as ds:
            window = rasterio.windows.from_bounds(*bounds, ds.transform)
            data = ds.read(1, window=window)
    except:
        data = np.full((chunk_length_pixels, chunk_length_pixels), no_data_val)

    return data

In [8]:
# Prepares list of chunks to download.
# Chunks are defined by a bounding box.
def prepare_to_download_chunk(bounds, download_dict, no_data_val):
 
    futures = {}

    bounds_str = boundstr(bounds)
    tile_id = xy_to_tile_id(bounds[0], bounds[3])
    chunk_length_pixels = calc_chunk_length_pixels(bounds)

    # Submit requests to S3 for input chunks but don't actually download them yet. This queueing of the requests before downloading them speeds up the downloading
    # Approach is to download all the input chunks up front for every year to make downloading more efficient, even though it means storing more upfront
    with concurrent.futures.ThreadPoolExecutor() as executor:
        
        dask_print(f"Requesting data in chunk {bounds_str} in {tile_id}: {timestr()}")

        for key, value in download_dict.items():
            futures[executor.submit(get_tile_dataset_rio, value, bounds, chunk_length_pixels, no_data_val)] = key

    return futures

In [9]:
# Checks if tiles exist at all
def check_for_tile(download_dict):

    s3 = boto3.client('s3')

    i=0

    while i < len(list(download_dict.values())):

        s3_key = list(download_dict.values())[i][15:]

        # Breaks the loop if the tile exists
        try:
            s3.head_object(Bucket='gfw2-data', Key=s3_key)
            dask_print(f"Tile id {list(download_dict.values())[i][-12:-4]} exists. Proceeding.")
            return 1
        except:
            pass
            
        i+=1

    dask_print(f"Tile id {list(download_dict.values())[0][-12:-4]} does not exist. Skipping chunk.")
    return 0

In [10]:
# Checks whether a chunk has data in it
def check_chunk_for_data(layers, item_to_check, bounds_str, tile_id, no_data_val):

    i=0

    while i < len(list(layers.values())):

        # Checks if all the pixels have the nodata value
        min = np.min(list(layers.values())[i])  # Can't use np.all because it doesn't work in chunks that are mostly water; says nodata in chunk even if there is land

        # Breaks the loop if there is data in the chunk
        if min < no_data_val:
            dask_print(f"Data in chunk {bounds_str}. Proceeding.")
            return 1

        i+=1

    dask_print(f"No data in chunk {bounds_str} for any input.")
    return 0

In [11]:
# Saves array as a raster locally, then uploads it to s3
def save_and_upload_small_raster_set(bounds, chunk_length_pixels, tile_id, bounds_str, output_dict, is_final):

    s3_client = boto3.client("s3") # Needs to be in the same function as the upload_file call

    transform = rasterio.transform.from_bounds(*bounds, width=chunk_length_pixels, height=chunk_length_pixels)

    file_info = f'{tile_id}__{bounds_str}'

    # For every output file, saves from array to local raster, then to s3.
    # Can't save directly to s3, unfortunately, so need to save locally first.
    for key, value in output_dict.items():

        data_array = value[0]
        data_meaning = value[2]
        year_out = value[3]

        array_dtype = data_array.dtype

        if not is_final:
            dask_print(f"Saving {bounds_str} in {tile_id} for {year_out}: {timestr()}")

        if is_final:
            file_name = f"{file_info}__{key}.tif"
        else:
            file_name = f"{file_info}__{key}__{timestr()}.tif"

        with rasterio.open(f"/tmp/{file_name}", 'w', driver='GTiff', width=chunk_length_pixels, height=chunk_length_pixels, count=1, dtype='uint8', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst:
            dst.write(data_array.astype(rasterio.uint8), 1)

        s3_path = f"{s3_out_dir}/{data_meaning}/{year_out}/{chunk_length_pixels}_pixels/{time.strftime('%Y%m%d')}"

        if not is_final:
            dask_print(f"Uploading {bounds_str} in {tile_id} for {year_out} to {s3_path}: {timestr()}")

        s3_client.upload_file(f"/tmp/{file_name}", "gfw2-data", Key=f"{s3_path}/{file_name}")

        # Deletes the local raster
        os.remove(f"/tmp/{file_name}")

In [12]:
# Lists rasters in an s3 folder and returns their names as a list
def list_rasters_in_folder(full_in_folder):

    cmd = ['aws', 's3', 'ls', full_in_folder]
    s3_contents_bytes = subprocess.check_output(cmd)

    # Converts subprocess results to useful string
    s3_contents_str = s3_contents_bytes.decode('utf-8')
    s3_contents_list = s3_contents_str.splitlines()
    rasters = [line.split()[-1] for line in s3_contents_list]
    rasters = [i for i in rasters if "tif" in i]

    return rasters

In [13]:
# Uploads a shapefile to s3
def upload_shp(full_in_folder, in_folder, shp):

    dask_print(f"Uploading to {full_in_folder}{shp}: {timestr()}")

    shp_pattern = shp[:-4]

    s3_client = boto3.client("s3")  # Needs to be in the same function as the upload_file call
    s3_client.upload_file(f"/tmp/{shp}", "gfw2-data", Key=f"{in_folder[10:]}{shp}")
    s3_client.upload_file(f"/tmp/{shp_pattern}.dbf", "gfw2-data", Key=f"{in_folder[10:]}{shp_pattern}.dbf")
    s3_client.upload_file(f"/tmp/{shp_pattern}.prj", "gfw2-data", Key=f"{in_folder[10:]}{shp_pattern}.prj")
    s3_client.upload_file(f"/tmp/{shp_pattern}.shx", "gfw2-data", Key=f"{in_folder[10:]}{shp_pattern}.shx")

    os.remove(f"/tmp/{shp}")
    os.remove(f"/tmp/{shp_pattern}.dbf")
    os.remove(f"/tmp/{shp_pattern}.prj")
    os.remove(f"/tmp/{shp_pattern}.shx")

In [14]:
# Makes a shapefile of the footprints of rasters in a folder, for checking geographical completeness of rasters
def make_tile_footprint_shp(input_dict):

    in_folder = list(input_dict.keys())[0]
    pattern = list(input_dict.values())[0]

    # Task properties
    dask_print(f"Making tile index shapefile for: {in_folder}: {timestr()}")

    # Folder including s3 key
    s3_in_folder = f's3://{in_folder}'
    vsis3_in_folder = f'/vsis3/{in_folder}'

    # List of all the filenames in the folder
    filenames = list_rasters_in_folder(s3_in_folder)

    # List of the tile paths in the folder
    tile_paths = []
    tile_paths = [vsis3_in_folder + filename for filename in filenames]

    file_paths = 's3_paths.txt'

    with open(f"/tmp/{file_paths}", 'w') as file:
        for item in tile_paths:
            file.write(item + '\n')

    # Output shapefile name
    shp = f"raster_footprints_{pattern}.shp"

    cmd = ["gdaltindex", "-t_srs", "EPSG:4326", f"/tmp/{shp}", "--optfile", f"/tmp/{file_paths}"]
    subprocess.check_call(cmd)

    # Uploads shapefile to s3
    upload_shp(s3_in_folder, in_folder, shp)

    return(f"Completed: {timestr()}")

In [15]:
def flatten_list(nested_list):
    return [x for xs in nested_list for x in xs]

def create_list_for_aggregation(s3_in_folders):

    list_of_s3_name_dicts_total = []   # Final list of dictionaries of s3 paths and output aggregated 10x10 rasters
    
    # Iterates through all the desired s3 folders
    for s3_in_folder in s3_in_folders:
    
        simple_file_names = []   # List of output aggregatd 10x10 rasters
    
        # Raw filenames in a folder
        filenames = list_rasters_in_folder(f"s3://{s3_in_folder}")
    
        # Iterates through all the files in a folder and converts them to the output names. 
        # Essentially [tile_id]__[pattern].tif. Drops the chunk bounds from the middle.
        for filename in filenames:
        
            result = filename[:10] + filename[filename.rfind("__") + len("__"):]   # Extracts the relevant parts of the raw file names
            simple_file_names.append(result)   # New list of simplified file names used for 10x10 degree outputs
    
        # Removes duplicate simplified file names.
        # There are duplicates because each 10x10 output raster has many constituent chunks, each of which have the same aggregated, final name
        simple_file_names = np.unique(simple_file_names).tolist()   
    
        # Makes a list of dictionaries, where the key is the input s3 path and the value is the output aggregated name
        list_of_s3_name_dicts = [{key: value} for value in simple_file_names for key in [s3_in_folder]]
    
        # dask_print(list_of_s3_name_dicts)
    
        # Adds the dictionary of s3 paths and output names for this folder to the list for all folders
        list_of_s3_name_dicts_total.append(list_of_s3_name_dicts)
    
    # Output of above is a nested list, where each input folder is its own inner list. Need to flatten to a list.
    list_of_s3_name_dicts_total = flatten_list(list_of_s3_name_dicts_total)
    
    print(f"There are {len(list_of_s3_name_dicts_total)} chunks to process in {len(s3_in_folders)} input folders.")

    return list_of_s3_name_dicts_total

In [16]:
# Saves an xarray data array locally as a raster and then uploads it to s3
def save_and_upload_raster_10x10(**kwargs):

    s3_client = boto3.client("s3") # Needs to be in the same function as the upload_file call

    data_array = kwargs['data']   # The data being saved
    out_file_name = kwargs['out_file_name']   # The output file name
    out_folder = kwargs['out_folder']   # The output folder

    dask_print(data_array)

    dask_print(f"Saving {out_file_name} locally")

    profile_kwargs = {'compress': 'lzw'}   # Adds attribute to compress the output raster 
    # data_array.rio.to_raster(f"{out_file_name}", **profile_kwargs)
    data_array.rio.to_raster(f"/tmp/{out_file_name}", **profile_kwargs)

    dask_print(f"Saving {out_file_name} to {out_folder[10:]}{out_file_name}")

    s3_client.upload_file(f"/tmp/{out_file_name}", "gfw2-data", Key=f"{out_folder[10:]}{out_file_name}")

    # Deletes the local raster
    os.remove(f"/tmp/{out_file_name}")

In [30]:
# Merges rasters that are <10x10 degrees into 10x10 degree rasters in the standard grid
def merge_small_tiles(s3_name_dict):

    in_folder = list(s3_name_dict.keys())[0]   # The input s3 folder for the small rasters
    out_file_name = list(s3_name_dict.values())[0]   # The output file name for the combined rasters

    s3_in_folder = f's3://{in_folder}'   # The input s3 folder with s3:// prepended
    vsis3_in_folder = f'/vsis3/{in_folder}'   # The input s3 folder with /vsis3/ prepended

    # Lists all the rasters in the specified s3 folder
    filenames = list_rasters_in_folder(s3_in_folder)   

    # Gets the tile_id from the output file name in the standard format
    tile_id = out_file_name[:8]

    # Limits the input rasters to the specified tile_id (the relevant 10x10 area)
    filenames_in_focus_area = [i for i in filenames if tile_id in i]
    
    # Lists the tile paths for the relevant rasters
    tile_paths = []
    tile_paths = [s3_in_folder + filename for filename in filenames_in_focus_area]

    dask_print(f"Opening small rasters in {tile_id} in {s3_in_folder}")

    # Opens the relevant rasters in a list of xarray data arrays
    small_rasters = [rioxarray.open_rasterio(tile_path, chunks=True) for tile_path in tile_paths]

    dask_print(f"Merging {tile_id} in {s3_in_folder}")

    # Merges the relevant small data arrays in the list
    merged = merge_arrays(small_rasters)  # https://corteva.github.io/rioxarray/stable/examples/merge.html

    dask_print(merged.dims)

    dask_print(merged)

    # Define the latitude and longitude coordinates
    lat = np.linspace(0, -10, 40000)  # 00 deg N to 10 deg S
    lon = np.linspace(0, 10, 40000)  # 0 deg E to 10 deg E
    
    # Create a 2D array filled with 255 values
    values = np.full((len(lat), len(lon)), 255, dtype='uint8')
    
    data_array = xr.DataArray(data=values, coords={'x': lon, 'y': lat,}, dims=('x', 'y'))
    
    # chunked_data_array = data_array.chunk({'lat': 400, 'lon': 400})
    
    # Print the resulting DataArray
    dask_print(data_array)

    combined = merged.combine_first(data_array)

    dask_print(combined)

    # # # Desired new dimensions
    # # new_width = 40000
    # # new_height = 40000   
    
    # # # Calculate padding sizes
    # # pad_width = new_width - merged.rio.width
    # # pad_height = new_height - merged.rio.height

    # # dask_print(pad_width)
    # # dask_print(pad_height)

    # # padded_x = np.linspace(merged.x.min(), merged.x.max() + pad_width, new_width)
    # # padded_y = np.linspace(merged.y.min(), merged.y.max() + pad_height, new_height)

    # # dask_print(padded_x)
    # # dask_print(padded_y)

    # # padded_image = merged.pad(pad_width={"x": (pad_width, 0), "y": (0, pad_height)}, constant_values=255)  # White padding
    # # padded_image = padded_image.assign_coords({"x": padded_x, "y": padded_y})

    # # dask_print(padded_image)

    # # Names the output folder. Same as the input folder but with the dimensions in pixels replaced
    # out_folder = re.sub(r'\d+_pixels', f'{full_raster_dims}_pixels', in_folder)

    # Saves the merged xarray data array locally and then to s3 
    save_and_upload_raster_10x10(data=combined, out_file_name=out_file_name, out_folder=out_folder)

    # # del merged
    # # del padded_image

    # return f"success for {s3_name_dict}"

In [31]:
%%time

s3_in_folders = [
           # "gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2000/8000_pixels/20240205/",
           # "gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2005/8000_pixels/20240205/",
           # "gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2010/8000_pixels/20240205/",
           # "gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2015/8000_pixels/20240205/",
           # "gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_classes/2020/8000_pixels/20240205/",
           # "gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2000_2005/8000_pixels/20240205/",
           # "gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2005_2010/8000_pixels/20240205/",
           # "gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2010_2015/8000_pixels/20240205/",
           "gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2015_2020/8000_pixels/20240205/"
          ]

list_of_s3_name_dicts_total = create_list_for_aggregation(s3_in_folders)

# For testing. Limits the number of output rasters
list_of_s3_name_dicts_total = list_of_s3_name_dicts_total[0:1]

delayed_result = [dask.delayed(merge_small_tiles)(s3_name_dict) for s3_name_dict in list_of_s3_name_dicts_total]

results = dask.compute(*delayed_result)
results

There are 280 chunks to process in 1 input folders.
Opening small rasters in 00N_000E in s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2015_2020/8000_pixels/20240205/
Merging 00N_000E in s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2015_2020/8000_pixels/20240205/
('band', 'y', 'x')
<xarray.DataArray (band: 1, y: 24000, x: 16000)>
array([[[0, 0, 0, ..., 1, 1, 1],
        [0, 0, 0, ..., 1, 1, 1],
        [0, 0, 0, ..., 1, 1, 1],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]], dtype=uint8)
Coordinates:
  * x            (x) float64 6.0 6.0 6.001 6.001 6.001 ... 9.999 9.999 10.0 10.0
  * y            (y) float64 -0.000125 -0.000375 -0.000625 ... -5.999 -6.0 -6.0
  * band         (band) int64 1
    spatial_ref  int64 0
Attributes:
    AREA_OR_POINT:  Area
    scale_factor:   1.0
    add_offset:     0.0
<xarray.DataArray (x: 40000, y: 40000)>
array([[255, 255, 255, ..., 255

NameError: name 'out_folder' is not defined

In [20]:
x = xr.DataArray(
    [[0, 0, 0], [0, 0, 0], [0, 0, 0]],
    dims=("lat", "lon"),
    coords={"lat": [35.0, 40.0, 45.0], "lon": [100.0, 120.0, 140.0]},
    name="var1",
)
y = xr.DataArray(
    [[5.0, 6.0], [7.0, 8.0]],
    dims=("lat", "lon"),
    coords={"lat": [35.0, 40.0], "lon": [100.0, 120.0]},
    name="var2",
)
# z = xr.DataArray(
#     [[0.0, 3.0], [4.0, 9.0]],
#     dims=("lat", "lon"),
#     coords={"lat": [30.0, 60.0], "lon": [100.0, 150.0]},
#     name="var3",
# )

In [21]:
new = y.combine_first(x)
new

In [None]:
%%time

# Define the latitude and longitude coordinates
lat = np.linspace(10, 0, 40)  # 10 deg N to 0 deg N
lon = np.linspace(-40, -50, 40)  # 40 deg W to 50 deg W

print("Hello")

# Create a 2D array filled with 255 values
values = np.full((len(lat), len(lon)), 255, dtype='uint8')

# Create an xarray DataArray
data_array = xr.DataArray(data=values, coords={'lat': lat, 'lon': lon}, dims=('lat', 'lon'))

# chunked_data_array = data_array.chunk({'lat': 400, 'lon': 400})

# Print the resulting DataArray
data_array

In [None]:
#TODO: Function to track the number of land use changes per pixel