In [None]:
import os
import boto3
import time
import math
import pandas as pd
import subprocess
import re
import concurrent.futures
from osgeo import gdal

# dask/parallelization libraries
import coiled
import dask
from dask.distributed import Client, LocalCluster
from dask.distributed import print
import distributed

# scipy basics
import numpy as np
import rasterio
import rasterio.transform
import rasterio.windows
# import geopandas as gpd
import pandas as pd
import rioxarray
import xarray as xr
from rioxarray.merge import merge_arrays

# numba
from numba import jit
from numba.typed import Dict
from numba.core import types

In [None]:
# Merges rasters that are <10x10 degrees into 10x10 degree rasters in the standard grid.
# Approach is to merge rasters with gdal.Warp and then upload them to s3.
def merge_small_tiles_gdal(s3_name_no_data_dict):
    
    in_folder = list(s3_name_no_data_dict.keys())[0]   # The input s3 folder for the small rasters
    out_file_name_no_data = list(s3_name_no_data_dict.values())[0]   # The output file name for the combined rasters and their no data value
    out_file_name = out_file_name_no_data[0]    # The output file name
    no_data = out_file_name_no_data[1]    # The output no data value. Not currently using but it's available.

    s3_in_folder = f's3://{in_folder}'   # The input s3 folder with s3:// prepended
    vsis3_in_folder = f'/vsis3/{in_folder}'   # The input s3 folder with /vsis3/ prepended

    # Lists all the rasters in the specified s3 folder
    filenames = list_rasters_in_folder(s3_in_folder)   

    # Gets the tile_id from the output file name in the standard format
    tile_id = out_file_name[:8]

    # Limits the input rasters to the specified tile_id (the relevant 10x10 area)
    filenames_in_focus_area = [i for i in filenames if tile_id in i]
    
    # Lists the tile paths for the relevant rasters
    tile_paths = []
    tile_paths = [vsis3_in_folder + filename for filename in filenames_in_focus_area]

    print(f"Merging small rasters in {tile_id} in {vsis3_in_folder}")

    # Names the output folder. Same as the input folder but with the dimensions in pixels replaced
    out_folder = re.sub(r'\d+_pixels', f'{full_raster_dims}_pixels', in_folder[10:])   # [10:] to remove the gfw2-data/ at the front

    min_x, min_y, max_x, max_y = get_10x10_tile_bounds(tile_id)

    output_extent = [min_x, min_y, max_x, max_y]  # Specify the extent in the order [xmin, ymin, xmax, ymax]

    warp_options = gdal.WarpOptions(outputBounds=output_extent, creationOptions=["COMPRESS=LZW"])
    # warp_options = gdal.WarpOptions(outputBounds=output_extent, creationOptions=["COMPRESS=LZW"], dstNodata=no_data)

    # Merges all output small rasters with the options above
    gdal.Warp(f"/tmp/{out_file_name}", tile_paths, options=warp_options)

    s3_client = boto3.client("s3") # Needs to be in the same function as the upload_file call

    print(f"Saving {out_file_name} to s3: {out_folder}{out_file_name}")
    
    s3_client.upload_file(f"/tmp/{out_file_name}", "gfw2-data", Key=f"{out_folder}{out_file_name}")

    # Deletes the local raster
    os.remove(f"/tmp/{out_file_name}")

    return f"success for {s3_name_no_data_dict}"

In [None]:
# Creates the list of aggregated 10x10 rasters that will be created (list of dictionaries of input s3 folder and output aggregated raster name.
# These are the basis for the tasks.

s3_in_folder = 
list_of_s3_name_dicts_total = create_list_for_aggregation(s3_in_folder)

delayed_result = [dask.delayed(merge_small_tiles_gdal)(s3_name_no_data_dict) for s3_name_no_data_dict in list_of_s3_name_dicts_total]