In [37]:
import os
import logging
import boto3
import rioxarray
import rasterio
from rasterio.merge import merge as merge_arrays
import dask
from dask.distributed import Client, LocalCluster
from dask.diagnostics import ProgressBar
import atexit

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# AWS S3 setup
s3_bucket = "gfw2-data"
local_temp_dir = "/tmp/merged"

def s3_file_exists(bucket, key):
    s3 = boto3.client('s3')
    try:
        s3.head_object(Bucket=bucket, Key=key)
        logging.info(f"File exists: s3://{bucket}/{key}")
        return True
    except:
        logging.info(f"File does not exist: s3://{bucket}/{key}")
        return False

def list_s3_files(bucket, prefix):
    s3 = boto3.client('s3')
    keys = []
    try:
        paginator = s3.get_paginator('list_objects_v2')
        for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
            for obj in page.get('Contents', []):
                keys.append(obj['Key'])
    except Exception as e:
        logging.error(f"Error listing files in s3://{bucket}/{prefix}: {e}")
    return keys

def merge_tiles(tile_id, input_prefix, output_prefix, type_indicator):
    small_raster_paths = list_s3_files(s3_bucket, input_prefix)
    small_raster_paths = [path for path in small_raster_paths if tile_id in path]
    small_raster_paths = [f's3://{s3_bucket}/{path}' for path in small_raster_paths]

    if not small_raster_paths:
        logging.info(f"No small rasters found for tile {tile_id}.")
        return

    # Open rasters using rasterio directly
    small_rasters = [rasterio.open(path) for path in small_raster_paths]

    merged, out_transform = merge_arrays(small_rasters)

    if not os.path.exists(local_temp_dir):
        os.makedirs(local_temp_dir)

    out_file = f'{tile_id}_{type_indicator}.tif'
    local_output_path = os.path.join(local_temp_dir, out_file)

    out_meta = small_rasters[0].meta.copy()
    out_meta.update({
        "driver": "GTiff",
        "height": merged.shape[1],
        "width": merged.shape[2],
        "transform": out_transform,
        "compress": "lzw"
    })

    with rasterio.open(local_output_path, 'w', **out_meta) as dst:
        dst.write(merged[0], 1)  # Write the first band

    s3_client = boto3.client('s3')
    s3_output_path = os.path.join(output_prefix, out_file)
    s3_client.upload_file(local_output_path, s3_bucket, s3_output_path)
    logging.info(f"Uploaded merged raster to s3://{s3_bucket}/{s3_output_path}")

    os.remove(local_output_path)

def cleanup():
    global client, cluster
    if client:
        client.close()
    if cluster:
        cluster.close()

def get_tile_ids(prefix):
    files = list_s3_files(s3_bucket, prefix)
    tile_ids = list(set(["_".join(os.path.basename(path).split('_')[:2]) for path in files]))
    return tile_ids

def process_tile_id(tile_id, input_prefix, output_prefix, type_indicator):
    merge_tiles(tile_id, input_prefix, output_prefix, type_indicator)

def main(input_prefixes, output_prefixes, tile_ids=None):
    # global cluster, client
    # cluster = LocalCluster()
    # client = Client(cluster)
    # atexit.register(cleanup)

    try:
        for input_prefix, output_prefix in zip(input_prefixes, output_prefixes):
            type_indicator = "soil" if "soil" in input_prefix.split('/') else "state"
            available_tile_ids = get_tile_ids(input_prefix)

            if tile_ids:
                # Filter the available tile IDs to only include those in the provided list
                tile_ids_to_process = [tile_id for tile_id in tile_ids if tile_id in available_tile_ids]
            else:
                tile_ids_to_process = available_tile_ids

            for tile_id in tile_ids_to_process:
                dask_tile = dask.delayed(process_tile_id)(tile_id, input_prefix, output_prefix, type_indicator)
                with ProgressBar():
                    dask.compute(dask_tile)
    finally:
        print("exit")
        
if __name__ == "__main__":
    input_prefixes = [
        'climate/AFOLU_flux_model/organic_soils/outputs/soil/2020/8000_pixels/20240607/',
        'climate/AFOLU_flux_model/organic_soils/outputs/state/2020/8000_pixels/20240607/'
    ]
    output_prefixes = [
        'climate/AFOLU_flux_model/organic_soils/outputs/soil/2020/10x10_degrees/20240607',
        'climate/AFOLU_flux_model/organic_soils/outputs/state/2020/10x10_degrees/20240607'
    ]
    tile_ids = ['50N_070E', '70N_040E', '00N_150E', '10N_130E', '20S_140E', '00N_020E', '10S_030E', '10N_080W', '20S_080W', '40N_120E', '70N_140E', '20N_010E', '30N_110W', '40N_120W', '00N_010E', '10N_070W', '10N_000E', '20N_060W', '20S_020E', '20S_010E', '30N_020W', '60N_090W', '40N_070W', '70N_160E', '80N_090E', '60N_100E', '10N_050E', '30S_060W', '40N_060E', '70N_070E', '20N_000E', '70N_050E', '70N_010E', '10N_080E', '60N_150W', '50N_010W', '20S_130E', '30S_070W', '60N_160E', '00N_060W', '20S_050W', '30N_100W', '40N_110W', '20N_120E', '60N_140W', '60N_060E', '70N_140W', '10N_100E', '00N_040W', '10S_060W', '50N_090W', '10S_050W', '60N_030E', '00N_080W', '10N_090W', '70N_120W', '80N_140E', '60N_160W', '10S_050E', '30N_010W', '30S_010E', '40S_070W', '70N_060E', '70N_150E', '60N_080E', '20S_070W', '20N_080E', '30N_060E', '60N_060W', '50N_060W', '70N_080W', '10N_010E', '40N_000E', '70N_090E', '00N_090W', '40N_110E', '20N_110E', '60N_040E', '60N_020E', '70N_160W', '30N_120W', '20N_070W', '60N_010E', '40N_080W', '40N_100W', '00N_130E', '00N_040E', '30N_100E', '20N_080W', '00N_070W', '30S_090W', '70N_180W', '70N_130E', '60N_120E', '70N_170E', '10S_040E', '20N_090E', '70N_030E', '30N_120E', '10S_150E', '50N_130E', '40N_010E', '10N_050W', '10S_100E', '10N_020W', '20N_110W', '10N_070E', '20S_060W', '50N_110E', '60N_000E', '30N_090W', '20N_090W', '50N_000E', '60N_070E', '00N_000E', '10S_160E', '20N_020W', '50N_100E', '60N_080W', '40N_040E', '60N_170E', '70N_080E', '50N_150E', '40N_080E', '10S_080W', '40S_140E', '50N_140E', '50N_090E', '20N_070E', '70N_120E', '30S_020E', '30N_030E', '10S_140E', '80N_120W', '30N_050E', '10S_020E', '40N_010W', '10S_070W', '40N_030E', '20N_100E', '50N_030E', '70N_110E', '70N_090W', '00N_100E', '30S_170E', '50N_050E', '40S_170E', '50N_040E', '10N_120E', '40S_160E', '50N_130W', '20S_160E', '50S_080W', '80N_020E', '00N_160E', '30N_080E', '30S_140E', '50N_080W', '50N_100W', '80N_070E', '00N_110E', '40N_140E', '50N_080E', '60N_110W', '10S_130E', '20N_010W', '70N_150W', '40N_090E', '20N_020E', '60N_050E', '40N_100E', '10N_030E', '20N_040E', '80N_120E', '30N_110E', '70N_020E', '20S_110E', '30S_080W', '00N_140E', '20S_150E', '30S_030E', '40S_080W', '50N_110W', '80N_110E', '70N_010W', '10N_090E', '20N_100W', '30N_080W', '60N_130E', '10N_060W', '10N_010W', '40N_090W', '40N_130E', '10S_170E', '10S_010E', '50N_070W', '30S_150E', '50N_120E', '40N_050E', '00N_050W', '20S_040E', '10N_020E', '20N_030E', '40N_070E', '50N_010E', '40N_130W', '60N_100W', '60N_140E', '10S_040W', '40N_020E', '60N_090E', '00N_120E', '70N_100E', '10N_110E', '50N_120W', '10N_040E', '60N_150E', '60N_180W', '60N_110E', '50S_060W', '80N_060E', '60N_130W', '70N_000E', '80N_140W', '80N_130W', '30N_040E', '50N_020E', '00N_090E', '20N_120W', '60N_070W', '60N_120W', '80N_080E', '30S_130E', '70N_130W', '00N_030E', '30N_070E', '60N_020W', '20S_030E', '30N_090E', '60N_010W', '70N_030W', '70N_110W', '10N_140E', '20S_120E', '30S_110E', '20N_050E', '30S_120E', '60N_170W', '10S_110E', '50S_070W', '70N_020W', '70N_100W', '10S_120E']  # Replace with the list of tile IDs you want to process
    main(input_prefixes, output_prefixes, tile_ids)


exit


CancelledError: process_tile_id-56749b29-6817-440e-a850-dc2f44a41d40

2024-06-10 23:45:33,646 - Found credentials in shared credentials file: ~/.aws/credentials
