The purpose of this notebook is to process the Dadap Canals density raster into 10x10 degree tiles and to set up a template for processing other input datasets. The code needs to check the CRS, projection, and cell size of the dataset and correct if necessary. The code also needs to export chunks of the input dataset to s3, which can later be merged into 10x10 degree tiles. 

In [53]:
import rioxarray
import rasterio
import xarray as xr
import pandas as pd
import os
import dask
import boto3
from dask.distributed import Client
import dask.array as da

# scipy basics
import numpy as np
import rasterio
import rasterio.transform
import rasterio.windows
from rasterio.windows import from_bounds
#import geopandas as gpd
import pandas as pd
import rioxarray
import xarray as xr
from rioxarray.merge import merge_arrays


In [54]:
template_uri = "s3://gfw2-data/climate/AFOLU_flux_model/organic_soils/inputs/raw/GFW_Global_Peatlands/00N_110E.tif"
dadap_uri = "s3://gfw2-data/climate/AFOLU_flux_model/organic_soils/inputs/raw/Dadap_SEA_Drainage/canal_length_data/canal_length_1km.tif"
s3_base_dir = "s3://gfw2-data/climate/AFOLU_flux_model/organic_soils/"
dadap_pattern = "dadap_density"

processed_dir = "s3:/gfw2-data/climate/AFOLU_flux_model/organic_soils/inputs/processed"
output_dir = os.path.join(processed_dir,dadap_pattern)

In [55]:
# def get_nodata_value(uri):
#     with rasterio.open(uri) as src:
#         return src.nodatavals[0]

In [56]:
# def preprocess_dadap_density():
#     print("Loading dadap density raster...")
#     dadap_density = get_dataset(dadap_uri, "dadap_density", template=None)

#     # Load a template raster from the Hansen dataset for reference
#     print("Loading template raster for CRS and resolution reference...")
#     template_raster = get_tile_dataset(peatlands_uri, "00N_110E.tif", "peatlands", template=None)

#     # Reproject and resample dadap_density to match template
#     print("Reprojecting and resampling dadap density raster...")
#     dadap_density = dadap_density.rio.reproject_match(template_raster)

#     return dadap_density

# dadap_density = preprocess_dadap_density()

In [57]:
# def clip_and_save_dadap_density(bounds, is_final):
#     xmin, ymin, xmax, ymax = bounds
#     bounds_str = boundstr(bounds)    # String form of chunk bounds
#     tile_id = xy_to_tile_id(bounds[0], bounds[3])    # tile_id in YYN/S_XXXE/W
#     chunk_length_pixels = calc_chunk_length_pixels(bounds)   # Chunk length in pixels (as opposed to decimal degrees)    
    
#     #xmin, ymin, xmax, ymax = bounds 
#     local_path = "/tmp"  # Temporary local path for saving the file
#     output_filename = f"{tile_id}_{dadap_pattern}.tif" if is_final else f"{tile_id}_{dadap_pattern}_{time.strftime('%Y%m%d%H%M%S')}.tif"
#     local_file_path = os.path.join(local_path, output_filename)
#     output_s3_path = os.path.join(output_dir, output_filename)

#     # Clip the raster 
#     # I am not clear on whether I actually need to clip the raster 
#     clipped = dadap_density.rio.clip_box(minx=xmin, miny=ymin, maxx=xmax, maxy=ymax)
    
#     # Check if there is any non-nodata value in the clipped raster
#     if np.any(clipped.data != clipped.rio.nodata):
#         # Save the raster locally
#         clipped.rio.to_raster(local_file_path)
#         print(f"Saved locally: {local_file_path}")

#         # Upload to S3
#         try:
#             s3 = boto3.client('s3')
#             s3.upload_file(local_file_path, "gfw2-data", output_s3_path.lstrip('/'))
#             print(f"Successfully uploaded {output_filename} to S3 at {output_s3_path}")
#         except NoCredentialsError:
#             print("Credentials not available for AWS S3.")
#         except Exception as e:
#             print(f"Failed to upload to S3: {str(e)}")

#         # Optionally, remove the local file after upload
#         os.remove(local_file_path)
#         print(f"Deleted local file: {local_file_path}")
#     else:
#         print(f"No valid data in chunk {bounds} for tile {tile_id}. Skipping this chunk.")

In [58]:
def get_tile_dataset_rio(uri, bounds, chunk_length_pixels):
    bounds_str = boundstr(bounds)
    try:
        with rasterio.open(uri) as ds:
            no_data_val = ds.nodatavals[0] if ds.nodatavals[0] is not None else -9999
            window = from_bounds(*bounds, ds.transform)
            data = ds.read(1, window=window)
            if data.size == 0:  # Skip chunks with no data
                print(f"No data in chunk {bounds_str}, skipping.")
                return None

            transform = ds.window_transform(window)
            data_array = xr.DataArray(data, dims=["y", "x"], coords={
                "x": np.linspace(transform.c, transform.c + transform.a * (data.shape[1] - 1), num=data.shape[1]),
                "y": np.linspace(transform.f, transform.f + transform.e * (data.shape[0] - 1), num=data.shape[0])
            })
            data_array.rio.write_crs(ds.crs, inplace=True)
            data_array.rio.write_nodata(no_data_val, inplace=True)
            return data_array
    except Exception as e:
        print(f"Error reading data for bounds {bounds_str}: {e}")
        return None

In [59]:
def process_dadap_chunk(bounds, dadap_uri, template_uri, output_dir, is_final):
    print(f"Processing chunk with bounds: {bounds}")
    chunk_length_pixels = calc_chunk_length_pixels(bounds)
    tile_id = xy_to_tile_id(bounds[0], bounds[3])

    dadap_data = get_tile_dataset_rio(dadap_uri, bounds, chunk_length_pixels)
    if dadap_data is None:
        return f"Skipped chunk {bounds} due to no data"

    template_data = get_tile_dataset_rio(template_uri, bounds, chunk_length_pixels)
    if template_data is None:
        return f"Skipped chunk {bounds} due to no template data"

    dadap_data_matched = dadap_data.rio.reproject_match(template_data)
    save_and_upload_small_raster_set(bounds, chunk_length_pixels, tile_id, boundstr(bounds), {
        "dadap_chunk": [dadap_data_matched, 'float32', 'dadap_density', time.strftime('%Y')]
    }, is_final, output_dir)
    return f"Processed and uploaded chunk {bounds}"


In [60]:
# Makes list of chunks to analyze
chunk_params = [110, -10, 120, 0, 2]
chunks = get_chunk_bounds(chunk_params)  
print("Processing", len(chunks), "chunks")

is_final = len(chunks) > 30
if is_final:
    print("Running as final model.")

# Correct the function name and parameters
delayed_result = [dask.delayed(process_dadap_chunk)(chunk, dadap_uri, template_uri, output_dir, is_final) for chunk in chunks]

results = dask.compute(*delayed_result)
print(results)

Processing 25 chunks
Processing chunk with bounds: [110, -2, 112, 0]
Processing chunk with bounds: [118, -2, 120, 0]
Processing chunk with bounds: [114, -2, 116, 0]
Processing chunk with bounds: [110, -10, 112, -8]
Processing chunk with bounds: [116, -2, 118, 0]
Processing chunk with bounds: [112, -8, 114, -6]
Processing chunk with bounds: [118, -4, 120, -2]
Processing chunk with bounds: [114, -10, 116, -8]
Processing chunk with bounds: [112, -10, 114, -8]
Processing chunk with bounds: [116, -4, 118, -2]
Processing chunk with bounds: [118, -10, 120, -8]
Processing chunk with bounds: [110, -4, 112, -2]
No data in chunk 112_-8_114_-6, skipping.No data in chunk 118_-10_120_-8, skipping.
Processing chunk with bounds: [114, -8, 116, -6]No data in chunk 110_-10_112_-8, skipping.
No data in chunk 112_-10_114_-8, skipping.
No data in chunk 114_-10_116_-8, skipping.


Processing chunk with bounds: [110, -6, 112, -4]
Processing chunk with bounds: [114, -4, 116, -2]
Processing chunk with bounds: 

In [61]:
# # Convert processing to Dask delayed tasks
# tasks = [dask.delayed(process_tiles)(dadap_density, output_dir, [tile]) for tile in tiles]
# results = dask.compute(*tasks)