In [156]:
import os

# dask/parallelization libraries
import coiled
import dask
from dask.distributed import Client, LocalCluster
from dask.distributed import print as dask_print
import dask.config
import distributed

# scipy basics
import numpy as np
import rasterio
import rasterio.features
import rasterio.transform
import rasterio.windows
import rioxarray
from rioxarray.merge import merge_arrays

from numba import jit
import concurrent.futures

import boto3
import time
import math
import ctypes
import pandas as pd

<font size="6">Making cloud and local clusters</font> 

In [None]:
# Full cluster
coiled_cluster = coiled.Cluster(
    n_workers=20,
    use_best_zone=True, 
    compute_purchase_option="spot_with_fallback",
    idle_timeout="10 minutes",
    region="us-east-1",
    name="AFOLU_flux_model", 
    account='jterry64', # Necessary to use the AWS environment that Justin set up in Coiled
    worker_memory = "32GiB" 
)

# Coiled cluster (cloud run)
coiled_client = coiled_cluster.get_client()
coiled_client

In [None]:
# Test cluster
coiled_cluster = coiled.Cluster(
    n_workers=1,
    use_best_zone=True, 
    compute_purchase_option="spot_with_fallback",
    idle_timeout="10 minutes",
    region="us-east-1",
    name="AFOLU_flux_model", 
    account='jterry64', # Necessary to use the AWS environment that Justin set up in Coiled
    worker_memory = "64GiB" 
)

# Coiled cluster (cloud run)
coiled_client = coiled_cluster.get_client()
coiled_client

In [None]:
# Local single-process cluster (local run). Will run .compute() on just one process, not a whole cluster.
local_client = Client(processes=False)
local_client

In [None]:
local_client = Client()
local_client

In [134]:
# Local cluster with multiple workers
local_cluster = LocalCluster()  
local_client = Client(local_cluster)
local_client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 24.91 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:34629,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 24.91 GiB

0,1
Comm: tcp://127.0.0.1:45749,Total threads: 2
Dashboard: http://127.0.0.1:34781/status,Memory: 6.23 GiB
Nanny: tcp://127.0.0.1:34123,
Local directory: /tmp/dask-scratch-space/worker-ic4uyv3u,Local directory: /tmp/dask-scratch-space/worker-ic4uyv3u

0,1
Comm: tcp://127.0.0.1:41447,Total threads: 2
Dashboard: http://127.0.0.1:36587/status,Memory: 6.23 GiB
Nanny: tcp://127.0.0.1:41613,
Local directory: /tmp/dask-scratch-space/worker-ri2nu744,Local directory: /tmp/dask-scratch-space/worker-ri2nu744

0,1
Comm: tcp://127.0.0.1:38981,Total threads: 2
Dashboard: http://127.0.0.1:35569/status,Memory: 6.23 GiB
Nanny: tcp://127.0.0.1:41211,
Local directory: /tmp/dask-scratch-space/worker-e6ok6vco,Local directory: /tmp/dask-scratch-space/worker-e6ok6vco

0,1
Comm: tcp://127.0.0.1:36993,Total threads: 2
Dashboard: http://127.0.0.1:40827/status,Memory: 6.23 GiB
Nanny: tcp://127.0.0.1:43405,
Local directory: /tmp/dask-scratch-space/worker-btd02yso,Local directory: /tmp/dask-scratch-space/worker-btd02yso


<font size="6">Shutting down cloud and local clusters</font> 

In [None]:
coiled_cluster.shutdown()

In [None]:
local_client.shutdown()

<font size="6">Scripts</font> 

In [115]:
complete_folder = 'landcover/composite/2020/raw/'

s3 = boto3.resource('s3')
my_bucket = s3.Bucket('gfw2-data')

s3_client = boto3.client("s3")

In [None]:
for year in list(range(2005, 2021, 5)):

    small_raster_paths = []
    
    change_folder = f"climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/{year-5}_{year}/2000_pixels/{time.strftime('%Y%m%d')}/"
   
    for object_summary in my_bucket.objects.filter(Prefix=change_folder):
        small_raster_paths.append(object_summary.key)
    
    # print(small_raster_paths)
    
    tile_string = "50N_010E"
    
    small_raster_paths = [i for i in small_raster_paths if tile_string in i]
    
    # print(small_raster_paths)
    
    small_raster_paths = ['s3://gfw2-data/' + path for path in small_raster_paths]
    
    print(small_raster_paths)

    small_rasters = [rioxarray.open_rasterio(path, chunks=True) for path in small_raster_paths]

    merged = merge_arrays(small_rasters)  # https://corteva.github.io/rioxarray/stable/examples/merge.html

    out_file = f'merged_change_{year-5}_{year}.tif'

    merged.rio.to_raster(f'/tmp/{out_file}')

    s3_client.upload_file(f'/tmp/{out_file}', "gfw2-data", 
                          Key=f"climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/{year-5}_{year}/40000_pixels/{time.strftime('%Y%m%d')}/{out_file}")

    os.remove(f'/tmp/{out_file}')

    del merged


In [212]:
def save_and_upload(bounds, chunk_length_pixels, tile_id, bounds_str, output_dict):

    transform = rasterio.transform.from_bounds(*bounds, width=chunk_length_pixels, height=chunk_length_pixels)

    file_info = f'{tile_id}__{bounds_str}'

    s3_client = boto3.client("s3")

    # For every output file, saves from array to local raster, then to s3.
    # Can't save directly to s3, unfortunately, so need to save locally first.
    for key, value in output_dict.items():

        data_meaning = value[2]
        year_out = value[3]

        dask_print(f"Saving {bounds_str} in {tile_id} for {year_out}: {timestr()}")

        file_name = f"{file_info}__{key}__{timestr()}"

        with rasterio.open(f"/tmp/{file_name}.tif", 'w', driver='GTiff', width=chunk_length_pixels, height=chunk_length_pixels, count=1, dtype='uint8', crs='EPSG:4326', transform=transform, compress='lzw', blockxsize=400, blockysize=400) as dst:
            dst.write(value[0].astype(rasterio.uint8), 1)

        dask_print(f"Uploading {bounds_str} in {tile_id} for {year_out}: {timestr()}")

        s3_client.upload_file(f"/tmp/{file_name}.tif", "gfw2-data", Key=f"{s3_out_dir}/{data_meaning}/{year_out}/40000_pixels/{time.strftime('%Y%m%d')}/{file_name}.tif")

        # Deletes the local raster. It won't be used again.
        os.remove(f"/tmp/{file_name}.tif")

In [225]:
def merge_small_tiles(year):

    small_raster_paths = []
    
    change_folder = f"climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/{year-5}_{year}/2000_pixels/{time.strftime('%Y%m%d')}/"

    cmd = ['aws', 's3', 'ls', f's3://gfw2-data/{change_folder}']

    s3_contents_bytes = subprocess.check_output(cmd)
    s3_contents_str = s3_contents_bytes.decode('utf-8')
    s3_contents_list = s3_contents_str.splitlines()
    filenames = [line.split()[-1] for line in s3_contents_list]
    
    small_raster_paths = [f's3://gfw2-data/{change_folder}' + filename for filename in filenames]

    # dask_print(small_raster_paths)
    
    tile_string = "50N_010E"
    
    small_raster_paths = [i for i in small_raster_paths if tile_string in i]
    
    # print(small_raster_paths)

    small_rasters = [rioxarray.open_rasterio(path, chunks=True) for path in small_raster_paths]

    dask_print(f"Merging {year-5}_{year} to s3")

    merged = merge_arrays(small_rasters)  # https://corteva.github.io/rioxarray/stable/examples/merge.html

    out_file = f'merged_change_{year-5}_{year}.tif'

    dask_print(f"Saving locally {year-5}_{year} to s3")

    merged.rio.to_raster(f'/tmp/{out_file}')

    # IPCC_change_dict[f"IPCC_change_{year-5}_{year}merged"] = [merged, "uint8", "IPCC_basic_change", f'{year-5}_{year}']  

    # save_and_upload(bounds, chunk_length_pixels, tile_id, bounds_str, IPCC_change_dict)

    s3_client = boto3.client("s3")

    path_and_file = f"climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/{year-5}_{year}/40000_pixels/{time.strftime('%Y%m%d')}/{out_file}"

    dask_print(f"Uploading {year-5}_{year} to s3")

    s3_client.upload_file(f'/tmp/{out_file}', "gfw2-data", 
                          Key = f'{path_and_file}')

    dask_print(f"Done uploading {year-5}_{year} to s3")

    os.remove(f'/tmp/{out_file}')

    del merged

    return f"success for {year}"

In [226]:
years = list(range(2005, 2021, 5))

delayed_result = [dask.delayed(merge_small_tiles)(year) for year in years]

results = dask.compute(*delayed_result)
results

Merging 2010_2015 to s3
Merging 2005_2010 to s3
Merging 2015_2020 to s3
Merging 2000_2005 to s3
Saving locally 2010_2015 to s3
Uploading 2010_2015 to s3
Saving locally 2000_2005 to s3
Saving locally 2005_2010 to s3
Uploading 2005_2010 to s3
Uploading 2000_2005 to s3
Saving locally 2015_2020 to s3
Uploading 2015_2020 to s3


('success for 2005',
 'success for 2010',
 'success for 2015',
 'success for 2020')

In [189]:
import dask
from dask import delayed, compute
import subprocess

# Define your processing function
def process_item(year):

    dask_print(year)

    small_raster_paths = []
    
    change_folder = f"climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/{year-5}_{year}/2000_pixels/{time.strftime('%Y%m%d')}/"

    # cmd = ['aws', 's3', 'ls', 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2000_2005/2000_pixels/20240126/']
    cmd = ['aws', 's3', 'ls', f's3://gfw2-data/{change_folder}']

    s3_contents_bytes = subprocess.check_output(cmd)
    s3_contents_str = s3_contents_bytes.decode('utf-8')
    s3_contents_list = s3_contents_str.splitlines()
    filenames = [line.split()[-1] for line in s3_contents_list]
    
    # dask_print(s3_contents_bytes)
    # dask_print(s3_contents_str)
    # dask_print(s3_contents_list)
    # dask_print(filenames)

    small_raster_paths = [f's3://gfw2-data/{change_folder}' + filename for filename in filenames]

    dask_print(small_raster_paths)
    
   
    # for object_summary in my_bucket.objects.filter(Prefix=change_folder):
    #     dask_print(items)
    #     small_raster_paths.append(object_summary.key)

    # small_raster_paths = list_files(change_folder)
    
    # Example processing
    return year * 2



# List of items to process
your_list = [1, 2, 3, 4, 5]

your_list = list(range(2005, 2021, 5))

# Create delayed tasks
delayed_tasks = [delayed(process_item)(year) for year in years]

# Compute results in parallel
results = compute(*delayed_tasks)

# Convert tuple of results to a list
results_list = list(results)

print(results_list)  # Output: [2, 4, 6, 8, 10]

2010
2015
2005
2020
['s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2005_2010/2000_pixels/20240126/50N_010E__10_49_10_50__IPCC_change_2005_2010__20240126_14_50_31.tif', 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2005_2010/2000_pixels/20240126/50N_010E__10_49_11_50__IPCC_change_2005_2010__20240126_14_50_32.tif', 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2005_2010/2000_pixels/20240126/50N_010E__10_50_10_50__IPCC_change_2005_2010__20240126_14_50_35.tif', 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2005_2010/2000_pixels/20240126/50N_010E__10_50_11_50__IPCC_change_2005_2010__20240126_14_50_28.tif']
['s3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2000_2005/2000_pixels/20240126/50N_010E__10_49_10_50__IPCC_change_2000_2005__20240126_14_50_30.tif', 's3://gfw2-data/climate/AFOLU_flux_model/LULUCF/outputs/IPCC_basic_change/2000_2005/2000_pixels/2024012