In [1]:
import os
# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables. 
aws_default_config = {
    #'AWS_NO_SIGN_REQUEST': 'YES', 
    'AWS_SECRET_ACCESS_KEY': 'fake',
    'AWS_ACCESS_KEY_ID': 'fake',
}

# To access public bucket, need to remove the AWS credentials in 
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [2]:
import logging

import boto3
import click
import fsspec
from odc import dscache
from odc.aws import s3_download
from odc.stats._cli_common import parse_all_tasks

#from deafrica_conflux.cli.common import MutuallyExclusiveOption
from deafrica_conflux.cli.logs import logging_setup
from deafrica_conflux.io import check_file_exists, check_if_s3_uri
from deafrica_conflux.queues import get_queue_url, send_batch_with_retry
from deafrica_conflux.text import task_id_to_string

In [3]:
verbose = 1
cachedb_file_path = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/senegal_basin/conflux/dbs/wofs_ls_2023-03--P3M.db"
tasks_sqs_queue = None
tasks_text_file = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/senegal_basin/conflux/tasks/wofs_ls_2023-03--P3M_tasks.txt"
task_filter = ""

In [4]:
# Set up logger.
logging_setup(verbose)
_log = logging.getLogger(__name__)

In [5]:
# Verify
if (tasks_sqs_queue and tasks_text_file) or (not tasks_sqs_queue and not tasks_text_file):
    raise ValueError("Provide EITHER tasks_sqs_queue OR tasks_text_file!")

In [6]:
# Support pathlib Paths.
cachedb_file_path = str(cachedb_file_path)
if tasks_text_file is not None:
    tasks_text_file = str(tasks_text_file)

In [7]:
# Check if the cache db file exists.
if not check_file_exists(cachedb_file_path):
    raise FileNotFoundError(f"{cachedb_file_path} does not exist!")
else:
    if check_if_s3_uri(cachedb_file_path):
        cachedb_file_path = s3_download(cachedb_file_path)
        if not check_file_exists(cachedb_file_path):
            raise FileNotFoundError(
                f"{cachedb_file_path} does not exist! File did not download."
            )

[2024-01-19 17:41:11,819] {credentials.py:611} INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [8]:
# Read the cache file
cache = dscache.open_ro(cachedb_file_path)

# Get all the tiles in the file db.
cfg = cache.get_info_dict("stats/config")
grid = cfg["grid"]

all_tasks = sorted(idx for idx, _ in cache.tiles(grid)) if cache else []
_log.info(f"Found {len(all_tasks):,d} tasks in the file")

[2024-01-19 17:41:12,633] {1540521160.py:9} INFO - Found 1,715 tasks in the file


  ref_resolver = jsonschema.RefResolver.from_schema(


In [9]:
# Filter the tasks using the task filter.
if len(task_filter) == 0:
    tasks = all_tasks
    _log.info(f"Found {len(all_tasks):,d} tasks.")
else:
    tasks = parse_all_tasks(task_filter, all_tasks)
    _log.info(f"Found {len(tasks):,d} tasks after filtering using filter {task_filter}")

tasks_str = [task_id_to_string(tidx) for tidx in tasks]

[2024-01-18 14:17:50,758] {3267516486.py:4} INFO - Found 1,715 tasks.


In [10]:
if tasks_sqs_queue:
    sqs_client = boto3.client("sqs")
    tasks_sqs_queue_url = get_queue_url(queue_name=tasks_sqs_queue, sqs_client=sqs_client)

    # Check if there are any messages in the queue.
    # If there are any messages purge the queue.
    response = sqs_client.get_queue_attributes(
        QueueUrl=tasks_sqs_queue_url, AttributeNames=["All"]
    )
    if float(response["Attributes"]["ApproximateNumberOfMessages"]) > 0:
        _log.info(f"Purging queue {tasks_sqs_queue_url}...")
        response = sqs_client.purge_queue(QueueUrl=tasks_sqs_queue_url)
        time.sleep(60)  # Delay for 1 minute
        _log.info(f"Purge of queue {tasks_sqs_queue_url} is complete.")

    _, failed_to_push = send_batch_with_retry(
        queue_url=tasks_sqs_queue_url, messages=tasks_str, max_retries=10, sqs_client=sqs_client
    )
    if failed_to_push:
        _log.error(f"Failed to push the tasks: {failed_to_push}")
elif tasks_text_file:
    if check_if_s3_uri(tasks_text_file):
        fs = fsspec.filesystem("s3")
    else:
        fs = fsspec.filesystem("file")
    with fs.open(tasks_text_file, "w") as file:
        for task in tasks_str:
            file.write(f"{task}\n")
    _log.info(f"{len(tasks_str)} tasks written to: {tasks_text_file}.")


[2024-01-18 14:17:50,843] {3079867929.py:29} INFO - 1715 tasks written to: s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/senegal_basin/conflux/tasks/wofs_ls_2023-03--P3M_tasks.txt.
