In [1]:
import os
# These are the default AWS configurations for the Analysis Sandbox.
# that are set in the environmnet variables. 
aws_default_config = {
    #'AWS_NO_SIGN_REQUEST': 'YES', 
    'AWS_SECRET_ACCESS_KEY': 'fake',
    'AWS_ACCESS_KEY_ID': 'fake',
}

# To access public bucket, need to remove the AWS credentials in 
# the environment variables or the following error will occur.
# PermissionError: The AWS Access Key Id you provided does not exist in our records.

for key in aws_default_config.keys():
    if key in os.environ:
        del os.environ[key]

In [2]:
import logging
import os
from importlib import import_module

import click
import datacube
import fsspec
from odc import dscache
from odc.aws import s3_download
from rasterio.errors import RasterioIOError

from deafrica_conflux.cli.logs import logging_setup
from deafrica_conflux.db import get_engine_waterbodies
from deafrica_conflux.drill import drill
from deafrica_conflux.io import (
    check_dir_exists,
    check_file_exists,
    check_if_s3_uri,
    table_exists,
    write_table_to_parquet,
)
from deafrica_conflux.plugins.utils import run_plugin, validate_plugin
from deafrica_conflux.stack import stack_waterbodies_parquet_to_db

In [3]:
verbose = 1
# File path to the cache file database.
cachedb_file_path = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/senegal_basin/conflux/wofs_ls_2023-03--P3M.db"
# Text file to get tasks ids from.
tasks_text_file = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/senegal_basin/conflux/wofs_ls_2023-03--P3M_tasks.txt"
# Name of the plugin. Plugin file must be in the
# deafrica_conflux/plugins/ directory.
plugin_name = "waterbodies_timeseries_v2"
# Path to the directory containing the polygons raster files.
polygons_rasters_directory = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/senegal_basin/conflux/historical_extent_rasters"
# Directory to write the drill outputs to.
output_directory = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/senegal_basin/conflux/drill_pq_files"
# Rerun tasks that have already been processed.
overwrite = True
# Write to the Waterbodies database.
db = False
# Not matter DataFrame is empty or not, always as it as Parquet file.
dump_empty_dataframe = True

In [4]:
# Set up logger.
logging_setup(verbose)
_log = logging.getLogger(__name__)

In [5]:
# Support pathlib Paths
cachedb_file_path = str(cachedb_file_path)
tasks_text_file = str(tasks_text_file)
polygons_rasters_directory = str(polygons_rasters_directory)
output_directory = str(output_directory)

In [6]:
# Read the plugin as a Python module.
module = import_module(f"deafrica_conflux.plugins.{plugin_name}")
plugin_file = module.__file__
plugin = run_plugin(plugin_file)
_log.info(f"Using plugin {plugin_file}")
validate_plugin(plugin)

# Get the drill name from the plugin
drill_name = plugin.product_name

[2023-12-04 18:59:14,099] {2585789683.py:5} INFO - Using plugin /home/jovyan/dev/deafrica-conflux/deafrica_conflux/plugins/waterbodies_timeseries_v2.py


In [7]:
if not check_dir_exists(polygons_rasters_directory):
    _log.error(f"Directory {polygons_rasters_directory} does not exist!")
    raise FileNotFoundError(f"Directory {polygons_rasters_directory} does not exist!)")

[2023-12-04 18:59:14,310] {credentials.py:611} INFO - Found credentials in shared credentials file: ~/.aws/credentials


In [8]:
# Create the output directory if it does not exist.
if not check_dir_exists(output_directory):
    if check_if_s3_uri(output_directory):
        fsspec.filesystem("s3").makedirs(output_directory, exist_ok=True)
    else:
        fsspec.filesystem("file").makedirs(output_directory, exist_ok=True)
    _log.info(f"Created directory {output_directory}")

[2023-12-04 18:59:14,590] {562408864.py:7} INFO - Created directory s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/senegal_basin/conflux/drill_pq_files


In [9]:
if not check_file_exists(cachedb_file_path):
    _log.error(f"Could not find the database file {cachedb_file_path}!")
    raise FileNotFoundError(f"{cachedb_file_path} does not exist!")
else:
    if check_if_s3_uri(cachedb_file_path):
        cachedb_file_path = s3_download(cachedb_file_path)
        if not check_file_exists(cachedb_file_path):
            _log.error(f"{cachedb_file_path} did not download!")
            raise FileNotFoundError(
                f"{cachedb_file_path} does not exist! File did not download."
            )

In [10]:
if not check_file_exists(tasks_text_file):
    _log.error(f"Could not find the text file {tasks_text_file}!")
    raise FileNotFoundError(f"Could not find text file {tasks_text_file}!")

In [11]:
# Read task ids from the S3 URI or File URI.
if check_if_s3_uri(tasks_text_file):
    fs = fsspec.filesystem("s3")
else:
    fs = fsspec.filesystem("file")

with fs.open(tasks_text_file, "r") as file:
    tasks = [line.strip() for line in file]
_log.info(f"Read {len(tasks)} tasks from file.")
_log.debug(f"Read {tasks} from file.")

[2023-12-04 18:59:15,385] {3293637472.py:9} INFO - Read 1715 tasks from file.


In [12]:
if db:
    engine = get_engine_waterbodies()

# Connect to the datacube
dc = datacube.Datacube(app="deafrica-conflux-drill")

# Read the cache file
cache = dscache.open_ro(cachedb_file_path)

  ref_resolver = jsonschema.RefResolver.from_schema(


In [13]:
# For testing ony run the first 10 tasks.
tasks = tasks[:10]
len(tasks)

10

In [14]:
failed_tasks = []
for i, task in enumerate(tasks):
    _log.info(f"Processing task {task} ({i + 1}/{len(tasks)})")

    # Get the tasks output file name.
    if not overwrite:
        _log.info(f"Checking existence of {task}")
        exists = table_exists(
            drill_name=drill_name, task_id_string=task, output_directory=output_directory
        )
    if overwrite or not exists:
        try:
            # Perform the polygon drill.
            table = drill(
                plugin=plugin,
                task_id_string=task,
                cache=cache,
                polygon_rasters_split_by_tile_directory=polygons_rasters_directory,
                dc=dc,
            )
            # Write the table to a parquet file.
            if (dump_empty_dataframe) or (not table.empty):
                pq_file_name = write_table_to_parquet(
                    drill_name=drill_name,
                    task_id_string=task,
                    table=table,
                    output_directory=output_directory,
                )
                if db:
                    _log.info(f"Writing {pq_file_name} to database")
                    stack_waterbodies_parquet_to_db(
                        parquet_file_paths=[pq_file_name],
                        verbose=verbose,
                        engine=engine,
                        drop=False,
                    )

        except KeyError as keyerr:
            _log.exception(f"Found task {task} has KeyError: {str(keyerr)}")
            failed_tasks = [].append(task)
        except TypeError as typeerr:
            _log.exception(f"Found task {task} has TypeError: {str(typeerr)}")
            failed_tasks.append(task)
        except RasterioIOError as ioerror:
            _log.exception(f"Found task {task} has RasterioIOError: {str(ioerror)}")
            failed_tasks.append(task)
        except ValueError as valueerror:
            _log.exception(f"Found task {task} has ValueError: {str(valueerror)}")
            failed_tasks.append(task)
        else:
            _log.info(f"Task {task} successful")
    else:
        _log.info(f"Drill output for {task} already exists, skipping")

    if failed_tasks:
        # Write the failed dataset ids to a text file.
        parent_folder, file_name = os.path.split(tasks_text_file)
        file, file_extension = os.path.splitext(file_name)
        failed_tasks_text_file = os.path.join(
            parent_folder, file + "_failed_tasks" + file_extension
        )

        with fs.open(failed_tasks_text_file, "a") as file:
            for task in failed_tasks:
                file.write(f"{task}\n")

        _log.info(f"Failed tasks {failed_tasks} written to: {failed_tasks_text_file}.")

[2023-12-04 18:59:15,627] {1312493662.py:3} INFO - Processing task 2023-03-01/171/092 (1/10)
[2023-12-04 18:59:15,629] {drill.py:335} INFO - Finding polygon raster tile....
[2023-12-04 18:59:15,764] {io.py:445} INFO - Found 1 GeoTIFF files.
[2023-12-04 18:59:15,890] {drill.py:345} INFO - Loaded s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/senegal_basin/conflux/historical_extent_rasters/x171_y092.tif
[2023-12-04 18:59:15,904] {drill.py:371} INFO - Query object to use for loading data {'resampling': 'nearest', 'dask_chunks': {'x': 3200, 'y': 3200}, 'group_by': 'solar_day', 'like': GeoBox(3200, 3200, Affine(30.0, 0.0, -960000.0,
       0.0, -30.0, 1536000.0), PROJCS["WGS 84 / NSIDC EASE-Grid 2.0 Global",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Cylindrical_Equal