In [1]:
from dotenv import load_dotenv

# Path to env file containing the waterbodies database credentials
# Only necessary on the Sandbox.
dotenv_path = "/home/jovyan/.env"
load_dotenv(dotenv_path=dotenv_path, verbose=True, override=True)

True

In [2]:
# Path to file containing the tasks.
import subprocess

bash_command = "cat /tmp/tasks | jq '.[200]'"
task_list = subprocess.check_output(bash_command, shell=True, universal_newlines=True)

In [3]:
import json
import logging
import os

import click
from datacube import Datacube

from waterbodies.db import get_waterbodies_engine
from waterbodies.hopper import find_task_datasets_ids
from waterbodies.io import check_directory_exists, get_filesystem
from waterbodies.logs import logging_setup
from waterbodies.surface_area_change import (  # noqa F401
    add_waterbody_observations_to_db,
    check_task_exists,
    get_waterbody_observations,
)
from waterbodies.text import get_task_id_str_from_tuple

In [4]:
verbose = 3
run_type = "backlog-processing"
task_list
historical_extent_rasters_directory = (
    "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/historical_extent_rasters/"
)
overwrite = False

In [5]:
# Set up logging.
logging_setup(verbose)
_log = logging.getLogger(__name__)

In [6]:
if not check_directory_exists(path=historical_extent_rasters_directory):
    e = FileNotFoundError(f"Directory {historical_extent_rasters_directory} does not exist!")
    _log.error(e)
    raise e

[2024-04-19 07:32:40,783] {credentials.py:557} INFO - Found credentials in environment variables.


In [7]:
product = "wofs_ls"

In [8]:
dc = Datacube(app=run_type)

In [9]:
# Connect to the database
engine = get_waterbodies_engine()
engine

Engine(postgresql+psycopg2://waterbodies_writer:***@db-writer:5432/waterbodies)

In [10]:
tasks = json.loads(task_list)
len(tasks)

34

In [11]:
failed_tasks = []
for idx, task in enumerate(tasks):
    _log.info(f"Processing task: {task}   {idx+1}/{len(tasks)}")

    solar_day = task["solar_day"]
    tile_id_x = task["tile_id_x"]
    tile_id_y = task["tile_id_y"]
    task_datasets_ids = task["task_datasets_ids"]

    task_id_tuple = (solar_day, tile_id_x, tile_id_y)
    task_id_str = get_task_id_str_from_tuple(task_id_tuple)

    try:
        if run_type == "backlog-processing":
            if not overwrite:
                exists = check_task_exists(task_id_str=task_id_str, engine=engine)

            if overwrite or not exists:
                waterbody_observations = get_waterbody_observations(
                    solar_day=solar_day,
                    tile_id_x=tile_id_x,
                    tile_id_y=tile_id_y,
                    task_datasets_ids=task_datasets_ids,
                    historical_extent_rasters_directory=historical_extent_rasters_directory,
                    dc=dc,
                )
                if waterbody_observations is None:
                    _log.info(f"Task {task_id_str} has no waterbody observations")
                else:
                    # add_waterbody_observations_to_db(
                    #    waterbody_observations=waterbody_observations, engine=engine,
                    #    update_rows=True
                    # )
                    _log.info(
                        f"Task {task_id_str} has {len(waterbody_observations)} waterbody observations"
                    )

                    _log.info(f"Task {task_id_str} complete")
            else:
                _log.info(f"Task {task_id_str} already exists, skipping")

        elif run_type == "gap-filling":
            # Find the dataset ids for the task.
            task_datasets_ids = find_task_datasets_ids(
                solar_day=solar_day,
                tile_id_x=tile_id_x,
                tile_id_y=tile_id_y,
                dc=dc,
                product=product,
            )
            waterbody_observations = get_waterbody_observations(
                solar_day=solar_day,
                tile_id_x=tile_id_x,
                tile_id_y=tile_id_y,
                task_datasets_ids=task_datasets_ids,
                historical_extent_rasters_directory=historical_extent_rasters_directory,
                dc=dc,
            )
            if waterbody_observations is None:
                _log.info(f"Task {task_id_str} has no waterbody observations")
            else:
                # add_waterbody_observations_to_db(
                #    waterbody_observations=waterbody_observations, engine=engine, update_rows=True
                # )
                _log.info(
                    f"Task {task_id_str} has {len(waterbody_observations)} waterbody observations"
                )

                _log.info(f"Task {task_id_str} complete")
    except Exception as error:
        _log.exception(error)
        _log.error(f"Failed to process task {task}")
        failed_tasks.append(task)

[2024-04-19 07:32:41,634] {3310953911.py:3} INFO - Processing task: {'solar_day': '2019-08-22', 'tile_id_x': 213, 'tile_id_y': 102, 'task_datasets_ids': ['e9dee6e4-23c4-558b-b547-23053fb52d38', '4ef16a7c-7d97-5fe0-999c-c404de27f9ca']}   1/34
[2024-04-19 07:32:43,727] {3310953911.py:41} INFO - Task 2019-08-22/x213/y102 already exists, skipping
[2024-04-19 07:32:43,728] {3310953911.py:3} INFO - Processing task: {'solar_day': '2019-09-30', 'tile_id_x': 213, 'tile_id_y': 102, 'task_datasets_ids': ['29dca1af-dcb9-59cf-b423-c2ee1ee8965f']}   2/34
[2024-04-19 07:32:44,230] {3310953911.py:41} INFO - Task 2019-09-30/x213/y102 already exists, skipping
[2024-04-19 07:32:44,230] {3310953911.py:3} INFO - Processing task: {'solar_day': '2019-06-19', 'tile_id_x': 213, 'tile_id_y': 102, 'task_datasets_ids': ['daeb2dc5-85e8-58e5-89d8-1307c9e43f34', 'e1e626d6-2ad3-5cb8-a7d0-c08b880199aa']}   3/34
[2024-04-19 07:32:44,732] {3310953911.py:41} INFO - Task 2019-06-19/x213/y102 already exists, skipping
[2024

In [12]:
if failed_tasks:
    failed_tasks_json_array = json.dumps(failed_tasks)

    tasks_directory = "/tmp/"
    fs = get_filesystem(path=tasks_directory)

    if not check_directory_exists(path=tasks_directory):
        fs.mkdirs(path=tasks_directory, exist_ok=True)
        _log.info(f"Created directory {tasks_directory}")

    failed_tasks_output_file = os.path.join(tasks_directory, "failed_tasks")
    with fs.open(failed_tasks_output_file, "a") as file:
        file.write(failed_tasks_json_array + "\n")
    _log.info(f"Failed tasks written to {failed_tasks_output_file}")