In [1]:
from dotenv import load_dotenv

# Path to env file containing the waterbodies database credentials
# Only necessary on the Sandbox.
dotenv_path = "/home/jovyan/.env"
load_dotenv(dotenv_path=dotenv_path, verbose=True, override=True)

True

In [3]:
import json
import logging
import os

import click
from datacube import Datacube
import numpy as np
from odc.stats.model import DateTimeRange

from waterbodies.hopper import create_tasks_from_datasets
from waterbodies.io import check_directory_exists, find_geotiff_files, get_filesystem
from waterbodies.logs import logging_setup
from waterbodies.text import format_task, get_tile_id_tuple_from_filename

In [4]:
verbose = 3
run_type = "backlog-processing"
temporal_range = "2019--P1Y"
historical_extent_rasters_directory = (
    "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/historical_extent_rasters/"
)
max_parallel_steps = 7000

In [5]:
# Set up logging.
logging_setup(verbose)
_log = logging.getLogger(__name__)

In [6]:
if not check_directory_exists(path=historical_extent_rasters_directory):
    e = FileNotFoundError(f"Directory {historical_extent_rasters_directory} does not exist!")
    _log.error(e)
    raise e
else:
    historical_extent_rasters = find_geotiff_files(
        directory_path=historical_extent_rasters_directory
    )

[2024-04-19 16:38:59,147] {credentials.py:557} INFO - Found credentials in environment variables.


In [7]:
tiles_containing_waterbodies = [
    get_tile_id_tuple_from_filename(file_path=raster_file)
    for raster_file in historical_extent_rasters
]

In [8]:
product = "wofs_ls"

In [9]:
# Parse the temporal range
temporal_range_ = DateTimeRange(temporal_range)

In [10]:
# Connect to the datacube
dc = Datacube(app=run_type)

In [11]:
# Define the datacube query
dc_query = dict(product=product, time=(temporal_range_.start, temporal_range_.end))

In [12]:
%%time
# Query the datacube for all wofs_ls datasets whose acquisition times fall within
# the temporal range specified.
datasets = dc.find_datasets(**dc_query)

CPU times: user 5.71 s, sys: 687 ms, total: 6.4 s
Wall time: 1min 26s


In [13]:
_log.info(f"Found {len(datasets)} datasets matching the query {dc_query}")

[2024-04-19 16:40:44,149] {3047205837.py:1} INFO - Found 61203 datasets matching the query {'product': 'wofs_ls', 'time': (datetime.datetime(2019, 1, 1, 0, 0), datetime.datetime(2019, 12, 31, 23, 59, 59, 999999))}


In [14]:
%%time
tasks = create_tasks_from_datasets(
    datasets=datasets, tile_ids_of_interest=tiles_containing_waterbodies
)

Processing 61203 datasets: 100%|██████████| 61203/61203 [00:46<00:00, 1316.30it/s]


CPU times: user 49.1 s, sys: 279 ms, total: 49.4 s
Wall time: 49.2 s


In [15]:
# Put the tasks in the correct format
tasks = [format_task(task) for task in tasks]

In [16]:
# Sort the tasks by solar day
sorted_tasks = sorted(tasks, key=lambda x: x["solar_day"])
_log.info(f"Total number of tasks: {len(sorted_tasks)}")

[2024-04-19 16:41:34,583] {2708203668.py:3} INFO - Total number of tasks: 236711


In [17]:
# Split the list into chunks.
task_chunks = np.array_split(np.array(sorted_tasks), max_parallel_steps)
task_chunks = [chunk.tolist() for chunk in task_chunks]
# Get the number of chunks.
task_chunks_count = str(len(task_chunks))

In [18]:
# Convert list to json array.
task_chunks_json_array = json.dumps(task_chunks)

In [19]:
tasks_directory = "/tmp/"
tasks_output_file = os.path.join(tasks_directory, "tasks")
tasks_count_file = os.path.join(tasks_directory, "tasks_count")

In [20]:
fs = get_filesystem(path=tasks_directory)

In [21]:
if not check_directory_exists(path=tasks_directory):
    fs.mkdirs(path=tasks_directory, exist_ok=True)
    _log.info(f"Created directory {tasks_directory}")

In [22]:
with fs.open(tasks_output_file, "w") as file:
    file.write(task_chunks_json_array)
_log.info(f"Tasks written to {tasks_output_file}")

with fs.open(tasks_count_file, "w") as file:
    file.write(task_chunks_count)
_log.info(f"Tasks count written to {tasks_count_file}")

[2024-04-19 16:41:35,382] {1726518394.py:3} INFO - Tasks written to /tmp/tasks
[2024-04-19 16:41:35,383] {1726518394.py:7} INFO - Tasks count written to /tmp/tasks_count
