In [1]:
from dotenv import load_dotenv

# Path to env file containing the waterbodies database credentials
# Only necessary on the Sandbox.
dotenv_path = "/home/jovyan/.env"
load_dotenv(dotenv_path=dotenv_path, verbose=True, override=True)

True

In [2]:
import json
import logging
import os

import click
from datacube import Datacube
import numpy as np
from odc.stats.model import DateTimeRange

from waterbodies.hopper import create_tasks_from_datasets
from waterbodies.io import check_directory_exists, find_geotiff_files, get_filesystem
from waterbodies.logs import logging_setup
from waterbodies.text import format_task, get_tile_id_tuple_from_filename

In [3]:
verbose = 3
run_type = "backlog-processing"
temporal_range = "2016-04-05--P1D"
historical_extent_rasters_directory = "/home/jovyan/dev/waterbodies/tests/data/historical_extent_rasters_directory/"
#temporal_range = "2019--P1Y"
#historical_extent_rasters_directory = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/historical_extent_rasters/"
max_parallel_steps = 7000

In [4]:
# Set up logging.
logging_setup(verbose)
_log = logging.getLogger(__name__)

In [5]:
if not check_directory_exists(path=historical_extent_rasters_directory):
    e = FileNotFoundError(f"Directory {historical_extent_rasters_directory} does not exist!")
    _log.error(e)
    raise e
else:
    historical_extent_rasters = find_geotiff_files(
        directory_path=historical_extent_rasters_directory
    )

In [6]:
tiles_containing_waterbodies = [
    get_tile_id_tuple_from_filename(file_path=raster_file)
    for raster_file in historical_extent_rasters
]

In [7]:
product = "wofs_ls"

In [8]:
# Parse the temporal range
temporal_range_ = DateTimeRange(temporal_range)

In [9]:
# Connect to the datacube
dc = Datacube(app=run_type)

In [10]:
# Define the datacube query
dc_query = dict(product=product, time=(temporal_range_.start, temporal_range_.end))

In [11]:
%%time
# Query the datacube for all wofs_ls datasets whose acquisition times fall within
# the temporal range specified.
datasets = dc.find_datasets(**dc_query)

CPU times: user 48.8 ms, sys: 27 µs, total: 48.8 ms
Wall time: 1min 16s


In [12]:
_log.info(f"Found {len(datasets)} datasets matching the query {dc_query}")

[2024-04-19 20:02:46,870] {3047205837.py:1} INFO - Found 174 datasets matching the query {'product': 'wofs_ls', 'time': (datetime.datetime(2016, 4, 5, 0, 0), datetime.datetime(2016, 4, 5, 23, 59, 59, 999999))}


In [13]:
%%time
tasks = create_tasks_from_datasets(
    datasets=datasets, tile_ids_of_interest=tiles_containing_waterbodies
)

Processing 174 datasets: 100%|██████████| 174/174 [00:00<00:00, 381.19it/s]

CPU times: user 460 ms, sys: 4.8 ms, total: 465 ms
Wall time: 462 ms





In [14]:
# Put the tasks in the correct format
tasks = [format_task(task) for task in tasks]

In [15]:
# Sort the tasks by solar day
sorted_tasks = sorted(tasks, key=lambda x: x["solar_day"])
_log.info(f"Total number of tasks: {len(sorted_tasks)}")

[2024-04-19 20:02:47,350] {2708203668.py:3} INFO - Total number of tasks: 1


In [16]:
# Split the list into chunks.
task_chunks = np.array_split(np.array(sorted_tasks), max_parallel_steps)
task_chunks = [chunk.tolist() for chunk in task_chunks]
# Remove empty lists
task_chunks = list(filter(None, task_chunks))
# Get the number of chunks.
task_chunks_count = str(len(task_chunks))
task_chunks_count

'1'

In [17]:
# Convert list to json array.
task_chunks_json_array = json.dumps(task_chunks)

In [18]:
tasks_directory = "/tmp/"
tasks_output_file = os.path.join(tasks_directory, "tasks")
tasks_count_file = os.path.join(tasks_directory, "tasks_count")

In [19]:
fs = get_filesystem(path=tasks_directory)

In [20]:
if not check_directory_exists(path=tasks_directory):
    fs.mkdirs(path=tasks_directory, exist_ok=True)
    _log.info(f"Created directory {tasks_directory}")

In [21]:
with fs.open(tasks_output_file, "w") as file:
    file.write(task_chunks_json_array)
_log.info(f"Tasks written to {tasks_output_file}")

with fs.open(tasks_count_file, "w") as file:
    file.write(task_chunks_count)
_log.info(f"Tasks count written to {tasks_count_file}")

[2024-04-19 20:02:47,486] {1726518394.py:3} INFO - Tasks written to /tmp/tasks
[2024-04-19 20:02:47,487] {1726518394.py:7} INFO - Tasks count written to /tmp/tasks_count
