In [1]:
from dotenv import load_dotenv

# Path to env file containing the waterbodies database credentials
# Only necessary on the Sandbox.
dotenv_path = "/home/jovyan/.env"
load_dotenv(dotenv_path=dotenv_path, verbose=True, override=True)

True

In [2]:
# Temporal range is defined in the Argo workflow generate-temporal-range step
import json
import sys
from datetime import datetime, timedelta

today = datetime.now().date()
seven_days_ago = today - timedelta(days=7)
temporal_range = f"{seven_days_ago:%Y-%m-%d}--P7D"
json.dump(temporal_range, sys.stdout)

"2024-04-12--P7D"

In [3]:
import json
import logging
import os
from datetime import timedelta
from itertools import groupby

import click
import numpy as np
from datacube import Datacube
from odc.stats.model import DateTimeRange

from waterbodies.hopper import create_tasks_from_datasets
from waterbodies.io import check_directory_exists, find_geotiff_files, get_filesystem
from waterbodies.logs import logging_setup
from waterbodies.text import format_task, get_tile_id_tuple_from_filename

In [4]:
verbose = 3
run_type = "backlog-processing"
temporal_range
historical_extent_rasters_directory = (
    "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/historical_extent_rasters/"
)
max_parallel_steps = 100

In [5]:
# Set up logging.
logging_setup(verbose)
_log = logging.getLogger(__name__)

In [6]:
if not check_directory_exists(path=historical_extent_rasters_directory):
    e = FileNotFoundError(f"Directory {historical_extent_rasters_directory} does not exist!")
    _log.error(e)
    raise e
else:
    historical_extent_rasters = find_geotiff_files(
        directory_path=historical_extent_rasters_directory
    )

[2024-04-19 19:51:52,372] {credentials.py:557} INFO - Found credentials in environment variables.


In [7]:
# Get the tile_ids for tiles that actually contain waterbodies.
tiles_containing_waterbodies = [
    get_tile_id_tuple_from_filename(file_path=raster_file)
    for raster_file in historical_extent_rasters
]

In [8]:
product = "wofs_ls"

In [9]:
# Parse the temporal range
temporal_range_ = DateTimeRange(temporal_range)

In [10]:
# Connect to the datacube
dc = Datacube(app=run_type)

In [11]:
if abs(temporal_range_.end - temporal_range_.start) > timedelta(days=7):
    _log.warning(
        "Gap-filling is only meant to be run for a temporal range of 7 days or less. "
        "If running for a larger temporal range please except long run times."
    )

In [12]:
# The difference between gap-filling and the other steps is here
# we are seatemporal_rangerching for datasets by their creation date (`creation_time`),
# not their acquisition date (`time`).
dc_query_ = dict(product=product, creation_time=(temporal_range_.start, temporal_range_.end))

In [13]:
%%time
# Search the datacube for all wofs_ls datasets whose creation times (not acquisition time)
# fall within the temporal range specified.
# E.g  a dataset can have an aquisition date of 2023-12-15 but have been added to the
# datacube in 2024-02, which will be its creation date.
datasets_ = dc.find_datasets(**dc_query_)
_log.info(f"Found {len(datasets_)} datasets matching the query {dc_query_}")

[2024-04-19 19:52:05,478] {<timed exec>:6} INFO - Found 1128 datasets matching the query {'product': 'wofs_ls', 'creation_time': (datetime.datetime(2024, 4, 12, 0, 0), datetime.datetime(2024, 4, 18, 23, 59, 59, 999999))}
CPU times: user 75.6 ms, sys: 13.6 ms, total: 89.3 ms
Wall time: 12 s


In [14]:
%%time
# Get the ids of the tasks to process.
tasks_ = create_tasks_from_datasets(
    datasets=datasets_, tile_ids_of_interest=tiles_containing_waterbodies
)

Processing 1128 datasets: 100%|██████████| 1128/1128 [00:01<00:00, 611.67it/s]


CPU times: user 1.97 s, sys: 24.4 ms, total: 2 s
Wall time: 1.99 s


In [15]:
# Update each task with the datasets whose acquisition time matches
# the solar day in the task id.
task_ids_ = [task_id for task in tasks_ for task_id, task_dataset_ids in task.items()]
# Sort the task ids by the solar day.
sorted_task_ids = sorted(task_ids_, key=lambda x: x[0])
# Group the sorted task ids by solar day.
grouped_task_ids = {key: list(group) for key, group in groupby(sorted_task_ids, key=lambda x: x[0])}

In [16]:
%%time
tasks = []
idx = 1
for solar_day, task_ids in grouped_task_ids.items():
    _log.info(
        f"Updating datasets for tasks with the solar day: {solar_day}  {idx}/{len(grouped_task_ids)}"
    )
    task_tile_ids = [(task_id[1], task_id[2]) for task_id in task_ids]
    dc_query = dict(product=product, time=(solar_day))
    datasets = dc.find_datasets(**dc_query)
    updated_tasks = create_tasks_from_datasets(
        datasets=datasets, tile_ids_of_interest=task_tile_ids
    )
    tasks.extend(updated_tasks)
    idx += 1

[2024-04-19 19:52:07,484] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-04  1/13


Processing 183 datasets: 100%|██████████| 183/183 [00:00<00:00, 506.09it/s]

[2024-04-19 19:52:52,907] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-05  2/13



Processing 185 datasets: 100%|██████████| 185/185 [00:00<00:00, 509.51it/s]

[2024-04-19 19:53:30,327] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-06  3/13



Processing 174 datasets: 100%|██████████| 174/174 [00:00<00:00, 509.24it/s]

[2024-04-19 19:54:05,365] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-07  4/13



Processing 169 datasets: 100%|██████████| 169/169 [00:00<00:00, 366.44it/s]

[2024-04-19 19:54:40,514] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-08  5/13



Processing 175 datasets: 100%|██████████| 175/175 [00:00<00:00, 506.06it/s]

[2024-04-19 19:55:16,974] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-09  6/13



Processing 179 datasets: 100%|██████████| 179/179 [00:00<00:00, 504.82it/s]

[2024-04-19 19:55:51,840] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-11  7/13



Processing 180 datasets: 100%|██████████| 180/180 [00:00<00:00, 507.28it/s]

[2024-04-19 19:56:26,865] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-12  8/13



Processing 183 datasets: 100%|██████████| 183/183 [00:00<00:00, 504.22it/s]

[2024-04-19 19:57:01,701] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-13  9/13



Processing 185 datasets: 100%|██████████| 185/185 [00:00<00:00, 519.04it/s]

[2024-04-19 19:57:37,522] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-14  10/13



Processing 89 datasets: 100%|██████████| 89/89 [00:00<00:00, 299.25it/s]

[2024-04-19 19:58:19,247] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-15  11/13



Processing 84 datasets: 100%|██████████| 84/84 [00:00<00:00, 503.82it/s]

[2024-04-19 19:59:07,225] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-16  12/13



Processing 92 datasets: 100%|██████████| 92/92 [00:00<00:00, 512.05it/s]

[2024-04-19 19:59:44,074] {<timed exec>:4} INFO - Updating datasets for tasks with the solar day: 2024-04-17  13/13



Processing 86 datasets: 100%|██████████| 86/86 [00:00<00:00, 504.85it/s]

CPU times: user 4.6 s, sys: 75.4 ms, total: 4.68 s
Wall time: 8min 19s





In [17]:
# Put the tasks in the correct format.
tasks = [format_task(task) for task in tasks]

In [18]:
# Sort the tasks by solar day
sorted_tasks = sorted(tasks, key=lambda x: x["solar_day"])
_log.info(f"Total number of tasks: {len(sorted_tasks)}")

[2024-04-19 20:00:26,585] {2708203668.py:3} INFO - Total number of tasks: 4186


In [19]:
# Split the list into chunks.
task_chunks = np.array_split(np.array(sorted_tasks), max_parallel_steps)
task_chunks = [chunk.tolist() for chunk in task_chunks]
# Remove empty lists
task_chunks = list(filter(None, task_chunks))
# Get the number of chunks.
task_chunks_count = str(len(task_chunks))
task_chunks_count

'100'

In [20]:
# Convert list to json array.
task_chunks_json_array = json.dumps(task_chunks)

In [21]:
tasks_directory = "/tmp/"
tasks_output_file = os.path.join(tasks_directory, "tasks")
tasks_count_file = os.path.join(tasks_directory, "tasks_count")

In [22]:
fs = get_filesystem(path=tasks_directory)

In [23]:
if not check_directory_exists(path=tasks_directory):
    fs.mkdirs(path=tasks_directory, exist_ok=True)
    _log.info(f"Created directory {tasks_directory}")

In [24]:
with fs.open(tasks_output_file, "w") as file:
    file.write(task_chunks_json_array)
_log.info(f"Tasks written to {tasks_output_file}")

with fs.open(tasks_count_file, "w") as file:
    file.write(task_chunks_count)
_log.info(f"Tasks count written to {tasks_count_file}")

[2024-04-19 20:00:26,627] {1726518394.py:3} INFO - Tasks written to /tmp/tasks
[2024-04-19 20:00:26,628] {1726518394.py:7} INFO - Tasks count written to /tmp/tasks_count
