In [1]:
import logging
from odc.geo.geobox import GeoBox
from tqdm import tqdm
from datacube import Datacube
from odc.stats.model import DateTimeRange
from waterbodies.io import check_directory_exists, find_geotiff_files
from waterbodies.logs import logging_setup
from waterbodies.text import parse_tile_id_from_filename
from waterbodies.hopper import create_tasks_from_scenes, GRIDSPEC

In [2]:
from dotenv import load_dotenv
# Path to env file containing the waterbodies database credentials
# Only necessary on the Sandbox.
env_path = "/home/jovyan/.env"
load_dotenv(env_path)

True

In [3]:
verbose =1
run_type = "gap-filling"
temporal_range = "2024-02--P1M"
historical_extent_rasters_directory = "s3://deafrica-services/waterbodies/v0.0.2/conflux/historical_extent_rasters/"

In [4]:
# Set up logging.
logging_setup(verbose)
_log = logging.getLogger(__name__)

In [5]:
# Parse the temporal range
if run_type != "regular-update" :
    temporal_range_ = DateTimeRange(temporal_range)

In [6]:
if not check_directory_exists(path=historical_extent_rasters_directory):
    e = FileNotFoundError(f"Directory {historical_extent_rasters_directory} does not exist!")
    _log.error(e)
    raise e
else:
    historical_extent_rasters = find_geotiff_files(directory_path=historical_extent_rasters_directory)

In [7]:
# Get the tile_ids for tiles that actually contain waterbodies.
tile_ids_of_interest = [
    parse_tile_id_from_filename(file_path=raster_file)
    for raster_file in historical_extent_rasters
]

In [8]:
product = "wofs_ls"

In [9]:
# Connect to the datacube
dc = Datacube(app="gap-filling")

In [10]:
# The difference between gap-filling and the other steps is here
# we are searching for datasets by their creation date (`creation_time`),
# not their acquisition date (`time`).
dc_query = dict(product=product, creation_time=(temporal_range_.start, temporal_range_.end))
dc_query

{'product': 'wofs_ls',
 'creation_time': (datetime.datetime(2024, 2, 1, 0, 0),
  datetime.datetime(2024, 2, 29, 23, 59, 59, 999999))}

In [11]:
%%time
# Search the datacube for all wofs_ls datasets whose creation times (not acquisition time)
# fall within the temporal range specified.
# E.g  a dataset can have an aquisition date of 2023-12-15 but have been added to the
# datacube in 2024-02, which will be its creation date.
scenes = dc.find_datasets(**dc_query)

CPU times: user 1.95 s, sys: 542 ms, total: 2.49 s
Wall time: 1min 24s


In [12]:
%%time
# Identify the tasks / waterbody observations affected.
affected_tasks = create_tasks_from_scenes(scenes=scenes, tile_ids_of_interest=tile_ids_of_interest)

Processing   21,786 scenes: 100%|██████████| 21786/21786 [00:27<00:00, 801.06it/s]

[2024-03-28 20:20:46,878] {hopper.py:113} INFO - Filter the 5092 cells to keep only the cells containing the 2783 tile ids of interest.





[2024-03-28 20:20:47,047] {hopper.py:120} INFO - Total number of cells after filtering: 2783
[2024-03-28 20:20:47,048] {hopper.py:124} INFO - For each cell group the datasets by solar day
[2024-03-28 20:20:47,953] {hopper.py:137} INFO - Total of 19,541 unique dataset UUIDs.
[2024-03-28 20:20:47,954] {hopper.py:138} INFO - Total number of tasks: 85015
CPU times: user 28.1 s, sys: 323 ms, total: 28.4 s
Wall time: 28.3 s


In [13]:
# Get the ids for the affected waterbody observations.
task_ids = [
            task_id for task in affected_tasks for task_id, task_dataset_ids in task.items()
        ]

In [14]:
# For each task id add an empty list as a place holder for the task datasets ids.
# This will be filled in the drill function which is easier to do in parallel than
# looping over each task to update the required task dataset ids here.
tasks = [{task_id: []} for task_id in task_ids]

In [15]:
# View the first 100 tasks
tasks[:100]

[{('2024-01-11', 199, 66): []},
 {('2024-01-27', 199, 66): []},
 {('2024-02-03', 199, 66): []},
 {('2024-02-04', 199, 66): []},
 {('2024-01-16', 199, 66): []},
 {('2024-02-11', 199, 66): []},
 {('2024-01-26', 199, 66): []},
 {('2024-01-19', 199, 66): []},
 {('2024-01-18', 199, 66): []},
 {('2023-09-25', 199, 66): []},
 {('2023-07-06', 199, 66): []},
 {('2023-07-28', 199, 66): []},
 {('2023-12-15', 199, 66): []},
 {('2023-12-10', 199, 66): []},
 {('2023-07-23', 199, 66): []},
 {('2024-01-06', 199, 66): []},
 {('2023-07-01', 199, 66): []},
 {('2023-08-02', 199, 66): []},
 {('2023-08-19', 199, 66): []},
 {('2023-06-09', 199, 66): []},
 {('2023-10-07', 199, 66): []},
 {('2023-10-12', 199, 66): []},
 {('2023-06-14', 199, 66): []},
 {('2023-10-17', 199, 66): []},
 {('2023-08-29', 199, 66): []},
 {('2023-12-20', 199, 66): []},
 {('2023-09-20', 199, 66): []},
 {('2023-06-19', 199, 66): []},
 {('2023-11-13', 199, 66): []},
 {('2023-09-15', 199, 66): []},
 {('2023-07-11', 199, 66): []},
 {('2024