In [1]:
import logging

from datacube import Datacube
from odc.stats.model import DateTimeRange
from waterbodies.io import check_directory_exists, find_geotiff_files
from waterbodies.logs import logging_setup
from waterbodies.text import parse_tile_id_from_filename
from waterbodies.hopper import create_tasks_from_scenes

In [2]:
from dotenv import load_dotenv
# Path to env file containing the waterbodies database credentials
# Only necessary on the Sandbox.
env_path = "/home/jovyan/.env"
load_dotenv(env_path)

True

In [3]:
verbose =1
run_type = "backlog-processing"
temporal_range = "2014--P5Y"
historical_extent_rasters_directory = "s3://deafrica-services/waterbodies/v0.0.2/conflux/historical_extent_rasters/"

In [4]:
# Set up logging.
logging_setup(verbose)
_log = logging.getLogger(__name__)

In [5]:
# Parse the temporal range
if run_type != "regular-update" :
    temporal_range_ = DateTimeRange(temporal_range)

In [6]:
if not check_directory_exists(path=historical_extent_rasters_directory):
    e = FileNotFoundError(f"Directory {historical_extent_rasters_directory} does not exist!")
    _log.error(e)
    raise e
else:
    historical_extent_rasters = find_geotiff_files(directory_path=historical_extent_rasters_directory)

In [7]:
# Get the tile_ids for tiles that actually contain waterbodies.
tile_ids_of_interest = [
    parse_tile_id_from_filename(file_path=raster_file)
    for raster_file in historical_extent_rasters
]

In [8]:
product = "wofs_ls"

In [9]:
# Connect to the datacube
dc = Datacube(app="backlog-processing")

In [10]:
# Define the datacube query 
dc_query = dict(product=product, time=(temporal_range_.start, temporal_range_.end))
dc_query

{'product': 'wofs_ls',
 'time': (datetime.datetime(2014, 1, 1, 0, 0),
  datetime.datetime(2018, 12, 31, 23, 59, 59, 999999))}

In [11]:
%%time
# Query the datacube for all wofs_ls datasets whose acquisition times fall within
# the temporal range specified.
scenes = dc.find_datasets(**dc_query)

CPU times: user 29.8 s, sys: 3.48 s, total: 33.2 s
Wall time: 1min 52s


In [12]:
%%time
tasks = create_tasks_from_scenes(scenes=scenes, tile_ids_of_interest=tile_ids_of_interest)

Processing  298,847 scenes: 100%|██████████| 298847/298847 [06:00<00:00, 829.88it/s]

[2024-03-28 18:44:00,634] {hopper.py:148} INFO - Filter the 4649 cells to keep only the cells containing the 2783 tile ids of interest.





[2024-03-28 18:44:00,810] {hopper.py:155} INFO - Total number of cells after filtering: 2783
[2024-03-28 18:44:00,811] {hopper.py:159} INFO - For each cell group the datasets by solar day
[2024-03-28 18:44:12,377] {hopper.py:172} INFO - Total of 268,917 unique dataset UUIDs.
[2024-03-28 18:44:12,378] {hopper.py:173} INFO - Total number of tasks: 1157396
CPU times: user 6min 11s, sys: 2.05 s, total: 6min 13s
Wall time: 6min 11s


In [13]:
# View the first 100 tasks
tasks[:100]

[{('2016-04-05', 199, 34): [UUID('9b916e21-2229-5121-8333-0a8b3736d440')]},
 {('2018-08-01', 199, 34): [UUID('90805fa9-01f2-5054-85b2-98c245bba569')]},
 {('2018-07-16', 199, 34): [UUID('3fefcaf4-e8b8-5edc-a059-9938143d794a')]},
 {('2016-04-14', 199, 34): [UUID('93d84752-2d04-58ce-9532-547ed6826d23'),
   UUID('addec823-2fc6-5af7-90ec-9385a8240018')]},
 {('2017-09-08', 199, 34): [UUID('03403972-01b0-5225-a8c6-a72692d3a092'),
   UUID('1f3b6901-9068-5022-8400-bbe6c72c5aed')]},
 {('2016-06-17', 199, 34): [UUID('d5b2ef55-d110-525b-8c25-8fad0f36ebba'),
   UUID('72086146-58c7-586b-a0b2-9ea0273b2c3a')]},
 {('2016-05-16', 199, 34): [UUID('3d6eb622-0102-5a75-8aa9-50e5e7069063'),
   UUID('ec9af1b2-4d15-50f3-a6fb-ee26ce1dbbe9')]},
 {('2016-02-26', 199, 34): [UUID('4374a999-9e4b-5841-99bd-ad9efe4a7193'),
   UUID('29d90677-d278-5fa8-a630-a7b61582dbb4')]},
 {('2018-10-13', 199, 34): [UUID('2fafdd62-9240-5f22-92b7-89e37190709b'),
   UUID('0711eb91-815d-56c8-96b5-ad10156cf532')]},
 {('2014-06-28', 199, 