In [1]:
import os 

del os.environ["AWS_ACCESS_KEY_ID"]
del os.environ["AWS_SECRET_ACCESS_KEY"]

from dotenv import load_dotenv
# Path to env file containing the waterbodies database credentials
# Only necessary on the Sandbox.
env_path = "/home/jovyan/.env"
load_dotenv(env_path)

True

In [2]:
import json
import logging
import sys

import click
from datacube import Datacube
from odc.stats.model import DateTimeRange

from waterbodies.hopper import create_tasks_from_scenes
from waterbodies.io import check_directory_exists, find_geotiff_files
from waterbodies.logs import logging_setup
from waterbodies.text import format_task, get_tile_id_tuple_from_filename

In [3]:
verbose =1
run_type = "backlog-processing"
temporal_range = "2019--P1W"
historical_extent_rasters_directory = "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/historical_extent_rasters/"

In [4]:
# Set up logging.
logging_setup(verbose)
_log = logging.getLogger(__name__)

In [5]:
if not check_directory_exists(path=historical_extent_rasters_directory):
    e = FileNotFoundError(f"Directory {historical_extent_rasters_directory} does not exist!")
    _log.error(e)
    raise e
else:
    historical_extent_rasters = find_geotiff_files(directory_path=historical_extent_rasters_directory)

[2024-04-05 12:17:33,964] {credentials.py:557} INFO - Found credentials in environment variables.


In [6]:
# Get the tile_ids for tiles that actually contain waterbodies.
tile_ids_of_interest = [
        get_tile_id_tuple_from_filename(file_path=raster_file)
        for raster_file in historical_extent_rasters
    ]

In [7]:
product = "wofs_ls"

In [8]:
# Parse the temporal range
temporal_range_ = DateTimeRange(temporal_range)

In [9]:
# Connect to the datacube
dc = Datacube(app=run_type)

In [10]:
# Define the datacube query 
dc_query = dict(product=product, time=(temporal_range_.start, temporal_range_.end))
dc_query

{'product': 'wofs_ls',
 'time': (datetime.datetime(2019, 1, 1, 0, 0),
  datetime.datetime(2019, 1, 7, 23, 59, 59, 999999))}

In [11]:
%%time
# Query the datacube for all wofs_ls datasets whose acquisition times fall within
# the temporal range specified.
scenes = dc.find_datasets(**dc_query)

CPU times: user 170 ms, sys: 20.9 ms, total: 191 ms
Wall time: 2min 43s


In [12]:
%%time
tasks = create_tasks_from_scenes(scenes=scenes, tile_ids_of_interest=tile_ids_of_interest)

Processing    1,195 scenes: 100%|██████████| 1195/1195 [00:02<00:00, 589.11it/s] 

[2024-04-05 12:20:20,829] {hopper.py:167} INFO - Filter the 4164 cells to keep only the cells containing the 2783 tile ids of interest.





[2024-04-05 12:20:20,952] {hopper.py:174} INFO - Total number of cells after filtering: 2730
[2024-04-05 12:20:20,953] {hopper.py:178} INFO - For each cell, group the datasets by solar day
[2024-04-05 12:20:20,988] {hopper.py:184} INFO - Total number of tasks: 4617
CPU times: user 2.19 s, sys: 16.9 ms, total: 2.21 s
Wall time: 2.2 s


In [13]:
tasks = [format_task(task) for task in tasks]

In [14]:
json.dump(tasks[:10], sys.stdout)

[{"solar_day": "2019-01-05", "tile_id_x": 214, "tile_id_y": 83, "task_datasets_ids": ["5eccabe0-64d1-5b20-ad87-4e73505996cf", "b3f720d6-4bae-5f33-85a4-39516f3e4c0b"]}, {"solar_day": "2019-01-04", "tile_id_x": 214, "tile_id_y": 83, "task_datasets_ids": ["5b4f425f-8837-5cc8-92f9-da436e404847", "4ee2f719-2c6f-5dc9-a90e-839c5085384a"]}, {"solar_day": "2019-01-05", "tile_id_x": 215, "tile_id_y": 83, "task_datasets_ids": ["5eccabe0-64d1-5b20-ad87-4e73505996cf", "b3f720d6-4bae-5f33-85a4-39516f3e4c0b"]}, {"solar_day": "2019-01-06", "tile_id_x": 215, "tile_id_y": 83, "task_datasets_ids": ["9650962f-2448-5c47-8dc1-7d1d869458cb", "ac092109-221f-54fd-8c0c-c3daae28a8c0"]}, {"solar_day": "2019-01-05", "tile_id_x": 214, "tile_id_y": 84, "task_datasets_ids": ["5eccabe0-64d1-5b20-ad87-4e73505996cf", "b3f720d6-4bae-5f33-85a4-39516f3e4c0b"]}, {"solar_day": "2019-01-04", "tile_id_x": 214, "tile_id_y": 84, "task_datasets_ids": ["4ee2f719-2c6f-5dc9-a90e-839c5085384a"]}, {"solar_day": "2019-01-05", "tile_id_