In [1]:
from dotenv import load_dotenv

# Path to env file containing the waterbodies database credentials
# Only necessary on the Sandbox.
dotenv_path = "/home/jovyan/.env"
load_dotenv(dotenv_path=dotenv_path, verbose=True, override=True)

True

In [2]:
import json
import logging
import os

import click
from datacube import Datacube
import numpy as np
from odc.stats.model import DateTimeRange

from waterbodies.hopper import create_tasks_from_scenes
from waterbodies.io import check_directory_exists, find_geotiff_files, get_filesystem
from waterbodies.logs import logging_setup
from waterbodies.text import format_task, get_tile_id_tuple_from_filename

In [3]:
verbose = 3
run_type = "backlog-processing"
temporal_range = "2019--P1Y"
historical_extent_rasters_directory = (
    "s3://deafrica-waterbodies-dev/waterbodies/v0.0.2/historical_extent_rasters/"
)
max_parallel_steps = 7000

In [4]:
# Set up logging.
logging_setup(verbose)
_log = logging.getLogger(__name__)

In [5]:
if not check_directory_exists(path=historical_extent_rasters_directory):
    e = FileNotFoundError(f"Directory {historical_extent_rasters_directory} does not exist!")
    _log.error(e)
    raise e
else:
    historical_extent_rasters = find_geotiff_files(
        directory_path=historical_extent_rasters_directory
    )

[2024-04-18 16:33:27,707] {credentials.py:557} INFO - Found credentials in environment variables.


In [6]:
# Get the tile_ids for tiles that actually contain waterbodies.
tile_ids_of_interest = [
    get_tile_id_tuple_from_filename(file_path=raster_file)
    for raster_file in historical_extent_rasters
]

In [7]:
product = "wofs_ls"

In [8]:
# Parse the temporal range
temporal_range_ = DateTimeRange(temporal_range)

In [9]:
# Connect to the datacube
dc = Datacube(app=run_type)

In [10]:
# Define the datacube query
dc_query = dict(product=product, time=(temporal_range_.start, temporal_range_.end))
dc_query

{'product': 'wofs_ls',
 'time': (datetime.datetime(2019, 1, 1, 0, 0),
  datetime.datetime(2019, 12, 31, 23, 59, 59, 999999))}

In [11]:
%%time
# Query the datacube for all wofs_ls datasets whose acquisition times fall within
# the temporal range specified.
scenes = dc.find_datasets(**dc_query)

CPU times: user 6.21 s, sys: 781 ms, total: 6.99 s
Wall time: 1min 30s


In [12]:
%%time
tasks = create_tasks_from_scenes(scenes=scenes, tile_ids_of_interest=tile_ids_of_interest)

Processing   61,203 scenes: 100%|██████████| 61203/61203 [00:47<00:00, 1284.56it/s]

[2024-04-18 16:35:47,077] {hopper.py:167} INFO - Filter the 4450 cells to keep only the cells containing the 2783 tile ids of interest.
[2024-04-18 16:35:47,219] {hopper.py:174} INFO - Total number of cells after filtering: 2783





[2024-04-18 16:35:47,220] {hopper.py:178} INFO - For each cell, group the datasets by solar day
[2024-04-18 16:35:49,803] {hopper.py:184} INFO - Total number of tasks: 236711
CPU times: user 50.4 s, sys: 209 ms, total: 50.6 s
Wall time: 50.4 s


In [13]:
# Put the tasks in the correct format.
tasks = [format_task(task) for task in tasks]

In [14]:
%%time
# Split the list into chunks.
task_chunks = np.array_split(np.array(tasks), max_parallel_steps)
task_chunks = [chunk.tolist() for chunk in task_chunks]

CPU times: user 54.2 ms, sys: 3.85 ms, total: 58 ms
Wall time: 56.5 ms


In [15]:
# Convert list to json array.
task_chunks_json_array = json.dumps(task_chunks)

In [17]:
tasks_directory = "/tmp/"
fs = get_filesystem(path=tasks_directory)

In [18]:
if not check_directory_exists(path=tasks_directory):
    fs.mkdirs(path=tasks_directory, exist_ok=True)
    _log.info(f"Created directory {tasks_directory}")

In [19]:
tasks_output_file = os.path.join(tasks_directory, "tasks")
tasks_count_file = os.path.join(tasks_directory, "tasks_count")

In [20]:
with fs.open(tasks_output_file, "w") as file:
    file.write(task_chunks_json_array)
_log.info(f"Tasks written to {tasks_output_file}")

with fs.open(tasks_count_file, "w") as file:
    file.write(str(len(task_chunks)))
_log.info(f"Tasks count written to {tasks_count_file}")

[2024-04-18 16:36:04,509] {1198171069.py:3} INFO - Tasks written to /tmp/tasks
[2024-04-18 16:36:04,510] {1198171069.py:7} INFO - Tasks count written to /tmp/tasks_count
