This Notebook downloads data from PC in the format required by the inference notebook.


In [1]:
import requests
import planetary_computer
from pystac.extensions.eo import EOExtension as eo
import pystac_client
import planetary_computer
import geopandas as gpd
from pathlib import Path
import numpy as np
import requests
import json
from datetime import datetime
import shapely
import pandas as pd
import rasterio as rio
from tqdm.auto import tqdm


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
# this is the world tide api key, let me know if you need it
world_tide_key_file = Path.cwd() / "world_tide_key.txt"
world_tides_api_key = world_tide_key_file.read_text().strip()

In [3]:
# this is output dir, each scene will end up about 2Gb so make sure you have space!
export_dir = Path("/Volumes/4TB SSD/Coastline data") / "inference_scenes_2"
export_dir.mkdir(exist_ok=True, parents=True)
export_dir

PosixPath('/Volumes/4TB SSD/Coastline data/inference_scenes_2')

In [4]:
required_bands = ["B03", "B08"]
extract_start_year = 2022
extract_end_year = 2022

In [5]:
# this is the sentinel 2 grid, limited to just Tas at the moment
sentinel_2_grid = Path.cwd() / "data/Senntinele 2 grid coastal tas.gpkg"
s2_grid = gpd.read_file(sentinel_2_grid)

In [6]:
catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1",
)

In [7]:
# sort the scenes by cloud cover
def sort_by_clouds(row, items):
    filtered_items = [
        item
        for item in items
        if eo.ext(item).cloud_cover is not None and row.Name in item.id
    ]
    sorted_items = sorted(filtered_items, key=lambda x: eo.ext(x).cloud_cover)
    return sorted_items

In [8]:
# use the world tide api to get the tide height at the time of the image
def filter_by_tide(centroid, items, limit=6):
    results = []
    for item in items:
        lon, lat = centroid.coords[0]

        dt_str = item.to_dict()["properties"]["datetime"]

        dt_obj = datetime.fromisoformat(dt_str.replace("Z", "+00:00"))

        # Fetch data from API
        url = f"https://www.worldtides.info/api/v3?heights&date={dt_obj.date().isoformat()}&lat={lat}&lon={lon}&key={world_tides_api_key}"
        response = requests.get(url)
        data = json.loads(response.text)

        if data == {"status": 400, "error": "No location found"}:
            print("No location found in world tides")
            return pd.DataFrame({"items": items, "tide": np.full(len(items), np.nan)})[
                :limit
            ]

        min_diff = float("inf")

        target_timestamp = dt_obj.timestamp()
        for entry in data["heights"]:
            diff = abs(entry["dt"] - target_timestamp)
            if diff < min_diff:
                min_diff = diff
                closest_entry = entry

        results.append({"item": item, "tide": closest_entry["height"]})
    # convert to df and sort by tide height
    results_df = pd.DataFrame(results).sort_values(by="tide", ascending=False)[:limit]

    return results_df

In [9]:
# sign the url and download the band
def get_band(href, attempt=0):
    try:
        singed_href = planetary_computer.sign(href)
        with rio.open(singed_href) as src:
            return src.read(1), src.profile.copy()
    except:
        print(f"Failed to open {href}")
        if attempt < 3:
            print(f"Trying again {attempt+1}")
            return get_band(href, attempt + 1)
        else:
            print(f"Failed to open {href} after 3 attempts")
            return None, None

In [10]:
# downlaod the required number of bands
def downlaod_bands(items_with_tide, time_steps):
    bands = []
    profile = {}
    pbar = tqdm(total=time_steps * len(required_bands), leave=False)
    for id, row in items_with_tide.iterrows():
        scene_bands = []

        for band in required_bands:
            href = row["item"].assets[band].href
            band, profile = get_band(href)
            if type(band) == type(None):
                print(f"Failed to download {href}")
                scene_bands = []
                break
            pbar.update(1)

            scene_bands.append(band)
        for band in scene_bands:
            bands.append(band)
        if len(bands) == time_steps * len(required_bands):
            return bands, profile
    return bands, profile

In [11]:
# each grid location may have multiple orbits which covers it, list each orbit
def split_by_orbits(items):
    orbits = {}
    for item in items:
        orbit = item.properties["sat:relative_orbit"]
        if orbit not in orbits:
            orbits[orbit] = [item]
        else:
            orbits[orbit].append(item)
    return orbits

In [12]:
def export_tif(bands, profile, export_path):
    array = np.array(bands)
    profile.update(count=array.shape[0])
    with rio.open(export_path, "w", **profile) as dst:
        dst.write(array)

In [13]:
s2_grid = s2_grid.sample(frac=1)

In [14]:
s2_grid.head()

Unnamed: 0,Name,geometry
10,55GDN,"MULTIPOLYGON Z (((145.78353 -42.44570 0.00000,..."
22,55HDS,"MULTIPOLYGON Z (((145.84726 -38.84277 0.00000,..."
8,55GCR,"MULTIPOLYGON Z (((144.66639 -39.72627 0.00000,..."
6,55GCP,"MULTIPOLYGON Z (((144.60284 -41.52631 0.00000,..."
18,55GFQ,"MULTIPOLYGON Z (((148.18270 -40.64480 0.00000,..."


In [15]:
target_bands = 12
time_steps = 6

In [16]:
for id, row in s2_grid.iterrows():
    centroid = row.geometry.centroid
    export_path = export_dir / f"{row.Name}_{extract_start_year}_{extract_end_year}.tif"
    print(export_path)

    if export_path.exists():
        print(f"File exists for {row.Name}")
        continue

    # Sentinel-2 query parameters
    query = {
        "collections": ["sentinel-2-l2a"],
        "intersects": shapely.to_geojson(centroid),
        "datetime": f"{extract_start_year}-01-01T00:00:00Z/{extract_end_year}-12-31T23:59:59Z",
    }
    scenes = catalog.search(**query).get_all_items()
    # break
    if len(scenes) == 0:
        continue

    scenes_by_orbit = split_by_orbits(scenes)
    all_orbits_bands = []
    for orbit, scenes in scenes_by_orbit.items():
        items_filtered = sort_by_clouds(row, scenes)
        items_filtered = items_filtered[:20]
        # break
        items_with_tide = filter_by_tide(centroid, items_filtered, limit=8)
        bands, profile = downlaod_bands(items_with_tide, time_steps)
        all_orbits_bands.append(bands)

    all_orbits_bands = np.array(all_orbits_bands)
    all_orbits_bands = np.moveaxis(all_orbits_bands, 0, 1)

    merged_bands = []
    for multi_orbit_bands in all_orbits_bands:
        target_array = np.zeros(multi_orbit_bands.shape[1:])
        for band in multi_orbit_bands:
            target_array[target_array == 0] = band[target_array == 0]
        merged_bands.append(target_array)
    # merged_bands = np.array(merged_bands)

    if len(merged_bands) == target_bands:
        export_tif(merged_bands, profile, export_path)
    else:
        print(f"Failed to download {row.Name}")
    # break

/Volumes/4TB SSD/Coastline data/inference_scenes_2/55GDN_2022_2022.tif
File exists for 55GDN
/Volumes/4TB SSD/Coastline data/inference_scenes_2/55HDS_2022_2022.tif




  0%|          | 0/12 [00:00<?, ?it/s]