This Notebook downloads data from PC in the format required by the inference notebook.


In [1]:
import requests
import planetary_computer
from pystac.extensions.eo import EOExtension as eo
import pystac_client
import planetary_computer
import geopandas as gpd
from pathlib import Path
import numpy as np
import requests
import json
from datetime import datetime
import shapely
import pandas as pd
import rasterio as rio
from tqdm.auto import tqdm
from shapely.geometry import Polygon, shape
from shapely.ops import transform
from rasterio.merge import merge
from concurrent.futures import ThreadPoolExecutor


import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In the next release, GeoPandas will switch to using Shapely by default, even if PyGEOS is installed. If you only have PyGEOS installed to get speed-ups, this switch should be smooth. However, if you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd


In [2]:
# this is the world tide api key, let me know if you need it
world_tide_key_file = Path.cwd() / "world_tide_key.txt"
world_tides_api_key = world_tide_key_file.read_text().strip()

In [3]:
# this is output dir, each scene will end up about 2Gb so make sure you have space!
export_dir = Path("/Volumes/4TB SSD/Coastline data") / "inference_scenes_5"
export_dir.mkdir(exist_ok=True, parents=True)
export_dir

PosixPath('/Volumes/4TB SSD/Coastline data/inference_scenes_5')

In [4]:
required_bands = ["B03", "B08"]
target_bands = 12
time_steps = 6
extract_start_year = 2022
extract_end_year = 2022

In [5]:
# this is the sentinel 2 grid, limited to just Tas at the moment
sentinel_2_grid = Path.cwd() / "data/Senntinele 2 grid coastal tas.gpkg"
s2_grid = gpd.read_file(sentinel_2_grid)

In [6]:
catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1",
)

In [7]:
def add_cloud_pct(items_df):
    items_df["cloud_pct"] = items_df.apply(
        lambda row: row[0].properties["eo:cloud_cover"], axis=1
    )
    return items_df

In [8]:
# add_cloud_pct(items_df)

In [9]:
# use the world tide api to get the tide height at the time of the image
def add_tide_height(centroid, items_df):
    results = []
    lon, lat = centroid.coords[0]
    for id, item in items_df.iterrows():
        dt_str = item[0].to_dict()["properties"]["datetime"]

        dt_obj = datetime.fromisoformat(dt_str.replace("Z", "+00:00"))

        # Fetch data from API
        url = f"https://www.worldtides.info/api/v3?heights&date={dt_obj.date().isoformat()}&lat={lat}&lon={lon}&key={world_tides_api_key}"
        response = requests.get(url)
        data = json.loads(response.text)

        min_diff = float("inf")

        target_timestamp = dt_obj.timestamp()
        for entry in data["heights"]:
            diff = abs(entry["dt"] - target_timestamp)
            if diff < min_diff:
                min_diff = diff
                closest_entry = entry

        results.append(closest_entry["height"])
    # convert to df and sort by tide height
    # results_df = pd.DataFrame(results).sort_values(by="tide", ascending=False)[:limit]
    items_df["tide_height"] = results

    return items_df

In [10]:
# sign the url and download the band
def get_band(href, attempt=0):
    try:
        singed_href = planetary_computer.sign(href)
        with rio.open(singed_href) as src:
            return src.read(1), src.profile.copy()
    except:
        print(f"Failed to open {href}")
        if attempt < 3:
            print(f"Trying again {attempt+1}")
            return get_band(href, attempt + 1)
        else:
            print(f"Failed to open {href} after 3 attempts")
            return None, None

In [11]:
# downlaod the required number of bands
def downlaod_bands(items_with_tide, time_steps):
    bands = []
    profile = {}
    pbar = tqdm(total=time_steps * len(required_bands), leave=False)
    for id, row in items_with_tide.iterrows():
        scene_bands = []

        for band in required_bands:
            href = row["item"].assets[band].href
            band, profile = get_band(href)
            if type(band) == type(None):
                print(f"Failed to download {href}")
                scene_bands = []
                break
            pbar.update(1)

            scene_bands.append(band)
        for band in scene_bands:
            bands.append(band)
        if len(bands) == time_steps * len(required_bands):
            return bands, profile
    return bands, profile

In [12]:
# each grid location may have multiple orbits which covers it, list each orbit
def split_by_orbits(items):
    orbits = {}
    for item in items:
        orbit = item.properties["sat:relative_orbit"]
        if orbit not in orbits:
            orbits[orbit] = [item]
        else:
            orbits[orbit].append(item)
    return orbits

In [13]:
def export_tif(bands, profile, export_path):
    array = np.array(bands)
    profile.update(count=array.shape[0])
    with rio.open(export_path, "w", **profile) as dst:
        dst.write(array)

In [14]:
s2_grid = s2_grid.sample(frac=1)

In [15]:
s2_grid.head()

Unnamed: 0,Name,geometry
23,55HES,"MULTIPOLYGON Z (((146.99977 -38.84846 0.00000,..."
15,55GEQ,"MULTIPOLYGON Z (((146.99976 -40.65086 0.00000,..."
5,55GCN,"MULTIPOLYGON Z (((144.56882 -42.42636 0.00000,..."
8,55GCR,"MULTIPOLYGON Z (((144.66639 -39.72627 0.00000,..."
7,55GCQ,"MULTIPOLYGON Z (((144.63532 -40.62664 0.00000,..."


In [18]:
# for id, row in s2_grid.iterrows():
def download_scene(row):
    _, row = row
    centroid = row.geometry.centroid
    export_path = export_dir / f"{row.Name}_{extract_start_year}_{extract_end_year}.tif"
    print(export_path)

    if export_path.exists():
        print(f"File exists for {row.Name}")
        return

    # Sentinel-2 query parameters
    query = {
        "collections": ["sentinel-2-l2a"],
        "intersects": shapely.to_geojson(centroid),
        "datetime": f"{extract_start_year}-01-01T00:00:00Z/{extract_end_year}-12-31T23:59:59Z",
        "query": {"s2:mgrs_tile": {"eq": row.Name}},
    }
    scenes = catalog.search(**query).get_all_items()
    # break
    if len(scenes) == 0:
        return

    scenes_by_orbit = split_by_orbits(scenes)
    all_orbits_bands = []
    for orbit, scenes in scenes_by_orbit.items():
        # make df from items in orbit
        items_df = pd.DataFrame(scenes)
        items_df.columns = ["item"]

        items_df = add_cloud_pct(items_df)
        # sort by cloud cover
        items_df = items_df.sort_values(by="cloud_pct", ascending=True)
        # only keep the top 20 scenes
        items_df = items_df[:20]
        items_df = add_tide_height(centroid, items_df)
        # round tide height to nearest 10
        items_df["cloud_pct"] = items_df["cloud_pct"].apply(
            lambda x: round(x / 10) * 10
        )
        # Sort by cloud_pct and then by tide_height
        items_df = items_df.sort_values(
            by=["cloud_pct", "tide_height"], ascending=[True, False]
        )
        # download the required bands
        bands, profile = downlaod_bands(items_df, time_steps)
        all_orbits_bands.append(bands)

    all_orbits_bands = np.array(all_orbits_bands)
    all_orbits_bands = np.moveaxis(all_orbits_bands, 0, 1)

    merged_bands = []
    for multi_orbit_bands in all_orbits_bands:
        target_array = np.zeros(multi_orbit_bands.shape[1:])
        for band in multi_orbit_bands:
            target_array[target_array == 0] = band[target_array == 0]
        merged_bands.append(target_array)
    # merged_bands = np.array(merged_bands)

    if len(merged_bands) == target_bands:
        export_tif(merged_bands, profile, export_path)
    else:
        print(f"Failed to download {row.Name}")
    # break

In [19]:
# call download_scene with a thread pool

with ThreadPoolExecutor(max_workers=4) as pool:
    pool.map(download_scene, s2_grid.iterrows())

/Volumes/4TB SSD/Coastline data/inference_scenes_5/55HES_2022_2022.tif/Volumes/4TB SSD/Coastline data/inference_scenes_5/55GEQ_2022_2022.tif

/Volumes/4TB SSD/Coastline data/inference_scenes_5/55GCR_2022_2022.tif
/Volumes/4TB SSD/Coastline data/inference_scenes_5/55GCN_2022_2022.tif
File exists for 55GCR
File exists for 55GEQ
File exists for 55HES
File exists for 55GCN
/Volumes/4TB SSD/Coastline data/inference_scenes_5/55GFQ_2022_2022.tif
/Volumes/4TB SSD/Coastline data/inference_scenes_5/55GFR_2022_2022.tif
/Volumes/4TB SSD/Coastline data/inference_scenes_5/55HCS_2022_2022.tif
/Volumes/4TB SSD/Coastline data/inference_scenes_5/55GCQ_2022_2022.tif
File exists for 55GFR
/Volumes/4TB SSD/Coastline data/inference_scenes_5/55GER_2022_2022.tif
File exists for 55HCS
/Volumes/4TB SSD/Coastline data/inference_scenes_5/54GYA_2022_2022.tif
File exists for 54GYA
/Volumes/4TB SSD/Coastline data/inference_scenes_5/55GEM_2022_2022.tif
File exists for 55GFQ
/Volumes/4TB SSD/Coastline data/inference_s



  0%|          | 0/12 [00:00<?, ?it/s]