In [7]:
import requests
from bs4 import BeautifulSoup
import re
import lxml
from datetime import datetime, timedelta
from shapely.geometry import box
from fiona.drvsupport import supported_drivers
import geopandas as gpd

- Input a date range and a polygon for area(s) of interest
- Find folders that are within that date range
- Find images that intersect the polygons
- Clip the images that intersect by the polygons and save geotiff of the interecting area

In [30]:
base_url = "https://data.ceda.ac.uk/neodc/sentinel_ard/data/sentinel_2"

In [31]:
start_date = "2023-05-01"
end_date = "2023-05-10"

In [32]:
def filter_sentinel2_tiles(aoi, tiles_layer=None):
    if tiles_layer is None:
        supported_drivers["KML"] = "rw"
        tiles_layer = gpd.read_file(
            "https://sentinels.copernicus.eu/documents/247904/1955685/S2A_OPER_GIP_TILPAR_MPC__20151209T095117_V20150622T000000_21000101T000000_B00.kml",
            driver="kml",
        )
    tile_list = gpd.sjoin(
        tiles_layer, aoi.to_crs(epsg=4326), how="inner", op="intersects"
    )["Name"].to_list()
    return [f"T{t}" for t in tile_list]

In [33]:
aoi = gpd.read_file("inputs/test_quarries.shp")
tile_list = filter_sentinel2_tiles(aoi)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [34]:
def create_date_url(base_url, input_date):
    year = input_date.strftime("%Y")
    month = input_date.strftime("%m")
    day = input_date.strftime("%d")
    return f"{base_url}/{year}/{month}/{day}"


def get_existing_folders(base_url, start_date, end_date):
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    current_date = start_date
    urls = []
    while current_date <= end_date:
        check_url = create_date_url(base_url, current_date)
        response = requests.get(check_url, timeout=5)
        if response.status_code == 200:
            urls.append(check_url)
        current_date += timedelta(days=1)
    return urls


def extract_xml_links(url, tile_list=None):
    """Extracts XML links from the given HTML webpage URL."""
    xml_links = []
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        for link in soup.find_all("a", href=True):
            href = link["href"]
            if href.endswith(".xml?download=1"):
                if isinstance(tile_list, list):
                    for t in tile_list:
                        if t in href:
                            xml_links.append(href)
                else:
                    xml_links.append

    return xml_links


def all_xml_list(base_url, start_date, end_date, tile_list=None):
    date_urls = get_existing_folders(base_url, start_date, end_date)
    xml_links = []
    for url in date_urls:
        xml_links.extend(extract_xml_links(url, tile_list))
    return xml_links

In [35]:
xml_links = all_xml_list(base_url, start_date, end_date, tile_list)
print(len(xml_links))
xml_links[0:2]

26


['https://dap.ceda.ac.uk/neodc/sentinel_ard/data/sentinel_2/2023/05/02/S2B_20230502_latn563lonw0021_T30VWH_ORB080_20230502121918_utm30n_osgb_vmsk_sharp_rad_srefdem_stdsref_meta.xml?download=1',
 'https://dap.ceda.ac.uk/neodc/sentinel_ard/data/sentinel_2/2023/05/02/S2B_20230502_latn563lonw0021_T30VWH_ORB080_20230502121918_utm30n_osgb_vmsk_sharp_rad_srefdem_stdsref_meta.xml?download=1']

In [None]:
def _extract_xml_cloud(xml_extract):
    supp = xml_extract.find("gmd:supplementalinformation")
    character_string = supp.find("gco:characterstring").text
    lines = character_string.split("\n")
    lines = ["".join(l.split()) for l in lines]
    for line in lines:
        if line.startswith("ARCSI_CLOUD_COVER"):
            arcsi_cloud_cover = line.split(":")[1].strip()
            val = arcsi_cloud_cover
            break
    return float(val)


def _clean_coord(coord):
    coord = coord.replace("\n", "")
    return float(coord)


def _extract_extent(xml_extract):
    minx = _clean_coord(xml_extract.find("gmd:westboundlongitude").text)
    miny = _clean_coord(xml_extract.find("gmd:southboundlatitude").text)
    maxx = _clean_coord(xml_extract.find("gmd:eastboundlongitude").text)
    maxy = _clean_coord(xml_extract.find("gmd:northboundlatitude").text)
    return minx, miny, maxy, maxy


def _read_xml(url):
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the XML content using BeautifulSoup with lxml parser
        soup = BeautifulSoup(response.text, "lxml")
    return soup


def filter_xmls(xml_links, southern_most_lat=54, geometry=None, cloud_cover_max=0.4):
    retained_links = []
    for url in xml_links:
        # read the xml
        try:
            xml_extract = _read_xml(url)
        except:
            continue
        # first get the coords and see if image is too far south
        coords = _extract_extent(xml_extract)
        if coords[3] < southern_most_lat:
            continue
        # check if too cloudy overall
        if _extract_xml_cloud(xml_extract) > cloud_cover_max:
            continue
        # finally if specified check extent intersects input geometry
        if geometry is not None:
            image_geom = box(coords)
            if image_geom.intersects(geometry):
                retained_links.append(url)
        else:
            retained_links.append(url)
    return retained_links

In [7]:
filtered_xml_links = filter_xmls(xml_links)

In [9]:
len(filtered_xml_links)

20

In [12]:
def xml_to_tif_link(xml_link):
    return xml_link.replace("_meta.xml?download=1", ".tif")

In [15]:
test_url = xml_to_tif_link(filtered_xml_links[0])

In [18]:
import rasterio as rio

with rio.open(test_url) as f:
    print(f.profile)
    print(f.bounds)

{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': 0.0, 'width': 11139, 'height': 11140, 'count': 10, 'crs': CRS.from_epsg(27700), 'transform': Affine(10.0, 0.0, 438940.0,
       0.0, -10.0, 871480.0), 'blockxsize': 512, 'blockysize': 512, 'tiled': True, 'compress': 'deflate', 'interleave': 'pixel'}
BoundingBox(left=438940.0, bottom=760080.0, right=550330.0, top=871480.0)


In [11]:
import geopandas as gpd

supported_drivers["KML"] = "rw"
grids = gpd.read_file(
    "https://sentinels.copernicus.eu/documents/247904/1955685/S2A_OPER_GIP_TILPAR_MPC__20151209T095117_V20150622T000000_21000101T000000_B00.kml",
    driver="kml",
)

In [9]:
aoi = gpd.read_file("inputs/test_quarries.shp")

In [14]:
aoi.crs

<Projected CRS: EPSG:27700>
Name: OSGB36 / British National Grid
Axis Info [cartesian]:
- E[east]: Easting (metre)
- N[north]: Northing (metre)
Area of Use:
- name: United Kingdom (UK) - offshore to boundary of UKCS within 49°45'N to 61°N and 9°W to 2°E; onshore Great Britain (England, Wales and Scotland). Isle of Man onshore.
- bounds: (-9.01, 49.75, 2.01, 61.01)
Coordinate Operation:
- name: British National Grid
- method: Transverse Mercator
Datum: Ordnance Survey of Great Britain 1936
- Ellipsoid: Airy 1830
- Prime Meridian: Greenwich

In [18]:
gpd.sjoin(grids, aoi.to_crs(epsg=4326), how="inner", op="intersects")["Name"].to_list()

  if await self.run_code(code, result, async_=asy):


['30UWG', '30VWH', '30VUH', '30VVH', '30VWH', '30VVK']

In [16]:
"test".contains("st")

AttributeError: 'str' object has no attribute 'contains'