In [84]:
import requests
from bs4 import BeautifulSoup
import re
import lxml
from datetime import datetime, timedelta
from shapely.geometry import box

- Input a date range and a polygon for area(s) of interest
- Find folders that are within that date range
- Find images that intersect the polygons
- Clip the images that intersect by the polygons and save geotiff of the interecting area

In [2]:
base_url = "https://data.ceda.ac.uk/neodc/sentinel_ard/data/sentinel_2"

In [3]:
start_date = "2023-05-01"
end_date = "2023-05-10"

In [4]:
def create_date_url(base_url, input_date):
    year = input_date.strftime("%Y")
    month = input_date.strftime("%m")
    day = input_date.strftime("%d")
    return f"{base_url}/{year}/{month}/{day}"


def get_existing_folders(base_url, start_date, end_date):
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    current_date = start_date
    urls = []
    while current_date <= end_date:
        check_url = create_date_url(base_url, current_date)
        response = requests.get(check_url, timeout=10)
        if response.status_code == 200:
            urls.append(check_url)
        current_date += timedelta(days=1)
    return urls


def extract_xml_links(url):
    """Extracts XML links from the given HTML webpage URL."""
    xml_links = []
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        for link in soup.find_all("a", href=True):
            href = link["href"]
            if href.endswith(".xml?download=1"):
                xml_links.append(href)

    return xml_links


def all_xml_list(base_url, start_date, end_date):
    date_urls = get_existing_folders(base_url, start_date, end_date)
    xml_links = []
    for url in date_urls:
        xml_links.extend(extract_xml_links(url))
    return xml_links

In [5]:
xml_links = all_xml_list(base_url, start_date, end_date)
print(len(xml_links))
xml_links[0:2]

370


['https://dap.ceda.ac.uk/neodc/sentinel_ard/data/sentinel_2/2023/05/01/S2A_20230501_latn500lonw0008_T30UXA_ORB137_20230501131147_utm30n_osgb_vmsk_sharp_rad_srefdem_stdsref_meta.xml?download=1',
 'https://dap.ceda.ac.uk/neodc/sentinel_ard/data/sentinel_2/2023/05/01/S2A_20230501_latn500lonw0008_T30UXA_ORB137_20230501131147_utm30n_osgb_vmsk_sharp_rad_srefdem_stdsref_meta.xml?download=1']

In [113]:
def _extract_xml_cloud(xml_extract):
    supp = xml_extract.find("gmd:supplementalinformation")
    character_string = supp.find("gco:characterstring").text
    lines = character_string.split("\n")
    lines = ["".join(l.split()) for l in lines]
    for line in lines:
        if line.startswith("ARCSI_CLOUD_COVER"):
            arcsi_cloud_cover = line.split(":")[1].strip()
            val = arcsi_cloud_cover
            break
    return float(val)


def _clean_coord(coord):
    coord = coord.replace("\n", "")
    return float(coord)


def _extract_extent(xml_extract):
    minx = _clean_coord(xml_extract.find("gmd:westboundlongitude").text)
    miny = _clean_coord(xml_extract.find("gmd:southboundlatitude").text)
    maxx = _clean_coord(xml_extract.find("gmd:eastboundlongitude").text)
    maxy = _clean_coord(xml_extract.find("gmd:northboundlatitude").text)
    return minx, miny, maxy, maxy


def _read_xml(url):
    # Send an HTTP GET request to the URL
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the XML content using BeautifulSoup with lxml parser
        soup = BeautifulSoup(response.text, "lxml")
    return soup


def filter_xmls(xml_links, southern_most_lat=54, geometry=None, cloud_cover_max=0.4):
    retained_links = []
    for url in xml_links:
        # read the xml
        try:
            xml_extract = _read_xml(url)
        except:
            continue
        # first get the coords and see if image is too far south
        coords = _extract_extent(xml_extract)
        if coords[3] < southern_most_lat:
            continue
        # check if too cloudy overall
        if _extract_xml_cloud(xml_extract) > cloud_cover_max:
            continue
        # finally if specified check extent intersects input geometry
        if geometry is not None:
            image_geom = box(coords)
            if image_geom.intersects(geometry):
                retained_links.append(url)
        else:
            retained_links.append(url)
    return retained_links

In [114]:
filtered_xml_links = filter_xmls(xml_links)

In [115]:
len(filtered_xml_links)

20