In [1]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta

- Input a date range and a polygon for area(s) of interest
- Find folders that are within that date range
- Find images that intersect the polygons
- Clip the images that intersect by the polygons and save geotiff of the interecting area

In [2]:
base_url = "https://data.ceda.ac.uk/neodc/sentinel_ard/data/sentinel_2"

In [3]:
start_date = "2023-05-01"
end_date = "2023-05-10"

In [4]:
def create_date_url(base_url, input_date):
    year = input_date.strftime("%Y")
    month = input_date.strftime("%m")
    day = input_date.strftime("%d")
    return f"{base_url}/{year}/{month}/{day}"


def get_existing_folders(base_url, start_date, end_date):
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    current_date = start_date
    urls = []
    while current_date <= end_date:
        check_url = create_date_url(base_url, current_date)
        response = requests.get(check_url, timeout=10)
        if response.status_code == 200:
            urls.append(check_url)
        current_date += timedelta(days=1)
    return urls


def extract_xml_links(url):
    """Extracts XML links from the given HTML webpage URL."""
    xml_links = []
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")

        for link in soup.find_all("a", href=True):
            href = link["href"]
            if href.endswith(".xml?download=1"):
                xml_links.append(href)

    return xml_links


def all_xml_list(base_url, start_date, end_date):
    date_urls = get_existing_folders(base_url, start_date, end_date)
    xml_links = []
    for url in date_urls:
        xml_links.extend(extract_xml_links(url))
    return xml_links

In [5]:
xml_links = all_xml_list(base_url, start_date, end_date)
print(len(xml_links))
xml_links[0:2]

370


['https://dap.ceda.ac.uk/neodc/sentinel_ard/data/sentinel_2/2023/05/01/S2A_20230501_latn500lonw0008_T30UXA_ORB137_20230501131147_utm30n_osgb_vmsk_sharp_rad_srefdem_stdsref_meta.xml?download=1',
 'https://dap.ceda.ac.uk/neodc/sentinel_ard/data/sentinel_2/2023/05/01/S2A_20230501_latn500lonw0008_T30UXA_ORB137_20230501131147_utm30n_osgb_vmsk_sharp_rad_srefdem_stdsref_meta.xml?download=1']

In [7]:
def process_xml(url):
    # Input URL of the XML file
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the XML content using BeautifulSoup
            soup = BeautifulSoup(response.content, "xml")

            # Find the ARCSI_CLOUD_COVER value using the tag name
            arcsi_cloud_cover_element = soup.find("gco:CharacterString")

            # Check if the element was found and extract the value
            if arcsi_cloud_cover_element is not None:
                arcsi_cloud_cover = arcsi_cloud_cover_element.text
                print("ARCSI_CLOUD_COVER:", arcsi_cloud_cover)
            else:
                print("ARCSI_CLOUD_COVER not found in the XML.")
        else:
            print("Failed to retrieve the XML. Status code:", response.status_code)

    except requests.exceptions.RequestException as e:
        print("Error:", e)

In [13]:
for x in xml_links:
    process_xml(x)

<Response [502]>