In [9]:
from lxml import etree   # import etree functionality from lxml library
import urllib.request  # import urllib.request module for opening URLs
import os  # import os module for interacting with OS

In [10]:
def get_rdfa(xmlfile: str):  # define function with parameters
    
    # check XML file extension
    if not xmlfile.lower().endswith('.xml'):
        raise ValueError('Invalid file format.')

    # try opening XML file
    try:
        if xmlfile.startswith('http://') or xmlfile.startswith('https://'):  # check if XML file has URL source
            with urllib.request.urlopen(xmlfile) as response:  # send request to URL to open XML file
                xml_bytes = response.read()  # read URL raw byte content
                tree = etree.ElementTree(etree.fromstring(xml_bytes))  # parse raw bytes into XML element structure and wrap structure inside ElementTree
        elif os.path.isfile(xmlfile):  # check if XML file exists locally
            with open(xmlfile, 'rb') as xml_file:  # open XML file in binary mode
                tree = etree.parse(xml_file)  # parse XML file directly into ElementTree
        else:
            raise ValueError('Invalid file path or URL format.')

        # check successful parsing
        root = tree.getroot()
        print(f'Successfully loaded {xmlfile} with XML root element {root.tag}.')

        # ✅ Print the first few child elements (optional)
        print("First few elements:")
        for i, element in enumerate(root[:5]):  # Show only the first 5 elements
            print(f"- {element.tag}")

    except Exception as e:
        raise IOError(f"Error loading XML: {e}")

In [11]:
get_rdfa('https://raw.githubusercontent.com/auden-in-austria-digital/aad-data/refs/heads/dev/data/xml/editions/aad-transcript__0032.xml')

Successfully loaded https://raw.githubusercontent.com/auden-in-austria-digital/aad-data/refs/heads/dev/data/xml/editions/aad-transcript__0032.xml with XML root element {http://www.tei-c.org/ns/1.0}TEI.
First few elements:
- {http://www.tei-c.org/ns/1.0}teiHeader
- {http://www.tei-c.org/ns/1.0}facsimile
- {http://www.tei-c.org/ns/1.0}text
