In [40]:
from lxml import etree   # import etree functionality from lxml library
import urllib.request  # import urllib.request module for opening URLs
import os  # import os module for interacting with OS

In [None]:
def get_rdfa(xmlfile: str):  # define function and parameters
    
    # check XML file extension
    if not xmlfile.lower().endswith(".xml"):
        raise ValueError("XML file format required.")

    # try opening XML file
    try:
        if xmlfile.startswith('http://') or xmlfile.startswith('https://'):
            with urllib.request.urlopen(xmlfile) as response:  # open xml file from URL
                xml_bytes = response.read()  # reads URL raw byte content
                tree = etree.ElementTree(etree.fromstring(xml_bytes))  # parse raw bytes ino
        elif os.path.isfile(xmlfile):  # Check if file exists
            # Handle local file
            with open(xmlfile, "rb") as file:  # Open in binary mode
                tree = etree.parse(file)  # Directly parse file
        else:
            raise ValueError("Invalid file path or URL format.")

        # ✅ Print root element to confirm successful parsing
        root = tree.getroot()
        print(f"✅ Successfully loaded XML file: {xmlfile}")
        print(f"Root element: {root.tag}")

        # ✅ Print the first few child elements (optional)
        print("First few elements:")
        for i, element in enumerate(root[:5]):  # Show only the first 5 elements
            print(f"- {element.tag}")

    except Exception as e:
        raise IOError(f"Error loading XML: {e}")

In [42]:
tree = get_rdfa('https://raw.githubusercontent.com/auden-in-austria-digital/aad-data/refs/heads/dev/data/xml/editions/aad-transcript__0032.xml')

✅ Successfully loaded XML file: https://raw.githubusercontent.com/auden-in-austria-digital/aad-data/refs/heads/dev/data/xml/editions/aad-transcript__0032.xml
Root element: {http://www.tei-c.org/ns/1.0}TEI
First few elements:
- {http://www.tei-c.org/ns/1.0}teiHeader
- {http://www.tei-c.org/ns/1.0}facsimile
- {http://www.tei-c.org/ns/1.0}text
