In [1]:
from lxml import etree   # import etree functionality from lxml library
import urllib.request  # import urllib.request module for URL handling
import os  # import os module for OS interaction
import rdflib  # import rdflib library for RDF graph processing
import pyRdfa  # import pyRdfa library for RDFa parsing

In [2]:
def get_rdfa(xmlfile: str):  # define function with parameters
    
    # check XML file extension
    if not xmlfile.lower().endswith('.xml'):
        raise ValueError('Invalid file format.')

    try:
        # load XML file
        if xmlfile.startswith('http://') or xmlfile.startswith('https://'):  # check if XML file has URL source
            with urllib.request.urlopen(xmlfile) as response:  # send request to URL to open XML file
                xml_bytes = response.read()  # read URL raw byte content
                tree = etree.ElementTree(etree.fromstring(xml_bytes))  # parse raw bytes into XML element structure and wrap structure inside ElementTree
            xml_source = xmlfile  # pass XML file URL to variable
        elif os.path.isfile(xmlfile):  # check if XML file exists locally
            with open(xmlfile, 'rb') as xml_file:  # open XML file in binary mode
                tree = etree.parse(xml_file)  # parse XML file object directly into ElementTree
            xml_source = 'file://' + os.path.abspath(xmlfile)  # pass XML file URI to variable
        else:
            raise ValueError('Invalid file path or URL format.')

        # feedback successful XML parsing
        root = tree.getroot()  # retrieve XML-tree root element
        print(f'Successfully loaded {xmlfile}.\n'
            f'XML root element {root.tag} has direct child elements ' + ', '.join(element.tag for element in root) + '.')  # join list of tag names into comma-separated string
        
        # parse RDFa
        g = rdflib.Graph()  # create empty rdflib graph object for storing triples
        rdfa_parser = pyRdfa.pyRdfa()  # instantiate RDFa parser object
        rdfa_parser.graph_from_source(xml_source, graph=g)  # parse RDFa in XML file to populate graph with triples
        
        # output RDF triples
        print(f'\n{len(g)} RDF triples detected:')
        if len(g) > 0:
            for subj, pred, obj in g:
                print(f'{subj} - {pred} - {obj}')
        else:
            print('No RDFa detected.')

        return g  # return RDF graph

    # catch exception, feedback details
    except ValueError as e:
        raise ValueError(f'Value error encountered: {e}.')  
    except OSError as e:
        raise OSError(f'OS error encountered: {e}.')  
    except Exception as e:
        raise OSError(f'Unexpected error encountered: {e}.')  

In [3]:
get_rdfa('https://raw.githubusercontent.com/auden-in-austria-digital/aad-data/refs/heads/dev/data/xml/editions/aad-transcript__0032.xml')

Successfully loaded https://raw.githubusercontent.com/auden-in-austria-digital/aad-data/refs/heads/dev/data/xml/editions/aad-transcript__0032.xml.
XML root element {http://www.tei-c.org/ns/1.0}TEI has direct child elements {http://www.tei-c.org/ns/1.0}teiHeader, {http://www.tei-c.org/ns/1.0}facsimile, {http://www.tei-c.org/ns/1.0}text.


rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xml:base="https://id.acdh.oeaw.ac.at/auden-in-austria-digital" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.tei-c.org/ns/1.0"/ does not look like a valid URI, trying to serialize this will break.



8 RDF triples detected:
wd:Q62566483 - http://www.w3.org/1999/02/22-rdf-syntax-ns#type - crm:E21_Person
wd:Q123679692 - http://www.w3.org/1999/02/22-rdf-syntax-ns#type - crm:E21_Person
https://amp.acdh.oeaw.ac.at/amp_person_254.html - crm:P107_has_current_or_former_member - wd:Q62566483
wd:Q62566483 - http://www.w3.org/2000/01/rdf-schema#label - Thekla Clark
wd:Q123679692 - http://www.w3.org/2000/01/rdf-schema#label - John Clark
https://amp.acdh.oeaw.ac.at/amp_person_254.html - http://www.w3.org/2000/01/rdf-schema#label - Gäste
https://amp.acdh.oeaw.ac.at/amp_person_254.html - http://www.w3.org/1999/02/22-rdf-syntax-ns#type - crm:E74_Group
https://amp.acdh.oeaw.ac.at/amp_person_254.html - crm:P107_has_current_or_former_member - wd:Q123679692


<Graph identifier=Naed47e95a67e42e2a9f9889b20d4c8e4 (<class 'rdflib.graph.Graph'>)>