import

In [3]:
import os
import urllib.request
import logging
from typing import Dict, Optional, Set, Union, List
from lxml import etree
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS
import warnings

In [4]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

def tei_rdfa(xmlfile: str, parse_element: Optional[str] = None, verbose: bool = True) -> Graph:
    """
    Extract RDFa content from a TEI/XML file and create an RDF graph.
    
    Parameters:
    xmlfile (str): Path or URL to a TEI/XML file
    parse_element (str, optional): Name of the element to parse for RDFa content
    verbose (bool): Whether to print detailed output including serialized graphs
    
    Returns:
    rdflib.Graph: An RDF graph containing the extracted triples
    """
    
    # Check XML file extension
    if not xmlfile.lower().endswith('.xml'):
        raise ValueError('Invalid file format. File must have .xml extension.')

    try:
        # Load XML file
        tree, xml_source = load_xml_file(xmlfile)
            
        # Get root element and provide feedback
        root = tree.getroot()
        if verbose:
            child_elements = ", ".join(element.tag for element in root)
            logger.info(f'Successfully loaded {xmlfile}.\n'
                        f'XML root element {root.tag} has direct child elements {child_elements}.')
            
        # Set up namespaces for XPath
        namespaces = {
            'tei': 'http://www.tei-c.org/ns/1.0',
            'xml': 'http://www.w3.org/XML/1998/namespace'
        }
        
        # Get TEI prefixes from listPrefixDef
        prefix_map = get_tei_prefixes(tree, namespaces)
        
        # Create RDF graph and register namespaces
        g = Graph()
        for prefix, uri in prefix_map.items():
            g.bind(prefix, uri)
        
        # Process RDFa content
        if parse_element:
            process_specific_elements(tree, g, parse_element, namespaces, prefix_map)
        else:
            # Process the entire document
            process_element(tree.getroot(), None, g, prefix_map)
        
        # Output the graph in different formats if verbose
        if verbose:
            logger.info("\nExtracted RDF Graph in Turtle format:")
            print(g.serialize(format='turtle'))
            
            logger.info("\nExtracted RDF Graph in RDF/XML format:")
            print(g.serialize(format='xml'))
        
        return g
        
    except ValueError as e:
        raise ValueError(f'Value error encountered: {e}')  
    except OSError as e:
        raise OSError(f'OS error encountered: {e}')  


def load_xml_file(xmlfile: str) -> tuple:
    """
    Load an XML file from a URL or local path.
    
    Parameters:
    xmlfile (str): Path or URL to a TEI/XML file
    
    Returns:
    tuple: (ElementTree, source_uri)
    """
    if xmlfile.startswith(('http://', 'https://')):
        with urllib.request.urlopen(xmlfile) as response:
            xml_bytes = response.read()
            tree = etree.ElementTree(etree.fromstring(xml_bytes))
        xml_source = xmlfile
    elif os.path.isfile(xmlfile):
        tree = etree.parse(xmlfile)
        xml_source = 'file://' + os.path.abspath(xmlfile)
    else:
        raise ValueError('Invalid file path or URL format.')
        
    return tree, xml_source


def get_tei_prefixes(tree: etree.ElementTree, namespaces: Dict[str, str]) -> Dict[str, str]:
    """
    Extract namespace prefixes from the TEI listPrefixDef element.
    
    Parameters:
    tree (etree.ElementTree): The parsed XML tree
    namespaces (dict): Namespace mapping for XPath queries
    
    Returns:
    dict: A mapping of prefixes to their full URIs
    """
    prefix_map = {}
    
    # Look for prefixDef elements in the TEI header
    prefix_defs = tree.xpath('//tei:encodingDesc/tei:listPrefixDef/tei:prefixDef', namespaces=namespaces)
    
    for prefix_def in prefix_defs:
        prefix = prefix_def.get('ident')
        if not prefix:
            continue
            
        # Get the replacement pattern
        replacement_pattern = prefix_def.get('replacementPattern')
        if not replacement_pattern:
            continue
            
        # Handle various replacement pattern formats
        # This is more robust than just removing $1
        if '$1' in replacement_pattern:
            base_uri = replacement_pattern.split('$1')[0]
        elif '{$1}' in replacement_pattern:
            base_uri = replacement_pattern.split('{$1}')[0]
        else:
            # If no substitution pattern found, use as is
            base_uri = replacement_pattern
            
        prefix_map[prefix] = base_uri
    
    # Add common RDF namespaces
    prefix_map['rdf'] = str(RDF)
    prefix_map['rdfs'] = str(RDFS)
    
    return prefix_map


def process_specific_elements(tree: etree.ElementTree, graph: Graph, element_name: str, 
                             namespaces: Dict[str, str], prefix_map: Dict[str, str]) -> None:
    """
    Find and process specific elements in the document.
    
    Parameters:
    tree (etree.ElementTree): The parsed XML tree
    graph (rdflib.Graph): The graph to add triples to
    element_name (str): Name of element to process
    namespaces (dict): Namespace mapping for XPath queries
    prefix_map (dict): Mapping of prefixes to URIs
    """
    xpath_expr = f'//tei:{element_name}'
    elements = tree.xpath(xpath_expr, namespaces=namespaces)
    
    if not elements:
        warnings.warn(f"No elements matching '{element_name}' were found in the document.")
        return
    
    # For each matching element, extract RDFa
    for element in elements:
        process_element(element, None, graph, prefix_map)


def process_element(element: etree.Element, parent_subject: Optional[str], 
                   graph: Graph, prefix_map: Dict[str, str]) -> Optional[str]:
    """
    Process an element for RDFa attributes and recursively process children.
    
    Parameters:
    element (etree.Element): The XML element to process
    parent_subject (str, optional): The subject URI from the parent element
    graph (rdflib.Graph): The graph to add triples to
    prefix_map (dict): Mapping of prefixes to URIs
    
    Returns:
    str or None: The subject URI of this element
    """
    # Determine the subject of this element according to RDFa rules
    current_subject = determine_subject(element, parent_subject, prefix_map)
    
    # Process RDFa attributes
    if current_subject:
        # Process typeof attribute (creates rdf:type triples)
        process_typeof(element, current_subject, graph, prefix_map)
        
        # Process property attribute (creates predicate-literal triples)
        process_property(element, current_subject, graph, prefix_map)
        
        # Process rel attribute (creates predicate-resource triples)
        process_rel(element, current_subject, graph, prefix_map)
        
        # Process rev attribute (creates reverse predicate-resource triples)
        process_rev(element, current_subject, parent_subject, graph, prefix_map)
    
    # Recursively process child elements
    for child in element:
        process_element(child, current_subject, graph, prefix_map)
    
    return current_subject


def determine_subject(element: etree.Element, parent_subject: Optional[str], 
                     prefix_map: Dict[str, str]) -> Optional[str]:
    """
    Determine the subject of an element according to RDFa rules.
    
    Parameters:
    element (etree.Element): The XML element
    parent_subject (str, optional): The subject URI from the parent element
    prefix_map (dict): Mapping of prefixes to URIs
    
    Returns:
    str or None: The determined subject URI
    """
    # Check for about attribute first
    if 'about' in element.attrib:
        return expand_uri(element.attrib['about'], prefix_map)
        
    # Check for resource attribute when no rel/rev is present
    if 'resource' in element.attrib and not ('rel' in element.attrib or 'rev' in element.attrib):
        return expand_uri(element.attrib['resource'], prefix_map)
        
    # Inherit from parent
    return parent_subject


def process_typeof(element: etree.Element, subject: str, graph: Graph, prefix_map: Dict[str, str]) -> None:
    """
    Process the typeof attribute to create rdf:type triples.
    
    Parameters:
    element (etree.Element): The XML element
    subject (str): The subject URI
    graph (rdflib.Graph): The graph to add triples to
    prefix_map (dict): Mapping of prefixes to URIs
    """
    if 'typeof' not in element.attrib:
        return
        
    types = element.attrib['typeof'].split()
    for type_uri in types:
        expanded_type = expand_uri(type_uri, prefix_map)
        graph.add((URIRef(subject), RDF.type, URIRef(expanded_type)))


def process_property(element: etree.Element, subject: str, graph: Graph, prefix_map: Dict[str, str]) -> None:
    """
    Process the property attribute to create predicate-literal triples.
    
    Parameters:
    element (etree.Element): The XML element
    subject (str): The subject URI
    graph (rdflib.Graph): The graph to add triples to
    prefix_map (dict): Mapping of prefixes to URIs
    """
    if 'property' not in element.attrib:
        return
        
    properties = element.attrib['property'].split()
    for prop in properties:
        expanded_prop = expand_uri(prop, prefix_map)
        
        # Determine the object value according to RDFa rules
        obj_value = determine_property_object(element, prefix_map)
        
        # Add the triple if we have an object value
        if obj_value:
            graph.add((URIRef(subject), URIRef(expanded_prop), obj_value))


def determine_property_object(element: etree.Element, prefix_map: Dict[str, str]) -> Optional[Union[URIRef, Literal]]:
    """
    Determine the object value for a property attribute.
    
    Parameters:
    element (etree.Element): The XML element
    prefix_map (dict): Mapping of prefixes to URIs
    
    Returns:
    URIRef, Literal, or None: The determined object value
    """
    # If there's a resource attribute, use that as the object
    if 'resource' in element.attrib:
        return URIRef(expand_uri(element.attrib['resource'], prefix_map))
        
    # If there's content attribute, use that (even if empty)
    if 'content' in element.attrib:
        return Literal(element.attrib['content'])
        
    # Otherwise use the element's text content
    if element.text and element.text.strip():
        return Literal(element.text.strip())
        
    # If there's no text but there are child elements, check if they contain text
    if len(element) > 0:
        # Concatenate text from all children
        text_parts = extract_text_from_children(element)
        if text_parts:
            return Literal(" ".join(text_parts))
            
    return None


def extract_text_from_children(element: etree.Element) -> List[str]:
    """
    Extract text content from child elements.
    
    Parameters:
    element (etree.Element): The XML element
    
    Returns:
    list: List of text strings from child elements
    """
    text_parts = []
    
    # Handle simple child elements with text
    if all(len(child) == 0 for child in element):
        for child in element:
            if child.text and child.text.strip():
                text_parts.append(child.text.strip())
    
    return text_parts


def process_rel(element: etree.Element, subject: str, graph: Graph, prefix_map: Dict[str, str]) -> None:
    """
    Process the rel attribute to create predicate-resource triples.
    
    Parameters:
    element (etree.Element): The XML element
    subject (str): The subject URI
    graph (rdflib.Graph): The graph to add triples to
    prefix_map (dict): Mapping of prefixes to URIs
    """
    if 'rel' not in element.attrib:
        return
        
    relations = element.attrib['rel'].split()
    
    # If there's a resource attribute, use that as the object
    if 'resource' in element.attrib:
        obj_uri = expand_uri(element.attrib['resource'], prefix_map)
        for rel in relations:
            expanded_rel = expand_uri(rel, prefix_map)
            graph.add((URIRef(subject), URIRef(expanded_rel), URIRef(obj_uri)))
        return
        
    # Find all descendant elements with resource attributes using XPath
    # This is more efficient than recursive function calls
    resource_elems = element.xpath('.//*[@resource]')
    
    if resource_elems:
        for rel in relations:
            expanded_rel = expand_uri(rel, prefix_map)
            for resource_elem in resource_elems:
                obj_uri = expand_uri(resource_elem.get('resource'), prefix_map)
                graph.add((URIRef(subject), URIRef(expanded_rel), URIRef(obj_uri)))


def process_rev(element: etree.Element, subject: Optional[str], parent_subject: Optional[str], 
               graph: Graph, prefix_map: Dict[str, str]) -> None:
    """
    Process the rev attribute to create reverse predicate-resource triples.
    
    Parameters:
    element (etree.Element): The XML element
    subject (str, optional): The subject URI of this element
    parent_subject (str, optional): The subject URI from the parent element
    graph (rdflib.Graph): The graph to add triples to
    prefix_map (dict): Mapping of prefixes to URIs
    """
    if 'rev' not in element.attrib:
        return
        
    if 'resource' not in element.attrib:
        return
        
    revs = element.attrib['rev'].split()
    obj_uri = expand_uri(element.attrib['resource'], prefix_map)
    subject_uri = subject or parent_subject
    
    if not subject_uri:
        return
        
    for rev in revs:
        expanded_rev = expand_uri(rev, prefix_map)
        # In rev relationships, the resource is the subject and the current element is the object
        graph.add((URIRef(obj_uri), URIRef(expanded_rev), URIRef(subject_uri)))


def expand_uri(uri_ref: str, prefix_map: Dict[str, str]) -> str:
    """
    Expand a prefixed URI into a full URI.
    
    Parameters:
    uri_ref (str): The URI reference, possibly with a prefix
    prefix_map (dict): Mapping of prefixes to URIs
    
    Returns:
    str: The expanded URI
    """
    if ':' in uri_ref:
        prefix, local = uri_ref.split(':', 1)
        if prefix in prefix_map:
            return prefix_map[prefix] + local
    
    return uri_ref

In [5]:
g = tei_rdfa('https://raw.githubusercontent.com/auden-in-austria-digital/aad-data/refs/heads/dev/data/xml/editions/aad-transcript__0032.xml', 'interpGrp')

INFO: Successfully loaded https://raw.githubusercontent.com/auden-in-austria-digital/aad-data/refs/heads/dev/data/xml/editions/aad-transcript__0032.xml.
XML root element {http://www.tei-c.org/ns/1.0}TEI has direct child elements {http://www.tei-c.org/ns/1.0}teiHeader, {http://www.tei-c.org/ns/1.0}facsimile, {http://www.tei-c.org/ns/1.0}text.
INFO: 
Extracted RDF Graph in Turtle format:
INFO: 
Extracted RDF Graph in RDF/XML format:


@prefix crm: <http://www.cidoc-crm.org/cidoc-crm/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix wd: <https://www.wikidata.org/entity/> .

<https://amp.acdh.oeaw.ac.at/amp_person_254.html> a crm:E74_Group ;
    rdfs:label "Gäste" ;
    crm:P107_has_current_or_former_member wd:Q123679692,
        wd:Q62566483 .

wd:Q123679692 a crm:E21_Person ;
    rdfs:label "John Clark" .

wd:Q62566483 a crm:E21_Person ;
    rdfs:label "Thekla Clark" .


<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF
   xmlns:crm="http://www.cidoc-crm.org/cidoc-crm/"
   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
   xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
>
  <rdf:Description rdf:about="https://www.wikidata.org/entity/Q123679692">
    <rdf:type rdf:resource="http://www.cidoc-crm.org/cidoc-crm/E21_Person"/>
    <rdfs:label>John Clark</rdfs:label>
  </rdf:Description>
  <rdf:Description rdf:about="https://amp.acdh.oeaw.ac.at/amp_person_254.html">
    <rdf:type rdf:resource="ht