In [1]:
import re
import yaml

In [2]:
def compilePath(child, parentPath=''):
    """
    Generates the query paths for a given child and all its children

    >>> child = {"id": "work", "type": "crm:E36_Visual_Item", "query": "$subject crm:P128_carries ?value .", "children": [{"id": "work_creation", "query": "$subject crm:P94i_was_created_by ?value .", "children" : [{"id": "work_creator", "query" : "$subject crm:P14_carried_out_by ?value ." }] }] }
    >>> print(compilePath(child, ""))
    $subject crm:P128_carries ?value_work .
    $subject crm:P128_carries/crm:P94i_was_created_by ?value_work_creation .
    $subject crm:P128_carries/crm:P94i_was_created_by/crm:P14_carried_out_by ?value_work_creator .
    """
    query = child['query']
    optional = child['optional'] if 'optional' in child else False
    
    subjectPathPattern = r'(?:\$subject\s)([^\s]*)'
    subjectPath = re.search(subjectPathPattern, query).group(1)
    
    completePath = parentPath + '/' + subjectPath if parentPath else subjectPath

    query = query.replace(subjectPath, completePath)
    
    # Namespace variables by prefixing them with (unique) field id
    query = namespaceVariablesInQuery(query, child['id'])
    
    if optional:
        query = "OPTIONAL { %s }\n" % query
    
    if 'children' in child:
        for c in child['children']:
            query = query + "\n" + compilePath(c, completePath)
    
    return query

In [3]:
def namespaceVariablesInQuery(query, id):
    return re.sub(r'\?([^\s/,:;,\-\\\(\)]*)', r'?\1_' + id, query)

In [77]:
def compileQuery(node, **kwargs):
    """
    Generates a SPARQL query starting from a given node as subject and traversing through all children
    
    Keyword arguments:
    distinct -- use distinct keyword in select (default False)
    limit -- a limit for the query (default None)
    select -- a list of variables to use in the select statement (default None, uses value and label variables from model)

    >>> node = {"id": "artwork", "label": "Artwork", "type": "crm:E22_Human-Made_Object", "children": [{"id": "work", "type": "crm:E36_Visual_Item", "query": "$subject crm:P128_carries ?value .", "children": [{"id": "work_creation", "query": "$subject crm:P94i_was_created_by ?value .", "children" : [{"id": "work_creator", "optional": True, "query" : "$subject crm:P14_carried_out_by ?value ." }] }] }]}
    >>> print(compileQuery(node))
    SELECT $subject ?value_work ?value_work_creation ?value_work_creator {
    $subject a crm:E22_Human-Made_Object .
    $subject crm:P128_carries ?value_work .
    $subject crm:P128_carries/crm:P94i_was_created_by ?value_work_creation .
    OPTIONAL { $subject crm:P128_carries/crm:P94i_was_created_by/crm:P14_carried_out_by ?value_work_creator . }
    }

    >>> print(compileQuery(node, distinct=True, limit=10, select=['?subject','?value_work_creator']))
    SELECT DISTINCT ?subject ?value_work_creator {
    $subject a crm:E22_Human-Made_Object .
    $subject crm:P128_carries ?value_work .
    $subject crm:P128_carries/crm:P94i_was_created_by ?value_work_creation .
    OPTIONAL { $subject crm:P128_carries/crm:P94i_was_created_by/crm:P14_carried_out_by ?value_work_creator . }
    } LIMIT 10
    """
    
    query = "SELECT "
    
    if 'distinct' in kwargs:
        if kwargs['distinct'] == True:
            query += "DISTINCT "
    
    if 'select' in kwargs:
        variables = ' '.join(namespaceSelectsForNode(kwargs['select'], node))
    else:
        variables = "$subject " + ' '.join(getNamespacedValuesAndLabels(node))    
    
    query += variables
    query += " {\n"
    
    query += "$subject a " + node['type'] + " .\n"
    for child in node['children']:
        query += compilePath(child)
        
    query += "}"
    if 'limit' in kwargs:
         query += " LIMIT " + str(kwargs['limit'])
    return query


def getQueryForId(id, node):
    
    def traverseNode(id, node):
        if node['id'] == id:
            query.append(node['query'])
        elif 'children' in node:
            for child in node['children']:
                traverseNode(id, child)

    query = []
    traverseNode(id, node)
    
    return query[0]


def namespaceSelectsForNode(selects, node):
    namespacedSelects = []
    for select in selects:
        query = getQueryForId(select, node)
        namespacedSelects.append("?value_%s" % select)
        if '?label' in query:
            namespacedSelects.append("?label_%s" % select)
            
    return namespacedSelects

print(compileQuery(model[0], select=['work_creator'])) 

SELECT ?value_work_creator {
$subject a crm:E22_Man-Made_Object .
OPTIONAL {  $subject crm:P2_has_type ?value_artwork_genre . ?value_artwork_genre crm:P2_has_type aat:300056462 ; rdfs:label ?label_artwork_genre .  }
 $subject crm:P43_has_dimension ?value_artwork_dimension .  $subject crm:P45_consists_of ?value_artwork_material . OPTIONAL {  $subject crm:P2_has_type ?value_artwork_medium . ?value_artwork_medium crm:P2_has_type aat:300014842 ; rdfs:label ?label_artwork_medium .  }
 $subject crm:P128_carries ?value_work . 
 $subject crm:P128_carries/crm:P94i_was_created_by ?value_work_creation . 
OPTIONAL {  $subject crm:P128_carries/crm:P94i_was_created_by/crm:P14_carried_out_by ?value_work_creator .  }

OPTIONAL {  $subject crm:P128_carries/crm:P94i_was_created_by/crm:P9_consists_of ?subcreation_work_creator_with_role ; crm:P14_carried_out_by ?subcreation_person_work_creator_with_role . ?subcreation_work_creator_with_role crm:P2_has_type ?subcreation_type_work_creator_with_role ; crm:P1

In [40]:
def getNamespacedValuesAndLabels(node):
    
    def getQueries(children):
        for child in children:
            if 'query' in child:
                queries.append(namespaceVariablesInQuery(child['query'], child['id']))
            if 'children' in child:
                getQueries(child['children'])
                
    queries = []
    if 'children' in node:
        getQueries(node['children'])
        
    allQueries = ' '.join(queries)
    matches = re.findall(r'((?:\?value[^\s/,:,\-\\\(\)]*)|(?:\?label[^\s/,:,\-\\\(\)]*))', allQueries)
    return list(set(matches))
        
                
getNamespacedValuesAndLabels(model[0])

['?value_represented_event_place',
 '?value_represented_place_note',
 '?value_represented_group_same_as',
 '?value_work_creator_with_role',
 '?value_artwork_material',
 '?value_artwork_dimension',
 '?value_represented_event_same_as',
 '?value_represented_actor_same_as',
 '?label_artwork_genre',
 '?value_represented_place',
 '?value_represented_period',
 '?value_artwork_medium',
 '?value_artwork_genre',
 '?label_artwork_medium',
 '?value_work',
 '?value_work_creator_group_with_role',
 '?value_represented_event',
 '?value_represented_group',
 '?value_represented_type',
 '?value_work_creator',
 '?value_represented_event_time_span',
 '?value_represented_place_same_as',
 '?value_represented_type_same_as',
 '?value_represented_actor',
 '?value_work_creation']

In [7]:
def parseModelFromFile(inputFile):
    """
    Reads input model from filepath

    >>> model = parseModelFromFile('../models/bso.yml')
    >>> print(type(model))
    <class 'list'>
    """
    with open(inputFile, 'r') as f:
        modelData = yaml.safe_load(f.read())
    return modelData



In [8]:
def verifyModel(model):
    """
    Checks if a given model is valid. Returns "Ok" if yes. Otherwise lists errors

    >>> model = [{"id": "artwork", "query": "$subject a crm:E22_Man-Made_Object .", "children": [{"id": "work", "type": "crm:E36_Visual_Item", "query": "$subject crm:P128_carries ?value .", "children": [{"id": "work_creation", "query": "$subject crm:P94i_was_created_by ?value .", "children" : [{"id": "work_creator", "query" : "$subject crm:P14_carried_out_by ?value ." }] }]] }}]
    >>> verifyModel(model)
    
    """
    
    def verifyModelNode(node):
        id = None
        if not 'id' in node:
            errors.append("No id present in node")

        id = node['id']

        if not id in ids:
            ids.append(id)
        else:
            errors.append("Duplicate id %s" % id)

        if not 'query' in node and not 'type' in node:
            errors.append("No query or type present in node %s" % id)
        elif 'query' in node:
            if not re.search(r'\$subject\s', node['query']):
                errors.append("No $subject found in query of %s" %id)
            if not re.search(r'\?value[\s|\)]', node['query']):
                errors.append("No ?value found in query of %s" %id)

        if 'children' in node:
            for child in node['children']:
                verifyModelNode(child)

    ids = []
    errors = []
    for node in model:
        verifyModelNode(node)
    
    if len(errors):
        return "\n".join(errors)
    else:
        return "Ok"
        
    

In [9]:
inputFile = '../models/bso.yml'
model = parseModelFromFile(inputFile)

In [10]:
print(verifyModel(model))

Ok
