In [1]:
import copy
import json
from lxml import etree
from os.path import join
from SPARQLWrapper import SPARQLWrapper, JSON
from string import Template
from tqdm import tqdm

**Note**: Before running ingest only SFF data **as individual graphs** and GND data to instance in order to avoid data from other collections being exorted (e.g. labels)

In [2]:
outputFile = "../static/sff-export.xml"

limit = False
offset = 0

In [3]:
endpoint = "http://localhost:7776/blazegraph/sparql"

In [4]:
def sparqlResultToDict(results):
    rows = []
    for result in results["results"]["bindings"]:
        row = {}
        for key in list(result.keys()):
            row[key] = result[key]["value"]
        rows.append(row)
    return rows

In [5]:
sparql = SPARQLWrapper(endpoint)
sparql.setReturnFormat(JSON)

In [6]:
objectsQuery = """
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX gnd: <https://d-nb.info/gnd/> 
PREFIX la: <https://linked.art/ns/terms/>
PREFIX search: <https://platform.swissartresearch.net/search/>
SELECT ?subject ?label ?image WHERE {
    BIND(gnd:1196831858 as ?sff)
    ?subject a search:Object ; 
        crm:P109_has_current_or_former_curator ?sff ;
        rdfs:label ?label ;
        crm:P128_carries/la:digitally_shown_by/la:digitally_available_via/la:access_point ?image
}
"""

In [7]:
if limit:
    objectsQuery += "LIMIT %d " % limit
if offset:
    objectsQuery += "OFFST %d " % offset

In [8]:
sparql.setQuery(objectsQuery)
objects = sparqlResultToDict(sparql.query().convert())

In [9]:
namespaces = """
PREFIX aat: <http://vocab.getty.edu/aat/>
PREFIX crm: <http://www.cidoc-crm.org/cidoc-crm/>
PREFIX crmdig: <http://www.ics.forth.gr/isl/CRMdig/>
PREFIX gnd: <https://d-nb.info/gnd/> 
PREFIX gndo: <https://d-nb.info/standards/elementset/gnd#>
PREFIX search: <https://platform.swissartresearch.net/search/>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
"""

model = {
    "artist_entity": {
        "select": ['value', 'label', 'gnd'],
        "query": """GRAPH ?g {
            $subject crm:P128_carries/crm:P94i_was_created_by/crm:P9_consists_of?/crm:P14_carried_out_by ?value .
            ?value rdfs:label ?label .
        }
        OPTIONAL {
            ?value crmdig:L54_is_same-as ?gnd .
            ?gnd a gndo:AuthorityResource .
        }
        """
    },
    "artist_role": {
        "select": ['value', 'label', 'gnd'],
        "query": """GRAPH ?g {
            $parent crm:P128_carries/crm:P94i_was_created_by/crm:P9_consists_of ?event .
            ?event crm:P14_carried_out_by $subject ;
                crm:P2_has_type ?value .
            ?value rdfs:label ?label .
        }
            OPTIONAL {
                ?value crmdig:L54_is_same-as ?gnd .
                ?gnd a gndo:AuthorityResource .
            }
        """
    },
    "dimensions": {
        "select": ['label', 'value', 'unit'],
        "query": """GRAPH ?g {
            $subject crm:P43_has_dimension ?dimension .
            ?dimension rdfs:label ?label ;
                crm:P90_has_value ?value ;
                crm:P91_has_unit/rdfs:label ?unit.
            }
        FILTER(LANG(?label) = 'de')"""
    },
    "work_creation_dates": {
        "select": ['value', 'label', 'from', 'to'],
        "query": """$subject crm:P128_carries/crm:P94i_was_created_by/crm:P4_has_time-span ?date .
            ?date rdfs:label ?label ;
                crm:P82a_begin_of_the_begin ?from ;
                crm:P82b_end_of_the_end ?to
        """
    },
    "inscriptions": {
        "query": """$subject crm:P128_carries ?value .
        ?value a crm:E34_Inscription ;
            rdfs:label ?label"""
    },
    "identifiers": {
        "query": "$subject crm:P1_is_identified_by/rdfs:label ?value ."
    },
    "material": {
        "query": """$subject crm:P45_consists_of ?value .
        ?value rdfs:label ?label
        """
    },
    "notes": {
        "query": """$subject crm:P3_has_note ?value ."""
    },
    "parts": {
        "select" : ['value', 'label', 'note'],
        "query": """$subject crm:P46_is_composed_of ?value .
        ?value rdfs:label ?label .
        OPTIONAL {
            ?value crm:P3_has_note ?note .
        }"""
    },
    "producer_entity": {
        "select": ['value', 'label', 'gnd'],
        "query": """GRAPH ?g {
            $subject crm:P108i_was_produced_by/crm:P9_consists_of?/crm:P14_carried_out_by ?value .
            ?value rdfs:label ?label
        }
        OPTIONAL {
            ?value crmdig:L54_is_same-as ?gnd .
            ?gnd a gndo:AuthorityResource .
        }
        """
    },
    "producer_role": {
        "select": ['value', 'label', 'gnd'],
        "query": """GRAPH ?g {
            $parent crm:P108i_was_produced_by/crm:P9_consists_of ?event .
            ?event crm:P14_carried_out_by $subject ;
                crm:P2_has_type ?value .
            ?value rdfs:label ?label .
            }
        OPTIONAL {
            ?value crmdig:L54_is_same-as ?gnd .
            ?gnd a gndo:AuthorityResource .
        }
        """
    },
    "provenienceNotes": {
        "query": """$subject crm:P24i_changed_ownership_through/crm:P3_has_note ?value"""
    },
    "remarks" : {
        "query": """$subject crm:P129i_is_subject_of ?value .
        ?value crm:P2_has_type aat:300435415 ;
            rdfs:label ?label"""
    },
    "represented_places": {
        "select": ['value', 'label', 'gnd'],
        "query": """GRAPH ?g {
            $subject crm:P128_carries/crm:P138_represents ?value .
            ?value a crm:E53_Place ;
                rdfs:label ?label .
        }
        OPTIONAL {
            ?value crmdig:L54_is_same-as ?gnd .
            ?gnd a gndo:AuthorityResource .
        }"""
    },
    "represented_types": {
        "select": ['value', 'label', 'gnd'],
        "query": """GRAPH ?g {
            $subject crm:P128_carries/crm:P138_represents ?value .
            ?value a crm:E55_Type ;
                rdfs:label ?label .
        }
        OPTIONAL {
            ?value crmdig:L54_is_same-as ?gnd .
            ?gnd a gndo:AuthorityResource .
        }"""
    },
    "technique": {
        "query": """GRAPH ?g {
            $subject crm:P108i_was_produced_by/crm:P32_used_general_technique ?value .
            ?value rdfs:label ?label .
        }
        """
    },
    "work_title": {
        "query": """$subject crm:P128_carries/crm:P1_is_identified_by ?title .
            ?title crm:P2_has_type aat:300417209 ;
                rdfs:label ?value ."""
    }
}


def executeModelQuery(identifier, subject, *, additionalBinds=[], debug=False):
    field = model[identifier]
    query = field['query']
    if not 'select' in field:
        head = "SELECT DISTINCT ?value ?label WHERE { \n"
    else:
        head = "SELECT DISTINCT %s WHERE { \n" % ' '.join(["?%s" % d for d in field['select']])
    binds = "BIND(<%s> as $subject) \n" % subject
    for bind in additionalBinds:
        binds += "BIND(<%s> as $%s) \n" % (bind[1], bind[0])
    foot = "}"
    fullQuery = namespaces + head + binds + query + foot
    sparql.setQuery(fullQuery)
    if debug:
        print(fullQuery)
    return sparqlResultToDict(sparql.query().convert())

def createExportObject(objectIri):
   
    obj = {"iri": objectIri}
    
    # Add artists
    obj["artists"] = []
    values = executeModelQuery('artist_entity', objectIri)
    for value in values:
        artist = {"iri": value['value'],
                  "label": value['label'],
                 "gnd": value['gnd'] if 'gnd' in value else ''
                 }
        roles = executeModelQuery('artist_role', artist['iri'], additionalBinds=[("parent", objectIri)])
        artist['roles'] = [{
            "label": d['label'],
            "gnd": d['gnd'] if 'gnd' in d else ''} for d in roles]
        obj["artists"].append(artist)
    
    # Add producers
    obj["producers"] = []
    values = executeModelQuery('producer_entity', objectIri)
    for value in values:
        producer = {"iri": value['value'],
                  "label": value['label'],
                 "gnd": value['gnd'] if 'gnd' in value else ''
                 }
        roles = executeModelQuery('producer_role', producer['iri'], additionalBinds=[("parent", objectIri)])
        producer['roles'] = [{
            "label": d['label'],
            "gnd": d['gnd'] if 'gnd' in d else ''} for d in roles]
        obj["producers"].append(producer)
        
    # Add title
    titles = executeModelQuery('work_title', objectIri)
    obj["title"] = [{"label": d['value']} for d in titles]
    
    # Add dates
    dates = executeModelQuery('work_creation_dates', objectIri)
    obj["dates"] = dates
    
    # Add technique
    techniques = executeModelQuery('technique', objectIri)
    obj["technique"] = [{"label": d['label']} for d in techniques]
    
    # Add material
    materials = executeModelQuery('material', objectIri)
    obj["material"] = [{"label": d['label']} for d in materials]
    
    # Add Notes
    notes = executeModelQuery('notes', objectIri)
    obj["notes"] = [{"label": d['value']} for d in notes]
    
    # Add Inscription
    inscriptions = executeModelQuery('inscriptions', objectIri)
    obj["artistInscriptions"] = [{"label": d['label']} for d in inscriptions]
    
    # Add other inscriptions/remarks
    remarks = executeModelQuery('remarks', objectIri)
    obj["remarks"] = [{"label": d['label']} for d in remarks]
    
    # Add provenience
    provenienceNotes = executeModelQuery('provenienceNotes', objectIri)
    obj["provenience"] = [{"label": d['value']} for d in provenienceNotes]

    # Add identifiers
    identifiers = executeModelQuery('identifiers', objectIri)
    obj["identifiers"] = [{"label": d['value']} for d in identifiers]
    
    # Add keywords
    keywords = executeModelQuery('represented_types', objectIri)
    obj["keywords"] = [{"label": d['label'], "gnd": d['gnd'] if 'gnd' in d else ''} for d in keywords]
    
    # Add places
    places = executeModelQuery('represented_places', objectIri)
    obj["places"] = [{"label": d['label'], "gnd": d['gnd'] if 'gnd' in d else ''} for d in places]
    
    # Add parts and dimensions
    obj["parts"] = []
    values = executeModelQuery('parts', objectIri)
    for value in values:
        dimensions = executeModelQuery('dimensions', value['value'])
        obj['parts'].append({
            "label": value['label'],
            "note": value['note'],
            "dimensions": dimensions
        })
        
    return obj
   

In [10]:
export = []
for obj in tqdm(objects):
    export.append(createExportObject(obj['subject']))

100%|██████████| 677/677 [02:41<00:00,  4.20it/s]


In [11]:
compressedExport = copy.deepcopy(export)
# For fields that contain just a an array with a single object
# that has a single value "label", we just put the label as a
# string for the field
for row in compressedExport:
    for key in row.keys():
        if len(row[key]) == 1 and len(list(row[key][0].keys())) == 1 and row[key][0]['label']:
            row[key] = row[key][0]['label']
            
# Fields that are empty we set to None
for row in compressedExport:
    for key in row.keys():
        if row[key] == []:
            row[key] = None

In [12]:
# with open(outputFile, 'w', encoding='utf8') as f:
#     json.dump(compressedExport, f, indent=4, ensure_ascii=False)

In [13]:
from dicttoxml import dicttoxml
from xml.dom.minidom import parseString
xml = dicttoxml(compressedExport, attr_type=False)
dom = parseString(xml)

In [14]:
with open(outputFile, 'w') as f:
    f.write(dom.toprettyxml())