From 02301ca030e570398e14bde1af3f02e19666bad5 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Fri, 25 Jul 2014 11:42:54 +0200 Subject: [PATCH 01/66] Example files from the prov-xml documentation. --- prov/tests/xml/example_01.xml | 13 +++++++++ prov/tests/xml/example_02.xml | 12 ++++++++ prov/tests/xml/example_03.xml | 14 +++++++++ prov/tests/xml/example_04.xml | 12 ++++++++ prov/tests/xml/example_05.xml | 14 +++++++++ prov/tests/xml/example_06.xml | 13 +++++++++ prov/tests/xml/example_07.xml | 14 +++++++++ prov/tests/xml/example_08.xml | 25 +++++++++++++++++ prov/tests/xml/example_09.xml | 25 +++++++++++++++++ prov/tests/xml/example_10.xml | 20 +++++++++++++ prov/tests/xml/example_11.xml | 48 +++++++++++++++++++++++++++++++ prov/tests/xml/example_12.xml | 20 +++++++++++++ prov/tests/xml/example_13.xml | 24 ++++++++++++++++ prov/tests/xml/example_14.xml | 22 +++++++++++++++ prov/tests/xml/example_15.xml | 21 ++++++++++++++ prov/tests/xml/example_16.xml | 32 +++++++++++++++++++++ prov/tests/xml/example_17.xml | 20 +++++++++++++ prov/tests/xml/example_18.xml | 13 +++++++++ prov/tests/xml/example_19.xml | 9 ++++++ prov/tests/xml/example_20.xml | 9 ++++++ prov/tests/xml/example_21.xml | 9 ++++++ prov/tests/xml/example_22.xml | 33 ++++++++++++++++++++++ prov/tests/xml/example_23.xml | 39 ++++++++++++++++++++++++++ prov/tests/xml/example_24.xml | 39 ++++++++++++++++++++++++++ prov/tests/xml/example_25.xml | 53 +++++++++++++++++++++++++++++++++++ prov/tests/xml/example_26.xml | 15 ++++++++++ prov/tests/xml/example_27.xml | 33 ++++++++++++++++++++++ prov/tests/xml/example_28.xml | 33 ++++++++++++++++++++++ prov/tests/xml/example_29.xml | 15 ++++++++++ prov/tests/xml/example_30.xml | 21 ++++++++++++++ prov/tests/xml/example_31.xml | 9 ++++++ prov/tests/xml/example_32.xml | 10 +++++++ prov/tests/xml/example_33.xml | 20 +++++++++++++ prov/tests/xml/example_34.xml | 13 +++++++++ prov/tests/xml/example_35.xml | 16 +++++++++++ prov/tests/xml/example_36.xml | 14 +++++++++ prov/tests/xml/example_37.xml | 17 +++++++++++ prov/tests/xml/example_38.xml | 22 +++++++++++++++ prov/tests/xml/example_39.xml | 26 +++++++++++++++++ prov/tests/xml/example_40.xml | 15 ++++++++++ prov/tests/xml/example_41.xml | 9 ++++++ prov/tests/xml/example_42.xml | 17 +++++++++++ 42 files changed, 858 insertions(+) create mode 100644 prov/tests/xml/example_01.xml create mode 100644 prov/tests/xml/example_02.xml create mode 100644 prov/tests/xml/example_03.xml create mode 100644 prov/tests/xml/example_04.xml create mode 100644 prov/tests/xml/example_05.xml create mode 100644 prov/tests/xml/example_06.xml create mode 100644 prov/tests/xml/example_07.xml create mode 100644 prov/tests/xml/example_08.xml create mode 100644 prov/tests/xml/example_09.xml create mode 100644 prov/tests/xml/example_10.xml create mode 100644 prov/tests/xml/example_11.xml create mode 100644 prov/tests/xml/example_12.xml create mode 100644 prov/tests/xml/example_13.xml create mode 100644 prov/tests/xml/example_14.xml create mode 100644 prov/tests/xml/example_15.xml create mode 100644 prov/tests/xml/example_16.xml create mode 100644 prov/tests/xml/example_17.xml create mode 100644 prov/tests/xml/example_18.xml create mode 100644 prov/tests/xml/example_19.xml create mode 100644 prov/tests/xml/example_20.xml create mode 100644 prov/tests/xml/example_21.xml create mode 100644 prov/tests/xml/example_22.xml create mode 100644 prov/tests/xml/example_23.xml create mode 100644 prov/tests/xml/example_24.xml create mode 100644 prov/tests/xml/example_25.xml create mode 100644 prov/tests/xml/example_26.xml create mode 100644 prov/tests/xml/example_27.xml create mode 100644 prov/tests/xml/example_28.xml create mode 100644 prov/tests/xml/example_29.xml create mode 100644 prov/tests/xml/example_30.xml create mode 100644 prov/tests/xml/example_31.xml create mode 100644 prov/tests/xml/example_32.xml create mode 100644 prov/tests/xml/example_33.xml create mode 100644 prov/tests/xml/example_34.xml create mode 100644 prov/tests/xml/example_35.xml create mode 100644 prov/tests/xml/example_36.xml create mode 100644 prov/tests/xml/example_37.xml create mode 100644 prov/tests/xml/example_38.xml create mode 100644 prov/tests/xml/example_39.xml create mode 100644 prov/tests/xml/example_40.xml create mode 100644 prov/tests/xml/example_41.xml create mode 100644 prov/tests/xml/example_42.xml diff --git a/prov/tests/xml/example_01.xml b/prov/tests/xml/example_01.xml new file mode 100644 index 00000000..1a26c957 --- /dev/null +++ b/prov/tests/xml/example_01.xml @@ -0,0 +1,13 @@ + + + + prov:Plan + ex:Workflow + + + diff --git a/prov/tests/xml/example_02.xml b/prov/tests/xml/example_02.xml new file mode 100644 index 00000000..33efb1ff --- /dev/null +++ b/prov/tests/xml/example_02.xml @@ -0,0 +1,12 @@ + + + + ex:Workflow + + + diff --git a/prov/tests/xml/example_03.xml b/prov/tests/xml/example_03.xml new file mode 100644 index 00000000..9a646b38 --- /dev/null +++ b/prov/tests/xml/example_03.xml @@ -0,0 +1,14 @@ + + + + ex:Workflow + prov:Plan + prov:Entity + + + diff --git a/prov/tests/xml/example_04.xml b/prov/tests/xml/example_04.xml new file mode 100644 index 00000000..c5b93bad --- /dev/null +++ b/prov/tests/xml/example_04.xml @@ -0,0 +1,12 @@ + + + + ex:Workflow + + + diff --git a/prov/tests/xml/example_05.xml b/prov/tests/xml/example_05.xml new file mode 100644 index 00000000..85930361 --- /dev/null +++ b/prov/tests/xml/example_05.xml @@ -0,0 +1,14 @@ + + + + ex:Workflow + prov:Plan + prov:Entity + + + diff --git a/prov/tests/xml/example_06.xml b/prov/tests/xml/example_06.xml new file mode 100644 index 00000000..5fd2891b --- /dev/null +++ b/prov/tests/xml/example_06.xml @@ -0,0 +1,13 @@ + + + + document + 2 + + + diff --git a/prov/tests/xml/example_07.xml b/prov/tests/xml/example_07.xml new file mode 100644 index 00000000..8e088a2c --- /dev/null +++ b/prov/tests/xml/example_07.xml @@ -0,0 +1,14 @@ + + + + 2011-11-16T16:05:00 + 2011-11-16T16:06:00 + ex:edit + server.example.org + + + diff --git a/prov/tests/xml/example_08.xml b/prov/tests/xml/example_08.xml new file mode 100644 index 00000000..d4947c28 --- /dev/null +++ b/prov/tests/xml/example_08.xml @@ -0,0 +1,25 @@ + + + + + + + + + + 2001-10-26T21:32:52 + p1 + + + + + + + + 2001-10-26T10:00:00 + p2 + + + diff --git a/prov/tests/xml/example_09.xml b/prov/tests/xml/example_09.xml new file mode 100644 index 00000000..4da86a02 --- /dev/null +++ b/prov/tests/xml/example_09.xml @@ -0,0 +1,25 @@ + + + + + + + + + + + + + p1 + + + + + + 2011-11-16T16:00:01 + p2 + + + diff --git a/prov/tests/xml/example_10.xml b/prov/tests/xml/example_10.xml new file mode 100644 index 00000000..79ec912d --- /dev/null +++ b/prov/tests/xml/example_10.xml @@ -0,0 +1,20 @@ + + + + traffic regulations enforcing + + + + fine paying, check writing, and mailing + + + + + + + + diff --git a/prov/tests/xml/example_11.xml b/prov/tests/xml/example_11.xml new file mode 100644 index 00000000..d3dba57c --- /dev/null +++ b/prov/tests/xml/example_11.xml @@ -0,0 +1,48 @@ + + + + email message + + + + Discuss + + + + + + 2011-11-16T16:05:00 + + + + + + + + + Write + + + + + + + + + + + + 2011-11-16T16:05:00 + + + + + + 2011-11-16T16:05:00 + + + diff --git a/prov/tests/xml/example_12.xml b/prov/tests/xml/example_12.xml new file mode 100644 index 00000000..18abb394 --- /dev/null +++ b/prov/tests/xml/example_12.xml @@ -0,0 +1,20 @@ + + + + approval document + + + + Editing + + + + + + + + diff --git a/prov/tests/xml/example_13.xml b/prov/tests/xml/example_13.xml new file mode 100644 index 00000000..72816093 --- /dev/null +++ b/prov/tests/xml/example_13.xml @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + + 1998-09-03T01:31:00 + plane accident + + + diff --git a/prov/tests/xml/example_14.xml b/prov/tests/xml/example_14.xml new file mode 100644 index 00000000..a8e76dcc --- /dev/null +++ b/prov/tests/xml/example_14.xml @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + physical transform + + + diff --git a/prov/tests/xml/example_15.xml b/prov/tests/xml/example_15.xml new file mode 100644 index 00000000..2beabc33 --- /dev/null +++ b/prov/tests/xml/example_15.xml @@ -0,0 +1,21 @@ + + + + rec54:WD + + + + rec54:WD + + + + + + + + diff --git a/prov/tests/xml/example_16.xml b/prov/tests/xml/example_16.xml new file mode 100644 index 00000000..80d545ea --- /dev/null +++ b/prov/tests/xml/example_16.xml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/prov/tests/xml/example_17.xml b/prov/tests/xml/example_17.xml new file mode 100644 index 00000000..f0cd8577 --- /dev/null +++ b/prov/tests/xml/example_17.xml @@ -0,0 +1,20 @@ + + + + map + + + + journal + + + + + + + + diff --git a/prov/tests/xml/example_18.xml b/prov/tests/xml/example_18.xml new file mode 100644 index 00000000..bfb7763c --- /dev/null +++ b/prov/tests/xml/example_18.xml @@ -0,0 +1,13 @@ + + + + prov:Person + Alice + 1234 + + + diff --git a/prov/tests/xml/example_19.xml b/prov/tests/xml/example_19.xml new file mode 100644 index 00000000..6a7b35a7 --- /dev/null +++ b/prov/tests/xml/example_19.xml @@ -0,0 +1,9 @@ + + + + + diff --git a/prov/tests/xml/example_20.xml b/prov/tests/xml/example_20.xml new file mode 100644 index 00000000..afcf0728 --- /dev/null +++ b/prov/tests/xml/example_20.xml @@ -0,0 +1,9 @@ + + + + + diff --git a/prov/tests/xml/example_21.xml b/prov/tests/xml/example_21.xml new file mode 100644 index 00000000..1839c46a --- /dev/null +++ b/prov/tests/xml/example_21.xml @@ -0,0 +1,9 @@ + + + + + diff --git a/prov/tests/xml/example_22.xml b/prov/tests/xml/example_22.xml new file mode 100644 index 00000000..b292a76c --- /dev/null +++ b/prov/tests/xml/example_22.xml @@ -0,0 +1,33 @@ + + + + prov:Person + + + + prov:Person + + + + rec54:WD + + + + + + editorship + + + + + + authorship + + + diff --git a/prov/tests/xml/example_23.xml b/prov/tests/xml/example_23.xml new file mode 100644 index 00000000..ca559a16 --- /dev/null +++ b/prov/tests/xml/example_23.xml @@ -0,0 +1,39 @@ + + + + workflow execution + + + + operator + + + + designator + + + + + + loggedInUser + webapp + + + + + + + designer + project1 + + + + Workflow 1 + http://example.org/workflow1.bpel + + + diff --git a/prov/tests/xml/example_24.xml b/prov/tests/xml/example_24.xml new file mode 100644 index 00000000..ca559a16 --- /dev/null +++ b/prov/tests/xml/example_24.xml @@ -0,0 +1,39 @@ + + + + workflow execution + + + + operator + + + + designator + + + + + + loggedInUser + webapp + + + + + + + designer + project1 + + + + Workflow 1 + http://example.org/workflow1.bpel + + + diff --git a/prov/tests/xml/example_25.xml b/prov/tests/xml/example_25.xml new file mode 100644 index 00000000..3e24054b --- /dev/null +++ b/prov/tests/xml/example_25.xml @@ -0,0 +1,53 @@ + + + + workflow + + + + programmer + + + + researcher + + + + funder + + + + + + loggedInUser + + + + + + + + + + + + + + + + + line-management + + + + + + + contract + + + diff --git a/prov/tests/xml/example_26.xml b/prov/tests/xml/example_26.xml new file mode 100644 index 00000000..8bf744a9 --- /dev/null +++ b/prov/tests/xml/example_26.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + diff --git a/prov/tests/xml/example_27.xml b/prov/tests/xml/example_27.xml new file mode 100644 index 00000000..289696a7 --- /dev/null +++ b/prov/tests/xml/example_27.xml @@ -0,0 +1,33 @@ + + + + 1 + + + + + + + report + 2 + + + + + 2012-05-25T11:00:01 + + + + + + + + + diff --git a/prov/tests/xml/example_28.xml b/prov/tests/xml/example_28.xml new file mode 100644 index 00000000..289696a7 --- /dev/null +++ b/prov/tests/xml/example_28.xml @@ -0,0 +1,33 @@ + + + + 1 + + + + + + + report + 2 + + + + + 2012-05-25T11:00:01 + + + + + + + + + diff --git a/prov/tests/xml/example_29.xml b/prov/tests/xml/example_29.xml new file mode 100644 index 00000000..ba9b189d --- /dev/null +++ b/prov/tests/xml/example_29.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + diff --git a/prov/tests/xml/example_30.xml b/prov/tests/xml/example_30.xml new file mode 100644 index 00000000..4997ee18 --- /dev/null +++ b/prov/tests/xml/example_30.xml @@ -0,0 +1,21 @@ + + + + a news item for desktop + + + + a news item for mobile devices + + + + + + + + diff --git a/prov/tests/xml/example_31.xml b/prov/tests/xml/example_31.xml new file mode 100644 index 00000000..80753bbf --- /dev/null +++ b/prov/tests/xml/example_31.xml @@ -0,0 +1,9 @@ + + + + + diff --git a/prov/tests/xml/example_32.xml b/prov/tests/xml/example_32.xml new file mode 100644 index 00000000..9418897f --- /dev/null +++ b/prov/tests/xml/example_32.xml @@ -0,0 +1,10 @@ + + + + + diff --git a/prov/tests/xml/example_33.xml b/prov/tests/xml/example_33.xml new file mode 100644 index 00000000..068d4070 --- /dev/null +++ b/prov/tests/xml/example_33.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + diff --git a/prov/tests/xml/example_34.xml b/prov/tests/xml/example_34.xml new file mode 100644 index 00000000..9fd22624 --- /dev/null +++ b/prov/tests/xml/example_34.xml @@ -0,0 +1,13 @@ + + + + document + 2 + + + diff --git a/prov/tests/xml/example_35.xml b/prov/tests/xml/example_35.xml new file mode 100644 index 00000000..8b60fcbe --- /dev/null +++ b/prov/tests/xml/example_35.xml @@ -0,0 +1,16 @@ + + + + + + + + + + 2001-10-26T21:32:52 + p1 + + + diff --git a/prov/tests/xml/example_36.xml b/prov/tests/xml/example_36.xml new file mode 100644 index 00000000..68867fe5 --- /dev/null +++ b/prov/tests/xml/example_36.xml @@ -0,0 +1,14 @@ + + + + This is a human-readable label + + + + Voiture 01 + Car 01 + + + diff --git a/prov/tests/xml/example_37.xml b/prov/tests/xml/example_37.xml new file mode 100644 index 00000000..872a64f7 --- /dev/null +++ b/prov/tests/xml/example_37.xml @@ -0,0 +1,17 @@ + + + + StillImage + Le Louvre, Paris + + + + (5,5) + 10 + + + diff --git a/prov/tests/xml/example_38.xml b/prov/tests/xml/example_38.xml new file mode 100644 index 00000000..106dd73a --- /dev/null +++ b/prov/tests/xml/example_38.xml @@ -0,0 +1,22 @@ + + + + + + loggedInUser + webapp + + + + + + + designer + project1 + + + diff --git a/prov/tests/xml/example_39.xml b/prov/tests/xml/example_39.xml new file mode 100644 index 00000000..86f2a5a6 --- /dev/null +++ b/prov/tests/xml/example_39.xml @@ -0,0 +1,26 @@ + + + + document + 2 + + + + prov:Person + Alice + 1234 + + + + 2011-11-16T16:05:00 + 2011-11-16T16:06:00 + ex:edit + server.example.org + + + diff --git a/prov/tests/xml/example_40.xml b/prov/tests/xml/example_40.xml new file mode 100644 index 00000000..ba6af8d5 --- /dev/null +++ b/prov/tests/xml/example_40.xml @@ -0,0 +1,15 @@ + + + + abcd + + + + 4 + + + diff --git a/prov/tests/xml/example_41.xml b/prov/tests/xml/example_41.xml new file mode 100644 index 00000000..7bd661bb --- /dev/null +++ b/prov/tests/xml/example_41.xml @@ -0,0 +1,9 @@ + + + + + diff --git a/prov/tests/xml/example_42.xml b/prov/tests/xml/example_42.xml new file mode 100644 index 00000000..54ad9c9d --- /dev/null +++ b/prov/tests/xml/example_42.xml @@ -0,0 +1,17 @@ + + + + + + + bar + + + + + + From b539a0faef6d8bd967494a16336e4cd86b47245a Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Fri, 25 Jul 2014 11:50:00 +0200 Subject: [PATCH 02/66] Dummy structure for PROV-XML. --- .travis.yml | 2 +- prov/serializers/__init__.py | 4 +++- prov/serializers/provxml.py | 22 ++++++++++++++++++++++ setup.py | 1 + 4 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 prov/serializers/provxml.py diff --git a/.travis.yml b/.travis.yml index b8fc08b4..ad349cca 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,7 @@ before_install: # Install packages install: - pip install -r requirements.txt - - pip install coverage coveralls pydot + - pip install coverage coveralls pydot lxml # Run test script: diff --git a/prov/serializers/__init__.py b/prov/serializers/__init__.py index 20b2c4f4..c4baa918 100644 --- a/prov/serializers/__init__.py +++ b/prov/serializers/__init__.py @@ -20,9 +20,11 @@ class Registry: @staticmethod def load_serializers(): from prov.serializers.provjson import ProvJSONSerializer + from prov.serializers.provxml import ProvXMLSerializer Registry.serializers = { - 'json': ProvJSONSerializer + 'json': ProvJSONSerializer, + 'xml': ProvXMLSerializer } diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py new file mode 100644 index 00000000..e4478979 --- /dev/null +++ b/prov/serializers/provxml.py @@ -0,0 +1,22 @@ +"""PROV-XML serializers for ProvDocument + +""" +__author__ = 'Lion Krischer' +__email__ = 'krischer@geophysik.uni-muenchen.de' + +import logging +logger = logging.getLogger(__name__) + +from prov import Serializer, Error + + +class ProvXMLException(Error): + pass + + +class ProvXMLSerializer(Serializer): + def serialize(self, stream, **kwargs): + pass + + def deserialize(self, stream, **kwargs): + pass diff --git a/setup.py b/setup.py index 919939ec..e23c7a4a 100644 --- a/setup.py +++ b/setup.py @@ -12,6 +12,7 @@ requirements = [ 'python-dateutil', + 'lxml' ] test_requirements = [ From d8f514257b555b0b91d6a4a2e755ae6d5a2d3904 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sat, 26 Jul 2014 13:10:32 +0200 Subject: [PATCH 03/66] Very basic test and partially working serialization. --- prov/model.py | 2 +- prov/serializers/provxml.py | 78 ++++++++++++++++++++++++++++++++++--- prov/tests/test_xml.py | 55 ++++++++++++++++++++++++++ 3 files changed, 129 insertions(+), 6 deletions(-) create mode 100644 prov/tests/test_xml.py diff --git a/prov/model.py b/prov/model.py index 7c8ed94b..78665521 100644 --- a/prov/model.py +++ b/prov/model.py @@ -83,7 +83,7 @@ def encoding_provn_value(value): elif isinstance(value, float): return u'"%g" %%%% xsd:float' % value elif isinstance(value, bool): - return u'"%i" %%%% xsd:boolean' % value + return u'"%i" %%%% xsd:boolean' % value else: # TODO: QName export return unicode(value) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index e4478979..817e63a6 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -5,18 +5,86 @@ __email__ = 'krischer@geophysik.uni-muenchen.de' import logging +from lxml import etree + logger = logging.getLogger(__name__) -from prov import Serializer, Error +import prov +import prov.constants + +NS_PROV = "http://www.w3.org/ns/prov#" +NS_XSI = "http://www.w3.org/2001/XMLSchema-instance" -class ProvXMLException(Error): +class ProvXMLException(prov.Error): pass -class ProvXMLSerializer(Serializer): +class ProvXMLSerializer(prov.Serializer): def serialize(self, stream, **kwargs): - pass + # Build the namespace map for lxml and attach it to the root XML + # element. + nsmap = {ns.prefix: ns.uri for ns in + self.document._namespaces.get_registered_namespaces()} + if self.document._namespaces._default: + nsmap[None] = self.document._namespaces._default.uri + nsmap["prov"] = NS_PROV + + xml_root = etree.Element(_ns_prov("document"), nsmap=nsmap) + + # Filter functions used to sort the attributes according to the + # PROV-XML specification. + filter_fcts = [ + lambda x: x[0] in prov.constants.PROV_ATTRIBUTE_QNAMES, + lambda x: x[0] in prov.constants.PROV_ATTRIBUTE_LITERALS, + lambda x: x[0] == prov.constants.PROV_LABEL, + lambda x: x[0] == prov.constants.PROV_LOCATION, + lambda x: x[0] == prov.constants.PROV_ROLE, + lambda x: x[0] == prov.constants.PROV_TYPE, + lambda x: x[0] == prov.constants.PROV_VALUE, + lambda x: True + ] + + for record in self.document._records: + rec_type = record.get_type() + rec_label = prov.constants.PROV_N_MAP[rec_type] + identifier = unicode(record._identifier) + + elem = etree.SubElement( + xml_root, _ns_prov(rec_label), + {_ns_prov("id"): identifier}) + + used_attributes = [] + for fct in filter_fcts: + _fct = lambda x: x not in used_attributes and fct(x) \ + and x[1] is not None + for attr, value in filter(_fct, record.attributes): + used_attributes.append((attr, value)) + subelem = etree.SubElement( + elem, _ns(attr.namespace.uri, attr.localpart)) + if isinstance(value, prov.model.Literal): + subelem.attrib[_ns_xsi("type")] = "%s:%s" % ( + value.datatype.namespace.prefix, + value.datatype.localpart) + value = value.value + else: + value = str(value) + subelem.text = value + + print "" + print etree.tostring(xml_root, pretty_print=True) + def deserialize(self, stream, **kwargs): - pass + raise NotImplementedError + + +def _ns(ns, tag): + return "{%s}%s" % (ns, tag) + + +def _ns_prov(tag): + return _ns(NS_PROV, tag) + +def _ns_xsi(tag): + return _ns(NS_XSI, tag) diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py new file mode 100644 index 00000000..ffd81419 --- /dev/null +++ b/prov/tests/test_xml.py @@ -0,0 +1,55 @@ +import inspect +import os +import unittest + +import prov.model as prov + + +EX_NS = ('ex', 'http://example.org/ns/ex#') +EX_TR = ('tr', 'http://example.org/ns/tr#') +EX_XSI = ('xsi', 'http://www.w3.org/2001/XMLSchema-instance') + +# Most general way to get the path. +DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(inspect.getfile( + inspect.currentframe()))), "xml") + + +class ProvXMLSerializationTestCase(unittest.TestCase): + def test_serialization_example_6(self): + """ + Test the serialization of example 6 which is a simple entity + description. + """ + document = prov.ProvDocument() + document.add_namespace(*EX_NS) + document.add_namespace(*EX_TR) + document.add_namespace(*EX_XSI) + + document.entity("tr:WD-prov-dm-20111215", ( + (prov.PROV_TYPE, prov.Literal("document", prov.XSD_QNAME)), + ("ex:version", "2") + )) + + document.serialize(format='xml') + + def test_serialization_example_7(self): + """ + Test the serialization of example 7 which is a basic activity. + """ + document = prov.ProvDocument() + document.add_namespace(*EX_NS) + document.add_namespace(*EX_XSI) + + document.activity( + "ex:a1", + "2011-11-16T16:05:00", + "2011-11-16T16:06:00", [ + (prov.PROV_TYPE, prov.Literal("ex:edit", prov.XSD_QNAME)), + ("ex:host", "server.example.org")]) + + document.serialize(format='xml') + + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file From e76e3b30a3fbb706d7142f3c70f0d0a97a080253 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sat, 26 Jul 2014 15:02:39 +0200 Subject: [PATCH 04/66] Serialization tests including XML comparision. --- prov/serializers/provxml.py | 11 ++++++-- prov/tests/test_xml.py | 56 +++++++++++++++++++++++++++++++++---- 2 files changed, 59 insertions(+), 8 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 817e63a6..6e9eebc5 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -4,6 +4,7 @@ __author__ = 'Lion Krischer' __email__ = 'krischer@geophysik.uni-muenchen.de' +import datetime import logging from lxml import etree @@ -14,6 +15,7 @@ NS_PROV = "http://www.w3.org/ns/prov#" NS_XSI = "http://www.w3.org/2001/XMLSchema-instance" +NS_XSD = "http://www.w3.org/2001/XMLSchema" class ProvXMLException(prov.Error): @@ -28,7 +30,10 @@ def serialize(self, stream, **kwargs): self.document._namespaces.get_registered_namespaces()} if self.document._namespaces._default: nsmap[None] = self.document._namespaces._default.uri + # Add the prov, XSI, and XSD namespaces by default. nsmap["prov"] = NS_PROV + nsmap["xsi"] = NS_XSI + nsmap["xsd"] = NS_XSD xml_root = etree.Element(_ns_prov("document"), nsmap=nsmap) @@ -67,12 +72,14 @@ def serialize(self, stream, **kwargs): value.datatype.namespace.prefix, value.datatype.localpart) value = value.value + if isinstance(value, datetime.datetime): + value = value.isoformat() else: value = str(value) subelem.text = value - print "" - print etree.tostring(xml_root, pretty_print=True) + et = etree.ElementTree(xml_root) + et.write(stream, pretty_print=True) def deserialize(self, stream, **kwargs): diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index ffd81419..f69c98e3 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -1,19 +1,58 @@ +import difflib import inspect +import io +from lxml import etree import os import unittest import prov.model as prov -EX_NS = ('ex', 'http://example.org/ns/ex#') -EX_TR = ('tr', 'http://example.org/ns/tr#') -EX_XSI = ('xsi', 'http://www.w3.org/2001/XMLSchema-instance') +EX_NS = ('ex', 'http://example.com/ns/ex#') +EX_TR = ('tr', 'http://example.com/ns/tr#') # Most general way to get the path. DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(inspect.getfile( inspect.currentframe()))), "xml") +def compare_xml(doc1, doc2): + """ + Helper function to compare two XML files. It will parse both once again + and write them in a canonical fashion. + """ + try: + doc1.seek(0, 0) + except AttributeError: + pass + try: + doc2.seek(0, 0) + except AttributeError: + pass + + obj1 = etree.parse(doc1) + obj2 = etree.parse(doc2) + + buf = io.BytesIO() + obj1.write_c14n(buf) + buf.seek(0, 0) + str1 = buf.read() + str1 = [_i.strip() for _i in str1.splitlines() if _i.strip()] + + buf = io.BytesIO() + obj2.write_c14n(buf) + buf.seek(0, 0) + str2 = buf.read() + str2 = [_i.strip() for _i in str2.splitlines() if _i.strip()] + + unified_diff = difflib.unified_diff(str1, str2) + + err_msg = "\n".join(unified_diff) + if err_msg: + msg = "Strings are not equal.\n" + raise AssertionError(msg + err_msg) + + class ProvXMLSerializationTestCase(unittest.TestCase): def test_serialization_example_6(self): """ @@ -23,14 +62,16 @@ def test_serialization_example_6(self): document = prov.ProvDocument() document.add_namespace(*EX_NS) document.add_namespace(*EX_TR) - document.add_namespace(*EX_XSI) document.entity("tr:WD-prov-dm-20111215", ( (prov.PROV_TYPE, prov.Literal("document", prov.XSD_QNAME)), ("ex:version", "2") )) - document.serialize(format='xml') + with io.BytesIO() as actual: + document.serialize(format='xml', destination=actual) + compare_xml(os.path.join(DATA_PATH, "example_06.xml"), actual) + def test_serialization_example_7(self): """ @@ -38,7 +79,6 @@ def test_serialization_example_7(self): """ document = prov.ProvDocument() document.add_namespace(*EX_NS) - document.add_namespace(*EX_XSI) document.activity( "ex:a1", @@ -49,6 +89,10 @@ def test_serialization_example_7(self): document.serialize(format='xml') + with io.BytesIO() as actual: + document.serialize(format='xml', destination=actual) + compare_xml(os.path.join(DATA_PATH, "example_07.xml"), actual) + if __name__ == '__main__': From 6743f6b820e6a9c628ff1bc708f2bf376333ea84 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sat, 26 Jul 2014 16:56:59 +0200 Subject: [PATCH 05/66] Basic deserialization works. --- prov/serializers/provxml.py | 46 +++++++++++++++++++++++++++++++++++-- prov/tests/test_xml.py | 43 +++++++++++++++++++++++++++++++--- 2 files changed, 84 insertions(+), 5 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 6e9eebc5..001286c8 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -81,9 +81,51 @@ def serialize(self, stream, **kwargs): et = etree.ElementTree(xml_root) et.write(stream, pretty_print=True) - def deserialize(self, stream, **kwargs): - raise NotImplementedError + xml_doc = etree.parse(stream).getroot() + + document = prov.model.ProvDocument() + for key, value in xml_doc.nsmap.items(): + document.add_namespace(key, value) + + r_nsmap = {value: key for key, value in xml_doc.nsmap.items()} + + for element in xml_doc: + qname = etree.QName(element) + if qname.namespace == NS_PROV: + rec_type = prov.constants.PROV_RECORD_IDS_MAP[qname.localname] + rec_id = element.attrib[_ns_prov("id")] + attributes = [] + other_attributes = [] + for subel in element: + sqname = etree.QName(subel) + if sqname.namespace == NS_PROV: + _t = prov.constants.PROV[sqname.localname] + d = attributes + else: + _t = "%s:%s" % (r_nsmap[sqname.namespace], + sqname.localname) + d = other_attributes + + if len(subel.attrib) > 1: + raise NotImplementedError + elif len(subel.attrib) == 1: + key, value = subel.attrib.items()[0] + if key != "{%s}%s" % (NS_XSI, "type"): + raise NotImplementedError + _v = prov.model.Literal( + subel.text, + prov.constants.XSD[value.split(":")[1]]) + else: + _v = subel.text + d.append((_t, _v)) + document.add_record(rec_type, rec_id, attributes, + other_attributes) + + else: + raise NotImplementedError + + return document def _ns(ns, tag): diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index f69c98e3..5ec9d460 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -72,7 +72,6 @@ def test_serialization_example_6(self): document.serialize(format='xml', destination=actual) compare_xml(os.path.join(DATA_PATH, "example_06.xml"), actual) - def test_serialization_example_7(self): """ Test the serialization of example 7 which is a basic activity. @@ -87,12 +86,50 @@ def test_serialization_example_7(self): (prov.PROV_TYPE, prov.Literal("ex:edit", prov.XSD_QNAME)), ("ex:host", "server.example.org")]) - document.serialize(format='xml') - with io.BytesIO() as actual: document.serialize(format='xml', destination=actual) compare_xml(os.path.join(DATA_PATH, "example_07.xml"), actual) + def test_deserialization_example_6(self): + """ + Test the deserialization of example 6 which is a simple entity + description. + """ + actual_doc = prov.ProvDocument.deserialize( + source=os.path.join(DATA_PATH, "example_06.xml"), + format="xml") + + expected_document = prov.ProvDocument() + expected_document.add_namespace(*EX_NS) + expected_document.add_namespace(*EX_TR) + + expected_document.entity("tr:WD-prov-dm-20111215", ( + (prov.PROV_TYPE, prov.Literal("document", prov.XSD_QNAME)), + ("ex:version", "2") + )) + + self.assertEqual(actual_doc, expected_document) + + def test_deserialization_example_7(self): + """ + Test the deserialization of example 7 which is a simple activity + description. + """ + actual_doc = prov.ProvDocument.deserialize( + source=os.path.join(DATA_PATH, "example_07.xml"), + format="xml") + + expected_document = prov.ProvDocument() + expected_document.add_namespace(*EX_NS) + + expected_document.activity( + "ex:a1", + "2011-11-16T16:05:00", + "2011-11-16T16:06:00", [ + (prov.PROV_TYPE, prov.Literal("ex:edit", prov.XSD_QNAME)), + ("ex:host", "server.example.org")]) + + self.assertEqual(actual_doc, expected_document) if __name__ == '__main__': From 746c72a54cefd503e2b81fcca95a367c23c1e490 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sat, 26 Jul 2014 21:06:09 +0200 Subject: [PATCH 06/66] Parameterized roundtrip test cases. One for each test file. --- prov/serializers/provxml.py | 7 +++++-- prov/tests/test_xml.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 001286c8..bdc3c2d9 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -1,5 +1,5 @@ -"""PROV-XML serializers for ProvDocument - +""" +PROV-XML serializers for ProvDocument """ __author__ = 'Lion Krischer' __email__ = 'krischer@geophysik.uni-muenchen.de' @@ -91,6 +91,8 @@ def deserialize(self, stream, **kwargs): r_nsmap = {value: key for key, value in xml_doc.nsmap.items()} for element in xml_doc: + if isinstance(element, etree._Comment): + continue qname = etree.QName(element) if qname.namespace == NS_PROV: rec_type = prov.constants.PROV_RECORD_IDS_MAP[qname.localname] @@ -135,5 +137,6 @@ def _ns(ns, tag): def _ns_prov(tag): return _ns(NS_PROV, tag) + def _ns_xsi(tag): return _ns(NS_XSI, tag) diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index 5ec9d460..3243ae54 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -1,4 +1,5 @@ import difflib +import glob import inspect import io from lxml import etree @@ -53,7 +54,7 @@ def compare_xml(doc1, doc2): raise AssertionError(msg + err_msg) -class ProvXMLSerializationTestCase(unittest.TestCase): +class ProvXMLTestCase(unittest.TestCase): def test_serialization_example_6(self): """ Test the serialization of example 6 which is a simple entity @@ -132,5 +133,34 @@ def test_deserialization_example_7(self): self.assertEqual(actual_doc, expected_document) +class ProvXMLRoundTripFromFileTestCase(unittest.TestCase): + def _perform_round_trip(self, filename): + document = prov.ProvDocument.deserialize(source=filename, format="xml") + + with io.BytesIO() as new_xml: + document.serialize(format='xml', destination=new_xml) + compare_xml(filename, new_xml) + + +# Add one test for each found file. Lazy way to do metaprogramming... +# I think parametrized tests are justified in this case as the test +# function names make it clear what is going on. +for filename in glob.iglob(os.path.join( + DATA_PATH, "*" + os.path.extsep + "xml")): + name = os.path.splitext(os.path.basename(filename))[0] + test_name = "test_roundtrip_from_xml_%s" % name + + # Python creates closures by function calls... + def get_fct(f): + def fct(self): + self._perform_round_trip(f) + return fct + + fct = get_fct(filename) + fct.__name__ = test_name + + setattr(ProvXMLRoundTripFromFileTestCase, test_name, fct) + + if __name__ == '__main__': unittest.main() \ No newline at end of file From 0d6c93faa2e6cf95a987b41782c6d4b5fde5d837 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sat, 26 Jul 2014 21:13:39 +0200 Subject: [PATCH 07/66] Fixing XSD namespace --- prov/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/constants.py b/prov/constants.py index f398ae41..98dd540b 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -6,7 +6,7 @@ # Built-in namespaces from prov.identifier import Namespace -XSD = Namespace('xsd', 'http://www.w3.org/2001/XMLSchema#') +XSD = Namespace('xsd', 'http://www.w3.org/2001/XMLSchema') PROV = Namespace('prov', 'http://www.w3.org/ns/prov#') # C1. Entities/Activities From 8f9b78f09003dd3ffedbcd2e42436a5f58467bce Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sat, 26 Jul 2014 23:25:03 +0200 Subject: [PATCH 08/66] All examples now have xsi and xsd namespaces defined. They will be added by default. Workarounds to this would be awkward and frankly speaking not worth the trouble. --- prov/tests/xml/example_08.xml | 2 ++ prov/tests/xml/example_09.xml | 2 ++ prov/tests/xml/example_13.xml | 2 ++ prov/tests/xml/example_26.xml | 2 ++ prov/tests/xml/example_29.xml | 2 ++ prov/tests/xml/example_35.xml | 2 ++ prov/tests/xml/example_36.xml | 2 ++ 7 files changed, 14 insertions(+) diff --git a/prov/tests/xml/example_08.xml b/prov/tests/xml/example_08.xml index d4947c28..130c23a5 100644 --- a/prov/tests/xml/example_08.xml +++ b/prov/tests/xml/example_08.xml @@ -1,4 +1,6 @@ diff --git a/prov/tests/xml/example_09.xml b/prov/tests/xml/example_09.xml index 4da86a02..faa5a7ad 100644 --- a/prov/tests/xml/example_09.xml +++ b/prov/tests/xml/example_09.xml @@ -1,4 +1,6 @@ diff --git a/prov/tests/xml/example_13.xml b/prov/tests/xml/example_13.xml index 72816093..607174f5 100644 --- a/prov/tests/xml/example_13.xml +++ b/prov/tests/xml/example_13.xml @@ -1,4 +1,6 @@ diff --git a/prov/tests/xml/example_26.xml b/prov/tests/xml/example_26.xml index 8bf744a9..5ff74019 100644 --- a/prov/tests/xml/example_26.xml +++ b/prov/tests/xml/example_26.xml @@ -1,4 +1,6 @@ diff --git a/prov/tests/xml/example_29.xml b/prov/tests/xml/example_29.xml index ba9b189d..370ed02f 100644 --- a/prov/tests/xml/example_29.xml +++ b/prov/tests/xml/example_29.xml @@ -1,4 +1,6 @@ diff --git a/prov/tests/xml/example_35.xml b/prov/tests/xml/example_35.xml index 8b60fcbe..33797655 100644 --- a/prov/tests/xml/example_35.xml +++ b/prov/tests/xml/example_35.xml @@ -1,4 +1,6 @@ diff --git a/prov/tests/xml/example_36.xml b/prov/tests/xml/example_36.xml index 68867fe5..155154e9 100644 --- a/prov/tests/xml/example_36.xml +++ b/prov/tests/xml/example_36.xml @@ -1,4 +1,6 @@ From 95b076e931bdc7da521609ddbf4785e6f86fd8fc Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sat, 26 Jul 2014 23:53:24 +0200 Subject: [PATCH 09/66] More changes to id and attribute handling. Slowly approaching the correct state. --- prov/serializers/provxml.py | 33 +++++++++++++++++++++------------ prov/tests/test_xml.py | 2 +- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index bdc3c2d9..cd58041c 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -53,11 +53,15 @@ def serialize(self, stream, **kwargs): for record in self.document._records: rec_type = record.get_type() rec_label = prov.constants.PROV_N_MAP[rec_type] - identifier = unicode(record._identifier) + identifier = unicode(record._identifier) \ + if record._identifier else None - elem = etree.SubElement( - xml_root, _ns_prov(rec_label), - {_ns_prov("id"): identifier}) + if identifier: + attrs = {_ns_prov("id"): identifier} + else: + attrs = None + + elem = etree.SubElement(xml_root, _ns_prov(rec_label), attrs) used_attributes = [] for fct in filter_fcts: @@ -96,7 +100,11 @@ def deserialize(self, stream, **kwargs): qname = etree.QName(element) if qname.namespace == NS_PROV: rec_type = prov.constants.PROV_RECORD_IDS_MAP[qname.localname] - rec_id = element.attrib[_ns_prov("id")] + + id_tag = _ns_prov("id") + rec_id = element.attrib[id_tag] if id_tag in element.attrib \ + else None + attributes = [] other_attributes = [] for subel in element: @@ -113,21 +121,22 @@ def deserialize(self, stream, **kwargs): raise NotImplementedError elif len(subel.attrib) == 1: key, value = subel.attrib.items()[0] - if key != "{%s}%s" % (NS_XSI, "type"): + if key == "{%s}%s" % (NS_XSI, "type"): + _v = prov.model.Literal( + subel.text, + prov.constants.XSD[value.split(":")[1]]) + elif key == _ns_prov("ref"): + _v = value + else: raise NotImplementedError - _v = prov.model.Literal( - subel.text, - prov.constants.XSD[value.split(":")[1]]) else: _v = subel.text d.append((_t, _v)) document.add_record(rec_type, rec_id, attributes, other_attributes) - else: raise NotImplementedError - - return document + return document def _ns(ns, tag): diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index 3243ae54..4fa5cbe7 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -150,7 +150,7 @@ def _perform_round_trip(self, filename): name = os.path.splitext(os.path.basename(filename))[0] test_name = "test_roundtrip_from_xml_%s" % name - # Python creates closures by function calls... + # Python creates closures on function calls... def get_fct(f): def fct(self): self._perform_round_trip(f) From df4f5e74ed8edcfbbe91cdd26835d8ef74374209 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 01:31:01 +0200 Subject: [PATCH 10/66] New scheme to order the child elements in the XML. --- prov/serializers/provxml.py | 107 ++++++++++++++++++++++++------------ prov/tests/test_xml.py | 24 ++++++++ 2 files changed, 96 insertions(+), 35 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index cd58041c..4716f865 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -11,12 +11,63 @@ logger = logging.getLogger(__name__) import prov -import prov.constants +from prov.constants import * NS_PROV = "http://www.w3.org/ns/prov#" NS_XSI = "http://www.w3.org/2001/XMLSchema-instance" NS_XSD = "http://www.w3.org/2001/XMLSchema" +# Force the order of child elements as it matters in XML. Not specified +# elements will keep the original order. Label, location, role, type, +# and value attributes will always come after the specified attributes. Any +# other attributes will come after that. +ELEMENT_ORDER = { + PROV_ACTIVITY: [PROV_ATTR_STARTTIME, PROV_ATTR_ENDTIME], + PROV_GENERATION: [PROV_ATTR_ENTITY, PROV_ATTR_ACTIVITY, PROV_ATTR_TIME], + PROV_USAGE: [PROV_ATTR_ENTITY, PROV_ATTR_ACTIVITY, PROV_ATTR_TIME], + PROV_COMMUNICATION: [PROV_ATTR_INFORMED, PROV_ATTR_INFORMANT], + PROV_START: [PROV_ATTR_ACTIVITY, PROV_ATTR_TRIGGER, PROV_ATTR_STARTER, + PROV_ATTR_TIME], + PROV_END: [PROV_ATTR_ACTIVITY, PROV_ATTR_TRIGGER, PROV_ATTR_ENDER, + PROV_ATTR_TIME], + PROV_INVALIDATION: [PROV_ATTR_ENTITY, PROV_ATTR_ACTIVITY, PROV_ATTR_TIME], + PROV_DERIVATION: [PROV_ATTR_GENERATED_ENTITY, PROV_ATTR_USED_ENTITY, + PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], + PROV_ATTRIBUTION: [PROV_ATTR_ENTITY, PROV_AGENT], + PROV_ASSOCIATION: [PROV_ATTR_ACTIVITY, PROV_AGENT, PROV_ATTR_PLAN], + PROV_DELEGATION: [PROV_ATTR_DELEGATE, PROV_ATTR_RESPONSIBLE, + PROV_ATTR_ACTIVITY], + PROV_INFLUENCE: [PROV_ATTR_INFLUENCEE, PROV_ATTR_INFLUENCER], + PROV_SPECIALIZATION: [PROV_ATTR_SPECIFIC_ENTITY, PROV_ATTR_GENERAL_ENTITY], + PROV_MEMBERSHIP: [PROV_ATTR_COLLECTION, PROV_ATTR_ENTITY] +} + +def sorted_attributes(element, attributes): + """ + Helper function sorting attributes into the order required by PROV-XML. + """ + attributes = list(attributes) + if element in ELEMENT_ORDER: + order = list(ELEMENT_ORDER[element]) + else: + order = [] + # Append label, location, role, type, and value attributes. This is + # universal amongst all elements. + order.extend([PROV_LABEL, PROV_LOCATION, PROV_ROLE, PROV_TYPE, + PROV_VALUE]) + + sorted_elements = [] + for item in order: + for e in list(attributes): + if e[0] != item: + continue + sorted_elements.append(e) + attributes.remove(e) + # Add remaining attributes. + sorted_elements.extend(attributes) + + return sorted_elements + class ProvXMLException(prov.Error): pass @@ -37,22 +88,9 @@ def serialize(self, stream, **kwargs): xml_root = etree.Element(_ns_prov("document"), nsmap=nsmap) - # Filter functions used to sort the attributes according to the - # PROV-XML specification. - filter_fcts = [ - lambda x: x[0] in prov.constants.PROV_ATTRIBUTE_QNAMES, - lambda x: x[0] in prov.constants.PROV_ATTRIBUTE_LITERALS, - lambda x: x[0] == prov.constants.PROV_LABEL, - lambda x: x[0] == prov.constants.PROV_LOCATION, - lambda x: x[0] == prov.constants.PROV_ROLE, - lambda x: x[0] == prov.constants.PROV_TYPE, - lambda x: x[0] == prov.constants.PROV_VALUE, - lambda x: True - ] - for record in self.document._records: rec_type = record.get_type() - rec_label = prov.constants.PROV_N_MAP[rec_type] + rec_label = PROV_N_MAP[rec_type] identifier = unicode(record._identifier) \ if record._identifier else None @@ -63,23 +101,22 @@ def serialize(self, stream, **kwargs): elem = etree.SubElement(xml_root, _ns_prov(rec_label), attrs) - used_attributes = [] - for fct in filter_fcts: - _fct = lambda x: x not in used_attributes and fct(x) \ - and x[1] is not None - for attr, value in filter(_fct, record.attributes): - used_attributes.append((attr, value)) - subelem = etree.SubElement( - elem, _ns(attr.namespace.uri, attr.localpart)) - if isinstance(value, prov.model.Literal): - subelem.attrib[_ns_xsi("type")] = "%s:%s" % ( - value.datatype.namespace.prefix, - value.datatype.localpart) - value = value.value - if isinstance(value, datetime.datetime): - value = value.isoformat() - else: - value = str(value) + for attr, value in sorted_attributes(rec_type, record.attributes): + subelem = etree.SubElement( + elem, _ns(attr.namespace.uri, attr.localpart)) + if isinstance(value, prov.model.Literal): + subelem.attrib[_ns_xsi("type")] = "%s:%s" % ( + value.datatype.namespace.prefix, + value.datatype.localpart) + value = value.value + elif isinstance(value, datetime.datetime): + value = value.isoformat() + else: + value = str(value) + + if attr in PROV_ATTRIBUTE_QNAMES and value: + subelem.attrib[_ns_prov("ref")] = value + else: subelem.text = value et = etree.ElementTree(xml_root) @@ -99,7 +136,7 @@ def deserialize(self, stream, **kwargs): continue qname = etree.QName(element) if qname.namespace == NS_PROV: - rec_type = prov.constants.PROV_RECORD_IDS_MAP[qname.localname] + rec_type = PROV_RECORD_IDS_MAP[qname.localname] id_tag = _ns_prov("id") rec_id = element.attrib[id_tag] if id_tag in element.attrib \ @@ -110,7 +147,7 @@ def deserialize(self, stream, **kwargs): for subel in element: sqname = etree.QName(subel) if sqname.namespace == NS_PROV: - _t = prov.constants.PROV[sqname.localname] + _t = PROV[sqname.localname] d = attributes else: _t = "%s:%s" % (r_nsmap[sqname.namespace], @@ -124,7 +161,7 @@ def deserialize(self, stream, **kwargs): if key == "{%s}%s" % (NS_XSI, "type"): _v = prov.model.Literal( subel.text, - prov.constants.XSD[value.split(":")[1]]) + XSD[value.split(":")[1]]) elif key == _ns_prov("ref"): _v = value else: diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index 4fa5cbe7..5650927e 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -91,6 +91,30 @@ def test_serialization_example_7(self): document.serialize(format='xml', destination=actual) compare_xml(os.path.join(DATA_PATH, "example_07.xml"), actual) + def test_serialization_example_8(self): + """ + Test the serialization of example 8 which deals with generation. + """ + document = prov.ProvDocument() + document.add_namespace(*EX_NS) + + e1 = document.entity("ex:e1") + a1 = document.activity("ex:a1") + + document.wasGeneratedBy(entity=e1, activity=a1, + time="2001-10-26T21:32:52", + other_attributes={"ex:port": "p1"}) + + e2 = document.entity("ex:e2") + + document.wasGeneratedBy(entity=e2, activity=a1, + time="2001-10-26T10:00:00", + other_attributes={"ex:port": "p2"}) + + with io.BytesIO() as actual: + document.serialize(format='xml', destination=actual) + compare_xml(os.path.join(DATA_PATH, "example_08.xml"), actual) + def test_deserialization_example_6(self): """ Test the deserialization of example 6 which is a simple entity From 08e53b85169d443ddbbf9209d60a1d48fff27371 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 01:33:11 +0200 Subject: [PATCH 11/66] Fixing example 9. --- prov/tests/xml/example_09.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/tests/xml/example_09.xml b/prov/tests/xml/example_09.xml index faa5a7ad..5b26df4a 100644 --- a/prov/tests/xml/example_09.xml +++ b/prov/tests/xml/example_09.xml @@ -13,7 +13,7 @@ - + 2011-11-16T16:00:00 p1 From 2fab7aa1c290e73f659bcbb699d441e957c28639 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 01:35:17 +0200 Subject: [PATCH 12/66] Fixing example 10. --- prov/tests/xml/example_10.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/tests/xml/example_10.xml b/prov/tests/xml/example_10.xml index 79ec912d..3e24862f 100644 --- a/prov/tests/xml/example_10.xml +++ b/prov/tests/xml/example_10.xml @@ -5,7 +5,7 @@ xmlns:ex="http://example.com/ns/ex#"> - traffic regulations enforcing + traffic regulations enforcing From 244a28375bc2a704d6aa57a7153f194a57ec8be3 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 02:12:29 +0200 Subject: [PATCH 13/66] xsd type interference. --- prov/serializers/provxml.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 4716f865..dd67076d 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -24,7 +24,7 @@ ELEMENT_ORDER = { PROV_ACTIVITY: [PROV_ATTR_STARTTIME, PROV_ATTR_ENDTIME], PROV_GENERATION: [PROV_ATTR_ENTITY, PROV_ATTR_ACTIVITY, PROV_ATTR_TIME], - PROV_USAGE: [PROV_ATTR_ENTITY, PROV_ATTR_ACTIVITY, PROV_ATTR_TIME], + PROV_USAGE: [PROV_ATTR_ACTIVITY, PROV_ATTR_ENTITY, PROV_ATTR_TIME], PROV_COMMUNICATION: [PROV_ATTR_INFORMED, PROV_ATTR_INFORMANT], PROV_START: [PROV_ATTR_ACTIVITY, PROV_ATTR_TRIGGER, PROV_ATTR_STARTER, PROV_ATTR_TIME], @@ -108,16 +108,34 @@ def serialize(self, stream, **kwargs): subelem.attrib[_ns_xsi("type")] = "%s:%s" % ( value.datatype.namespace.prefix, value.datatype.localpart) - value = value.value + v = value.value elif isinstance(value, datetime.datetime): - value = value.isoformat() + v = value.isoformat() else: - value = str(value) - - if attr in PROV_ATTRIBUTE_QNAMES and value: - subelem.attrib[_ns_prov("ref")] = value + v = str(value) + + # If it is a type element and does not yet have an + # associated xsi type, try to infer it from the value. + if attr == PROV_TYPE and _ns_xsi("type") not in subelem.attrib: + xsd_type = None + if isinstance(value, (str, unicode)): + xsd_type = XSD_STRING + elif isinstance(value, float): + xsd_type = XSD_DOUBLE + elif isinstance(value, int): + xsd_type = XSD_INT + elif isinstance(value, bool): + xsd_type = XSD_BOOLEAN + elif isinstance(value, datetime.datetime): + xsd_type = XSD_DATETIME + + if xsd_type is not None: + subelem.attrib[_ns_xsi("type")] = str(xsd_type) + + if attr in PROV_ATTRIBUTE_QNAMES and v: + subelem.attrib[_ns_prov("ref")] = v else: - subelem.text = value + subelem.text = v et = etree.ElementTree(xml_root) et.write(stream, pretty_print=True) @@ -158,7 +176,7 @@ def deserialize(self, stream, **kwargs): raise NotImplementedError elif len(subel.attrib) == 1: key, value = subel.attrib.items()[0] - if key == "{%s}%s" % (NS_XSI, "type"): + if key == _ns_xsi("type"): _v = prov.model.Literal( subel.text, XSD[value.split(":")[1]]) From f02aac6d8bb9d3256c64d2046e3a4bad31009d6e Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 02:22:13 +0200 Subject: [PATCH 14/66] Adding Sofware Agent, Organization, and Person agents. --- prov/constants.py | 12 ++++++++++++ prov/model.py | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/prov/constants.py b/prov/constants.py index 98dd540b..ccc365d1 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -24,6 +24,9 @@ # C3. Agents/Responsibility PROV_AGENT = PROV['Agent'] +PROV_SOFWARE_AGENT = PROV['SoftwareAgent'] +PROV_PERSON = PROV['Person'] +PROV_ORGANIZATION = PROV['Organization'] PROV_ATTRIBUTION = PROV['Attribution'] PROV_ASSOCIATION = PROV['Association'] PROV_DELEGATION = PROV['Delegation'] @@ -48,6 +51,9 @@ PROV_INVALIDATION: u'wasInvalidatedBy', PROV_DERIVATION: u'wasDerivedFrom', PROV_AGENT: u'agent', + PROV_SOFWARE_AGENT: u'softwareAgent', + PROV_PERSON: u'person', + PROV_ORGANIZATION: u'organization', PROV_ATTRIBUTION: u'wasAttributedTo', PROV_ASSOCIATION: u'wasAssociatedWith', PROV_DELEGATION: u'actedOnBehalfOf', @@ -68,6 +74,9 @@ PROV_ATTR_STARTER = PROV['starter'] PROV_ATTR_ENDER = PROV['ender'] PROV_ATTR_AGENT = PROV['agent'] +PROV_ATTR_SOFTWARE_AGENT = PROV['softwareAgent'] +PROV_ATTR_PERSON = PROV['person'] +PROV_ATTR_ORGANIZATION = PROV['organization'] PROV_ATTR_PLAN = PROV['plan'] PROV_ATTR_DELEGATE = PROV['delegate'] PROV_ATTR_RESPONSIBLE = PROV['responsible'] @@ -99,6 +108,9 @@ PROV_ATTR_STARTER, PROV_ATTR_ENDER, PROV_ATTR_AGENT, + PROV_ATTR_SOFTWARE_AGENT, + PROV_ATTR_PERSON, + PROV_ATTR_ORGANIZATION, PROV_ATTR_PLAN, PROV_ATTR_DELEGATE, PROV_ATTR_RESPONSIBLE, diff --git a/prov/model.py b/prov/model.py index 78665521..b70431e7 100644 --- a/prov/model.py +++ b/prov/model.py @@ -443,6 +443,21 @@ def get_type(self): return PROV_AGENT +class ProvSoftwareAgent(ProvElement): + def get_type(self): + return PROV_SOFWARE_AGENT + + +class ProvPerson(ProvElement): + def get_type(self): + return PROV_PERSON + + +class ProvOrganization(ProvElement): + def get_type(self): + return PROV_ORGANIZATION + + class ProvAttribution(ProvRelation): FORMAL_ATTRIBUTES = (PROV_ATTR_ENTITY, PROV_ATTR_AGENT) @@ -513,6 +528,9 @@ def get_type(self): PROV_INVALIDATION: ProvInvalidation, PROV_DERIVATION: ProvDerivation, PROV_AGENT: ProvAgent, + PROV_SOFWARE_AGENT: ProvSoftwareAgent, + PROV_PERSON: ProvPerson, + PROV_ORGANIZATION: ProvOrganization, PROV_ATTRIBUTION: ProvAttribution, PROV_ASSOCIATION: ProvAssociation, PROV_DELEGATION: ProvDelegation, From 6e02a32cde2c8b6775c7a924ddcf4331f3cdd597 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 02:25:30 +0200 Subject: [PATCH 15/66] Adding software agents, ... methods. --- prov/constants.py | 4 ++-- prov/model.py | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/prov/constants.py b/prov/constants.py index ccc365d1..bdd82570 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -24,7 +24,7 @@ # C3. Agents/Responsibility PROV_AGENT = PROV['Agent'] -PROV_SOFWARE_AGENT = PROV['SoftwareAgent'] +PROV_SOFTWARE_AGENT = PROV['SoftwareAgent'] PROV_PERSON = PROV['Person'] PROV_ORGANIZATION = PROV['Organization'] PROV_ATTRIBUTION = PROV['Attribution'] @@ -51,7 +51,7 @@ PROV_INVALIDATION: u'wasInvalidatedBy', PROV_DERIVATION: u'wasDerivedFrom', PROV_AGENT: u'agent', - PROV_SOFWARE_AGENT: u'softwareAgent', + PROV_SOFTWARE_AGENT: u'softwareAgent', PROV_PERSON: u'person', PROV_ORGANIZATION: u'organization', PROV_ATTRIBUTION: u'wasAttributedTo', diff --git a/prov/model.py b/prov/model.py index b70431e7..f67268a4 100644 --- a/prov/model.py +++ b/prov/model.py @@ -445,7 +445,7 @@ def get_type(self): class ProvSoftwareAgent(ProvElement): def get_type(self): - return PROV_SOFWARE_AGENT + return PROV_SOFTWARE_AGENT class ProvPerson(ProvElement): @@ -528,7 +528,7 @@ def get_type(self): PROV_INVALIDATION: ProvInvalidation, PROV_DERIVATION: ProvDerivation, PROV_AGENT: ProvAgent, - PROV_SOFWARE_AGENT: ProvSoftwareAgent, + PROV_SOFTWARE_AGENT: ProvSoftwareAgent, PROV_PERSON: ProvPerson, PROV_ORGANIZATION: ProvOrganization, PROV_ATTRIBUTION: ProvAttribution, @@ -968,6 +968,18 @@ def communication(self, informed, informant, identifier=None, other_attributes=N def agent(self, identifier, other_attributes=None): return self.add_record(PROV_AGENT, identifier, None, other_attributes) + def software_agent(self, identifier, other_attributes=None): + return self.add_record(PROV_SOFTWARE_AGENT, identifier, None, + other_attributes) + + def organization(self, identifier, other_attributes=None): + return self.add_record(PROV_ORGANIZATION, identifier, None, + other_attributes) + + def person(self, identifier, other_attributes=None): + return self.add_record(PROV_PERSON, identifier, None, + other_attributes) + def attribution(self, entity, agent, identifier=None, other_attributes=None): return self.add_record( PROV_ATTRIBUTION, identifier, { From f82044d41e0f40a112499b293399d14bbdd01ebf Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 02:34:01 +0200 Subject: [PATCH 16/66] Added revision and quotation elements. Preliminary support has previously been available but this is a bit clearer IMHO. --- prov/constants.py | 4 ++++ prov/model.py | 16 ++++++++++++++-- prov/serializers/provxml.py | 4 ++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/prov/constants.py b/prov/constants.py index bdd82570..17327749 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -21,6 +21,8 @@ # C2. Derivations PROV_DERIVATION = PROV['Derivation'] +PROV_REVISION = PROV['Revision'] +PROV_QUOTATION = PROV['Quotation'] # C3. Agents/Responsibility PROV_AGENT = PROV['Agent'] @@ -50,6 +52,8 @@ PROV_END: u'wasEndedBy', PROV_INVALIDATION: u'wasInvalidatedBy', PROV_DERIVATION: u'wasDerivedFrom', + PROV_REVISION: u'wasRevisionOf', + PROV_QUOTATION: u'wasQuotedFrom', PROV_AGENT: u'agent', PROV_SOFTWARE_AGENT: u'softwareAgent', PROV_PERSON: u'person', diff --git a/prov/model.py b/prov/model.py index f67268a4..ef8f0895 100644 --- a/prov/model.py +++ b/prov/model.py @@ -437,6 +437,16 @@ def get_type(self): return PROV_DERIVATION +class ProvRevision(ProvDerivation): + def get_type(self): + return PROV_REVISION + + +class ProvQuotation(ProvDerivation): + def get_type(self): + return PROV_QUOTATION + + ### Component 3: Agents, Responsibility, and Influence class ProvAgent(ProvElement): def get_type(self): @@ -527,6 +537,8 @@ def get_type(self): PROV_END: ProvEnd, PROV_INVALIDATION: ProvInvalidation, PROV_DERIVATION: ProvDerivation, + PROV_REVISION: ProvRevision, + PROV_QUOTATION: ProvQuotation, PROV_AGENT: ProvAgent, PROV_SOFTWARE_AGENT: ProvSoftwareAgent, PROV_PERSON: ProvPerson, @@ -1030,13 +1042,13 @@ def derivation(self, generatedEntity, usedEntity, activity=None, generation=None def revision(self, generatedEntity, usedEntity, activity=None, generation=None, usage=None, identifier=None, other_attributes=None): record = self.derivation(generatedEntity, usedEntity, activity, generation, usage, identifier, other_attributes) - record.add_asserted_type(PROV['Revision']) + record.add_asserted_type(PROV_REVISION) return record def quotation(self, generatedEntity, usedEntity, activity=None, generation=None, usage=None, identifier=None, other_attributes=None): record = self.derivation(generatedEntity, usedEntity, activity, generation, usage, identifier, other_attributes) - record.add_asserted_type(PROV['Quotation']) + record.add_asserted_type(PROV_QUOTATION) return record def primary_source(self, generatedEntity, usedEntity, activity=None, generation=None, usage=None, diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index dd67076d..cdd2bcbd 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -33,6 +33,10 @@ PROV_INVALIDATION: [PROV_ATTR_ENTITY, PROV_ATTR_ACTIVITY, PROV_ATTR_TIME], PROV_DERIVATION: [PROV_ATTR_GENERATED_ENTITY, PROV_ATTR_USED_ENTITY, PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], + PROV_REVISION: [PROV_ATTR_GENERATED_ENTITY, PROV_ATTR_USED_ENTITY, + PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], + PROV_QUOTATION: [PROV_ATTR_GENERATED_ENTITY, PROV_ATTR_USED_ENTITY, + PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], PROV_ATTRIBUTION: [PROV_ATTR_ENTITY, PROV_AGENT], PROV_ASSOCIATION: [PROV_ATTR_ACTIVITY, PROV_AGENT, PROV_ATTR_PLAN], PROV_DELEGATION: [PROV_ATTR_DELEGATE, PROV_ATTR_RESPONSIBLE, From d89e4c72d10b65356784c72e74a994c724c58e84 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 02:37:04 +0200 Subject: [PATCH 17/66] Added primary source object. Many more tests now pass. --- prov/constants.py | 2 ++ prov/model.py | 6 ++++++ prov/serializers/provxml.py | 2 ++ 3 files changed, 10 insertions(+) diff --git a/prov/constants.py b/prov/constants.py index 17327749..3300d777 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -23,6 +23,7 @@ PROV_DERIVATION = PROV['Derivation'] PROV_REVISION = PROV['Revision'] PROV_QUOTATION = PROV['Quotation'] +PROV_PRIMARY_SOURCE = PROV['PrimarySource'] # C3. Agents/Responsibility PROV_AGENT = PROV['Agent'] @@ -54,6 +55,7 @@ PROV_DERIVATION: u'wasDerivedFrom', PROV_REVISION: u'wasRevisionOf', PROV_QUOTATION: u'wasQuotedFrom', + PROV_PRIMARY_SOURCE: u'hadPrimarySource', PROV_AGENT: u'agent', PROV_SOFTWARE_AGENT: u'softwareAgent', PROV_PERSON: u'person', diff --git a/prov/model.py b/prov/model.py index ef8f0895..d2fc59a5 100644 --- a/prov/model.py +++ b/prov/model.py @@ -447,6 +447,11 @@ def get_type(self): return PROV_QUOTATION +class ProvPrimarySource(ProvDerivation): + def get_type(self): + return PROV_PRIMARY_SOURCE + + ### Component 3: Agents, Responsibility, and Influence class ProvAgent(ProvElement): def get_type(self): @@ -539,6 +544,7 @@ def get_type(self): PROV_DERIVATION: ProvDerivation, PROV_REVISION: ProvRevision, PROV_QUOTATION: ProvQuotation, + PROV_PRIMARY_SOURCE: ProvPrimarySource, PROV_AGENT: ProvAgent, PROV_SOFTWARE_AGENT: ProvSoftwareAgent, PROV_PERSON: ProvPerson, diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index cdd2bcbd..8bafca53 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -37,6 +37,8 @@ PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], PROV_QUOTATION: [PROV_ATTR_GENERATED_ENTITY, PROV_ATTR_USED_ENTITY, PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], + PROV_PRIMARY_SOURCE: [PROV_ATTR_GENERATED_ENTITY, PROV_ATTR_USED_ENTITY, + PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], PROV_ATTRIBUTION: [PROV_ATTR_ENTITY, PROV_AGENT], PROV_ASSOCIATION: [PROV_ATTR_ACTIVITY, PROV_AGENT, PROV_ATTR_PLAN], PROV_DELEGATION: [PROV_ATTR_DELEGATE, PROV_ATTR_RESPONSIBLE, From b7c072d66404f48da0740b175c06b217b10ef723 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 19:51:35 +0200 Subject: [PATCH 18/66] Adjusted example 18 to adhere to the spec. --- prov/tests/xml/example_18.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/tests/xml/example_18.xml b/prov/tests/xml/example_18.xml index bfb7763c..04672beb 100644 --- a/prov/tests/xml/example_18.xml +++ b/prov/tests/xml/example_18.xml @@ -6,8 +6,8 @@ prov:Person - Alice 1234 + Alice From 0b98b984cc9380a38a546b2b6fef13e075167970 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 19:52:08 +0200 Subject: [PATCH 19/66] Alphabetical sorting of optional prov-"attributes". This is required by the PROV-XML spec. --- prov/serializers/provxml.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 8bafca53..f31214d4 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -69,7 +69,9 @@ def sorted_attributes(element, attributes): continue sorted_elements.append(e) attributes.remove(e) - # Add remaining attributes. + # Add remaining attributes. According to the spec, the other attributes + # have a fixed alphabetical order. + attributes.sort(key=lambda x: str(x[0])) sorted_elements.extend(attributes) return sorted_elements From 0f60c220556769a290a87f5d38505e30457cbbdf Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 19:56:36 +0200 Subject: [PATCH 20/66] Fixing example 22. --- prov/tests/xml/example_22.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/tests/xml/example_22.xml b/prov/tests/xml/example_22.xml index b292a76c..17da28c1 100644 --- a/prov/tests/xml/example_22.xml +++ b/prov/tests/xml/example_22.xml @@ -7,7 +7,7 @@ xmlns:rec54="http://example.com/ns/rec54#"> - prov:Person + prov:Person From 1271841fe0c5d28fa4dd65ceb63274dd2d7f7e8c Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 20:04:56 +0200 Subject: [PATCH 21/66] Fixing example 23. --- prov/tests/xml/example_23.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/tests/xml/example_23.xml b/prov/tests/xml/example_23.xml index ca559a16..c4e4ba98 100644 --- a/prov/tests/xml/example_23.xml +++ b/prov/tests/xml/example_23.xml @@ -32,8 +32,8 @@ - Workflow 1 http://example.org/workflow1.bpel + Workflow 1 From d00dc0627e4c57c5d72a1cf19decb41840d7765c Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 20:13:21 +0200 Subject: [PATCH 22/66] Fixing example 24. --- prov/tests/xml/example_24.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/tests/xml/example_24.xml b/prov/tests/xml/example_24.xml index ca559a16..c4e4ba98 100644 --- a/prov/tests/xml/example_24.xml +++ b/prov/tests/xml/example_24.xml @@ -32,8 +32,8 @@ - Workflow 1 http://example.org/workflow1.bpel + Workflow 1 From 6efd0afc595621c4f33107fb7602033e85232e91 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 20:15:03 +0200 Subject: [PATCH 23/66] A bunch of small fixes able to do the roundtrip for more examples. --- prov/constants.py | 2 ++ prov/model.py | 6 ++++++ prov/serializers/provxml.py | 10 +++++++--- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/prov/constants.py b/prov/constants.py index 3300d777..22ebd2e6 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -32,6 +32,7 @@ PROV_ORGANIZATION = PROV['Organization'] PROV_ATTRIBUTION = PROV['Attribution'] PROV_ASSOCIATION = PROV['Association'] +PROV_PLAN = PROV['Plan'] PROV_DELEGATION = PROV['Delegation'] PROV_INFLUENCE = PROV['Influence'] # C4. Bundles @@ -62,6 +63,7 @@ PROV_ORGANIZATION: u'organization', PROV_ATTRIBUTION: u'wasAttributedTo', PROV_ASSOCIATION: u'wasAssociatedWith', + PROV_PLAN: u'plan', PROV_DELEGATION: u'actedOnBehalfOf', PROV_INFLUENCE: u'wasInfluencedBy', PROV_ALTERNATE: u'alternateOf', diff --git a/prov/model.py b/prov/model.py index d2fc59a5..09cdb6db 100644 --- a/prov/model.py +++ b/prov/model.py @@ -480,6 +480,11 @@ def get_type(self): return PROV_ATTRIBUTION +class ProvPlan(ProvEntity): + def get_type(self): + return PROV_PLAN + + class ProvAssociation(ProvRelation): FORMAL_ATTRIBUTES = (PROV_ATTR_ACTIVITY, PROV_ATTR_AGENT, PROV_ATTR_PLAN) @@ -551,6 +556,7 @@ def get_type(self): PROV_ORGANIZATION: ProvOrganization, PROV_ATTRIBUTION: ProvAttribution, PROV_ASSOCIATION: ProvAssociation, + PROV_PLAN: ProvPlan, PROV_DELEGATION: ProvDelegation, PROV_INFLUENCE: ProvInfluence, PROV_SPECIALIZATION: ProvSpecialization, diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index f31214d4..21a05365 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -39,8 +39,8 @@ PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], PROV_PRIMARY_SOURCE: [PROV_ATTR_GENERATED_ENTITY, PROV_ATTR_USED_ENTITY, PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], - PROV_ATTRIBUTION: [PROV_ATTR_ENTITY, PROV_AGENT], - PROV_ASSOCIATION: [PROV_ATTR_ACTIVITY, PROV_AGENT, PROV_ATTR_PLAN], + PROV_ATTRIBUTION: [PROV_ATTR_ENTITY, PROV_ATTR_AGENT], + PROV_ASSOCIATION: [PROV_ATTR_ACTIVITY, PROV_ATTR_AGENT, PROV_ATTR_PLAN], PROV_DELEGATION: [PROV_ATTR_DELEGATE, PROV_ATTR_RESPONSIBLE, PROV_ATTR_ACTIVITY], PROV_INFLUENCE: [PROV_ATTR_INFLUENCEE, PROV_ATTR_INFLUENCER], @@ -48,6 +48,7 @@ PROV_MEMBERSHIP: [PROV_ATTR_COLLECTION, PROV_ATTR_ENTITY] } + def sorted_attributes(element, attributes): """ Helper function sorting attributes into the order required by PROV-XML. @@ -124,7 +125,8 @@ def serialize(self, stream, **kwargs): # If it is a type element and does not yet have an # associated xsi type, try to infer it from the value. - if attr == PROV_TYPE and _ns_xsi("type") not in subelem.attrib: + if attr in [PROV_TYPE, PROV_LOCATION] and \ + _ns_xsi("type") not in subelem.attrib: xsd_type = None if isinstance(value, (str, unicode)): xsd_type = XSD_STRING @@ -136,6 +138,8 @@ def serialize(self, stream, **kwargs): xsd_type = XSD_BOOLEAN elif isinstance(value, datetime.datetime): xsd_type = XSD_DATETIME + elif isinstance(value, prov.identifier.Identifier): + xsd_type = XSD_ANYURI if xsd_type is not None: subelem.attrib[_ns_xsi("type")] = str(xsd_type) From ff6119cba55a626bec1ee27a266d537e143050e6 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 22:48:38 +0200 Subject: [PATCH 24/66] Fixing example 32. --- prov/tests/xml/example_32.xml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/prov/tests/xml/example_32.xml b/prov/tests/xml/example_32.xml index 9418897f..b33fc172 100644 --- a/prov/tests/xml/example_32.xml +++ b/prov/tests/xml/example_32.xml @@ -2,8 +2,7 @@ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:prov="http://www.w3.org/ns/prov#" - xmlns:bbc="http://www.bbc.co.uk/news" - xmlns:bbcmobile="http://www.bbc.co.uk/news/mobile"> + xmlns:ex="http://example.com/ns/ex#"> From c07eb7d9ade1391e25dfe8057d88f4ae03782813 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 22:49:06 +0200 Subject: [PATCH 25/66] Deserialization now in theory supports bundles. --- prov/serializers/provxml.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 21a05365..6400d1db 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -154,10 +154,13 @@ def serialize(self, stream, **kwargs): def deserialize(self, stream, **kwargs): xml_doc = etree.parse(stream).getroot() - document = prov.model.ProvDocument() + self.deserialize_subtree(xml_doc, document) + return document + + def deserialize_subtree(self, xml_doc, bundle): for key, value in xml_doc.nsmap.items(): - document.add_namespace(key, value) + bundle.add_namespace(key, value) r_nsmap = {value: key for key, value in xml_doc.nsmap.items()} @@ -172,6 +175,13 @@ def deserialize(self, stream, **kwargs): rec_id = element.attrib[id_tag] if id_tag in element.attrib \ else None + # Recursively build bundles. + if rec_type == PROV_BUNDLE: + new_bundle = prov.model.ProvBundle(document=bundle) + self.deserialize_subtree(element, new_bundle) + bundle.add_bundle(new_bundle, + new_bundle.valid_qualified_name(rec_id)) + attributes = [] other_attributes = [] for subel in element: @@ -199,11 +209,11 @@ def deserialize(self, stream, **kwargs): else: _v = subel.text d.append((_t, _v)) - document.add_record(rec_type, rec_id, attributes, - other_attributes) + bundle.add_record(rec_type, rec_id, attributes, + other_attributes) else: raise NotImplementedError - return document + return bundle def _ns(ns, tag): From 7c40e68caad6bde285339941444de296f9795e54 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 22:49:29 +0200 Subject: [PATCH 26/66] Proper collection and empty collection objects. --- prov/constants.py | 8 +++++++- prov/model.py | 21 +++++++++++++++++++-- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/prov/constants.py b/prov/constants.py index 22ebd2e6..552e1b2b 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -42,6 +42,8 @@ PROV_SPECIALIZATION = PROV['Specialization'] PROV_MENTION = PROV['Mention'] # C6. Collections +PROV_COLLECTION = PROV['Collection'] +PROV_EMPTY_COLLECTION = PROV['EmptyCollection'] PROV_MEMBERSHIP = PROV['Membership'] PROV_N_MAP = { @@ -69,6 +71,8 @@ PROV_ALTERNATE: u'alternateOf', PROV_SPECIALIZATION: u'specializationOf', PROV_MENTION: u'mentionOf', + PROV_COLLECTION: u'collection', + PROV_EMPTY_COLLECTION: u'emptyCollection', PROV_MEMBERSHIP: u'hadMember', PROV_BUNDLE: u'bundle', } @@ -100,6 +104,7 @@ PROV_ATTR_INFLUENCEE = PROV['influencee'] PROV_ATTR_INFLUENCER = PROV['influencer'] PROV_ATTR_COLLECTION = PROV['collection'] +PROV_ATTR_EMPTY_COLLECTION = PROV['emptyCollection'] # Literal properties PROV_ATTR_TIME = PROV['time'] @@ -133,7 +138,8 @@ PROV_ATTR_BUNDLE, PROV_ATTR_INFLUENCEE, PROV_ATTR_INFLUENCER, - PROV_ATTR_COLLECTION + PROV_ATTR_COLLECTION, + PROV_ATTR_EMPTY_COLLECTION ]) PROV_ATTRIBUTE_LITERALS = set([PROV_ATTR_TIME, PROV_ATTR_STARTTIME, PROV_ATTR_ENDTIME]) # Set of formal attributes of PROV records diff --git a/prov/model.py b/prov/model.py index 09cdb6db..5b4c089e 100644 --- a/prov/model.py +++ b/prov/model.py @@ -529,6 +529,16 @@ def get_type(self): ### Component 6: Collections +class ProvCollection(ProvEntity): + def get_type(self): + return PROV_COLLECTION + + +class ProvEmptyCollection(ProvCollection): + def get_type(self): + return PROV_EMPTY_COLLECTION + + class ProvMembership(ProvRelation): FORMAL_ATTRIBUTES = (PROV_ATTR_COLLECTION, PROV_ATTR_ENTITY) @@ -562,6 +572,8 @@ def get_type(self): PROV_SPECIALIZATION: ProvSpecialization, PROV_ALTERNATE: ProvAlternate, PROV_MENTION: ProvMention, + PROV_COLLECTION: ProvCollection, + PROV_EMPTY_COLLECTION: ProvEmptyCollection, PROV_MEMBERSHIP: ProvMembership, } @@ -1095,8 +1107,13 @@ def mention(self, specificEntity, generalEntity, bundle,): ) def collection(self, identifier, other_attributes=None): - record = self.add_record(PROV_ENTITY, identifier, None, other_attributes) - record.add_asserted_type(PROV['Collection']) + record = self.add_record(PROV_COLLECTION, identifier, None, + other_attributes) + return record + + def emptyCollection(self, identifier, other_attributes=None): + record = self.add_record(PROV_EMPTY_COLLECTION, identifier, None, + other_attributes) return record def membership(self, collection, entity): From 6262d138e910837ebfc1ef38caabea611e305d14 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 23:00:35 +0200 Subject: [PATCH 27/66] Some data types, like membership definitions can have multiple child elements of the same type. Thus the duplicate child attribute type check must not be executed there. --- prov/constants.py | 5 +++++ prov/model.py | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/prov/constants.py b/prov/constants.py index 552e1b2b..6ed3c70c 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -150,6 +150,11 @@ PROV_ID_ATTRIBUTES_MAP = dict((prov_id, attribute) for (prov_id, attribute) in PROV_RECORD_ATTRIBUTES) PROV_ATTRIBUTES_ID_MAP = dict((attribute, prov_id) for (prov_id, attribute) in PROV_RECORD_ATTRIBUTES) +# Some elements can have multiple attributes of the same type. +PROV_ELEMENTS_COLLECTION_LIKE = set([ + PROV_MEMBERSHIP +]) + # Extra definition for convenience PROV_TYPE = PROV['type'] PROV_LABEL = PROV['label'] diff --git a/prov/model.py b/prov/model.py index 5b4c089e..0e613f69 100644 --- a/prov/model.py +++ b/prov/model.py @@ -268,7 +268,8 @@ def add_attributes(self, attributes): if value is None: raise ProvException(u'Invalid value for attribute %s: %s' % (attr, original_value)) - if attr in PROV_ATTRIBUTES and self._attributes[attr]: + if attr in PROV_ATTRIBUTES and self._attributes[attr] and \ + self.get_type() not in PROV_ELEMENTS_COLLECTION_LIKE: existing_value = first(self._attributes[attr]) if value != existing_value: raise ProvException(u'Cannot have more than one value for attribute %s' % attr) From 945e9928c51708d8924f8194dab7b6dd02f6b1ec Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Sun, 27 Jul 2014 23:27:50 +0200 Subject: [PATCH 28/66] Proper handling of XML language attributes. --- prov/serializers/provxml.py | 13 ++++++++++--- prov/tests/xml/example_36.xml | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 6400d1db..8047e713 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -16,6 +16,7 @@ NS_PROV = "http://www.w3.org/ns/prov#" NS_XSI = "http://www.w3.org/2001/XMLSchema-instance" NS_XSD = "http://www.w3.org/2001/XMLSchema" +NS_XML = "http://www.w3.org/XML/1998/namespace" # Force the order of child elements as it matters in XML. Not specified # elements will keep the original order. Label, location, role, type, @@ -114,9 +115,13 @@ def serialize(self, stream, **kwargs): subelem = etree.SubElement( elem, _ns(attr.namespace.uri, attr.localpart)) if isinstance(value, prov.model.Literal): - subelem.attrib[_ns_xsi("type")] = "%s:%s" % ( - value.datatype.namespace.prefix, - value.datatype.localpart) + if value.datatype not in \ + [None, PROV["InternationalizedString"]]: + subelem.attrib[_ns_xsi("type")] = "%s:%s" % ( + value.datatype.namespace.prefix, + value.datatype.localpart) + if value.langtag is not None: + subelem.attrib[_ns(NS_XML, "lang")] = value.langtag v = value.value elif isinstance(value, datetime.datetime): v = value.isoformat() @@ -204,6 +209,8 @@ def deserialize_subtree(self, xml_doc, bundle): XSD[value.split(":")[1]]) elif key == _ns_prov("ref"): _v = value + elif key == _ns(NS_XML, "lang"): + _v = prov.model.Literal(subel.text, langtag=value) else: raise NotImplementedError else: diff --git a/prov/tests/xml/example_36.xml b/prov/tests/xml/example_36.xml index 155154e9..62889bc3 100644 --- a/prov/tests/xml/example_36.xml +++ b/prov/tests/xml/example_36.xml @@ -9,8 +9,8 @@ - Voiture 01 Car 01 + Voiture 01 From 340d0b49f3d42e728aa3aa9575f04a52ad889707 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 28 Jul 2014 00:57:31 +0200 Subject: [PATCH 29/66] Special case handling for example 4 and 5. These can be read but not round trip can be performed, a semantically identical variant of both will be returned. Also include some fixes to the PROV example files and small bug fixes. --- prov/serializers/provxml.py | 15 ++++-- prov/tests/test_xml.py | 88 +++++++++++++++++++++++++++++++++++ prov/tests/xml/example_01.xml | 2 +- prov/tests/xml/example_04.xml | 12 ----- prov/tests/xml/example_05.xml | 14 ------ prov/tests/xml/example_37.xml | 2 +- prov/tests/xml/example_39.xml | 2 +- 7 files changed, 103 insertions(+), 32 deletions(-) delete mode 100644 prov/tests/xml/example_04.xml delete mode 100644 prov/tests/xml/example_05.xml diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 8047e713..71658440 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -130,7 +130,7 @@ def serialize(self, stream, **kwargs): # If it is a type element and does not yet have an # associated xsi type, try to infer it from the value. - if attr in [PROV_TYPE, PROV_LOCATION] and \ + if attr in [PROV_TYPE, PROV_LOCATION, PROV_VALUE] and \ _ns_xsi("type") not in subelem.attrib: xsd_type = None if isinstance(value, (str, unicode)): @@ -159,6 +159,12 @@ def serialize(self, stream, **kwargs): def deserialize(self, stream, **kwargs): xml_doc = etree.parse(stream).getroot() + + # Remove all comments. + for c in xml_doc.xpath("//comment()"): + p = c.getparent() + p.remove(c) + document = prov.model.ProvDocument() self.deserialize_subtree(xml_doc, document) return document @@ -170,8 +176,6 @@ def deserialize_subtree(self, xml_doc, bundle): r_nsmap = {value: key for key, value in xml_doc.nsmap.items()} for element in xml_doc: - if isinstance(element, etree._Comment): - continue qname = etree.QName(element) if qname.namespace == NS_PROV: rec_type = PROV_RECORD_IDS_MAP[qname.localname] @@ -216,6 +220,11 @@ def deserialize_subtree(self, xml_doc, bundle): else: _v = subel.text d.append((_t, _v)) + + if _ns_xsi("type") in element.attrib: + value = element.attrib[_ns_xsi("type")] + other_attributes.append((PROV["type"], value)) + bundle.add_record(rec_type, rec_id, attributes, other_attributes) else: diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index 5650927e..6ee77e08 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -34,6 +34,19 @@ def compare_xml(doc1, doc2): obj1 = etree.parse(doc1) obj2 = etree.parse(doc2) + # Remove comments from both. + for c in obj1.getroot().xpath("//comment()"): + p = c.getparent() + p.remove(c) + for c in obj2.getroot().xpath("//comment()"): + p = c.getparent() + p.remove(c) + + # Remove root text which is just whitespace in between the nodes. There + # should be nothing in any case. + obj1.getroot().text = None + obj2.getroot().text = None + buf = io.BytesIO() obj1.write_c14n(buf) buf.seek(0, 0) @@ -156,6 +169,81 @@ def test_deserialization_example_7(self): self.assertEqual(actual_doc, expected_document) + def test_deserialization_example_04_and_05(self): + """ + Example 4 and 5 have a different type specification. They use an + xsi:type as an attribute on an entity. This can be read but if + written again it will become an XML child element. This is + semantically identical but cannot be tested with a round trip. + """ + # Example 4. + xml_string = """ + + + + ex:Workflow + + + + """ + with io.BytesIO() as xml: + xml.write(xml_string) + xml.seek(0, 0) + actual_document = prov.ProvDocument.deserialize(source=xml, + format="xml") + + expected_document = prov.ProvDocument() + expected_document.add_namespace(*EX_NS) + expected_document.add_namespace(*EX_TR) + + # The xsi:type attribute is mapped to a proper PROV attribute. + expected_document.entity("tr:WD-prov-dm-20111215", ( + (prov.PROV_TYPE, prov.Literal("ex:Workflow", prov.XSD_QNAME)), + (prov.PROV_TYPE, "prov:Plan"))) + + self.assertEqual(actual_document, expected_document) + + # Example 5. + xml_string = """ + + + + ex:Workflow + prov:Plan + prov:Entity + + + + """ + with io.BytesIO() as xml: + xml.write(xml_string) + xml.seek(0, 0) + actual_document = prov.ProvDocument.deserialize(source=xml, + format="xml") + + expected_document = prov.ProvDocument() + expected_document.add_namespace(*EX_NS) + expected_document.add_namespace(*EX_TR) + + # The xsi:type attribute is mapped to a proper PROV attribute. + expected_document.entity("tr:WD-prov-dm-20111215", ( + (prov.PROV_TYPE, prov.Literal("ex:Workflow", prov.XSD_QNAME)), + (prov.PROV_TYPE, "prov:Entity"), + (prov.PROV_TYPE, "prov:Plan") + )) + + self.assertEqual(actual_document, expected_document) + class ProvXMLRoundTripFromFileTestCase(unittest.TestCase): def _perform_round_trip(self, filename): diff --git a/prov/tests/xml/example_01.xml b/prov/tests/xml/example_01.xml index 1a26c957..73412ff1 100644 --- a/prov/tests/xml/example_01.xml +++ b/prov/tests/xml/example_01.xml @@ -6,8 +6,8 @@ xmlns:tr="http://example.com/ns/tr#"> - prov:Plan ex:Workflow + prov:Plan diff --git a/prov/tests/xml/example_04.xml b/prov/tests/xml/example_04.xml deleted file mode 100644 index c5b93bad..00000000 --- a/prov/tests/xml/example_04.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - ex:Workflow - - - diff --git a/prov/tests/xml/example_05.xml b/prov/tests/xml/example_05.xml deleted file mode 100644 index 85930361..00000000 --- a/prov/tests/xml/example_05.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - ex:Workflow - prov:Plan - prov:Entity - - - diff --git a/prov/tests/xml/example_37.xml b/prov/tests/xml/example_37.xml index 872a64f7..2cdedfd8 100644 --- a/prov/tests/xml/example_37.xml +++ b/prov/tests/xml/example_37.xml @@ -5,8 +5,8 @@ xmlns:ex="http://example.com/ns/ex#"> - StillImage Le Louvre, Paris + StillImage diff --git a/prov/tests/xml/example_39.xml b/prov/tests/xml/example_39.xml index 86f2a5a6..d3a748d7 100644 --- a/prov/tests/xml/example_39.xml +++ b/prov/tests/xml/example_39.xml @@ -12,8 +12,8 @@ prov:Person - Alice 1234 + Alice From a16e9847dd46a8996a6c63428de28e3a080e041d Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 28 Jul 2014 01:26:21 +0200 Subject: [PATCH 30/66] Everything but bundles and prov:other appears to work. --- prov/serializers/provxml.py | 13 ++++++++++--- prov/tests/test_xml.py | 14 ++++++++++---- prov/tests/xml/example_03.xml | 4 ++-- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 71658440..6cc0049d 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -66,14 +66,17 @@ def sorted_attributes(element, attributes): sorted_elements = [] for item in order: + this_type_list = [] for e in list(attributes): if e[0] != item: continue - sorted_elements.append(e) + this_type_list.append(e) attributes.remove(e) + this_type_list.sort(key=lambda x: (str(x[0]), str(x[1]))) + sorted_elements.extend(this_type_list) # Add remaining attributes. According to the spec, the other attributes # have a fixed alphabetical order. - attributes.sort(key=lambda x: str(x[0])) + attributes.sort(key=lambda x: (str(x[0]), str(x[1]))) sorted_elements.extend(attributes) return sorted_elements @@ -130,8 +133,12 @@ def serialize(self, stream, **kwargs): # If it is a type element and does not yet have an # associated xsi type, try to infer it from the value. + # The not startswith("prov:") check is a little bit hacky to + # avoid type interference when the type is a standard prov + # type. if attr in [PROV_TYPE, PROV_LOCATION, PROV_VALUE] and \ - _ns_xsi("type") not in subelem.attrib: + _ns_xsi("type") not in subelem.attrib and \ + not str(value).startswith("prov:"): xsd_type = None if isinstance(value, (str, unicode)): xsd_type = XSD_STRING diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index 6ee77e08..bc1d6540 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -17,6 +17,14 @@ inspect.currentframe()))), "xml") +def remove_empty_tags(tree): + if tree.text is not None and tree.text.strip() == "": + tree.text = None + for elem in tree: + if etree.iselement(elem): + remove_empty_tags(elem) + + def compare_xml(doc1, doc2): """ Helper function to compare two XML files. It will parse both once again @@ -42,10 +50,8 @@ def compare_xml(doc1, doc2): p = c.getparent() p.remove(c) - # Remove root text which is just whitespace in between the nodes. There - # should be nothing in any case. - obj1.getroot().text = None - obj2.getroot().text = None + remove_empty_tags(obj1.getroot()) + remove_empty_tags(obj2.getroot()) buf = io.BytesIO() obj1.write_c14n(buf) diff --git a/prov/tests/xml/example_03.xml b/prov/tests/xml/example_03.xml index 9a646b38..a1871900 100644 --- a/prov/tests/xml/example_03.xml +++ b/prov/tests/xml/example_03.xml @@ -7,8 +7,8 @@ ex:Workflow - prov:Plan - prov:Entity + prov:Entity + prov:Plan From 4d402350730e4c34f32ca0918216ba5365ff6a37 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 28 Jul 2014 01:44:02 +0200 Subject: [PATCH 31/66] Added full PROV example to round trip tests. The example is from the PROV Primer but was a little bit faulty and had to be fixed. --- prov/serializers/provxml.py | 3 +- .../xml/prov_primer_complete_example.xml | 225 ++++++++++++++++++ 2 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 prov/tests/xml/prov_primer_complete_example.xml diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 6cc0049d..266a3238 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -162,7 +162,8 @@ def serialize(self, stream, **kwargs): subelem.text = v et = etree.ElementTree(xml_root) - et.write(stream, pretty_print=True) + et.write(stream, pretty_print=True, xml_declaration=True, + encoding="UTF-8") def deserialize(self, stream, **kwargs): xml_doc = etree.parse(stream).getroot() diff --git a/prov/tests/xml/prov_primer_complete_example.xml b/prov/tests/xml/prov_primer_complete_example.xml new file mode 100644 index 00000000..75551a5e --- /dev/null +++ b/prov/tests/xml/prov_primer_complete_example.xml @@ -0,0 +1,225 @@ + + + + + + + + Crime rises in cities + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + prov:Person + Derek + mailto:derek@example.org + + + + prov:Organization + Chart Generators Inc + + + + + + + + + + + + + + + + + + + + + + + + + + + + exc:dataToCompose + + + + + + exc:regionsToAggregateBy + + + + + + exc:analyst + + + + + + exc:composedData + + + + + + + + + + + prov:Revision + + + + + + + + + + + prov:Revision + + + + + + + + prov:Plan + + + + prov:Person + + + + + + + + + + + + + + + + + + 2012-03-02T10:30:00 + + + + + 2012-04-01T15:21:00 + + + + + 2012-03-31T09:21:00 + + + + + 2012-04-01T15:21:00 + + + + + + 2012-03-31T09:21:00 + 2012-04-01T15:21:00 + + + + + + + + + + prov:Quotation + + + + + + + + + + + + + + + + + + + + + + From 0dd77cd82eb228ee3716042807caeed2203be4fb Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 28 Jul 2014 11:30:00 +0200 Subject: [PATCH 32/66] Added Provenance Challange 1 like test file. --- prov/tests/xml/pc1.xml | 737 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 737 insertions(+) create mode 100644 prov/tests/xml/pc1.xml diff --git a/prov/tests/xml/pc1.xml b/prov/tests/xml/pc1.xml new file mode 100644 index 00000000..977c1617 --- /dev/null +++ b/prov/tests/xml/pc1.xml @@ -0,0 +1,737 @@ + + + + align_warp 1 + prim:align_warp + + + align_warp 2 + prim:align_warp + + + align_warp 3 + prim:align_warp + + + align_warp 4 + prim:align_warp + + + Reslice 1 + http://openprovenance.org/primitives#reslice + + + Reslice 2 + http://openprovenance.org/primitives#reslice + + + Reslice 3 + http://openprovenance.org/primitives#reslice + + + Reslice 4 + http://openprovenance.org/primitives#reslice + + + Softmean + http://openprovenance.org/primitives#softmean + + + Slicer 1 + http://openprovenance.org/primitives#slicer + + + Slicer 2 + http://openprovenance.org/primitives#slicer + + + Slicer 3 + http://openprovenance.org/primitives#slicer + + + Convert 1 + http://openprovenance.org/primitives#convert + + + Convert 2 + http://openprovenance.org/primitives#convert + + + Convert 3 + http://openprovenance.org/primitives#convert + + + Reference Image + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/reference.img + + + Reference Header + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/reference.hdr + + + Anatomy I2 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/anatomy2.img + + + Anatomy H2 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/anatomy2.hdr + + + Anatomy I1 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/anatomy1.img + + + Anatomy H1 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/anatomy1.hdr + + + Anatomy I3 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/anatomy3.img + + + Anatomy H3 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/anatomy3.hdr + + + Anatomy I4 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/anatomy4.img + + + Anatomy H4 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/anatomy4.hdr + + + Warp Params1 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/warp1.warp + + + Warp Params2 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/warp2.warp + + + Warp Params3 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/warp3.warp + + + Warp Params4 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/warp4.warp + + + Resliced I1 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/resliced1.img + + + Resliced H1 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/resliced1.hdr + + + Resliced I2 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/resliced2.img + + + Resliced H2 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/resliced2.hdr + + + Resliced I3 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/resliced3.img + + + Resliced H3 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/resliced3.hdr + + + Resliced I4 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/resliced4.img + + + Resliced H4 + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/resliced4.hdr + + + Atlas Image + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/atlas.img + + + Atlas Header + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/atlas.hdr + + + Atlas X Slice + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/atlas-x.pgm + + + slicer param 1 + http://openprovenance.org/primitives#String + -x .5 + + + Atlas Y Slice + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/atlas-y.pgm + + + slicer param 2 + http://openprovenance.org/primitives#String + -y .5 + + + Atlas Z Slice + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/atlas-z.pgm + + + slicer param 3 + http://openprovenance.org/primitives#String + -z .5 + + + Atlas X Graphic + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/atlas-x.gif + + + Atlas Y Graphic + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/atlas-y.gif + + + Atlas Z Graphic + http://openprovenance.org/primitives#File + http://www.ipaw.info/challenge/atlas-z.gif + + + John Doe + + + + + img + + + + + hdr + + + + + imgRef + + + + + hdrRef + + + + + img + + + + + hdr + + + + + imgRef + + + + + hdrRef + + + + + img + + + + + hdr + + + + + imgRef + + + + + hdrRef + + + + + img + + + + + hdr + + + + + imgRef + + + + + hdrRef + + + + + in + + + + + in + + + + + in + + + + + in + + + + + i1 + + + + + h1 + + + + + i2 + + + + + h2 + + + + + i3 + + + + + h3 + + + + + i4 + + + + + h4 + + + + + img + + + + + hdr + + + + + img + + + + + hdr + + + + + img + + + + + hdr + + + + + param + + + + + param + + + + + param + + + + + in + + + + + in + + + + + in + + + + + out + + + + + out + + + + + out + + + + + out + + + + + img + + + + + hdr + + + + + img + + + + + hdr + + + + + img + + + + + hdr + + + + + img + + + + + hdr + + + + + img + + + + + hdr + + + + + out + + + + + out + + + + + out + + + + + 2012-10-26T09:58:08.407000+01:00 + out + + + + + 2012-10-26T09:58:08.407000+01:00 + out + + + + + 2012-10-26T09:58:08.407000+01:00 + out + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From f7376531e8a1de5b828d18058a51242115ff3bc9 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 28 Jul 2014 11:30:40 +0200 Subject: [PATCH 33/66] Changes to the xsd type inference. Also added an option to write xsd type attributes to most tags if desired. --- prov/serializers/provxml.py | 23 ++++++++++++++++++++--- prov/tests/test_xml.py | 15 +++++++++++---- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 266a3238..0abd0745 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -87,7 +87,16 @@ class ProvXMLException(prov.Error): class ProvXMLSerializer(prov.Serializer): - def serialize(self, stream, **kwargs): + def serialize(self, stream, force_types=False, **kwargs): + """ + :param stream: Where to save the output. + :type force_types: boolean, optional + :param force_types: Will force xsd:types to be written for most + attributes mainly only PROV-"attributes", e.g. tags not in the + PROV namespace. Off by default meaning xsd:type attributes will + only be set for prov:type, prov:location, and prov:value as is + done in the official PROV-XML specification. + """ # Build the namespace map for lxml and attach it to the root XML # element. nsmap = {ns.prefix: ns.uri for ns in @@ -131,14 +140,22 @@ def serialize(self, stream, **kwargs): else: v = str(value) + # xsd type inference. + # + # This is a bit messy and there are all kinds of special + # rules but it appears to get the job done. + # # If it is a type element and does not yet have an # associated xsi type, try to infer it from the value. # The not startswith("prov:") check is a little bit hacky to # avoid type interference when the type is a standard prov # type. - if attr in [PROV_TYPE, PROV_LOCATION, PROV_VALUE] and \ + if (force_types or attr in [PROV_TYPE, PROV_LOCATION, + PROV_VALUE]) and \ _ns_xsi("type") not in subelem.attrib and \ - not str(value).startswith("prov:"): + not str(value).startswith("prov:") and \ + not (attr in PROV_ATTRIBUTE_QNAMES and v) and \ + attr not in [PROV_ATTR_TIME, PROV_LABEL]: xsd_type = None if isinstance(value, (str, unicode)): xsd_type = XSD_STRING diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index bc1d6540..eb5ea589 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -252,11 +252,13 @@ def test_deserialization_example_04_and_05(self): class ProvXMLRoundTripFromFileTestCase(unittest.TestCase): - def _perform_round_trip(self, filename): - document = prov.ProvDocument.deserialize(source=filename, format="xml") + def _perform_round_trip(self, filename, force_types=False): + document = prov.ProvDocument.deserialize( + source=filename, format="xml") with io.BytesIO() as new_xml: - document.serialize(format='xml', destination=new_xml) + document.serialize(format='xml', destination=new_xml, + force_types=force_types) compare_xml(filename, new_xml) @@ -270,8 +272,13 @@ def _perform_round_trip(self, filename): # Python creates closures on function calls... def get_fct(f): + # Some test files have a lot of type declarations... + if name in ["pc1"]: + force_types = True + else: + force_types = False def fct(self): - self._perform_round_trip(f) + self._perform_round_trip(f, force_types=force_types) return fct fct = get_fct(filename) From 046b63adb3e8b82796ab3007175c8d7e01b3215d Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 28 Jul 2014 11:31:32 +0200 Subject: [PATCH 34/66] Added readme to the xml example files. That way the origin of the files is documented. --- prov/tests/xml/README.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 prov/tests/xml/README.txt diff --git a/prov/tests/xml/README.txt b/prov/tests/xml/README.txt new file mode 100644 index 00000000..9ccb10d9 --- /dev/null +++ b/prov/tests/xml/README.txt @@ -0,0 +1,10 @@ +* The example_xx.xml files are the official PROV-XML examples from the + specification. Examples 4 and 5 are tested separately in the XML test case. + Some files had to be adjusted a bit to be valid according to the spec. +* prov_primer_complete_example.xml: This is the complete example from the + PROV Primer documentation. It had to be adjusted a bit to be valid + according to the spec. +* pc1.xml: Provenanve Challenge 1 like file from + https://github.com/lucmoreau/ProvToolbox/blob/master/prov-xml/src/test/resources/pc1.xml + Had to change a couple of minor ordering/formatting issues to be able to + do the round trip. \ No newline at end of file From c2308f76dedd6bc397ba578313d0f6e5f2dd0fba Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 28 Jul 2014 14:07:44 +0200 Subject: [PATCH 35/66] Extracting the attribute order from the class definitions. --- prov/serializers/provxml.py | 38 +++---------------------------------- 1 file changed, 3 insertions(+), 35 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 0abd0745..c002afd4 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -11,6 +11,7 @@ logger = logging.getLogger(__name__) import prov +from prov.model import PROV_REC_CLS from prov.constants import * NS_PROV = "http://www.w3.org/ns/prov#" @@ -18,47 +19,14 @@ NS_XSD = "http://www.w3.org/2001/XMLSchema" NS_XML = "http://www.w3.org/XML/1998/namespace" -# Force the order of child elements as it matters in XML. Not specified -# elements will keep the original order. Label, location, role, type, -# and value attributes will always come after the specified attributes. Any -# other attributes will come after that. -ELEMENT_ORDER = { - PROV_ACTIVITY: [PROV_ATTR_STARTTIME, PROV_ATTR_ENDTIME], - PROV_GENERATION: [PROV_ATTR_ENTITY, PROV_ATTR_ACTIVITY, PROV_ATTR_TIME], - PROV_USAGE: [PROV_ATTR_ACTIVITY, PROV_ATTR_ENTITY, PROV_ATTR_TIME], - PROV_COMMUNICATION: [PROV_ATTR_INFORMED, PROV_ATTR_INFORMANT], - PROV_START: [PROV_ATTR_ACTIVITY, PROV_ATTR_TRIGGER, PROV_ATTR_STARTER, - PROV_ATTR_TIME], - PROV_END: [PROV_ATTR_ACTIVITY, PROV_ATTR_TRIGGER, PROV_ATTR_ENDER, - PROV_ATTR_TIME], - PROV_INVALIDATION: [PROV_ATTR_ENTITY, PROV_ATTR_ACTIVITY, PROV_ATTR_TIME], - PROV_DERIVATION: [PROV_ATTR_GENERATED_ENTITY, PROV_ATTR_USED_ENTITY, - PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], - PROV_REVISION: [PROV_ATTR_GENERATED_ENTITY, PROV_ATTR_USED_ENTITY, - PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], - PROV_QUOTATION: [PROV_ATTR_GENERATED_ENTITY, PROV_ATTR_USED_ENTITY, - PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], - PROV_PRIMARY_SOURCE: [PROV_ATTR_GENERATED_ENTITY, PROV_ATTR_USED_ENTITY, - PROV_ATTR_ACTIVITY, PROV_GENERATION, PROV_USAGE], - PROV_ATTRIBUTION: [PROV_ATTR_ENTITY, PROV_ATTR_AGENT], - PROV_ASSOCIATION: [PROV_ATTR_ACTIVITY, PROV_ATTR_AGENT, PROV_ATTR_PLAN], - PROV_DELEGATION: [PROV_ATTR_DELEGATE, PROV_ATTR_RESPONSIBLE, - PROV_ATTR_ACTIVITY], - PROV_INFLUENCE: [PROV_ATTR_INFLUENCEE, PROV_ATTR_INFLUENCER], - PROV_SPECIALIZATION: [PROV_ATTR_SPECIFIC_ENTITY, PROV_ATTR_GENERAL_ENTITY], - PROV_MEMBERSHIP: [PROV_ATTR_COLLECTION, PROV_ATTR_ENTITY] -} - def sorted_attributes(element, attributes): """ Helper function sorting attributes into the order required by PROV-XML. """ attributes = list(attributes) - if element in ELEMENT_ORDER: - order = list(ELEMENT_ORDER[element]) - else: - order = [] + order = list(PROV_REC_CLS[element].FORMAL_ATTRIBUTES) + # Append label, location, role, type, and value attributes. This is # universal amongst all elements. order.extend([PROV_LABEL, PROV_LOCATION, PROV_ROLE, PROV_TYPE, From 5d3e9099409724e38d593fde678d2d259bb95352 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 28 Jul 2014 15:36:14 +0200 Subject: [PATCH 36/66] Compat for Python 2.6. No dictionary comprehension in Python 2.6. --- prov/serializers/provxml.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index c002afd4..4c8a901d 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -66,9 +66,9 @@ def serialize(self, stream, force_types=False, **kwargs): done in the official PROV-XML specification. """ # Build the namespace map for lxml and attach it to the root XML - # element. - nsmap = {ns.prefix: ns.uri for ns in - self.document._namespaces.get_registered_namespaces()} + # element. No dictionary comprehension in Python 2.6! + nsmap = dict((ns.prefix, ns.uri) for ns in + self.document._namespaces.get_registered_namespaces()) if self.document._namespaces._default: nsmap[None] = self.document._namespaces._default.uri # Add the prov, XSI, and XSD namespaces by default. @@ -166,7 +166,8 @@ def deserialize_subtree(self, xml_doc, bundle): for key, value in xml_doc.nsmap.items(): bundle.add_namespace(key, value) - r_nsmap = {value: key for key, value in xml_doc.nsmap.items()} + # No dictionary comprehension in Python 2.6. + r_nsmap = dict((value, key) for (key, value) in xml_doc.nsmap.items()) for element in xml_doc: qname = etree.QName(element) From a8133f733b6c4e28b7872999ad7239bda7ec555c Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Wed, 30 Jul 2014 15:39:06 +0200 Subject: [PATCH 37/66] Nicer node labels during plotting --- prov/dot.py | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/prov/dot.py b/prov/dot.py index 8e7fef95..e31fed69 100644 --- a/prov/dot.py +++ b/prov/dot.py @@ -16,15 +16,18 @@ __email__ = 'trungdong@donggiang.com' import cgi +from datetime import datetime import pydot from prov.model import ( ProvBundle, PROV_ACTIVITY, PROV_AGENT, PROV_ALTERNATE, PROV_ASSOCIATION, PROV_ATTRIBUTION, PROV_BUNDLE, PROV_COMMUNICATION, PROV_DERIVATION, PROV_DELEGATION, PROV_ENTITY, PROV_GENERATION, PROV_INFLUENCE, PROV_INVALIDATION, PROV_END, PROV_MEMBERSHIP, PROV_MENTION, PROV_SPECIALIZATION, PROV_START, PROV_USAGE, - Identifier, PROV_ATTRIBUTE_QNAMES + Identifier, PROV_ATTRIBUTE_QNAMES, PROV_SOFTWARE_AGENT, PROV_PERSON ) +from prov.serializers.provxml import sorted_attributes + # Visual styles for various elements (nodes) and relations (edges) # see http://graphviz.org/content/attrs @@ -35,6 +38,8 @@ PROV_ENTITY: {'shape': 'oval', 'style': 'filled', 'fillcolor': '#FFFC87', 'color': '#808080'}, PROV_ACTIVITY: {'shape': 'box', 'style': 'filled', 'fillcolor': '#9FB1FC', 'color': '#0000FF'}, PROV_AGENT: {'shape': 'house', 'style': 'filled', 'fillcolor': '#FED37F'}, + PROV_SOFTWARE_AGENT: {'shape': 'house', 'style': 'filled', 'fillcolor': '#FED37F'}, + PROV_PERSON: {'shape': 'house', 'style': 'filled', 'fillcolor': '#FED37F'}, # PROV_COLLECTION: {'label': 'wasGeneratedBy', 'fontsize': 10.0}, PROV_BUNDLE: {'shape': 'folder', 'style': 'filled', 'fillcolor': 'aliceblue'}, # Relations @@ -105,12 +110,16 @@ def _attach_attribute_annotation(node, record): if not attributes: return # No attribute to display + attributes = sorted_attributes(record.get_type(), attributes) + ann_rows = [ANNOTATION_START_ROW] ann_rows.extend( ANNOTATION_ROW_TEMPLATE % ( attr.uri, cgi.escape(unicode(attr)), ' href=\"%s\"' % value.uri if isinstance(value, Identifier) else '', - cgi.escape(unicode(value))) + cgi.escape(unicode(value) + if not isinstance(value, datetime) else + unicode(value.isoformat()))) for attr, value in attributes ) ann_rows.append(ANNOTATION_END_ROW) @@ -123,7 +132,18 @@ def _add_bundle(bundle): count[2] += 1 subdot = pydot.Cluster(graph_name='c%d' % count[2], URL='"%s"' % bundle.identifier.uri) if use_labels: - subdot.set_label('"%s"' % unicode(bundle.label)) + if bundle.label == bundle.identifier: + bundle_label = '"%s"' % unicode(bundle.label) + else: + # Fancier label if both are different. The label will be + # the main node text, whereas the identifier will be a + # kind of suptitle. + bundle_label = ('<%s
' + '' + '%s>') + bundle_label = bundle_label % (unicode(bundle.label), + unicode(bundle.identifier)) + subdot.set_label('"%s"' % unicode(bundle_label)) else: subdot.set_label('"%s"' % unicode(bundle.identifier)) _bundle_to_dot(subdot, bundle) @@ -134,7 +154,17 @@ def _add_node(record): count[0] += 1 node_id = 'n%d' % count[0] if use_labels: - node_label = '"%s"' % unicode(record.label) + if record.label == record.identifier: + node_label = '"%s"' % unicode(record.label) + else: + # Fancier label if both are different. The label will be + # the main node text, whereas the identifier will be a + # kind of suptitle. + node_label = ('<%s
' + '' + '%s>') + node_label = node_label % (unicode(record.label), + unicode(record.identifier)) else: node_label = '"%s"' % unicode(record.identifier) From f236265da4d1ce72316e10ccb8375056bbda3581 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Thu, 31 Jul 2014 14:25:22 +0200 Subject: [PATCH 38/66] Added test for the prov:other element. And removed the corresponding example from the examples folder. --- prov/tests/test_xml.py | 43 +++++++++++++++++++++++++++++++++++ prov/tests/xml/example_42.xml | 17 -------------- 2 files changed, 43 insertions(+), 17 deletions(-) delete mode 100644 prov/tests/xml/example_42.xml diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index eb5ea589..95c853fa 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -5,6 +5,7 @@ from lxml import etree import os import unittest +import warnings import prov.model as prov @@ -250,6 +251,48 @@ def test_deserialization_example_04_and_05(self): self.assertEqual(actual_document, expected_document) + def test_other_elements(self): + """ + PROV XML uses the element to enable the storage of non + PROV information in a PROV XML document. It will be ignored by this + library a warning will be raised informing the user. + """ + # This is example 42 from the PROV XML documentation. + xml_string = """ + + + + + + + bar + + + + + + + """ + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + + with io.BytesIO() as xml: + xml.write(xml_string) + xml.seek(0, 0) + doc = prov.ProvDocument.deserialize(source=xml, format="xml") + + self.assertEqual(len(w), 1) + self.assertTrue( + "Document contains non-PROV information in . It will " + "be ignored in this package." in str(w[0].message)) + + # This document contains nothing else. + self.assertEqual(len(doc._records), 0) + class ProvXMLRoundTripFromFileTestCase(unittest.TestCase): def _perform_round_trip(self, filename, force_types=False): diff --git a/prov/tests/xml/example_42.xml b/prov/tests/xml/example_42.xml deleted file mode 100644 index 54ad9c9d..00000000 --- a/prov/tests/xml/example_42.xml +++ /dev/null @@ -1,17 +0,0 @@ - - - - - - - bar - - - - - - From b428c3c097535b8d082cdb0a599cd5e7e457cbcb Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Thu, 31 Jul 2014 14:31:14 +0200 Subject: [PATCH 39/66] tags will now be ignored. A warning is raised to inform the user, but otherwise this is desired behaviour. --- prov/serializers/provxml.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 4c8a901d..0b9bdb0b 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -7,6 +7,7 @@ import datetime import logging from lxml import etree +import warnings logger = logging.getLogger(__name__) @@ -172,6 +173,14 @@ def deserialize_subtree(self, xml_doc, bundle): for element in xml_doc: qname = etree.QName(element) if qname.namespace == NS_PROV: + # Ignore the element storing non-PROV information. + if qname.localname == "other": + warnings.warn( + "Document contains non-PROV information in " + ". It will be ignored in this package.", + UserWarning) + continue + rec_type = PROV_RECORD_IDS_MAP[qname.localname] id_tag = _ns_prov("id") From 147591f6060587d2d2fb1b8066284c329773ef1b Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Thu, 31 Jul 2014 16:27:53 +0200 Subject: [PATCH 40/66] Adapting to changes in master --- prov/model.py | 6 +++--- prov/serializers/provxml.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/prov/model.py b/prov/model.py index ba4bc295..b9708ecb 100644 --- a/prov/model.py +++ b/prov/model.py @@ -1016,15 +1016,15 @@ def agent(self, identifier, other_attributes=None): return self.new_record(PROV_AGENT, identifier, None, other_attributes) def software_agent(self, identifier, other_attributes=None): - return self.add_record(PROV_SOFTWARE_AGENT, identifier, None, + return self.new_record(PROV_SOFTWARE_AGENT, identifier, None, other_attributes) def organization(self, identifier, other_attributes=None): - return self.add_record(PROV_ORGANIZATION, identifier, None, + return self.new_record(PROV_ORGANIZATION, identifier, None, other_attributes) def person(self, identifier, other_attributes=None): - return self.add_record(PROV_PERSON, identifier, None, + return self.new_record(PROV_PERSON, identifier, None, other_attributes) def attribution(self, entity, agent, identifier=None, other_attributes=None): diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 0b9bdb0b..55200215 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -228,7 +228,7 @@ def deserialize_subtree(self, xml_doc, bundle): value = element.attrib[_ns_xsi("type")] other_attributes.append((PROV["type"], value)) - bundle.add_record(rec_type, rec_id, attributes, + bundle.new_record(rec_type, rec_id, attributes, other_attributes) else: raise NotImplementedError From 5db31314a032d31f4c8b63bfc1ba371ccd3845f9 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Thu, 31 Jul 2014 18:33:03 +0200 Subject: [PATCH 41/66] In theory working PROV XML bundle deserialization. --- prov/serializers/provxml.py | 95 ++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 38 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 55200215..a7c10eef 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -56,6 +56,10 @@ class ProvXMLException(prov.Error): class ProvXMLSerializer(prov.Serializer): + def __init__(self, *args, **kwargs): + super(ProvXMLSerializer, self).__init__(*args, **kwargs) + self.__bundles = {} + def serialize(self, stream, force_types=False, **kwargs): """ :param stream: Where to save the output. @@ -181,48 +185,31 @@ def deserialize_subtree(self, xml_doc, bundle): UserWarning) continue - rec_type = PROV_RECORD_IDS_MAP[qname.localname] - id_tag = _ns_prov("id") rec_id = element.attrib[id_tag] if id_tag in element.attrib \ else None - # Recursively build bundles. - if rec_type == PROV_BUNDLE: - new_bundle = prov.model.ProvBundle(document=bundle) - self.deserialize_subtree(element, new_bundle) - bundle.add_bundle(new_bundle, - new_bundle.valid_qualified_name(rec_id)) - - attributes = [] - other_attributes = [] - for subel in element: - sqname = etree.QName(subel) - if sqname.namespace == NS_PROV: - _t = PROV[sqname.localname] - d = attributes - else: - _t = "%s:%s" % (r_nsmap[sqname.namespace], - sqname.localname) - d = other_attributes - - if len(subel.attrib) > 1: - raise NotImplementedError - elif len(subel.attrib) == 1: - key, value = subel.attrib.items()[0] - if key == _ns_xsi("type"): - _v = prov.model.Literal( - subel.text, - XSD[value.split(":")[1]]) - elif key == _ns_prov("ref"): - _v = value - elif key == _ns(NS_XML, "lang"): - _v = prov.model.Literal(subel.text, langtag=value) - else: - raise NotImplementedError - else: - _v = subel.text - d.append((_t, _v)) + # Deal with bundles or bundle contents. + if qname.localname == "bundle": + b = bundle.bundle(identifier=rec_id) + attributes, other_attributes = \ + self._extract_attributes(element, r_nsmap) + if attributes: + msg = ("The bundle with identifier '%s' contains " + "attributes from the prov namespace which is " + "not allowed." % rec_id) + raise ValueError(msg) + b.add_attributes(other_attributes) + self.__bundles[rec_id] = b + continue + elif qname.localname == "bundleContent": + self.deserialize_subtree(element, self.__bundles[rec_id]) + continue + + rec_type = PROV_RECORD_IDS_MAP[qname.localname] + + attributes, other_attributes = self._extract_attributes( + element, r_nsmap) if _ns_xsi("type") in element.attrib: value = element.attrib[_ns_xsi("type")] @@ -234,6 +221,38 @@ def deserialize_subtree(self, xml_doc, bundle): raise NotImplementedError return bundle + def _extract_attributes(self, element, r_nsmap): + attributes = [] + other_attributes = [] + for subel in element: + sqname = etree.QName(subel) + if sqname.namespace == NS_PROV: + _t = PROV[sqname.localname] + d = attributes + else: + _t = "%s:%s" % (r_nsmap[sqname.namespace], + sqname.localname) + d = other_attributes + + if len(subel.attrib) > 1: + raise NotImplementedError + elif len(subel.attrib) == 1: + key, value = subel.attrib.items()[0] + if key == _ns_xsi("type"): + _v = prov.model.Literal( + subel.text, + XSD[value.split(":")[1]]) + elif key == _ns_prov("ref"): + _v = value + elif key == _ns(NS_XML, "lang"): + _v = prov.model.Literal(subel.text, langtag=value) + else: + raise NotImplementedError + else: + _v = subel.text + d.append((_t, _v)) + return attributes, other_attributes + def _ns(ns, tag): return "{%s}%s" % (ns, tag) From e4b16643b9bcc39b90d7fcd5efd83e3a17f1217a Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Fri, 1 Aug 2014 01:05:28 +0200 Subject: [PATCH 42/66] Read/Write bundles in PROV XML. --- prov/serializers/provxml.py | 90 ++++++++++++++++++++++++++----------- 1 file changed, 63 insertions(+), 27 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index a7c10eef..db9f3226 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -56,10 +56,6 @@ class ProvXMLException(prov.Error): class ProvXMLSerializer(prov.Serializer): - def __init__(self, *args, **kwargs): - super(ProvXMLSerializer, self).__init__(*args, **kwargs) - self.__bundles = {} - def serialize(self, stream, force_types=False, **kwargs): """ :param stream: Where to save the output. @@ -70,6 +66,16 @@ def serialize(self, stream, force_types=False, **kwargs): only be set for prov:type, prov:location, and prov:value as is done in the official PROV-XML specification. """ + xml_root = self.serialize_bundle(bundle=self.document, + force_types=force_types) + for bundle in self.document.bundles: + self.serialize_bundle(bundle=bundle, element=xml_root, + force_types=force_types) + et = etree.ElementTree(xml_root) + et.write(stream, pretty_print=True, xml_declaration=True, + encoding="UTF-8") + + def serialize_bundle(self, bundle, element=None, force_types=False): # Build the namespace map for lxml and attach it to the root XML # element. No dictionary comprehension in Python 2.6! nsmap = dict((ns.prefix, ns.uri) for ns in @@ -81,9 +87,16 @@ def serialize(self, stream, force_types=False, **kwargs): nsmap["xsi"] = NS_XSI nsmap["xsd"] = NS_XSD - xml_root = etree.Element(_ns_prov("document"), nsmap=nsmap) + if element is not None: + xml_bundle_root = etree.SubElement( + element, _ns_prov("bundleContent"), nsmap=nsmap) + else: + xml_bundle_root= etree.Element(_ns_prov("document"), nsmap=nsmap) + + if bundle.identifier: + xml_bundle_root.attrib[_ns_prov("id")] = unicode(bundle.identifier) - for record in self.document._records: + for record in bundle._records: rec_type = record.get_type() rec_label = PROV_N_MAP[rec_type] identifier = unicode(record._identifier) \ @@ -94,9 +107,24 @@ def serialize(self, stream, force_types=False, **kwargs): else: attrs = None - elem = etree.SubElement(xml_root, _ns_prov(rec_label), attrs) + # The bundle record is still of type entity. In PROV XML it + # actually is a proper bundle element. Loop through the + # attributes to check if an attribute designates the current + # element as a bundle element. + for attr, value in sorted_attributes(rec_type, record.attributes): + if self._check_if_bundle_entity(rec_type, attr, value): + rec_label = "bundle" + break + + elem = etree.SubElement(xml_bundle_root, + _ns_prov(rec_label), attrs) for attr, value in sorted_attributes(rec_type, record.attributes): + # Do not write the Bundle type specifier to the attributes. + # That information will be encoded in the parent element's tag. + if self._check_if_bundle_entity(rec_type, attr, value): + continue + subelem = etree.SubElement( elem, _ns(attr.namespace.uri, attr.localpart)) if isinstance(value, prov.model.Literal): @@ -150,10 +178,7 @@ def serialize(self, stream, force_types=False, **kwargs): subelem.attrib[_ns_prov("ref")] = v else: subelem.text = v - - et = etree.ElementTree(xml_root) - et.write(stream, pretty_print=True, xml_declaration=True, - encoding="UTF-8") + return xml_bundle_root def deserialize(self, stream, **kwargs): xml_doc = etree.parse(stream).getroot() @@ -168,7 +193,13 @@ def deserialize(self, stream, **kwargs): return document def deserialize_subtree(self, xml_doc, bundle): + # Do not add namespaces already defined in the parent document in + # case it is a bundle. + doc_ns = [(i.prefix, i.uri) for i in bundle.document.namespaces] \ + if bundle.document is not None else [] for key, value in xml_doc.nsmap.items(): + if (key, value) in doc_ns: + continue bundle.add_namespace(key, value) # No dictionary comprehension in Python 2.6. @@ -189,28 +220,25 @@ def deserialize_subtree(self, xml_doc, bundle): rec_id = element.attrib[id_tag] if id_tag in element.attrib \ else None - # Deal with bundles or bundle contents. - if qname.localname == "bundle": + # Recursively read bundles. + if qname.localname == "bundleContent": b = bundle.bundle(identifier=rec_id) - attributes, other_attributes = \ - self._extract_attributes(element, r_nsmap) - if attributes: - msg = ("The bundle with identifier '%s' contains " - "attributes from the prov namespace which is " - "not allowed." % rec_id) - raise ValueError(msg) - b.add_attributes(other_attributes) - self.__bundles[rec_id] = b - continue - elif qname.localname == "bundleContent": - self.deserialize_subtree(element, self.__bundles[rec_id]) + self.deserialize_subtree(element, b) continue - rec_type = PROV_RECORD_IDS_MAP[qname.localname] - attributes, other_attributes = self._extract_attributes( element, r_nsmap) + # Bundles are a bit special. Their metadata is represented + # as an entity with type bundle. + if qname.localname == "bundle": + rec_type = PROV_ENTITY + other_attributes.insert(0, ( + PROV["type"], prov.model.Literal("prov:bundle", + XSD_QNAME))) + else: + rec_type = PROV_RECORD_IDS_MAP[qname.localname] + if _ns_xsi("type") in element.attrib: value = element.attrib[_ns_xsi("type")] other_attributes.append((PROV["type"], value)) @@ -221,6 +249,14 @@ def deserialize_subtree(self, xml_doc, bundle): raise NotImplementedError return bundle + def _check_if_bundle_entity(self, rec_type, attr, value): + if rec_type == PROV_ENTITY and attr == PROV_TYPE and ( + value == "prov:bundle" or + (isinstance(value, prov.model.Literal) and + value.value == "prov:bundle")): + return True + return False + def _extract_attributes(self, element, r_nsmap): attributes = [] other_attributes = [] From 9af9819934587d14e0f4ffcb2c73043aa9908cf9 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Fri, 1 Aug 2014 01:07:11 +0200 Subject: [PATCH 43/66] Formatting. --- prov/serializers/provxml.py | 9 ++++----- prov/tests/test_xml.py | 7 ++++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index db9f3226..cffd863a 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -13,7 +13,7 @@ import prov from prov.model import PROV_REC_CLS -from prov.constants import * +from prov.constants import * # NOQA NS_PROV = "http://www.w3.org/ns/prov#" NS_XSI = "http://www.w3.org/2001/XMLSchema-instance" @@ -91,7 +91,7 @@ def serialize_bundle(self, bundle, element=None, force_types=False): xml_bundle_root = etree.SubElement( element, _ns_prov("bundleContent"), nsmap=nsmap) else: - xml_bundle_root= etree.Element(_ns_prov("document"), nsmap=nsmap) + xml_bundle_root = etree.Element(_ns_prov("document"), nsmap=nsmap) if bundle.identifier: xml_bundle_root.attrib[_ns_prov("id")] = unicode(bundle.identifier) @@ -251,9 +251,8 @@ def deserialize_subtree(self, xml_doc, bundle): def _check_if_bundle_entity(self, rec_type, attr, value): if rec_type == PROV_ENTITY and attr == PROV_TYPE and ( - value == "prov:bundle" or - (isinstance(value, prov.model.Literal) and - value.value == "prov:bundle")): + value == "prov:bundle" or (isinstance(value, prov.model.Literal) + and value.value == "prov:bundle")): return True return False diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index 95c853fa..e88dbc0e 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -104,8 +104,8 @@ def test_serialization_example_7(self): "ex:a1", "2011-11-16T16:05:00", "2011-11-16T16:06:00", [ - (prov.PROV_TYPE, prov.Literal("ex:edit", prov.XSD_QNAME)), - ("ex:host", "server.example.org")]) + (prov.PROV_TYPE, prov.Literal("ex:edit", prov.XSD_QNAME)), + ("ex:host", "server.example.org")]) with io.BytesIO() as actual: document.serialize(format='xml', destination=actual) @@ -320,6 +320,7 @@ def get_fct(f): force_types = True else: force_types = False + def fct(self): self._perform_round_trip(f, force_types=force_types) return fct @@ -331,4 +332,4 @@ def fct(self): if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() From b5d60d7e3739ac82f3f2351aafafb6bd76757e16 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Fri, 1 Aug 2014 01:28:30 +0200 Subject: [PATCH 44/66] Better exceptions and some refactoring. --- prov/serializers/provxml.py | 149 +++++++++++++++++++----------------- 1 file changed, 79 insertions(+), 70 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index cffd863a..41d3739b 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -21,36 +21,6 @@ NS_XML = "http://www.w3.org/XML/1998/namespace" -def sorted_attributes(element, attributes): - """ - Helper function sorting attributes into the order required by PROV-XML. - """ - attributes = list(attributes) - order = list(PROV_REC_CLS[element].FORMAL_ATTRIBUTES) - - # Append label, location, role, type, and value attributes. This is - # universal amongst all elements. - order.extend([PROV_LABEL, PROV_LOCATION, PROV_ROLE, PROV_TYPE, - PROV_VALUE]) - - sorted_elements = [] - for item in order: - this_type_list = [] - for e in list(attributes): - if e[0] != item: - continue - this_type_list.append(e) - attributes.remove(e) - this_type_list.sort(key=lambda x: (str(x[0]), str(x[1]))) - sorted_elements.extend(this_type_list) - # Add remaining attributes. According to the spec, the other attributes - # have a fixed alphabetical order. - attributes.sort(key=lambda x: (str(x[0]), str(x[1]))) - sorted_elements.extend(attributes) - - return sorted_elements - - class ProvXMLException(prov.Error): pass @@ -207,46 +177,45 @@ def deserialize_subtree(self, xml_doc, bundle): for element in xml_doc: qname = etree.QName(element) - if qname.namespace == NS_PROV: - # Ignore the element storing non-PROV information. - if qname.localname == "other": - warnings.warn( - "Document contains non-PROV information in " - ". It will be ignored in this package.", - UserWarning) - continue + if qname.namespace != NS_PROV: + raise ProvXMLException("Non PROV element discovered in " + "document or bundle.") + # Ignore the element storing non-PROV information. + if qname.localname == "other": + warnings.warn( + "Document contains non-PROV information in " + ". It will be ignored in this package.", + UserWarning) + continue - id_tag = _ns_prov("id") - rec_id = element.attrib[id_tag] if id_tag in element.attrib \ - else None + id_tag = _ns_prov("id") + rec_id = element.attrib[id_tag] if id_tag in element.attrib \ + else None - # Recursively read bundles. - if qname.localname == "bundleContent": - b = bundle.bundle(identifier=rec_id) - self.deserialize_subtree(element, b) - continue + # Recursively read bundles. + if qname.localname == "bundleContent": + b = bundle.bundle(identifier=rec_id) + self.deserialize_subtree(element, b) + continue - attributes, other_attributes = self._extract_attributes( - element, r_nsmap) + attributes, other_attributes = self._extract_attributes( + element, r_nsmap) - # Bundles are a bit special. Their metadata is represented - # as an entity with type bundle. - if qname.localname == "bundle": - rec_type = PROV_ENTITY - other_attributes.insert(0, ( - PROV["type"], prov.model.Literal("prov:bundle", - XSD_QNAME))) - else: - rec_type = PROV_RECORD_IDS_MAP[qname.localname] + # Bundles are a bit special. Their metadata is represented as an + # entity with type "bundle". + if qname.localname == "bundle": + rec_type = PROV_ENTITY + other_attributes.insert(0, ( + PROV["type"], prov.model.Literal("prov:bundle", + XSD_QNAME))) + else: + rec_type = PROV_RECORD_IDS_MAP[qname.localname] - if _ns_xsi("type") in element.attrib: - value = element.attrib[_ns_xsi("type")] - other_attributes.append((PROV["type"], value)) + if _ns_xsi("type") in element.attrib: + value = element.attrib[_ns_xsi("type")] + other_attributes.append((PROV["type"], value)) - bundle.new_record(rec_type, rec_id, attributes, - other_attributes) - else: - raise NotImplementedError + bundle.new_record(rec_type, rec_id, attributes, other_attributes) return bundle def _check_if_bundle_entity(self, rec_type, attr, value): @@ -257,6 +226,13 @@ def _check_if_bundle_entity(self, rec_type, attr, value): return False def _extract_attributes(self, element, r_nsmap): + """ + Extract the PROV attributes from an etree element. + + :param element: The lxml.etree.Element instance. + :param r_nsmap: A reverse namespace map going from prefix to + namespace URI. + """ attributes = [] other_attributes = [] for subel in element: @@ -269,10 +245,7 @@ def _extract_attributes(self, element, r_nsmap): sqname.localname) d = other_attributes - if len(subel.attrib) > 1: - raise NotImplementedError - elif len(subel.attrib) == 1: - key, value = subel.attrib.items()[0] + for key, value in subel.attrib.items(): if key == _ns_xsi("type"): _v = prov.model.Literal( subel.text, @@ -282,13 +255,49 @@ def _extract_attributes(self, element, r_nsmap): elif key == _ns(NS_XML, "lang"): _v = prov.model.Literal(subel.text, langtag=value) else: - raise NotImplementedError - else: + warnings.warn( + "The element '%s' contains an attribute %s='%s' " + "which is not representable in the prov module's " + "internal data model and will thus be ignored." % + (_t, str(key), str(value)), UserWarning) + + if not subel.attrib: _v = subel.text + d.append((_t, _v)) return attributes, other_attributes +def sorted_attributes(element, attributes): + """ + Helper function sorting attributes into the order required by PROV-XML. + """ + attributes = list(attributes) + order = list(PROV_REC_CLS[element].FORMAL_ATTRIBUTES) + + # Append label, location, role, type, and value attributes. This is + # universal amongst all elements. + order.extend([PROV_LABEL, PROV_LOCATION, PROV_ROLE, PROV_TYPE, + PROV_VALUE]) + + sorted_elements = [] + for item in order: + this_type_list = [] + for e in list(attributes): + if e[0] != item: + continue + this_type_list.append(e) + attributes.remove(e) + this_type_list.sort(key=lambda x: (str(x[0]), str(x[1]))) + sorted_elements.extend(this_type_list) + # Add remaining attributes. According to the spec, the other attributes + # have a fixed alphabetical order. + attributes.sort(key=lambda x: (str(x[0]), str(x[1]))) + sorted_elements.extend(attributes) + + return sorted_elements + + def _ns(ns, tag): return "{%s}%s" % (ns, tag) From 2df53712918d723650d956b6570731f6bd375aed Mon Sep 17 00:00:00 2001 From: Trung Dong Huynh Date: Fri, 1 Aug 2014 01:04:44 +0100 Subject: [PATCH 45/66] Added test_attributes and test_statements for prov-xml --- prov/tests/test_xml.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index e88dbc0e..a8193259 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -8,6 +8,8 @@ import warnings import prov.model as prov +from prov.tests.test_attributes import TestAttributes +from prov.tests.test_statements import RoundTripFromPythonTest EX_NS = ('ex', 'http://example.com/ns/ex#') @@ -331,5 +333,13 @@ def fct(self): setattr(ProvXMLRoundTripFromFileTestCase, test_name, fct) +class TestProvXMLAttributes(TestAttributes): + FORMAT = 'xml' + + +class TestProvXMLStatements(RoundTripFromPythonTest): + FORMAT = 'xml' + + if __name__ == '__main__': unittest.main() From ded8435169f4c8283c83e5288d9a7a784a5270f3 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Fri, 1 Aug 2014 11:43:38 +0200 Subject: [PATCH 46/66] Fixing XSD namespace issue. --- prov/constants.py | 2 +- prov/serializers/provxml.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/prov/constants.py b/prov/constants.py index 6ed3c70c..ca79db7d 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -6,7 +6,7 @@ # Built-in namespaces from prov.identifier import Namespace -XSD = Namespace('xsd', 'http://www.w3.org/2001/XMLSchema') +XSD = Namespace('xsd', 'http://www.w3.org/2001/XMLSchema#') PROV = Namespace('prov', 'http://www.w3.org/ns/prov#') # C1. Entities/Activities diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 41d3739b..cdb92e71 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -15,9 +15,8 @@ from prov.model import PROV_REC_CLS from prov.constants import * # NOQA -NS_PROV = "http://www.w3.org/ns/prov#" +NS_PROV = prov.constants.PROV.uri NS_XSI = "http://www.w3.org/2001/XMLSchema-instance" -NS_XSD = "http://www.w3.org/2001/XMLSchema" NS_XML = "http://www.w3.org/XML/1998/namespace" @@ -55,7 +54,9 @@ def serialize_bundle(self, bundle, element=None, force_types=False): # Add the prov, XSI, and XSD namespaces by default. nsmap["prov"] = NS_PROV nsmap["xsi"] = NS_XSI - nsmap["xsd"] = NS_XSD + # The XSD namespace for some reason has no hash at the end for PROV + # XML but for all other serializations it does. + nsmap["xsd"] = prov.constants.XSD.uri.rstrip("#") if element is not None: xml_bundle_root = etree.SubElement( @@ -170,6 +171,8 @@ def deserialize_subtree(self, xml_doc, bundle): for key, value in xml_doc.nsmap.items(): if (key, value) in doc_ns: continue + if key == "xsd": + value = value.rstrip("#") + "#" bundle.add_namespace(key, value) # No dictionary comprehension in Python 2.6. From 9bdb2d881417157a1e179313bef9266afa6ceb3c Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Fri, 1 Aug 2014 14:07:17 +0200 Subject: [PATCH 47/66] Reestablished compatibility with PROV-N. --- prov/constants.py | 49 ++++++++++++++------ prov/dot.py | 5 +- prov/model.py | 92 ++++++------------------------------- prov/serializers/provxml.py | 57 ++++++++++++----------- 4 files changed, 82 insertions(+), 121 deletions(-) diff --git a/prov/constants.py b/prov/constants.py index ca79db7d..cf3f6d17 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -77,6 +77,40 @@ PROV_BUNDLE: u'bundle', } +# Maps qualified names from the PROV namespace to their base class. If it +# has no baseclass it maps to itsself. This is needed for example for PROV +# XML (de)serializer where extended types are used a lot. +PROV_BASE_CLS = { + PROV_ENTITY: PROV_ENTITY, + PROV_ACTIVITY: PROV_ACTIVITY, + PROV_GENERATION: PROV_GENERATION, + PROV_USAGE: PROV_USAGE, + PROV_COMMUNICATION: PROV_COMMUNICATION, + PROV_START: PROV_START, + PROV_END: PROV_END, + PROV_INVALIDATION: PROV_INVALIDATION, + PROV_DERIVATION: PROV_DERIVATION, + PROV_REVISION: PROV_DERIVATION, + PROV_QUOTATION: PROV_DERIVATION, + PROV_PRIMARY_SOURCE: PROV_DERIVATION, + PROV_AGENT: PROV_AGENT, + PROV_SOFTWARE_AGENT: PROV_AGENT, + PROV_PERSON: PROV_AGENT, + PROV_ORGANIZATION: PROV_AGENT, + PROV_ATTRIBUTION: PROV_ATTRIBUTION, + PROV_ASSOCIATION: PROV_ASSOCIATION, + PROV_PLAN: PROV_ENTITY, + PROV_DELEGATION: PROV_DELEGATION, + PROV_INFLUENCE: PROV_INFLUENCE, + PROV_ALTERNATE: PROV_ALTERNATE, + PROV_SPECIALIZATION: PROV_SPECIALIZATION, + PROV_MENTION: PROV_MENTION, + PROV_COLLECTION: PROV_ENTITY, + PROV_EMPTY_COLLECTION: PROV_ENTITY, + PROV_MEMBERSHIP: PROV_ENTITY, + PROV_BUNDLE: PROV_ENTITY +} + # Identifiers for PROV's attributes PROV_ATTR_ENTITY = PROV['entity'] PROV_ATTR_ACTIVITY = PROV['activity'] @@ -86,9 +120,6 @@ PROV_ATTR_STARTER = PROV['starter'] PROV_ATTR_ENDER = PROV['ender'] PROV_ATTR_AGENT = PROV['agent'] -PROV_ATTR_SOFTWARE_AGENT = PROV['softwareAgent'] -PROV_ATTR_PERSON = PROV['person'] -PROV_ATTR_ORGANIZATION = PROV['organization'] PROV_ATTR_PLAN = PROV['plan'] PROV_ATTR_DELEGATE = PROV['delegate'] PROV_ATTR_RESPONSIBLE = PROV['responsible'] @@ -104,7 +135,6 @@ PROV_ATTR_INFLUENCEE = PROV['influencee'] PROV_ATTR_INFLUENCER = PROV['influencer'] PROV_ATTR_COLLECTION = PROV['collection'] -PROV_ATTR_EMPTY_COLLECTION = PROV['emptyCollection'] # Literal properties PROV_ATTR_TIME = PROV['time'] @@ -121,9 +151,6 @@ PROV_ATTR_STARTER, PROV_ATTR_ENDER, PROV_ATTR_AGENT, - PROV_ATTR_SOFTWARE_AGENT, - PROV_ATTR_PERSON, - PROV_ATTR_ORGANIZATION, PROV_ATTR_PLAN, PROV_ATTR_DELEGATE, PROV_ATTR_RESPONSIBLE, @@ -138,8 +165,7 @@ PROV_ATTR_BUNDLE, PROV_ATTR_INFLUENCEE, PROV_ATTR_INFLUENCER, - PROV_ATTR_COLLECTION, - PROV_ATTR_EMPTY_COLLECTION + PROV_ATTR_COLLECTION ]) PROV_ATTRIBUTE_LITERALS = set([PROV_ATTR_TIME, PROV_ATTR_STARTTIME, PROV_ATTR_ENDTIME]) # Set of formal attributes of PROV records @@ -150,11 +176,6 @@ PROV_ID_ATTRIBUTES_MAP = dict((prov_id, attribute) for (prov_id, attribute) in PROV_RECORD_ATTRIBUTES) PROV_ATTRIBUTES_ID_MAP = dict((attribute, prov_id) for (prov_id, attribute) in PROV_RECORD_ATTRIBUTES) -# Some elements can have multiple attributes of the same type. -PROV_ELEMENTS_COLLECTION_LIKE = set([ - PROV_MEMBERSHIP -]) - # Extra definition for convenience PROV_TYPE = PROV['type'] PROV_LABEL = PROV['label'] diff --git a/prov/dot.py b/prov/dot.py index e31fed69..a2658324 100644 --- a/prov/dot.py +++ b/prov/dot.py @@ -23,7 +23,7 @@ ProvBundle, PROV_ACTIVITY, PROV_AGENT, PROV_ALTERNATE, PROV_ASSOCIATION, PROV_ATTRIBUTION, PROV_BUNDLE, PROV_COMMUNICATION, PROV_DERIVATION, PROV_DELEGATION, PROV_ENTITY, PROV_GENERATION, PROV_INFLUENCE, PROV_INVALIDATION, PROV_END, PROV_MEMBERSHIP, PROV_MENTION, PROV_SPECIALIZATION, PROV_START, PROV_USAGE, - Identifier, PROV_ATTRIBUTE_QNAMES, PROV_SOFTWARE_AGENT, PROV_PERSON + Identifier, PROV_ATTRIBUTE_QNAMES ) from prov.serializers.provxml import sorted_attributes @@ -38,8 +38,6 @@ PROV_ENTITY: {'shape': 'oval', 'style': 'filled', 'fillcolor': '#FFFC87', 'color': '#808080'}, PROV_ACTIVITY: {'shape': 'box', 'style': 'filled', 'fillcolor': '#9FB1FC', 'color': '#0000FF'}, PROV_AGENT: {'shape': 'house', 'style': 'filled', 'fillcolor': '#FED37F'}, - PROV_SOFTWARE_AGENT: {'shape': 'house', 'style': 'filled', 'fillcolor': '#FED37F'}, - PROV_PERSON: {'shape': 'house', 'style': 'filled', 'fillcolor': '#FED37F'}, # PROV_COLLECTION: {'label': 'wasGeneratedBy', 'fontsize': 10.0}, PROV_BUNDLE: {'shape': 'folder', 'style': 'filled', 'fillcolor': 'aliceblue'}, # Relations @@ -110,6 +108,7 @@ def _attach_attribute_annotation(node, record): if not attributes: return # No attribute to display + # Sort the attributes similar to how PROV XML does it. attributes = sorted_attributes(record.get_type(), attributes) ann_rows = [ANNOTATION_START_ROW] diff --git a/prov/model.py b/prov/model.py index cfc5101e..a1539c83 100644 --- a/prov/model.py +++ b/prov/model.py @@ -256,6 +256,15 @@ def add_attributes(self, attributes): if isinstance(attributes, dict): # Converting the dictionary into a list of tuples (i.e. attribute-value pairs) attributes = attributes.items() + + # Check if one of the attributes specifies that the current type + # is a collection. In that case multiple attributes of the same + # type are allowed. + if PROV_ATTR_COLLECTION in [_i[0] for _i in attributes]: + is_collection = True + else: + is_collection = False + for attr_name, original_value in attributes: if original_value is None: continue @@ -276,8 +285,8 @@ def add_attributes(self, attributes): if value is None: raise ProvException(u'Invalid value for attribute %s: %s' % (attr, original_value)) - if attr in PROV_ATTRIBUTES and self._attributes[attr] and \ - self.get_type() not in PROV_ELEMENTS_COLLECTION_LIKE: + if not is_collection and attr in PROV_ATTRIBUTES and \ + self._attributes[attr]: existing_value = first(self._attributes[attr]) if value != existing_value: raise ProvException(u'Cannot have more than one value for attribute %s' % attr) @@ -446,42 +455,12 @@ def get_type(self): return PROV_DERIVATION -class ProvRevision(ProvDerivation): - def get_type(self): - return PROV_REVISION - - -class ProvQuotation(ProvDerivation): - def get_type(self): - return PROV_QUOTATION - - -class ProvPrimarySource(ProvDerivation): - def get_type(self): - return PROV_PRIMARY_SOURCE - - ### Component 3: Agents, Responsibility, and Influence class ProvAgent(ProvElement): def get_type(self): return PROV_AGENT -class ProvSoftwareAgent(ProvElement): - def get_type(self): - return PROV_SOFTWARE_AGENT - - -class ProvPerson(ProvElement): - def get_type(self): - return PROV_PERSON - - -class ProvOrganization(ProvElement): - def get_type(self): - return PROV_ORGANIZATION - - class ProvAttribution(ProvRelation): FORMAL_ATTRIBUTES = (PROV_ATTR_ENTITY, PROV_ATTR_AGENT) @@ -489,11 +468,6 @@ def get_type(self): return PROV_ATTRIBUTION -class ProvPlan(ProvEntity): - def get_type(self): - return PROV_PLAN - - class ProvAssociation(ProvRelation): FORMAL_ATTRIBUTES = (PROV_ATTR_ACTIVITY, PROV_ATTR_AGENT, PROV_ATTR_PLAN) @@ -538,16 +512,6 @@ def get_type(self): ### Component 6: Collections -class ProvCollection(ProvEntity): - def get_type(self): - return PROV_COLLECTION - - -class ProvEmptyCollection(ProvCollection): - def get_type(self): - return PROV_EMPTY_COLLECTION - - class ProvMembership(ProvRelation): FORMAL_ATTRIBUTES = (PROV_ATTR_COLLECTION, PROV_ATTR_ENTITY) @@ -566,23 +530,14 @@ def get_type(self): PROV_END: ProvEnd, PROV_INVALIDATION: ProvInvalidation, PROV_DERIVATION: ProvDerivation, - PROV_REVISION: ProvRevision, - PROV_QUOTATION: ProvQuotation, - PROV_PRIMARY_SOURCE: ProvPrimarySource, PROV_AGENT: ProvAgent, - PROV_SOFTWARE_AGENT: ProvSoftwareAgent, - PROV_PERSON: ProvPerson, - PROV_ORGANIZATION: ProvOrganization, PROV_ATTRIBUTION: ProvAttribution, PROV_ASSOCIATION: ProvAssociation, - PROV_PLAN: ProvPlan, PROV_DELEGATION: ProvDelegation, PROV_INFLUENCE: ProvInfluence, PROV_SPECIALIZATION: ProvSpecialization, PROV_ALTERNATE: ProvAlternate, PROV_MENTION: ProvMention, - PROV_COLLECTION: ProvCollection, - PROV_EMPTY_COLLECTION: ProvEmptyCollection, PROV_MEMBERSHIP: ProvMembership, } @@ -1019,18 +974,6 @@ def communication(self, informed, informant, identifier=None, other_attributes=N def agent(self, identifier, other_attributes=None): return self.new_record(PROV_AGENT, identifier, None, other_attributes) - def software_agent(self, identifier, other_attributes=None): - return self.new_record(PROV_SOFTWARE_AGENT, identifier, None, - other_attributes) - - def organization(self, identifier, other_attributes=None): - return self.new_record(PROV_ORGANIZATION, identifier, None, - other_attributes) - - def person(self, identifier, other_attributes=None): - return self.new_record(PROV_PERSON, identifier, None, - other_attributes) - def attribution(self, entity, agent, identifier=None, other_attributes=None): return self.new_record( PROV_ATTRIBUTION, identifier, { @@ -1081,13 +1024,13 @@ def derivation(self, generatedEntity, usedEntity, activity=None, generation=None def revision(self, generatedEntity, usedEntity, activity=None, generation=None, usage=None, identifier=None, other_attributes=None): record = self.derivation(generatedEntity, usedEntity, activity, generation, usage, identifier, other_attributes) - record.add_asserted_type(PROV_REVISION) + record.add_asserted_type(PROV['Revision']) return record def quotation(self, generatedEntity, usedEntity, activity=None, generation=None, usage=None, identifier=None, other_attributes=None): record = self.derivation(generatedEntity, usedEntity, activity, generation, usage, identifier, other_attributes) - record.add_asserted_type(PROV_QUOTATION) + record.add_asserted_type(PROV['Quotation']) return record def primary_source(self, generatedEntity, usedEntity, activity=None, generation=None, usage=None, @@ -1122,13 +1065,8 @@ def mention(self, specificEntity, generalEntity, bundle,): ) def collection(self, identifier, other_attributes=None): - record = self.new_record(PROV_COLLECTION, identifier, None, - other_attributes) - return record - - def emptyCollection(self, identifier, other_attributes=None): - record = self.new_record(PROV_EMPTY_COLLECTION, identifier, None, - other_attributes) + record = self.new_record(PROV_ENTITY, identifier, None, other_attributes) + record.add_asserted_type(PROV['Collection']) return record def membership(self, collection, entity): diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index cdb92e71..208199b2 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -69,7 +69,6 @@ def serialize_bundle(self, bundle, element=None, force_types=False): for record in bundle._records: rec_type = record.get_type() - rec_label = PROV_N_MAP[rec_type] identifier = unicode(record._identifier) \ if record._identifier else None @@ -82,20 +81,13 @@ def serialize_bundle(self, bundle, element=None, force_types=False): # actually is a proper bundle element. Loop through the # attributes to check if an attribute designates the current # element as a bundle element. - for attr, value in sorted_attributes(rec_type, record.attributes): - if self._check_if_bundle_entity(rec_type, attr, value): - rec_label = "bundle" - break + attributes = list(record.attributes) + rec_label = self._derive_record_label(rec_type, attributes) elem = etree.SubElement(xml_bundle_root, _ns_prov(rec_label), attrs) - for attr, value in sorted_attributes(rec_type, record.attributes): - # Do not write the Bundle type specifier to the attributes. - # That information will be encoded in the parent element's tag. - if self._check_if_bundle_entity(rec_type, attr, value): - continue - + for attr, value in sorted_attributes(rec_type, attributes): subelem = etree.SubElement( elem, _ns(attr.namespace.uri, attr.localpart)) if isinstance(value, prov.model.Literal): @@ -204,29 +196,40 @@ def deserialize_subtree(self, xml_doc, bundle): attributes, other_attributes = self._extract_attributes( element, r_nsmap) - # Bundles are a bit special. Their metadata is represented as an - # entity with type "bundle". - if qname.localname == "bundle": - rec_type = PROV_ENTITY - other_attributes.insert(0, ( - PROV["type"], prov.model.Literal("prov:bundle", - XSD_QNAME))) - else: - rec_type = PROV_RECORD_IDS_MAP[qname.localname] + # Map the record type to its base type. + q_prov_name = PROV_RECORD_IDS_MAP[qname.localname] + rec_type = PROV_BASE_CLS[q_prov_name] if _ns_xsi("type") in element.attrib: value = element.attrib[_ns_xsi("type")] other_attributes.append((PROV["type"], value)) - bundle.new_record(rec_type, rec_id, attributes, other_attributes) + rec = bundle.new_record(rec_type, rec_id, attributes, + other_attributes) + + # Add the actual type in case a base type has been used. + if rec_type != q_prov_name: + rec.add_asserted_type(q_prov_name) return bundle - def _check_if_bundle_entity(self, rec_type, attr, value): - if rec_type == PROV_ENTITY and attr == PROV_TYPE and ( - value == "prov:bundle" or (isinstance(value, prov.model.Literal) - and value.value == "prov:bundle")): - return True - return False + def _derive_record_label(self, rec_type, attributes): + """ + tries to derive the record label taking care of subtypes and what + not. It will also remove the type declaration for the attributes if + it was used to specialize the type . + """ + rec_label = PROV_N_MAP[rec_type] + + for key, value in list(attributes): + if key != PROV_TYPE: + continue + if isinstance(value, prov.model.Literal): + value = value.value + if value in PROV_BASE_CLS and PROV_BASE_CLS[value] != value: + attributes.remove((key, value)) + rec_label = PROV_N_MAP[value] + break + return rec_label def _extract_attributes(self, element, r_nsmap): """ From c5fabd6329d24be17e154504ef90ff33cd80bbac Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Fri, 1 Aug 2014 14:17:09 +0200 Subject: [PATCH 48/66] Removing more PROV-N types and putting them in separate mapping --- prov/constants.py | 50 +++++++++++++++++-------------------- prov/serializers/provxml.py | 11 +++++--- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/prov/constants.py b/prov/constants.py index cf3f6d17..d87647f8 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -21,18 +21,11 @@ # C2. Derivations PROV_DERIVATION = PROV['Derivation'] -PROV_REVISION = PROV['Revision'] -PROV_QUOTATION = PROV['Quotation'] -PROV_PRIMARY_SOURCE = PROV['PrimarySource'] # C3. Agents/Responsibility PROV_AGENT = PROV['Agent'] -PROV_SOFTWARE_AGENT = PROV['SoftwareAgent'] -PROV_PERSON = PROV['Person'] -PROV_ORGANIZATION = PROV['Organization'] PROV_ATTRIBUTION = PROV['Attribution'] PROV_ASSOCIATION = PROV['Association'] -PROV_PLAN = PROV['Plan'] PROV_DELEGATION = PROV['Delegation'] PROV_INFLUENCE = PROV['Influence'] # C4. Bundles @@ -42,8 +35,6 @@ PROV_SPECIALIZATION = PROV['Specialization'] PROV_MENTION = PROV['Mention'] # C6. Collections -PROV_COLLECTION = PROV['Collection'] -PROV_EMPTY_COLLECTION = PROV['EmptyCollection'] PROV_MEMBERSHIP = PROV['Membership'] PROV_N_MAP = { @@ -56,27 +47,32 @@ PROV_END: u'wasEndedBy', PROV_INVALIDATION: u'wasInvalidatedBy', PROV_DERIVATION: u'wasDerivedFrom', - PROV_REVISION: u'wasRevisionOf', - PROV_QUOTATION: u'wasQuotedFrom', - PROV_PRIMARY_SOURCE: u'hadPrimarySource', PROV_AGENT: u'agent', - PROV_SOFTWARE_AGENT: u'softwareAgent', - PROV_PERSON: u'person', - PROV_ORGANIZATION: u'organization', PROV_ATTRIBUTION: u'wasAttributedTo', PROV_ASSOCIATION: u'wasAssociatedWith', - PROV_PLAN: u'plan', PROV_DELEGATION: u'actedOnBehalfOf', PROV_INFLUENCE: u'wasInfluencedBy', PROV_ALTERNATE: u'alternateOf', PROV_SPECIALIZATION: u'specializationOf', PROV_MENTION: u'mentionOf', - PROV_COLLECTION: u'collection', - PROV_EMPTY_COLLECTION: u'emptyCollection', PROV_MEMBERSHIP: u'hadMember', PROV_BUNDLE: u'bundle', } +# Records defined as subtypes in PROV-N but top level types in for example +# PROV XML also need a mapping. +ADDITIONAL_N_MAP = { + PROV['Revision']: u'wasRevisionOf', + PROV['Quotation']: u'wasQuotedFrom', + PROV['PrimarySource']: u'hadPrimarySource', + PROV['SofwareAgent']: u'softwareAgent', + PROV['Person']: u'person', + PROV['Organization']: u'organization', + PROV['Plan']: u'plan', + PROV['Collection']: u'collection', + PROV['EmptyCollection']: u'emptyCollection', +} + # Maps qualified names from the PROV namespace to their base class. If it # has no baseclass it maps to itsself. This is needed for example for PROV # XML (de)serializer where extended types are used a lot. @@ -90,23 +86,23 @@ PROV_END: PROV_END, PROV_INVALIDATION: PROV_INVALIDATION, PROV_DERIVATION: PROV_DERIVATION, - PROV_REVISION: PROV_DERIVATION, - PROV_QUOTATION: PROV_DERIVATION, - PROV_PRIMARY_SOURCE: PROV_DERIVATION, + PROV['Revision']: PROV_DERIVATION, + PROV['Quotation']: PROV_DERIVATION, + PROV['PrimarySource']: PROV_DERIVATION, PROV_AGENT: PROV_AGENT, - PROV_SOFTWARE_AGENT: PROV_AGENT, - PROV_PERSON: PROV_AGENT, - PROV_ORGANIZATION: PROV_AGENT, + PROV['SofwareAgent']: PROV_AGENT, + PROV['Person']: PROV_AGENT, + PROV['Organization']: PROV_AGENT, PROV_ATTRIBUTION: PROV_ATTRIBUTION, PROV_ASSOCIATION: PROV_ASSOCIATION, - PROV_PLAN: PROV_ENTITY, + PROV['Plan']: PROV_ENTITY, PROV_DELEGATION: PROV_DELEGATION, PROV_INFLUENCE: PROV_INFLUENCE, PROV_ALTERNATE: PROV_ALTERNATE, PROV_SPECIALIZATION: PROV_SPECIALIZATION, PROV_MENTION: PROV_MENTION, - PROV_COLLECTION: PROV_ENTITY, - PROV_EMPTY_COLLECTION: PROV_ENTITY, + PROV['Collection']: PROV_ENTITY, + PROV['EmptyCollection']: PROV_ENTITY, PROV_MEMBERSHIP: PROV_ENTITY, PROV_BUNDLE: PROV_ENTITY } diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 208199b2..a011de58 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -19,6 +19,11 @@ NS_XSI = "http://www.w3.org/2001/XMLSchema-instance" NS_XML = "http://www.w3.org/XML/1998/namespace" +FULL_NAMES_MAP = dict(PROV_N_MAP) +FULL_NAMES_MAP.update(ADDITIONAL_N_MAP) +FULL_PROV_RECORD_IDS_MAP = dict((FULL_NAMES_MAP[rec_type_id], rec_type_id) for + rec_type_id in FULL_NAMES_MAP) + class ProvXMLException(prov.Error): pass @@ -197,7 +202,7 @@ def deserialize_subtree(self, xml_doc, bundle): element, r_nsmap) # Map the record type to its base type. - q_prov_name = PROV_RECORD_IDS_MAP[qname.localname] + q_prov_name = FULL_PROV_RECORD_IDS_MAP[qname.localname] rec_type = PROV_BASE_CLS[q_prov_name] if _ns_xsi("type") in element.attrib: @@ -218,7 +223,7 @@ def _derive_record_label(self, rec_type, attributes): not. It will also remove the type declaration for the attributes if it was used to specialize the type . """ - rec_label = PROV_N_MAP[rec_type] + rec_label = FULL_NAMES_MAP[rec_type] for key, value in list(attributes): if key != PROV_TYPE: @@ -227,7 +232,7 @@ def _derive_record_label(self, rec_type, attributes): value = value.value if value in PROV_BASE_CLS and PROV_BASE_CLS[value] != value: attributes.remove((key, value)) - rec_label = PROV_N_MAP[value] + rec_label = FULL_NAMES_MAP[value] break return rec_label From a1e3e65653985548e15184881e5ae24b499d2044 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 18 Aug 2014 19:58:27 +0200 Subject: [PATCH 49/66] Adding XSI to the default namespaces. --- prov/constants.py | 1 + prov/model.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/prov/constants.py b/prov/constants.py index d87647f8..c362a7e4 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -8,6 +8,7 @@ XSD = Namespace('xsd', 'http://www.w3.org/2001/XMLSchema#') PROV = Namespace('prov', 'http://www.w3.org/ns/prov#') +XSI = Namespace('xsi', 'http://www.w3.org/2001/XMLSchema-instance') # C1. Entities/Activities PROV_ENTITY = PROV['Entity'] diff --git a/prov/model.py b/prov/model.py index a1539c83..e7aa2bd3 100644 --- a/prov/model.py +++ b/prov/model.py @@ -542,7 +542,7 @@ def get_type(self): } -DEFAULT_NAMESPACES = {'prov': PROV, 'xsd': XSD} +DEFAULT_NAMESPACES = {'prov': PROV, 'xsd': XSD, 'xsi': XSI} # Bundle From 3bd97625cc2013cb4144030efcc08ff2ccb18b5f Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 18 Aug 2014 20:23:27 +0200 Subject: [PATCH 50/66] Changed parsing of boolean values to be more robust --- prov/model.py | 10 +++++++++- prov/serializers/provxml.py | 13 ++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/prov/model.py b/prov/model.py index e7aa2bd3..8ae020e8 100644 --- a/prov/model.py +++ b/prov/model.py @@ -47,6 +47,14 @@ def parse_xsd_datetime(value): pass return None +def parse_boolean(value): + if value.lower() in ["false", "0"]: + return False + elif value.lower() in ["true", "1"]: + return False + else: + return None + DATATYPE_PARSERS = { datetime.datetime: parse_xsd_datetime, } @@ -58,7 +66,7 @@ def parse_xsd_datetime(value): XSD_DOUBLE: float, XSD_LONG: long, XSD_INT: int, - XSD_BOOLEAN: bool, + XSD_BOOLEAN: parse_boolean, XSD_DATETIME: parse_xsd_datetime, XSD_ANYURI: Identifier } diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index a011de58..21a93e7e 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -119,21 +119,23 @@ def serialize_bundle(self, bundle, element=None, force_types=False): # The not startswith("prov:") check is a little bit hacky to # avoid type interference when the type is a standard prov # type. - if (force_types or attr in [PROV_TYPE, PROV_LOCATION, - PROV_VALUE]) and \ + if (force_types or + isinstance(value, bool) or + attr in [PROV_TYPE, PROV_LOCATION, PROV_VALUE]) and \ _ns_xsi("type") not in subelem.attrib and \ not str(value).startswith("prov:") and \ not (attr in PROV_ATTRIBUTE_QNAMES and v) and \ attr not in [PROV_ATTR_TIME, PROV_LABEL]: xsd_type = None - if isinstance(value, (str, unicode)): + if isinstance(value, bool): + xsd_type = XSD_BOOLEAN + v = v.lower() + elif isinstance(value, (str, unicode)): xsd_type = XSD_STRING elif isinstance(value, float): xsd_type = XSD_DOUBLE elif isinstance(value, int): xsd_type = XSD_INT - elif isinstance(value, bool): - xsd_type = XSD_BOOLEAN elif isinstance(value, datetime.datetime): xsd_type = XSD_DATETIME elif isinstance(value, prov.identifier.Identifier): @@ -276,6 +278,7 @@ def _extract_attributes(self, element, r_nsmap): _v = subel.text d.append((_t, _v)) + return attributes, other_attributes From b71cd90093de9e918a67ce834cb0363bcc1e72f4 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 18 Aug 2014 22:49:02 +0200 Subject: [PATCH 51/66] Better qualified name handling in PROV XML. Now properly read and read for records where it requires explicit setting. --- prov/serializers/provxml.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 21a93e7e..aedc196d 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -12,6 +12,7 @@ logger = logging.getLogger(__name__) import prov +import prov.identifier from prov.model import PROV_REC_CLS from prov.constants import * # NOQA @@ -104,6 +105,10 @@ def serialize_bundle(self, bundle, element=None, force_types=False): if value.langtag is not None: subelem.attrib[_ns(NS_XML, "lang")] = value.langtag v = value.value + elif isinstance(value, prov.model.QualifiedName): + if attr not in PROV_ATTRIBUTE_QNAMES: + subelem.attrib[_ns_xsi("type")] = "xsd:QName" + v = str(value) elif isinstance(value, datetime.datetime): v = value.isoformat() else: @@ -201,7 +206,7 @@ def deserialize_subtree(self, xml_doc, bundle): continue attributes, other_attributes = self._extract_attributes( - element, r_nsmap) + element, r_nsmap, bundle.namespaces) # Map the record type to its base type. q_prov_name = FULL_PROV_RECORD_IDS_MAP[qname.localname] @@ -238,13 +243,14 @@ def _derive_record_label(self, rec_type, attributes): break return rec_label - def _extract_attributes(self, element, r_nsmap): + def _extract_attributes(self, element, r_nsmap, namespaces): """ Extract the PROV attributes from an etree element. :param element: The lxml.etree.Element instance. :param r_nsmap: A reverse namespace map going from prefix to namespace URI. + :param namespaces: The namespace set defined for the current bundle. """ attributes = [] other_attributes = [] @@ -260,9 +266,24 @@ def _extract_attributes(self, element, r_nsmap): for key, value in subel.attrib.items(): if key == _ns_xsi("type"): - _v = prov.model.Literal( - subel.text, - XSD[value.split(":")[1]]) + try: + _namespace, _localpart = subel.text.split(":") + except ValueError: + _namespace, _localpart = None, subel.text + # If it is an xsd:QName, make sure it is returned as a + # QualifiedName instance! + if value == "xsd:QName" and _namespace and \ + _namespace != "prov": + _ns_obj = Namespace(_namespace, subel.nsmap[_namespace]) + if _ns_obj not in namespaces: + raise ProvXMLException( + "QualifiedName '%s' has an unknown namespace." + % subel.text) + _v = prov.identifier.QualifiedName(_ns_obj, _localpart) + else: + _v = prov.model.Literal( + subel.text, + XSD[value.split(":")[1]]) elif key == _ns_prov("ref"): _v = value elif key == _ns(NS_XML, "lang"): From 3749ab1810f7274eaf48c890cca3508461618b7b Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 18 Aug 2014 22:52:14 +0200 Subject: [PATCH 52/66] Always write datetime xsd type in PROV XML --- prov/serializers/provxml.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index aedc196d..1cf362a1 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -124,8 +124,9 @@ def serialize_bundle(self, bundle, element=None, force_types=False): # The not startswith("prov:") check is a little bit hacky to # avoid type interference when the type is a standard prov # type. + ALWAYS_CHECK = (bool, datetime.datetime) if (force_types or - isinstance(value, bool) or + type(value) in ALWAYS_CHECK or attr in [PROV_TYPE, PROV_LOCATION, PROV_VALUE]) and \ _ns_xsi("type") not in subelem.attrib and \ not str(value).startswith("prov:") and \ From b7145cf0e22e00c661abb6f21f3f057fe3587594 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 18 Aug 2014 23:00:24 +0200 Subject: [PATCH 53/66] More necessary type conversions --- prov/model.py | 6 +++--- prov/serializers/provxml.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/prov/model.py b/prov/model.py index 8ae020e8..00328020 100644 --- a/prov/model.py +++ b/prov/model.py @@ -48,10 +48,10 @@ def parse_xsd_datetime(value): return None def parse_boolean(value): - if value.lower() in ["false", "0"]: - return False - elif value.lower() in ["true", "1"]: + if value.lower() in ("false", "0"): return False + elif value.lower() in ("true", "1"): + return True else: return None diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 1cf362a1..9637412a 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -124,7 +124,7 @@ def serialize_bundle(self, bundle, element=None, force_types=False): # The not startswith("prov:") check is a little bit hacky to # avoid type interference when the type is a standard prov # type. - ALWAYS_CHECK = (bool, datetime.datetime) + ALWAYS_CHECK = (bool, datetime.datetime, int, float, long) if (force_types or type(value) in ALWAYS_CHECK or attr in [PROV_TYPE, PROV_LOCATION, PROV_VALUE]) and \ @@ -140,7 +140,7 @@ def serialize_bundle(self, bundle, element=None, force_types=False): xsd_type = XSD_STRING elif isinstance(value, float): xsd_type = XSD_DOUBLE - elif isinstance(value, int): + elif isinstance(value, (int, long)): xsd_type = XSD_INT elif isinstance(value, datetime.datetime): xsd_type = XSD_DATETIME From aa13756d8c0f25a93316c7b4ab6b0446d0a03052 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Mon, 18 Aug 2014 23:23:02 +0200 Subject: [PATCH 54/66] Setting the lang attribute on a Literal forces it to be type internationalized string as requried by PROV XML. PROV JSON requires the type attribute to not be set in the case of the lang attribute being used. --- prov/model.py | 5 ++++- prov/serializers/provxml.py | 3 ++- prov/tests/test_attributes.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/prov/model.py b/prov/model.py index 00328020..3511cff4 100644 --- a/prov/model.py +++ b/prov/model.py @@ -105,7 +105,10 @@ def __init__(self, value, datatype=None, langtag=None): if datatype is None: logger.debug('Assuming prov:InternationalizedString as the type of "%s"@%s' % (value, langtag)) datatype = PROV["InternationalizedString"] - elif datatype != PROV["InternationalizedString"] and datatype != XSD_STRING: + # PROV JSON states that the type field must not be set when + # using the lang attribute and PROV XML requires it to be an + # internationalized string. + elif datatype != PROV["InternationalizedString"]: logger.warn( 'Invalid data type (%s) for "%s"@%s, overridden as prov:InternationalizedString.' % (datatype, value, langtag) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 9637412a..a1a5c413 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -124,7 +124,8 @@ def serialize_bundle(self, bundle, element=None, force_types=False): # The not startswith("prov:") check is a little bit hacky to # avoid type interference when the type is a standard prov # type. - ALWAYS_CHECK = (bool, datetime.datetime, int, float, long) + ALWAYS_CHECK = (bool, datetime.datetime, int, float, long, + prov.identifier.Identifier) if (force_types or type(value) in ALWAYS_CHECK or attr in [PROV_TYPE, PROV_LOCATION, PROV_VALUE]) and \ diff --git a/prov/tests/test_attributes.py b/prov/tests/test_attributes.py index 4f216f6e..9f7297af 100644 --- a/prov/tests/test_attributes.py +++ b/prov/tests/test_attributes.py @@ -12,7 +12,7 @@ class TestAttributes(RoundTripTestCase): attribute_values = [ "un lieu", Literal("un lieu", langtag='fr'), - Literal("a place", datatype=XSD_STRING, langtag='en'), + Literal("a place", langtag='en'), Literal(1, XSD_INT), Literal(1, XSD_LONG), Literal(1, XSD_SHORT), From 4cfe9656b2be85e59ba4d573f3d06c02201c3c62 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Tue, 19 Aug 2014 00:28:14 +0200 Subject: [PATCH 55/66] Fixing some more small errors and inconsistencies --- prov/serializers/provxml.py | 16 +++++++++++++--- prov/tests/test_xml.py | 13 ++++++++----- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index a1a5c413..4a71a59a 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -144,7 +144,14 @@ def serialize_bundle(self, bundle, element=None, force_types=False): elif isinstance(value, (int, long)): xsd_type = XSD_INT elif isinstance(value, datetime.datetime): - xsd_type = XSD_DATETIME + # Exception of the exception, while technically + # still correct, do not write XSD dateTime type for + # attributes in the PROV namespaces as the type is + # already declared in the XSD and PROV XML also does + # not specify it in the docs. + if attr.namespace.prefix != "prov" \ + or "time" not in attr.localpart.lower(): + xsd_type = XSD_DATETIME elif isinstance(value, prov.identifier.Identifier): xsd_type = XSD_ANYURI @@ -317,6 +324,9 @@ def sorted_attributes(element, attributes): order.extend([PROV_LABEL, PROV_LOCATION, PROV_ROLE, PROV_TYPE, PROV_VALUE]) + sort_fct = lambda x: ( + str(x[0]), str(x[1].value if hasattr(x[1], "value") else x[1])) + sorted_elements = [] for item in order: this_type_list = [] @@ -325,11 +335,11 @@ def sorted_attributes(element, attributes): continue this_type_list.append(e) attributes.remove(e) - this_type_list.sort(key=lambda x: (str(x[0]), str(x[1]))) + this_type_list.sort(key=sort_fct) sorted_elements.extend(this_type_list) # Add remaining attributes. According to the spec, the other attributes # have a fixed alphabetical order. - attributes.sort(key=lambda x: (str(x[0]), str(x[1]))) + attributes.sort(key=sort_fct) sorted_elements.extend(attributes) return sorted_elements diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index a8193259..0ebc182e 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -7,6 +7,7 @@ import unittest import warnings +from prov.identifier import Namespace, QualifiedName import prov.model as prov from prov.tests.test_attributes import TestAttributes from prov.tests.test_statements import RoundTripFromPythonTest @@ -167,13 +168,14 @@ def test_deserialization_example_7(self): format="xml") expected_document = prov.ProvDocument() - expected_document.add_namespace(*EX_NS) + ex_ns = Namespace(*EX_NS) + expected_document.add_namespace(ex_ns) expected_document.activity( "ex:a1", "2011-11-16T16:05:00", "2011-11-16T16:06:00", [ - (prov.PROV_TYPE, prov.Literal("ex:edit", prov.XSD_QNAME)), + (prov.PROV_TYPE, QualifiedName(ex_ns, "edit")), ("ex:host", "server.example.org")]) self.assertEqual(actual_doc, expected_document) @@ -207,12 +209,13 @@ def test_deserialization_example_04_and_05(self): format="xml") expected_document = prov.ProvDocument() - expected_document.add_namespace(*EX_NS) + ex_ns = Namespace(*EX_NS) + expected_document.add_namespace(ex_ns) expected_document.add_namespace(*EX_TR) # The xsi:type attribute is mapped to a proper PROV attribute. expected_document.entity("tr:WD-prov-dm-20111215", ( - (prov.PROV_TYPE, prov.Literal("ex:Workflow", prov.XSD_QNAME)), + (prov.PROV_TYPE, QualifiedName(ex_ns, "Workflow")), (prov.PROV_TYPE, "prov:Plan"))) self.assertEqual(actual_document, expected_document) @@ -246,7 +249,7 @@ def test_deserialization_example_04_and_05(self): # The xsi:type attribute is mapped to a proper PROV attribute. expected_document.entity("tr:WD-prov-dm-20111215", ( - (prov.PROV_TYPE, prov.Literal("ex:Workflow", prov.XSD_QNAME)), + (prov.PROV_TYPE, QualifiedName(ex_ns, "Workflow")), (prov.PROV_TYPE, "prov:Entity"), (prov.PROV_TYPE, "prov:Plan") )) From f535cbef4bd9a97252dc37f226bb19b705ec96dc Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Tue, 19 Aug 2014 08:35:33 +0200 Subject: [PATCH 56/66] Correct namespace handling for namespaces defined at the bundle level --- prov/serializers/provxml.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 4a71a59a..f33662a8 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -57,6 +57,9 @@ def serialize_bundle(self, bundle, element=None, force_types=False): self.document._namespaces.get_registered_namespaces()) if self.document._namespaces._default: nsmap[None] = self.document._namespaces._default.uri + for namespace in bundle.namespaces: + if namespace not in nsmap: + nsmap[namespace.prefix] = namespace.uri # Add the prov, XSI, and XSD namespaces by default. nsmap["prov"] = NS_PROV nsmap["xsi"] = NS_XSI @@ -164,6 +167,18 @@ def serialize_bundle(self, bundle, element=None, force_types=False): subelem.text = v return xml_bundle_root + def _add_xml_namespaces_to_bundle(self, xml_doc, bundle): + # Do not add namespaces already defined in the parent document in + # case it is a bundle. + doc_ns = [(i.prefix, i.uri) for i in bundle.document.namespaces] \ + if bundle.document is not None else [] + for key, value in xml_doc.nsmap.items(): + if (key, value) in doc_ns: + continue + if key == "xsd": + value = value.rstrip("#") + "#" + bundle.add_namespace(key, value) + def deserialize(self, stream, **kwargs): xml_doc = etree.parse(stream).getroot() @@ -177,16 +192,7 @@ def deserialize(self, stream, **kwargs): return document def deserialize_subtree(self, xml_doc, bundle): - # Do not add namespaces already defined in the parent document in - # case it is a bundle. - doc_ns = [(i.prefix, i.uri) for i in bundle.document.namespaces] \ - if bundle.document is not None else [] - for key, value in xml_doc.nsmap.items(): - if (key, value) in doc_ns: - continue - if key == "xsd": - value = value.rstrip("#") + "#" - bundle.add_namespace(key, value) + self._add_xml_namespaces_to_bundle(xml_doc, bundle) # No dictionary comprehension in Python 2.6. r_nsmap = dict((value, key) for (key, value) in xml_doc.nsmap.items()) @@ -210,6 +216,7 @@ def deserialize_subtree(self, xml_doc, bundle): # Recursively read bundles. if qname.localname == "bundleContent": + self._add_xml_namespaces_to_bundle(element, bundle) b = bundle.bundle(identifier=rec_id) self.deserialize_subtree(element, b) continue From 3e946d9a844afc3c9d8907d70dd2a81fc86fdc24 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Tue, 19 Aug 2014 08:46:31 +0200 Subject: [PATCH 57/66] All tests pass. Fixed a bug in the base class assignments. --- prov/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prov/constants.py b/prov/constants.py index c362a7e4..dccbbfd8 100644 --- a/prov/constants.py +++ b/prov/constants.py @@ -104,7 +104,7 @@ PROV_MENTION: PROV_MENTION, PROV['Collection']: PROV_ENTITY, PROV['EmptyCollection']: PROV_ENTITY, - PROV_MEMBERSHIP: PROV_ENTITY, + PROV_MEMBERSHIP: PROV_MEMBERSHIP, PROV_BUNDLE: PROV_ENTITY } From 5cee0d29e9543b3846c69f3bbe8f8f60487bb63f Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Tue, 19 Aug 2014 09:45:54 +0200 Subject: [PATCH 58/66] Cleanup and documentation. --- prov/serializers/provxml.py | 111 +++++++++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 27 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index f33662a8..bbbb090a 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -13,17 +13,17 @@ import prov import prov.identifier -from prov.model import PROV_REC_CLS +from prov.model import PROV_REC_CLS, DEFAULT_NAMESPACES from prov.constants import * # NOQA -NS_PROV = prov.constants.PROV.uri -NS_XSI = "http://www.w3.org/2001/XMLSchema-instance" -NS_XML = "http://www.w3.org/XML/1998/namespace" +# Create a dictionary containing all top-level PROV XML elements for an easy +# mapping. FULL_NAMES_MAP = dict(PROV_N_MAP) FULL_NAMES_MAP.update(ADDITIONAL_N_MAP) +# Inverse mapping. FULL_PROV_RECORD_IDS_MAP = dict((FULL_NAMES_MAP[rec_type_id], rec_type_id) for - rec_type_id in FULL_NAMES_MAP) + rec_type_id in FULL_NAMES_MAP) class ProvXMLException(prov.Error): @@ -33,13 +33,17 @@ class ProvXMLException(prov.Error): class ProvXMLSerializer(prov.Serializer): def serialize(self, stream, force_types=False, **kwargs): """ + Serializes to PROV XML. + :param stream: Where to save the output. :type force_types: boolean, optional :param force_types: Will force xsd:types to be written for most - attributes mainly only PROV-"attributes", e.g. tags not in the + attributes mainly PROV-"attributes", e.g. tags not in the PROV namespace. Off by default meaning xsd:type attributes will only be set for prov:type, prov:location, and prov:value as is - done in the official PROV-XML specification. + done in the official PROV-XML specification. Furthermore the + types will always be set if the Python type requires it. False + is a good default and it should rarely require changing. """ xml_root = self.serialize_bundle(bundle=self.document, force_types=force_types) @@ -51,6 +55,20 @@ def serialize(self, stream, force_types=False, **kwargs): encoding="UTF-8") def serialize_bundle(self, bundle, element=None, force_types=False): + """ + Serializes a bundle or document to PROV XML. + + :param bundle: The bundle or document. + :param element: The XML element to write to. Will be created if None. + :type force_types: boolean, optional + :param force_types: Will force xsd:types to be written for most + attributes mainly PROV-"attributes", e.g. tags not in the + PROV namespace. Off by default meaning xsd:type attributes will + only be set for prov:type, prov:location, and prov:value as is + done in the official PROV-XML specification. Furthermore the + types will always be set if the Python type requires it. False + is a good default and it should rarely require changing. + """ # Build the namespace map for lxml and attach it to the root XML # element. No dictionary comprehension in Python 2.6! nsmap = dict((ns.prefix, ns.uri) for ns in @@ -60,12 +78,14 @@ def serialize_bundle(self, bundle, element=None, force_types=False): for namespace in bundle.namespaces: if namespace not in nsmap: nsmap[namespace.prefix] = namespace.uri - # Add the prov, XSI, and XSD namespaces by default. - nsmap["prov"] = NS_PROV - nsmap["xsi"] = NS_XSI - # The XSD namespace for some reason has no hash at the end for PROV - # XML but for all other serializations it does. - nsmap["xsd"] = prov.constants.XSD.uri.rstrip("#") + + for key, value in DEFAULT_NAMESPACES.items(): + uri = value.uri + if value.prefix == "xsd": + # The XSD namespace for some reason has no hash at the end + # for PROV XML, but for all other serializations it does. + uri = uri.rstrip("#") + nsmap[value.prefix] = uri if element is not None: xml_bundle_root = etree.SubElement( @@ -86,10 +106,8 @@ def serialize_bundle(self, bundle, element=None, force_types=False): else: attrs = None - # The bundle record is still of type entity. In PROV XML it - # actually is a proper bundle element. Loop through the - # attributes to check if an attribute designates the current - # element as a bundle element. + # Derive the record label from its attributes which is sometimes + # needed. attributes = list(record.attributes) rec_label = self._derive_record_label(rec_type, attributes) @@ -106,7 +124,7 @@ def serialize_bundle(self, bundle, element=None, force_types=False): value.datatype.namespace.prefix, value.datatype.localpart) if value.langtag is not None: - subelem.attrib[_ns(NS_XML, "lang")] = value.langtag + subelem.attrib[_ns_xml("lang")] = value.langtag v = value.value elif isinstance(value, prov.model.QualifiedName): if attr not in PROV_ATTRIBUTE_QNAMES: @@ -127,6 +145,9 @@ def serialize_bundle(self, bundle, element=None, force_types=False): # The not startswith("prov:") check is a little bit hacky to # avoid type interference when the type is a standard prov # type. + # + # To enable a mapping of Python types to XML and back, + # the XSD type must be written for these types. ALWAYS_CHECK = (bool, datetime.datetime, int, float, long, prov.identifier.Identifier) if (force_types or @@ -168,6 +189,13 @@ def serialize_bundle(self, bundle, element=None, force_types=False): return xml_bundle_root def _add_xml_namespaces_to_bundle(self, xml_doc, bundle): + """ + Helper function adding the namespaces defined in the etree to the + bundle. + + :param xml_doc: An etree element. + :param bundle: A prov bundle object. + """ # Do not add namespaces already defined in the parent document in # case it is a bundle. doc_ns = [(i.prefix, i.uri) for i in bundle.document.namespaces] \ @@ -180,6 +208,11 @@ def _add_xml_namespaces_to_bundle(self, xml_doc, bundle): bundle.add_namespace(key, value) def deserialize(self, stream, **kwargs): + """ + Deserialize from PROV XML to the internal prov document representation. + + :param stream: Input data. + """ xml_doc = etree.parse(stream).getroot() # Remove all comments. @@ -192,6 +225,13 @@ def deserialize(self, stream, **kwargs): return document def deserialize_subtree(self, xml_doc, bundle): + """ + Deserialize an etree element containing a PROV document or a bundle + and write it to the provided internal object. + + :param xml_doc: An etree element containing the information to read. + :param bundle: The bundle object to write to. + """ self._add_xml_namespaces_to_bundle(xml_doc, bundle) # No dictionary comprehension in Python 2.6. @@ -199,7 +239,7 @@ def deserialize_subtree(self, xml_doc, bundle): for element in xml_doc: qname = etree.QName(element) - if qname.namespace != NS_PROV: + if qname.namespace != DEFAULT_NAMESPACES["prov"].uri: raise ProvXMLException("Non PROV element discovered in " "document or bundle.") # Ignore the element storing non-PROV information. @@ -242,9 +282,12 @@ def deserialize_subtree(self, xml_doc, bundle): def _derive_record_label(self, rec_type, attributes): """ - tries to derive the record label taking care of subtypes and what - not. It will also remove the type declaration for the attributes if - it was used to specialize the type . + Helper function trying to derive the record label taking care of + subtypes and what not. It will also remove the type declaration for + the attributes if it was used to specialize the type. + + :param rec_type: The type of records. + :param attributes: The attributes of the record. """ rec_label = FULL_NAMES_MAP[rec_type] @@ -272,7 +315,7 @@ def _extract_attributes(self, element, r_nsmap, namespaces): other_attributes = [] for subel in element: sqname = etree.QName(subel) - if sqname.namespace == NS_PROV: + if sqname.namespace == DEFAULT_NAMESPACES["prov"].uri: _t = PROV[sqname.localname] d = attributes else: @@ -290,7 +333,8 @@ def _extract_attributes(self, element, r_nsmap, namespaces): # QualifiedName instance! if value == "xsd:QName" and _namespace and \ _namespace != "prov": - _ns_obj = Namespace(_namespace, subel.nsmap[_namespace]) + _ns_obj = Namespace(_namespace, + subel.nsmap[_namespace]) if _ns_obj not in namespaces: raise ProvXMLException( "QualifiedName '%s' has an unknown namespace." @@ -302,7 +346,7 @@ def _extract_attributes(self, element, r_nsmap, namespaces): XSD[value.split(":")[1]]) elif key == _ns_prov("ref"): _v = value - elif key == _ns(NS_XML, "lang"): + elif key == _ns_xml("lang"): _v = prov.model.Literal(subel.text, langtag=value) else: warnings.warn( @@ -322,6 +366,10 @@ def _extract_attributes(self, element, r_nsmap, namespaces): def sorted_attributes(element, attributes): """ Helper function sorting attributes into the order required by PROV-XML. + + :param element: The prov element used to derive the type and the + attribute order for the type. + :param attributes: The attributes to sort. """ attributes = list(attributes) order = list(PROV_REC_CLS[element].FORMAL_ATTRIBUTES) @@ -331,6 +379,10 @@ def sorted_attributes(element, attributes): order.extend([PROV_LABEL, PROV_LOCATION, PROV_ROLE, PROV_TYPE, PROV_VALUE]) + # Sort function. The PROV XML specification talks about alphabetical + # sorting. We now interpret it as sorting by tag including the prefix + # first and then sorting by the text, also including the namespace + # prefix if given. sort_fct = lambda x: ( str(x[0]), str(x[1].value if hasattr(x[1], "value") else x[1])) @@ -357,8 +409,13 @@ def _ns(ns, tag): def _ns_prov(tag): - return _ns(NS_PROV, tag) + return _ns(DEFAULT_NAMESPACES['prov'].uri, tag) def _ns_xsi(tag): - return _ns(NS_XSI, tag) + return _ns(DEFAULT_NAMESPACES['xsi'].uri, tag) + + +def _ns_xml(tag): + NS_XML = "http://www.w3.org/XML/1998/namespace" + return _ns(NS_XML, tag) From 37515850063eeb0c9d56f47970bd66876974519f Mon Sep 17 00:00:00 2001 From: Trung Dong Huynh Date: Tue, 19 Aug 2014 12:11:44 +0100 Subject: [PATCH 59/66] Refactored the XML round-trip test class to include all currently available round-trip tests --- prov/tests/test_xml.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/prov/tests/test_xml.py b/prov/tests/test_xml.py index 0ebc182e..a24c7719 100644 --- a/prov/tests/test_xml.py +++ b/prov/tests/test_xml.py @@ -9,8 +9,8 @@ from prov.identifier import Namespace, QualifiedName import prov.model as prov -from prov.tests.test_attributes import TestAttributes -from prov.tests.test_statements import RoundTripFromPythonTest +from prov.tests.test_model import AllTestsBase +from prov.tests.utility import RoundTripTestCase EX_NS = ('ex', 'http://example.com/ns/ex#') @@ -336,11 +336,7 @@ def fct(self): setattr(ProvXMLRoundTripFromFileTestCase, test_name, fct) -class TestProvXMLAttributes(TestAttributes): - FORMAT = 'xml' - - -class TestProvXMLStatements(RoundTripFromPythonTest): +class RoundTripXMLTests(RoundTripTestCase, AllTestsBase): FORMAT = 'xml' From 9b887bc1ec9007e4aaeb6c9ec10457f4a1997804 Mon Sep 17 00:00:00 2001 From: Trung Dong Huynh Date: Tue, 19 Aug 2014 12:47:14 +0100 Subject: [PATCH 60/66] Added: Printing out the serialized content in when a round-trip test fails --- prov/tests/utility.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/prov/tests/utility.py b/prov/tests/utility.py index a23249e8..2932b2c2 100644 --- a/prov/tests/utility.py +++ b/prov/tests/utility.py @@ -74,4 +74,6 @@ def assertRoundTripEquivalence(self, prov_doc, msg=None): serialized_content = prov_doc.serialize(format=self.FORMAT, indent=4) prov_doc_new = ProvDocument.deserialize(content=serialized_content, format=self.FORMAT) + msg_extra = u"'%s' serialization content:\n%s" % (self.FORMAT, serialized_content) + msg = u'\n'.join((msg, msg_extra)) if msg else msg_extra self.assertEqual(prov_doc, prov_doc_new, msg) From 6249d4922d095f289da438bf67d198809299a0c0 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Tue, 19 Aug 2014 15:42:39 +0200 Subject: [PATCH 61/66] Fixing unicode encoding issue in tests --- prov/serializers/provxml.py | 12 ++++++------ prov/tests/utility.py | 20 +++++++++++++++----- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index bbbb090a..1d92bb7c 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -129,11 +129,11 @@ def serialize_bundle(self, bundle, element=None, force_types=False): elif isinstance(value, prov.model.QualifiedName): if attr not in PROV_ATTRIBUTE_QNAMES: subelem.attrib[_ns_xsi("type")] = "xsd:QName" - v = str(value) + v = unicode(value) elif isinstance(value, datetime.datetime): v = value.isoformat() else: - v = str(value) + v = unicode(value) # xsd type inference. # @@ -154,7 +154,7 @@ def serialize_bundle(self, bundle, element=None, force_types=False): type(value) in ALWAYS_CHECK or attr in [PROV_TYPE, PROV_LOCATION, PROV_VALUE]) and \ _ns_xsi("type") not in subelem.attrib and \ - not str(value).startswith("prov:") and \ + not unicode(value).startswith("prov:") and \ not (attr in PROV_ATTRIBUTE_QNAMES and v) and \ attr not in [PROV_ATTR_TIME, PROV_LABEL]: xsd_type = None @@ -180,7 +180,7 @@ def serialize_bundle(self, bundle, element=None, force_types=False): xsd_type = XSD_ANYURI if xsd_type is not None: - subelem.attrib[_ns_xsi("type")] = str(xsd_type) + subelem.attrib[_ns_xsi("type")] = unicode(xsd_type) if attr in PROV_ATTRIBUTE_QNAMES and v: subelem.attrib[_ns_prov("ref")] = v @@ -353,7 +353,7 @@ def _extract_attributes(self, element, r_nsmap, namespaces): "The element '%s' contains an attribute %s='%s' " "which is not representable in the prov module's " "internal data model and will thus be ignored." % - (_t, str(key), str(value)), UserWarning) + (_t, unicode(key), unicode(value)), UserWarning) if not subel.attrib: _v = subel.text @@ -384,7 +384,7 @@ def sorted_attributes(element, attributes): # first and then sorting by the text, also including the namespace # prefix if given. sort_fct = lambda x: ( - str(x[0]), str(x[1].value if hasattr(x[1], "value") else x[1])) + unicode(x[0]), unicode(x[1].value if hasattr(x[1], "value") else x[1])) sorted_elements = [] for item in order: diff --git a/prov/tests/utility.py b/prov/tests/utility.py index 2932b2c2..43683c1e 100644 --- a/prov/tests/utility.py +++ b/prov/tests/utility.py @@ -1,3 +1,4 @@ +import io import logging import unittest @@ -72,8 +73,17 @@ def assertRoundTripEquivalence(self, prov_doc, msg=None): # This is a dummy test, just return return - serialized_content = prov_doc.serialize(format=self.FORMAT, indent=4) - prov_doc_new = ProvDocument.deserialize(content=serialized_content, format=self.FORMAT) - msg_extra = u"'%s' serialization content:\n%s" % (self.FORMAT, serialized_content) - msg = u'\n'.join((msg, msg_extra)) if msg else msg_extra - self.assertEqual(prov_doc, prov_doc_new, msg) + with io.BytesIO() as stream: + prov_doc.serialize(destination=stream, format=self.FORMAT, indent=4) + stream.seek(0, 0) + + prov_doc_new = ProvDocument.deserialize(source=stream, + format=self.FORMAT) + stream.seek(0, 0) + # Assume UTF-8 encoding which is forced by the particular + # PROV XML implementation and should also work for the PROV + # JSON implementation. + msg_extra = u"'%s' serialization content:\n%s" % ( + self.FORMAT, stream.read().decode("utf-8")) + msg = u'\n'.join((msg, msg_extra)) if msg else msg_extra + self.assertEqual(prov_doc, prov_doc_new, msg) From f62ce2f728d11a14623cbf85e689a6a20cee90bc Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Tue, 19 Aug 2014 15:47:22 +0200 Subject: [PATCH 62/66] Handling of default namespaces for PROV-XML --- prov/serializers/provxml.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 1d92bb7c..52582c95 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -203,9 +203,12 @@ def _add_xml_namespaces_to_bundle(self, xml_doc, bundle): for key, value in xml_doc.nsmap.items(): if (key, value) in doc_ns: continue - if key == "xsd": + elif key == "xsd": value = value.rstrip("#") + "#" - bundle.add_namespace(key, value) + elif key is None: + bundle.set_default_namespace(value) + else: + bundle.add_namespace(key, value) def deserialize(self, stream, **kwargs): """ From 9793e669de8b825891e5e6012b749104ab1f7615 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Tue, 19 Aug 2014 15:53:18 +0200 Subject: [PATCH 63/66] Moving sorted_attributes() function to prov.models. --- prov/dot.py | 7 ++---- prov/model.py | 41 +++++++++++++++++++++++++++++++++++ prov/serializers/provxml.py | 43 +------------------------------------ 3 files changed, 44 insertions(+), 47 deletions(-) diff --git a/prov/dot.py b/prov/dot.py index a2658324..bc901dbc 100644 --- a/prov/dot.py +++ b/prov/dot.py @@ -23,10 +23,7 @@ ProvBundle, PROV_ACTIVITY, PROV_AGENT, PROV_ALTERNATE, PROV_ASSOCIATION, PROV_ATTRIBUTION, PROV_BUNDLE, PROV_COMMUNICATION, PROV_DERIVATION, PROV_DELEGATION, PROV_ENTITY, PROV_GENERATION, PROV_INFLUENCE, PROV_INVALIDATION, PROV_END, PROV_MEMBERSHIP, PROV_MENTION, PROV_SPECIALIZATION, PROV_START, PROV_USAGE, - Identifier, PROV_ATTRIBUTE_QNAMES -) - -from prov.serializers.provxml import sorted_attributes + Identifier, PROV_ATTRIBUTE_QNAMES, sorted_attributes) # Visual styles for various elements (nodes) and relations (edges) @@ -108,7 +105,7 @@ def _attach_attribute_annotation(node, record): if not attributes: return # No attribute to display - # Sort the attributes similar to how PROV XML does it. + # Sort the attributes. attributes = sorted_attributes(record.get_type(), attributes) ann_rows = [ANNOTATION_START_ROW] diff --git a/prov/model.py b/prov/model.py index 44586829..52411107 100644 --- a/prov/model.py +++ b/prov/model.py @@ -1324,3 +1324,44 @@ def deserialize(source=None, content=None, format='json', **args): else: with open(source) as f: return serializer.deserialize(f, **args) + + +def sorted_attributes(element, attributes): + """ + Helper function sorting attributes into the order required by PROV-XML. + + :param element: The prov element used to derive the type and the + attribute order for the type. + :param attributes: The attributes to sort. + """ + attributes = list(attributes) + order = list(PROV_REC_CLS[element].FORMAL_ATTRIBUTES) + + # Append label, location, role, type, and value attributes. This is + # universal amongst all elements. + order.extend([PROV_LABEL, PROV_LOCATION, PROV_ROLE, PROV_TYPE, + PROV_VALUE]) + + # Sort function. The PROV XML specification talks about alphabetical + # sorting. We now interpret it as sorting by tag including the prefix + # first and then sorting by the text, also including the namespace + # prefix if given. + sort_fct = lambda x: ( + unicode(x[0]), unicode(x[1].value if hasattr(x[1], "value") else x[1])) + + sorted_elements = [] + for item in order: + this_type_list = [] + for e in list(attributes): + if e[0] != item: + continue + this_type_list.append(e) + attributes.remove(e) + this_type_list.sort(key=sort_fct) + sorted_elements.extend(this_type_list) + # Add remaining attributes. According to the spec, the other attributes + # have a fixed alphabetical order. + attributes.sort(key=sort_fct) + sorted_elements.extend(attributes) + + return sorted_elements \ No newline at end of file diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index 52582c95..a7636b60 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -13,7 +13,7 @@ import prov import prov.identifier -from prov.model import PROV_REC_CLS, DEFAULT_NAMESPACES +from prov.model import DEFAULT_NAMESPACES, sorted_attributes from prov.constants import * # NOQA @@ -366,47 +366,6 @@ def _extract_attributes(self, element, r_nsmap, namespaces): return attributes, other_attributes -def sorted_attributes(element, attributes): - """ - Helper function sorting attributes into the order required by PROV-XML. - - :param element: The prov element used to derive the type and the - attribute order for the type. - :param attributes: The attributes to sort. - """ - attributes = list(attributes) - order = list(PROV_REC_CLS[element].FORMAL_ATTRIBUTES) - - # Append label, location, role, type, and value attributes. This is - # universal amongst all elements. - order.extend([PROV_LABEL, PROV_LOCATION, PROV_ROLE, PROV_TYPE, - PROV_VALUE]) - - # Sort function. The PROV XML specification talks about alphabetical - # sorting. We now interpret it as sorting by tag including the prefix - # first and then sorting by the text, also including the namespace - # prefix if given. - sort_fct = lambda x: ( - unicode(x[0]), unicode(x[1].value if hasattr(x[1], "value") else x[1])) - - sorted_elements = [] - for item in order: - this_type_list = [] - for e in list(attributes): - if e[0] != item: - continue - this_type_list.append(e) - attributes.remove(e) - this_type_list.sort(key=sort_fct) - sorted_elements.extend(this_type_list) - # Add remaining attributes. According to the spec, the other attributes - # have a fixed alphabetical order. - attributes.sort(key=sort_fct) - sorted_elements.extend(attributes) - - return sorted_elements - - def _ns(ns, tag): return "{%s}%s" % (ns, tag) From 592401ef8c42fdc85c62bf28be548a14127f101b Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Tue, 19 Aug 2014 16:45:13 +0200 Subject: [PATCH 64/66] Workaround for weird behaviour of pypy when writing to the io classes. --- prov/serializers/provjson.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/prov/serializers/provjson.py b/prov/serializers/provjson.py index 7f15229d..21c2d890 100644 --- a/prov/serializers/provjson.py +++ b/prov/serializers/provjson.py @@ -9,7 +9,10 @@ from collections import defaultdict import datetime +import io import json +import platform +import StringIO from prov import Serializer, Error from prov.constants import * from prov.model import Literal, Identifier, QualifiedName, XSDQName, Namespace, ProvDocument, ProvBundle, \ @@ -44,6 +47,22 @@ def get_anon_id(self, obj, local_prefix="id"): class ProvJSONSerializer(Serializer): def serialize(self, stream, **kwargs): + # Special case handling for weird pypy behavior when writing to + # io.BytesIO and io.StringIO. + if platform.python_implementation().lower() == "pypy": + if isinstance(stream, (io.StringIO, io.BytesIO)): + buf = StringIO.StringIO() + try: + json.dump(self.document, buf, cls=ProvJSONEncoder, + **kwargs) + buf.seek(0, 0) + if isinstance(stream, io.BytesIO): + stream.write(buf.read().encode('utf-8')) + else: + stream.write(buf.read()) + finally: + buf.close() + return json.dump(self.document, stream, cls=ProvJSONEncoder, **kwargs) def deserialize(self, stream, **kwargs): From c95036b4d73f3f1c2363657dcaa7c98ab9ad0670 Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Tue, 19 Aug 2014 17:10:24 +0200 Subject: [PATCH 65/66] Make sure it works with different types of file like objects. Turns out the weird behaviour was not pypy specific but rather a features of the JSON library. Not sure why it previously only manifested on pypy. --- prov/serializers/provjson.py | 29 +++++++++++++---------------- prov/serializers/provxml.py | 21 ++++++++++++++++++--- prov/tests/test_extras.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 19 deletions(-) diff --git a/prov/serializers/provjson.py b/prov/serializers/provjson.py index 21c2d890..a7f79e3a 100644 --- a/prov/serializers/provjson.py +++ b/prov/serializers/provjson.py @@ -47,22 +47,19 @@ def get_anon_id(self, obj, local_prefix="id"): class ProvJSONSerializer(Serializer): def serialize(self, stream, **kwargs): - # Special case handling for weird pypy behavior when writing to - # io.BytesIO and io.StringIO. - if platform.python_implementation().lower() == "pypy": - if isinstance(stream, (io.StringIO, io.BytesIO)): - buf = StringIO.StringIO() - try: - json.dump(self.document, buf, cls=ProvJSONEncoder, - **kwargs) - buf.seek(0, 0) - if isinstance(stream, io.BytesIO): - stream.write(buf.read().encode('utf-8')) - else: - stream.write(buf.read()) - finally: - buf.close() - return + if isinstance(stream, (io.StringIO, io.BytesIO)): + buf = StringIO.StringIO() + try: + json.dump(self.document, buf, cls=ProvJSONEncoder, + **kwargs) + buf.seek(0, 0) + if isinstance(stream, io.BytesIO): + stream.write(buf.read().encode('utf-8')) + else: + stream.write(unicode(buf.read())) + finally: + buf.close() + return json.dump(self.document, stream, cls=ProvJSONEncoder, **kwargs) def deserialize(self, stream, **kwargs): diff --git a/prov/serializers/provxml.py b/prov/serializers/provxml.py index a7636b60..2d8aa33d 100644 --- a/prov/serializers/provxml.py +++ b/prov/serializers/provxml.py @@ -7,6 +7,8 @@ import datetime import logging from lxml import etree +import io +from StringIO import StringIO import warnings logger = logging.getLogger(__name__) @@ -50,9 +52,16 @@ def serialize(self, stream, force_types=False, **kwargs): for bundle in self.document.bundles: self.serialize_bundle(bundle=bundle, element=xml_root, force_types=force_types) + # No encoding must be specified when writing to String object which + # does not have the concept of an encoding as it should already + # represent unicode code points. et = etree.ElementTree(xml_root) - et.write(stream, pretty_print=True, xml_declaration=True, - encoding="UTF-8") + if isinstance(stream, (io.StringIO, StringIO)): + stream.write(unicode(etree.tostring(et, xml_declaration=True, + pretty_print=True))) + else: + et.write(stream, pretty_print=True, xml_declaration=True, + encoding="UTF-8") def serialize_bundle(self, bundle, element=None, force_types=False): """ @@ -216,7 +225,13 @@ def deserialize(self, stream, **kwargs): :param stream: Input data. """ - xml_doc = etree.parse(stream).getroot() + if isinstance(stream, (io.StringIO, StringIO)): + with io.BytesIO() as buf: + buf.write(stream.read().encode('utf-8')) + buf.seek(0, 0) + xml_doc = etree.parse(buf).getroot() + else: + xml_doc = etree.parse(stream).getroot() # Remove all comments. for c in xml_doc.xpath("//comment()"): diff --git a/prov/tests/test_extras.py b/prov/tests/test_extras.py index 8f75700f..8ecafd53 100644 --- a/prov/tests/test_extras.py +++ b/prov/tests/test_extras.py @@ -1,7 +1,10 @@ +import io +from StringIO import StringIO import unittest from prov.model import * from prov.dot import prov_to_dot +from prov.serializers import Registry EX_NS = Namespace('ex', 'http://example.org/') @@ -219,6 +222,31 @@ def test_document_helper_methods(self): self.assertTrue(document.has_bundles()) self.assertEqual(u'', str(document)) + def test_reading_and_writing_to_file_like_objects(self): + """ + Tests reading and writing to and from file like objects. + """ + # Create some random document. + document = ProvDocument() + document.entity(EX2_NS["test"]) + + objects = [io.BytesIO, io.StringIO, StringIO] + + Registry.load_serializers() + formats = Registry.serializers.keys() + + for obj in objects: + for format in formats: + try: + buf = obj() + document.serialize(destination=buf, format=format) + buf.seek(0, 0) + new_document = ProvDocument.deserialize(source=buf, + format=format) + self.assertEqual(document, new_document) + finally: + buf.close() + # def test_document_unification(self): # # TODO: Improve testing of this... # document = ProvDocument() From 4498fa1a129c29a5ca53d0f781227efb01ec145b Mon Sep 17 00:00:00 2001 From: Lion Krischer Date: Thu, 21 Aug 2014 16:30:30 +0200 Subject: [PATCH 66/66] Added CLA. --- cla/krischer.rst | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 cla/krischer.rst diff --git a/cla/krischer.rst b/cla/krischer.rst new file mode 100644 index 00000000..b6610d33 --- /dev/null +++ b/cla/krischer.rst @@ -0,0 +1,41 @@ +======================================== +Individual Contributor License Agreement +======================================== + +Thank you for Your interest in contributing to `the prov package `_. This document clarifies the terms under which You, the person listed below, may make Contributions which may include without limitation, software, bug fixes, configuration changes, documentation, or any other materials to `the PROV Python package project `_ owned or managed by the University of Southampton. + +Please complete the following information about **You** and the **Contributions**. If You have questions about these terms, please contact us at tdh@ecs.soton.ac.uk. + +You accept and agree to the following terms and conditions for Your present and future Contributions submitted to `the PROV Python package project `_. Except for the license granted herein to the University of Southampton, You reserve all right, title, and interest in and to Your Contributions. + +******** +Licenses +******** + +The PROV Python package project (code, documentation, and any other materials) are released under the terms of the MIT license as specified in the project's LICENSE file. + +***************** +You certify that: +***************** + +(a) Your Contributions are created in whole or in part by You and You have the right to submit it under the designated license; or + +(b) Your Contributions are based upon previous work that, to the best of your knowledge, is covered under an appropriate open source license and You have the right under that license to submit that work with modifications, whether created in whole or in part by You, under the designated license; or + +(c) Your Contributions are provided directly to You by some other person who certified (a) or (b) and You have not modified them. + +(d) You understand and agree that the PROV Python package package and Your Contributions are public and that a record of the Contributions (including all metadata and personal information You submit with them) is maintained indefinitely and may be redistributed consistent with the University of Southampton's policies and the requirements of the MIT license where they are relevant. + +(e) You are granting Your Contributions to the University of Southampton under the terms of the MIT open source license. Please complete the following information to indicate your agreement. + + +Full name: Lion Krischer +************************ +Github account: `krischer `_ +********************************************************* + +Please type "I AGREE" below to indicate you agree to these terms. Your full name and Github account will be publicly available. + +Confirmation: +********************* +I AGREE