From 833a9e406379f59164172361920ea1bb1a440a7e Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Sat, 23 May 2020 01:29:16 +0100 Subject: [PATCH] Force top-level of metadata to be an object --- docs/metadata.rst | 7 +- python/tests/test_metadata.py | 364 +++++++++++++++-------- python/tskit/metadata.py | 8 +- python/tskit/metadata_schema.schema.json | 12 +- 4 files changed, 251 insertions(+), 140 deletions(-) diff --git a/docs/metadata.rst b/docs/metadata.rst index eee97bef39..ad40d59efd 100644 --- a/docs/metadata.rst +++ b/docs/metadata.rst @@ -19,11 +19,10 @@ is in the form of a `JSON Schema `_. A good to guide to creating JSON Schemas is at `Understanding JSON Schema `_. -In the most common case where the metadata schema specifies an object with properties, +The metadata schema must specify an object with properties, the keys and types of those properties are specified along with optional -long-form names, descriptions -and validations such as min/max or regex matching for strings. See -:ref:`sec_metadata_example` below. Names and descriptions can assist +long-form names, descriptions and validations such as min/max or regex matching for +strings. See :ref:`sec_metadata_example` below. Names and descriptions can assist downstream users in understanding and using the metadata. It is best practise to populate these fields if your files will be used by any third-party, or if you wish to remember what they were some time after making the file! diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py index 5595624b7b..3bb5116c15 100644 --- a/python/tests/test_metadata.py +++ b/python/tests/test_metadata.py @@ -344,6 +344,15 @@ def test_parse(self): with self.assertRaises(ValueError): metadata.parse_metadata_schema(json.dumps({"codec": "json"})[:-1]) + def test_bad_top_level_type(self): + for bad_type in ["array", "boolean", "integer", "null", "number", "string"]: + schema = { + "codec": "json", + "type": bad_type, + } + with self.assertRaises(exceptions.MetadataSchemaValidationError): + metadata.MetadataSchema(schema) + def test_null_codec(self): ms = metadata.MetadataSchema(None) self.assertEqual(str(ms), "") @@ -884,18 +893,27 @@ def round_trip(self, schema, row_data): self.assertEqual(ms.decode_row(ms.validate_and_encode_row(row_data)), row_data) def test_simple_types(self): - schema = {"codec": "struct", "type": "number", "binaryFormat": "i"} - self.round_trip(schema, 5) - schema = {"codec": "struct", "type": "number", "binaryFormat": "d"} - self.round_trip(schema, 5.5) - schema = {"codec": "struct", "type": "string", "binaryFormat": "10p"} - self.round_trip(schema, "42") - schema = {"codec": "struct", "type": "boolean", "binaryFormat": "?"} - self.round_trip(schema, True) - schema = {"codec": "struct", "type": "null"} - self.round_trip(schema, None) - schema = {"codec": "struct", "type": "null", "binaryFormat": "10x"} - self.round_trip(schema, None) + for type_, binaryFormat, value in ( + ("number", "i", 5), + ("number", "d", 5.5), + ("string", "10p", "foobar"), + ("boolean", "?", True), + ("boolean", "?", False), + ("null", "10x", None), + ): + schema = { + "codec": "struct", + "type": "object", + "properties": {type_: {"type": type_, "binaryFormat": binaryFormat}}, + } + self.round_trip(schema, {type_: value}) + + schema = { + "codec": "struct", + "type": "object", + "properties": {"null": {"type": "null"}}, + } + self.round_trip(schema, {"null": None}) def test_flat_object(self): schema = { @@ -948,78 +966,127 @@ def test_nested_object(self): def test_flat_array(self): schema = { "codec": "struct", - "type": "array", - "items": {"type": "number", "binaryFormat": "i"}, + "type": "object", + "properties": { + "array": { + "type": "array", + "items": {"type": "number", "binaryFormat": "i"}, + } + }, } - self.round_trip(schema, []) - self.round_trip(schema, [1]) - self.round_trip(schema, [1, 6, -900]) + self.round_trip(schema, {"array": []}) + self.round_trip(schema, {"array": [1]}) + self.round_trip(schema, {"array": [1, 6, -900]}) schema = { "codec": "struct", - "type": "array", - "items": {"type": "number", "binaryFormat": "d"}, + "type": "object", + "properties": { + "array": { + "type": "array", + "items": {"type": "number", "binaryFormat": "d"}, + } + }, } - self.round_trip(schema, []) - self.round_trip(schema, [1.5]) - self.round_trip(schema, [1.5, 6.7, -900.00001]) + self.round_trip(schema, {"array": []}) + self.round_trip(schema, {"array": [1.5]}) + self.round_trip(schema, {"array": [1.5, 6.7, -900.00001]}) def test_nested_array(self): schema = { "codec": "struct", - "type": "array", - "items": { - "type": "array", - "items": {"type": "number", "binaryFormat": "i"}, + "type": "object", + "properties": { + "array": { + "type": "array", + "items": { + "type": "array", + "items": {"type": "number", "binaryFormat": "i"}, + }, + } }, } - self.round_trip(schema, [[]]) - self.round_trip(schema, [[]]) - self.round_trip(schema, [[], []]) - self.round_trip(schema, [[1]]) - self.round_trip(schema, [[1, 6, -900]]) - self.round_trip(schema, [[0, 987, 234903], [1, 6, -900]]) + self.round_trip(schema, {"array": [[]]}) + self.round_trip(schema, {"array": [[], []]}) + self.round_trip(schema, {"array": [[1]]}) + self.round_trip(schema, {"array": [[1, 6, -900]]}) + self.round_trip(schema, {"array": [[0, 987, 234903], [1, 6, -900]]}) schema = { "codec": "struct", - "type": "array", - "items": { - "type": "array", - "items": {"type": "number", "binaryFormat": "d"}, + "type": "object", + "properties": { + "array": { + "type": "array", + "items": { + "type": "array", + "items": {"type": "number", "binaryFormat": "d"}, + }, + } }, } - self.round_trip(schema, [[]]) - self.round_trip(schema, [[]]) - self.round_trip(schema, [[], []]) - self.round_trip(schema, [[1.67]]) - self.round_trip(schema, [[1.34, 6.56422, -900.0000006]]) - self.round_trip(schema, [[0.0, 987.123, 234903.123], [1.1235, 6, -900]]) + self.round_trip(schema, {"array": [[]]}) + self.round_trip(schema, {"array": [[], []]}) + self.round_trip(schema, {"array": [[1.67]]}) + self.round_trip(schema, {"array": [[1.34, 6.56422, -900.0000006]]}) + self.round_trip( + schema, {"array": [[0.0, 987.123, 234903.123], [1.1235, 6, -900]]} + ) def test_array_of_objects(self): schema = { "codec": "struct", - "type": "array", - "items": { - "type": "object", - "properties": { - "int": {"type": "number", "binaryFormat": "i"}, - "float": {"type": "number", "binaryFormat": "d"}, - "padding": {"type": "null", "binaryFormat": "5x"}, - "str": {"type": "string", "binaryFormat": "10p"}, - "bool": {"type": "boolean", "binaryFormat": "?"}, - }, + "type": "object", + "properties": { + "array": { + "type": "array", + "items": { + "type": "object", + "properties": { + "int": {"type": "number", "binaryFormat": "i"}, + "float": {"type": "number", "binaryFormat": "d"}, + "padding": {"type": "null", "binaryFormat": "5x"}, + "str": {"type": "string", "binaryFormat": "10p"}, + "bool": {"type": "boolean", "binaryFormat": "?"}, + }, + }, + } }, } - self.round_trip(schema, []) + self.round_trip(schema, {"array": []}) self.round_trip( schema, - [{"padding": None, "float": 5.78, "int": 9, "bool": False, "str": "41"}], + { + "array": [ + { + "padding": None, + "float": 5.78, + "int": 9, + "bool": False, + "str": "41", + } + ] + }, ) self.round_trip( schema, - [ - {"padding": None, "float": 5.78, "int": 9, "bool": False, "str": "41"}, - {"str": "FOO", "int": 7, "bool": True, "float": 45.7, "padding": None}, - ], + { + "array": [ + { + "padding": None, + "float": 5.78, + "int": 9, + "bool": False, + "str": "41", + }, + { + "str": "FOO", + "int": 7, + "bool": True, + "float": 45.7, + "padding": None, + }, + ], + }, ) def test_object_with_array(self): @@ -1042,26 +1109,36 @@ def test_object_with_array(self): def test_array_length_format(self): schema = { "codec": "struct", - "type": "array", - "arrayLengthFormat": "B", - "items": {"type": "number", "binaryFormat": "H"}, + "type": "object", + "properties": { + "array": { + "type": "array", + "arrayLengthFormat": "B", + "items": {"type": "number", "binaryFormat": "H"}, + } + }, } - self.round_trip(schema, []) + self.round_trip(schema, {"array": []}) self.round_trip( - schema, [1], + schema, {"array": [1]}, ) self.round_trip( - schema, list(range(255)), + schema, {"array": list(range(255))}, ) def test_string_encoding(self): schema = { "codec": "struct", - "type": "string", - "stringEncoding": "utf-16", - "binaryFormat": "40p", + "type": "object", + "properties": { + "string": { + "type": "string", + "stringEncoding": "utf-16", + "binaryFormat": "40p", + } + }, } - self.round_trip(schema, "Test string") + self.round_trip(schema, {"string": "Test string"}) def test_ordering_of_fields(self): row_data = { @@ -1146,22 +1223,13 @@ def test_bad_schema_union_type(self): metadata.MetadataSchema(schema) def test_bad_schema_hetrogeneous_array(self): - schema = { - "codec": "struct", - "type": "array", - "items": [{"type": "number"}, {"type": "string"}], - } - with self.assertRaisesRegex( - exceptions.MetadataSchemaValidationError, "is not of type 'object'" - ): - metadata.MetadataSchema(schema) schema = { "codec": "struct", "type": "object", "properties": { - "hetro_array": { + "array": { "type": "array", - "items": [{"type": "string"}, {"type": "number"}], + "items": [{"type": "number"}, {"type": "string"}], } }, } @@ -1171,18 +1239,30 @@ def test_bad_schema_hetrogeneous_array(self): metadata.MetadataSchema(schema) def test_bad_binary_format(self): - schema = {"codec": "struct", "type": "number", "binaryFormat": "int"} + schema = { + "codec": "struct", + "type": "object", + "properties": {"int": {"type": "number", "binaryFormat": "int"}}, + } with self.assertRaisesRegex( exceptions.MetadataSchemaValidationError, "does not match" ): metadata.MetadataSchema(schema) # Can't specify endianness - schema = {"codec": "struct", "type": "number", "binaryFormat": ">b"} + schema = { + "codec": "struct", + "type": "object", + "properties": {"int": {"type": "number", "binaryFormat": ">b"}}, + } with self.assertRaisesRegex( exceptions.MetadataSchemaValidationError, "does not match" ): metadata.MetadataSchema(schema) - schema = {"codec": "struct", "type": "null", "binaryFormat": "l"} + schema = { + "codec": "struct", + "type": "object", + "properties": {"null": {"type": "null", "binaryFormat": "l"}}, + } with self.assertRaisesRegex( exceptions.MetadataSchemaValidationError, "null type binaryFormat must be padding", @@ -1190,7 +1270,11 @@ def test_bad_binary_format(self): metadata.MetadataSchema(schema) def test_bad_array_length_format(self): - schema = {"codec": "struct", "type": "array", "arrayLengthFormat": "b"} + schema = { + "codec": "struct", + "type": "object", + "properties": {"array": {"type": "array", "arrayLengthFormat": "b"}}, + } with self.assertRaisesRegex( exceptions.MetadataSchemaValidationError, "does not match", ): @@ -1199,7 +1283,8 @@ def test_bad_array_length_format(self): def test_missing_binary_format(self): schema = { "codec": "struct", - "type": "number", + "type": "object", + "properties": {"int": {"type": "number"}}, } with self.assertRaisesRegex( exceptions.MetadataSchemaValidationError, @@ -1210,9 +1295,14 @@ def test_missing_binary_format(self): def test_bad_string_encoding(self): schema = { "codec": "struct", - "type": "string", - "binaryFormat": "5s", - "stringEncoding": 58, + "type": "object", + "properties": { + "string": { + "type": "string", + "binaryFormat": "5s", + "stringEncoding": 58, + } + }, } with self.assertRaisesRegex( exceptions.MetadataSchemaValidationError, "is not of type", @@ -1222,9 +1312,14 @@ def test_bad_string_encoding(self): def test_bad_null_terminated(self): schema = { "codec": "struct", - "type": "string", - "binaryFormat": "5s", - "nullTerminated": 58, + "type": "object", + "properties": { + "string": { + "type": "string", + "binaryFormat": "5s", + "nullTerminated": 58, + } + }, } with self.assertRaisesRegex( exceptions.MetadataSchemaValidationError, "is not of type", @@ -1234,9 +1329,14 @@ def test_bad_null_terminated(self): def test_bad_no_length_encoding_exhaust_buffer(self): schema = { "codec": "struct", - "type": "string", - "binaryFormat": "5s", - "noLengthEncodingExhaustBuffer": 58, + "type": "object", + "properties": { + "string": { + "type": "string", + "binaryFormat": "5s", + "noLengthEncodingExhaustBuffer": 58, + } + }, } with self.assertRaisesRegex( exceptions.MetadataSchemaValidationError, "is not of type", @@ -1280,7 +1380,11 @@ def test_individual(self): "properties": { "pedigreeID": {"type": "integer", "binaryFormat": "q", "index": 1}, "age": {"type": "integer", "binaryFormat": "i", "index": 2}, - "subpopulationID": {"type": "integer", "binaryFormat": "i", "index": 3}, + "subpopulationID": { + "type": "integer", + "binaryFormat": "i", + "index": 3, + }, "sex": {"type": "integer", "binaryFormat": "i", "index": 4}, "flags": {"type": "integer", "binaryFormat": "I", "index": 5}, }, @@ -1316,33 +1420,42 @@ def test_individual(self): def test_mutation(self): schema = { "codec": "struct", - "type": "array", - "noLengthEncodingExhaustBuffer": True, - "items": { - "type": "object", - "properties": { - "mutationTypeID": { - "type": "integer", - "binaryFormat": "i", - "index": 1, - }, - "selectionCoeff": { - "type": "number", - "binaryFormat": "f", - "index": 2, - }, - "subpopulationID": { - "type": "integer", - "binaryFormat": "i", - "index": 3, - }, - "originGeneration": { - "type": "integer", - "binaryFormat": "i", - "index": 4, + "type": "object", + "properties": { + "stacked_mutation_array": { + "type": "array", + "noLengthEncodingExhaustBuffer": True, + "items": { + "type": "object", + "properties": { + "mutationTypeID": { + "type": "integer", + "binaryFormat": "i", + "index": 1, + }, + "selectionCoeff": { + "type": "number", + "binaryFormat": "f", + "index": 2, + }, + "subpopulationID": { + "type": "integer", + "binaryFormat": "i", + "index": 3, + }, + "originGeneration": { + "type": "integer", + "binaryFormat": "i", + "index": 4, + }, + "nucleotide": { + "type": "integer", + "binaryFormat": "b", + "index": 5, + }, + }, }, - "nucleotide": {"type": "integer", "binaryFormat": "b", "index": 5}, - }, + } }, } @@ -1417,7 +1530,10 @@ def test_mutation(self): ), ]: self.assertEqual( - metadata.MetadataSchema(schema).decode_row(example), expected + metadata.MetadataSchema(schema).decode_row(example)[ + "stacked_mutation_array" + ], + expected, ) def test_population(self): @@ -1425,7 +1541,11 @@ def test_population(self): "codec": "struct", "type": "object", "properties": { - "subpopulationID": {"type": "integer", "binaryFormat": "i", "index": 0}, + "subpopulationID": { + "type": "integer", + "binaryFormat": "i", + "index": 0, + }, "femaleCloneFraction": { "type": "number", "binaryFormat": "d", diff --git a/python/tskit/metadata.py b/python/tskit/metadata.py index 8b3f406f6b..d2565d8c61 100644 --- a/python/tskit/metadata.py +++ b/python/tskit/metadata.py @@ -55,12 +55,14 @@ def replace_root_refs(obj): jsonschema.validators.Draft7Validator ) META_SCHEMA: Mapping[str, Any] = copy.deepcopy(TSKITMetadataSchemaValidator.META_SCHEMA) -# We need a top-level only required property so we need to rewite any reference +# We need a top-level only required property so we need to rewrite any reference # to the top-level schema to a copy in a definition. META_SCHEMA = replace_root_refs(META_SCHEMA) META_SCHEMA["definitions"]["root"] = copy.deepcopy(META_SCHEMA) META_SCHEMA["codec"] = {"type": "string"} META_SCHEMA["required"] = ["codec"] +# For interoperability reasons, force the top-level to be an object +META_SCHEMA["properties"]["type"] = {"enum": ["object"]} TSKITMetadataSchemaValidator.META_SCHEMA = META_SCHEMA @@ -201,12 +203,12 @@ def binary_format_validator(validator, types, instance, schema): META_SCHEMA["properties"]["nullTerminated"] = {"type": "boolean"} META_SCHEMA["definitions"]["root"]["properties"]["nullTerminated"] = META_SCHEMA[ "properties" -]["index"] +]["nullTerminated"] # noLengthEncodingExhaustBuffer is a boolean META_SCHEMA["properties"]["noLengthEncodingExhaustBuffer"] = {"type": "boolean"} META_SCHEMA["definitions"]["root"]["properties"][ "noLengthEncodingExhaustBuffer" -] = META_SCHEMA["properties"]["index"] +] = META_SCHEMA["properties"]["noLengthEncodingExhaustBuffer"] StructCodecSchemaValidator.META_SCHEMA = META_SCHEMA diff --git a/python/tskit/metadata_schema.schema.json b/python/tskit/metadata_schema.schema.json index f5ecf6887e..21f3845eae 100644 --- a/python/tskit/metadata_schema.schema.json +++ b/python/tskit/metadata_schema.schema.json @@ -105,17 +105,7 @@ "required": {"$ref": "#/definitions/stringArray"}, "then": {"$ref": "#/definitions/root"}, "title": {"type": "string"}, - "type": { - "anyOf": [ - {"$ref": "#/definitions/simpleTypes"}, - { - "items": {"$ref": "#/definitions/simpleTypes"}, - "minItems": 1, - "type": "array", - "uniqueItems": true, - }, - ] - }, + "type": {"enum": ["object"]}, "uniqueItems": {"default": false, "type": "boolean"}, }, "title": "Core schema meta-schema",