diff --git a/docs/metadata.rst b/docs/metadata.rst
index eee97bef39..ad40d59efd 100644
--- a/docs/metadata.rst
+++ b/docs/metadata.rst
@@ -19,11 +19,10 @@ is in the form of a
`JSON Schema `_. A good to guide to creating JSON Schemas is at
`Understanding JSON Schema `_.
-In the most common case where the metadata schema specifies an object with properties,
+The metadata schema must specify an object with properties,
the keys and types of those properties are specified along with optional
-long-form names, descriptions
-and validations such as min/max or regex matching for strings. See
-:ref:`sec_metadata_example` below. Names and descriptions can assist
+long-form names, descriptions and validations such as min/max or regex matching for
+strings. See :ref:`sec_metadata_example` below. Names and descriptions can assist
downstream users in understanding and using the metadata. It is best practise to
populate these fields if your files will be used by any third-party, or if you wish to
remember what they were some time after making the file!
diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py
index 5595624b7b..3bb5116c15 100644
--- a/python/tests/test_metadata.py
+++ b/python/tests/test_metadata.py
@@ -344,6 +344,15 @@ def test_parse(self):
with self.assertRaises(ValueError):
metadata.parse_metadata_schema(json.dumps({"codec": "json"})[:-1])
+ def test_bad_top_level_type(self):
+ for bad_type in ["array", "boolean", "integer", "null", "number", "string"]:
+ schema = {
+ "codec": "json",
+ "type": bad_type,
+ }
+ with self.assertRaises(exceptions.MetadataSchemaValidationError):
+ metadata.MetadataSchema(schema)
+
def test_null_codec(self):
ms = metadata.MetadataSchema(None)
self.assertEqual(str(ms), "")
@@ -884,18 +893,27 @@ def round_trip(self, schema, row_data):
self.assertEqual(ms.decode_row(ms.validate_and_encode_row(row_data)), row_data)
def test_simple_types(self):
- schema = {"codec": "struct", "type": "number", "binaryFormat": "i"}
- self.round_trip(schema, 5)
- schema = {"codec": "struct", "type": "number", "binaryFormat": "d"}
- self.round_trip(schema, 5.5)
- schema = {"codec": "struct", "type": "string", "binaryFormat": "10p"}
- self.round_trip(schema, "42")
- schema = {"codec": "struct", "type": "boolean", "binaryFormat": "?"}
- self.round_trip(schema, True)
- schema = {"codec": "struct", "type": "null"}
- self.round_trip(schema, None)
- schema = {"codec": "struct", "type": "null", "binaryFormat": "10x"}
- self.round_trip(schema, None)
+ for type_, binaryFormat, value in (
+ ("number", "i", 5),
+ ("number", "d", 5.5),
+ ("string", "10p", "foobar"),
+ ("boolean", "?", True),
+ ("boolean", "?", False),
+ ("null", "10x", None),
+ ):
+ schema = {
+ "codec": "struct",
+ "type": "object",
+ "properties": {type_: {"type": type_, "binaryFormat": binaryFormat}},
+ }
+ self.round_trip(schema, {type_: value})
+
+ schema = {
+ "codec": "struct",
+ "type": "object",
+ "properties": {"null": {"type": "null"}},
+ }
+ self.round_trip(schema, {"null": None})
def test_flat_object(self):
schema = {
@@ -948,78 +966,127 @@ def test_nested_object(self):
def test_flat_array(self):
schema = {
"codec": "struct",
- "type": "array",
- "items": {"type": "number", "binaryFormat": "i"},
+ "type": "object",
+ "properties": {
+ "array": {
+ "type": "array",
+ "items": {"type": "number", "binaryFormat": "i"},
+ }
+ },
}
- self.round_trip(schema, [])
- self.round_trip(schema, [1])
- self.round_trip(schema, [1, 6, -900])
+ self.round_trip(schema, {"array": []})
+ self.round_trip(schema, {"array": [1]})
+ self.round_trip(schema, {"array": [1, 6, -900]})
schema = {
"codec": "struct",
- "type": "array",
- "items": {"type": "number", "binaryFormat": "d"},
+ "type": "object",
+ "properties": {
+ "array": {
+ "type": "array",
+ "items": {"type": "number", "binaryFormat": "d"},
+ }
+ },
}
- self.round_trip(schema, [])
- self.round_trip(schema, [1.5])
- self.round_trip(schema, [1.5, 6.7, -900.00001])
+ self.round_trip(schema, {"array": []})
+ self.round_trip(schema, {"array": [1.5]})
+ self.round_trip(schema, {"array": [1.5, 6.7, -900.00001]})
def test_nested_array(self):
schema = {
"codec": "struct",
- "type": "array",
- "items": {
- "type": "array",
- "items": {"type": "number", "binaryFormat": "i"},
+ "type": "object",
+ "properties": {
+ "array": {
+ "type": "array",
+ "items": {
+ "type": "array",
+ "items": {"type": "number", "binaryFormat": "i"},
+ },
+ }
},
}
- self.round_trip(schema, [[]])
- self.round_trip(schema, [[]])
- self.round_trip(schema, [[], []])
- self.round_trip(schema, [[1]])
- self.round_trip(schema, [[1, 6, -900]])
- self.round_trip(schema, [[0, 987, 234903], [1, 6, -900]])
+ self.round_trip(schema, {"array": [[]]})
+ self.round_trip(schema, {"array": [[], []]})
+ self.round_trip(schema, {"array": [[1]]})
+ self.round_trip(schema, {"array": [[1, 6, -900]]})
+ self.round_trip(schema, {"array": [[0, 987, 234903], [1, 6, -900]]})
schema = {
"codec": "struct",
- "type": "array",
- "items": {
- "type": "array",
- "items": {"type": "number", "binaryFormat": "d"},
+ "type": "object",
+ "properties": {
+ "array": {
+ "type": "array",
+ "items": {
+ "type": "array",
+ "items": {"type": "number", "binaryFormat": "d"},
+ },
+ }
},
}
- self.round_trip(schema, [[]])
- self.round_trip(schema, [[]])
- self.round_trip(schema, [[], []])
- self.round_trip(schema, [[1.67]])
- self.round_trip(schema, [[1.34, 6.56422, -900.0000006]])
- self.round_trip(schema, [[0.0, 987.123, 234903.123], [1.1235, 6, -900]])
+ self.round_trip(schema, {"array": [[]]})
+ self.round_trip(schema, {"array": [[], []]})
+ self.round_trip(schema, {"array": [[1.67]]})
+ self.round_trip(schema, {"array": [[1.34, 6.56422, -900.0000006]]})
+ self.round_trip(
+ schema, {"array": [[0.0, 987.123, 234903.123], [1.1235, 6, -900]]}
+ )
def test_array_of_objects(self):
schema = {
"codec": "struct",
- "type": "array",
- "items": {
- "type": "object",
- "properties": {
- "int": {"type": "number", "binaryFormat": "i"},
- "float": {"type": "number", "binaryFormat": "d"},
- "padding": {"type": "null", "binaryFormat": "5x"},
- "str": {"type": "string", "binaryFormat": "10p"},
- "bool": {"type": "boolean", "binaryFormat": "?"},
- },
+ "type": "object",
+ "properties": {
+ "array": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "int": {"type": "number", "binaryFormat": "i"},
+ "float": {"type": "number", "binaryFormat": "d"},
+ "padding": {"type": "null", "binaryFormat": "5x"},
+ "str": {"type": "string", "binaryFormat": "10p"},
+ "bool": {"type": "boolean", "binaryFormat": "?"},
+ },
+ },
+ }
},
}
- self.round_trip(schema, [])
+ self.round_trip(schema, {"array": []})
self.round_trip(
schema,
- [{"padding": None, "float": 5.78, "int": 9, "bool": False, "str": "41"}],
+ {
+ "array": [
+ {
+ "padding": None,
+ "float": 5.78,
+ "int": 9,
+ "bool": False,
+ "str": "41",
+ }
+ ]
+ },
)
self.round_trip(
schema,
- [
- {"padding": None, "float": 5.78, "int": 9, "bool": False, "str": "41"},
- {"str": "FOO", "int": 7, "bool": True, "float": 45.7, "padding": None},
- ],
+ {
+ "array": [
+ {
+ "padding": None,
+ "float": 5.78,
+ "int": 9,
+ "bool": False,
+ "str": "41",
+ },
+ {
+ "str": "FOO",
+ "int": 7,
+ "bool": True,
+ "float": 45.7,
+ "padding": None,
+ },
+ ],
+ },
)
def test_object_with_array(self):
@@ -1042,26 +1109,36 @@ def test_object_with_array(self):
def test_array_length_format(self):
schema = {
"codec": "struct",
- "type": "array",
- "arrayLengthFormat": "B",
- "items": {"type": "number", "binaryFormat": "H"},
+ "type": "object",
+ "properties": {
+ "array": {
+ "type": "array",
+ "arrayLengthFormat": "B",
+ "items": {"type": "number", "binaryFormat": "H"},
+ }
+ },
}
- self.round_trip(schema, [])
+ self.round_trip(schema, {"array": []})
self.round_trip(
- schema, [1],
+ schema, {"array": [1]},
)
self.round_trip(
- schema, list(range(255)),
+ schema, {"array": list(range(255))},
)
def test_string_encoding(self):
schema = {
"codec": "struct",
- "type": "string",
- "stringEncoding": "utf-16",
- "binaryFormat": "40p",
+ "type": "object",
+ "properties": {
+ "string": {
+ "type": "string",
+ "stringEncoding": "utf-16",
+ "binaryFormat": "40p",
+ }
+ },
}
- self.round_trip(schema, "Test string")
+ self.round_trip(schema, {"string": "Test string"})
def test_ordering_of_fields(self):
row_data = {
@@ -1146,22 +1223,13 @@ def test_bad_schema_union_type(self):
metadata.MetadataSchema(schema)
def test_bad_schema_hetrogeneous_array(self):
- schema = {
- "codec": "struct",
- "type": "array",
- "items": [{"type": "number"}, {"type": "string"}],
- }
- with self.assertRaisesRegex(
- exceptions.MetadataSchemaValidationError, "is not of type 'object'"
- ):
- metadata.MetadataSchema(schema)
schema = {
"codec": "struct",
"type": "object",
"properties": {
- "hetro_array": {
+ "array": {
"type": "array",
- "items": [{"type": "string"}, {"type": "number"}],
+ "items": [{"type": "number"}, {"type": "string"}],
}
},
}
@@ -1171,18 +1239,30 @@ def test_bad_schema_hetrogeneous_array(self):
metadata.MetadataSchema(schema)
def test_bad_binary_format(self):
- schema = {"codec": "struct", "type": "number", "binaryFormat": "int"}
+ schema = {
+ "codec": "struct",
+ "type": "object",
+ "properties": {"int": {"type": "number", "binaryFormat": "int"}},
+ }
with self.assertRaisesRegex(
exceptions.MetadataSchemaValidationError, "does not match"
):
metadata.MetadataSchema(schema)
# Can't specify endianness
- schema = {"codec": "struct", "type": "number", "binaryFormat": ">b"}
+ schema = {
+ "codec": "struct",
+ "type": "object",
+ "properties": {"int": {"type": "number", "binaryFormat": ">b"}},
+ }
with self.assertRaisesRegex(
exceptions.MetadataSchemaValidationError, "does not match"
):
metadata.MetadataSchema(schema)
- schema = {"codec": "struct", "type": "null", "binaryFormat": "l"}
+ schema = {
+ "codec": "struct",
+ "type": "object",
+ "properties": {"null": {"type": "null", "binaryFormat": "l"}},
+ }
with self.assertRaisesRegex(
exceptions.MetadataSchemaValidationError,
"null type binaryFormat must be padding",
@@ -1190,7 +1270,11 @@ def test_bad_binary_format(self):
metadata.MetadataSchema(schema)
def test_bad_array_length_format(self):
- schema = {"codec": "struct", "type": "array", "arrayLengthFormat": "b"}
+ schema = {
+ "codec": "struct",
+ "type": "object",
+ "properties": {"array": {"type": "array", "arrayLengthFormat": "b"}},
+ }
with self.assertRaisesRegex(
exceptions.MetadataSchemaValidationError, "does not match",
):
@@ -1199,7 +1283,8 @@ def test_bad_array_length_format(self):
def test_missing_binary_format(self):
schema = {
"codec": "struct",
- "type": "number",
+ "type": "object",
+ "properties": {"int": {"type": "number"}},
}
with self.assertRaisesRegex(
exceptions.MetadataSchemaValidationError,
@@ -1210,9 +1295,14 @@ def test_missing_binary_format(self):
def test_bad_string_encoding(self):
schema = {
"codec": "struct",
- "type": "string",
- "binaryFormat": "5s",
- "stringEncoding": 58,
+ "type": "object",
+ "properties": {
+ "string": {
+ "type": "string",
+ "binaryFormat": "5s",
+ "stringEncoding": 58,
+ }
+ },
}
with self.assertRaisesRegex(
exceptions.MetadataSchemaValidationError, "is not of type",
@@ -1222,9 +1312,14 @@ def test_bad_string_encoding(self):
def test_bad_null_terminated(self):
schema = {
"codec": "struct",
- "type": "string",
- "binaryFormat": "5s",
- "nullTerminated": 58,
+ "type": "object",
+ "properties": {
+ "string": {
+ "type": "string",
+ "binaryFormat": "5s",
+ "nullTerminated": 58,
+ }
+ },
}
with self.assertRaisesRegex(
exceptions.MetadataSchemaValidationError, "is not of type",
@@ -1234,9 +1329,14 @@ def test_bad_null_terminated(self):
def test_bad_no_length_encoding_exhaust_buffer(self):
schema = {
"codec": "struct",
- "type": "string",
- "binaryFormat": "5s",
- "noLengthEncodingExhaustBuffer": 58,
+ "type": "object",
+ "properties": {
+ "string": {
+ "type": "string",
+ "binaryFormat": "5s",
+ "noLengthEncodingExhaustBuffer": 58,
+ }
+ },
}
with self.assertRaisesRegex(
exceptions.MetadataSchemaValidationError, "is not of type",
@@ -1280,7 +1380,11 @@ def test_individual(self):
"properties": {
"pedigreeID": {"type": "integer", "binaryFormat": "q", "index": 1},
"age": {"type": "integer", "binaryFormat": "i", "index": 2},
- "subpopulationID": {"type": "integer", "binaryFormat": "i", "index": 3},
+ "subpopulationID": {
+ "type": "integer",
+ "binaryFormat": "i",
+ "index": 3,
+ },
"sex": {"type": "integer", "binaryFormat": "i", "index": 4},
"flags": {"type": "integer", "binaryFormat": "I", "index": 5},
},
@@ -1316,33 +1420,42 @@ def test_individual(self):
def test_mutation(self):
schema = {
"codec": "struct",
- "type": "array",
- "noLengthEncodingExhaustBuffer": True,
- "items": {
- "type": "object",
- "properties": {
- "mutationTypeID": {
- "type": "integer",
- "binaryFormat": "i",
- "index": 1,
- },
- "selectionCoeff": {
- "type": "number",
- "binaryFormat": "f",
- "index": 2,
- },
- "subpopulationID": {
- "type": "integer",
- "binaryFormat": "i",
- "index": 3,
- },
- "originGeneration": {
- "type": "integer",
- "binaryFormat": "i",
- "index": 4,
+ "type": "object",
+ "properties": {
+ "stacked_mutation_array": {
+ "type": "array",
+ "noLengthEncodingExhaustBuffer": True,
+ "items": {
+ "type": "object",
+ "properties": {
+ "mutationTypeID": {
+ "type": "integer",
+ "binaryFormat": "i",
+ "index": 1,
+ },
+ "selectionCoeff": {
+ "type": "number",
+ "binaryFormat": "f",
+ "index": 2,
+ },
+ "subpopulationID": {
+ "type": "integer",
+ "binaryFormat": "i",
+ "index": 3,
+ },
+ "originGeneration": {
+ "type": "integer",
+ "binaryFormat": "i",
+ "index": 4,
+ },
+ "nucleotide": {
+ "type": "integer",
+ "binaryFormat": "b",
+ "index": 5,
+ },
+ },
},
- "nucleotide": {"type": "integer", "binaryFormat": "b", "index": 5},
- },
+ }
},
}
@@ -1417,7 +1530,10 @@ def test_mutation(self):
),
]:
self.assertEqual(
- metadata.MetadataSchema(schema).decode_row(example), expected
+ metadata.MetadataSchema(schema).decode_row(example)[
+ "stacked_mutation_array"
+ ],
+ expected,
)
def test_population(self):
@@ -1425,7 +1541,11 @@ def test_population(self):
"codec": "struct",
"type": "object",
"properties": {
- "subpopulationID": {"type": "integer", "binaryFormat": "i", "index": 0},
+ "subpopulationID": {
+ "type": "integer",
+ "binaryFormat": "i",
+ "index": 0,
+ },
"femaleCloneFraction": {
"type": "number",
"binaryFormat": "d",
diff --git a/python/tskit/metadata.py b/python/tskit/metadata.py
index 8b3f406f6b..d2565d8c61 100644
--- a/python/tskit/metadata.py
+++ b/python/tskit/metadata.py
@@ -55,12 +55,14 @@ def replace_root_refs(obj):
jsonschema.validators.Draft7Validator
)
META_SCHEMA: Mapping[str, Any] = copy.deepcopy(TSKITMetadataSchemaValidator.META_SCHEMA)
-# We need a top-level only required property so we need to rewite any reference
+# We need a top-level only required property so we need to rewrite any reference
# to the top-level schema to a copy in a definition.
META_SCHEMA = replace_root_refs(META_SCHEMA)
META_SCHEMA["definitions"]["root"] = copy.deepcopy(META_SCHEMA)
META_SCHEMA["codec"] = {"type": "string"}
META_SCHEMA["required"] = ["codec"]
+# For interoperability reasons, force the top-level to be an object
+META_SCHEMA["properties"]["type"] = {"enum": ["object"]}
TSKITMetadataSchemaValidator.META_SCHEMA = META_SCHEMA
@@ -201,12 +203,12 @@ def binary_format_validator(validator, types, instance, schema):
META_SCHEMA["properties"]["nullTerminated"] = {"type": "boolean"}
META_SCHEMA["definitions"]["root"]["properties"]["nullTerminated"] = META_SCHEMA[
"properties"
-]["index"]
+]["nullTerminated"]
# noLengthEncodingExhaustBuffer is a boolean
META_SCHEMA["properties"]["noLengthEncodingExhaustBuffer"] = {"type": "boolean"}
META_SCHEMA["definitions"]["root"]["properties"][
"noLengthEncodingExhaustBuffer"
-] = META_SCHEMA["properties"]["index"]
+] = META_SCHEMA["properties"]["noLengthEncodingExhaustBuffer"]
StructCodecSchemaValidator.META_SCHEMA = META_SCHEMA
diff --git a/python/tskit/metadata_schema.schema.json b/python/tskit/metadata_schema.schema.json
index f5ecf6887e..21f3845eae 100644
--- a/python/tskit/metadata_schema.schema.json
+++ b/python/tskit/metadata_schema.schema.json
@@ -105,17 +105,7 @@
"required": {"$ref": "#/definitions/stringArray"},
"then": {"$ref": "#/definitions/root"},
"title": {"type": "string"},
- "type": {
- "anyOf": [
- {"$ref": "#/definitions/simpleTypes"},
- {
- "items": {"$ref": "#/definitions/simpleTypes"},
- "minItems": 1,
- "type": "array",
- "uniqueItems": true,
- },
- ]
- },
+ "type": {"enum": ["object"]},
"uniqueItems": {"default": false, "type": "boolean"},
},
"title": "Core schema meta-schema",