diff --git a/appveyor.yml b/appveyor.yml index a100748590..6eb47f511a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -23,6 +23,7 @@ install: build_script: - cmd: cd python - cmd: python setup.py build_ext --inplace + # Install some modules needed for tests - cmd: python -m pip install PyVCF - cmd: python -m pip install newick - cmd: python -m pip install python_jsonschema_objects diff --git a/docs/data-model.rst b/docs/data-model.rst index f09be8213d..e77e9aba88 100644 --- a/docs/data-model.rst +++ b/docs/data-model.rst @@ -492,32 +492,14 @@ record char Provenance record. Metadata ======== -Users of the tables API sometimes need to store auxiliary information for -the various entities defined here. For example, in a forwards-time simulation, -the simulation engine may wish to store the time at which a particular mutation -arose or some other pertinent information. If we are representing real data, -we may wish to store information derived from a VCF INFO field, or associate -information relating to samples or populations. The columns defined in tables -here are deliberately minimal: we define columns only for information which -the library itself can use. All other information is considered to be -**metadata**, and is stored in the ``metadata`` columns of the various -tables. - -Arbitrary binary data can be stored in ``metadata`` columns, and the -``tskit`` library makes no attempt to interpret this information. How the -information held in this field is encoded is entirely the choice of client code. - -To ensure that metadata can be safely interchanged using the :ref:`sec_text_file_format`, -each row is `base 64 encoded `_. Thus, -binary information can be safely printed and exchanged, but may not be -human readable. - -.. todo:: - We plan on providing more sophisticated tools for working with metadata - in future, including the auto decoding metadata via pluggable - functions and the ability to store metadata schemas so that metadata - is self-describing. +Each table (excluding provenance) has a metadata column for storing and passing along +information that tskit does not use or interpret. See :ref:`sec_metadata` for details. +The metadata columns are :ref:`binary columns `. +When using the :ref:`sec_text_file_format`, to ensure that metadata can be safely +interchanged, each row is `base 64 encoded `_. +Thus, binary information can be safely printed and exchanged, but may not be +human readable. .. _sec_valid_tree_sequence_requirements: diff --git a/docs/index.rst b/docs/index.rst index 338d6104d9..f779d66550 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -17,6 +17,7 @@ Welcome to tskit's documentation! c-api cli data-model + metadata provenance development tutorial diff --git a/docs/metadata.rst b/docs/metadata.rst new file mode 100644 index 0000000000..cd73f56fee --- /dev/null +++ b/docs/metadata.rst @@ -0,0 +1,127 @@ +.. _sec_metadata: + +======== +Metadata +======== + +Every entity (nodes, mutations, edges, etc.) in a tskit tree sequence can have +metadata associated with it. This is intended for storing and passing on information +that tskit itself does not use or interpret. For example information derived from a VCF +INFO field, or administrative information (such as unique identifiers) relating to +samples and populations. Note that provenance information about how a tree sequence +was created should not be stored in metadata, instead the provenance mechanisms in +tskit should be used. See :ref:`sec_provenance`. + +The metadata for each entity is described by a schema for each entity type. This +schema allows the tskit Python API to encode and decode metadata and, most importantly, +tells downstream users and tools how to decode and interpret the metadata. This schema +is in the form of a +`JSON Schema `_. A good to guide to creating JSON Schemas is at +`Understanding JSON Schema `_. + +In the most common case where the metadata schema specifies an object with properties, +the keys and types of those properties are specified along with optional +long-form names, descriptions +and validations such as min/max or regex matching for strings. See +:ref:`sec_metadata_example` below. Names and descriptions can assist +downstream users in understanding and using the metadata. It is best practise to +populate these fields if your files will be used by any third-party, or if you wish to +remember what they were some time after making the file! + +The :ref:`sec_tutorial_metadata` Tutorial shows how to use schemas and access metadata +in the tskit Python API. + +Note that the C API simply provides byte-array binary access to the metadata and +leaves encoding and decoding to the user. The same can be achieved with the Python +API, see :ref:`sec_tutorial_metadata_binary`. + +****** +Codecs +****** + +As the underlying metadata is in raw binary (see +:ref:`data model `) it +must be encoded and decoded, in the case of the Python API to Python objects. +The method for doing this is specified in the top-level schema property ``codec``. +Currently the Python API supports the ``json`` codec which encodes metadata as +`JSON `_. We plan to support more codecs soon, such +as an efficient binary encoding (see :issue:`535`). It is possible to define a custom +codec using :meth:`tskit.register_metadata_codec`, however this should only be used +when necessary as downstream users of the metadata will not be able to decode it +without the custom codec. For an example see :ref:`sec_tutorial_metadata_custom_codec` + +.. _sec_metadata_example: + +******* +Example +******* + + +As an example here is a schema using the ``json`` codec which could apply, for example, +to the individuals in a tree sequence: + +.. code-block:: json + + { + "codec": "json", + "type": "object", + "properties": { + "accession_number": {"type": "number"}, + "collection_date": { + "name": "Collection date", + "description": "Date of sample collection in ISO format", + "type": "string", + "pattern": "^([1-9][0-9]{3})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])?$" + }, + }, + "required": ["accession_number"], + "additionalProperties": false, + } + +This schema states that the metadata for the each row of the table +is an object consisting of two properties. Property ``accession_number`` is a number +which must be specified (it is included in the ``required`` list). +Property ``collection_date`` is an optional string which must satisfy a regex, +which checks it is a valid `ISO8601 `_ date. +Any other properties are not allowed (``additionalProperties`` is false). + +.. _sec_metadata_api_overview: + +**************************** +Python Metadata API Overview +**************************** + +Schemas are represented in the Python API by the :class:`tskit.MetadataSchema` +class which can be assigned to, and retrieved from, tables via their ``metadata_schema`` +attribute (e.g. :attr:`tskit.IndividualTable.metadata_schema`). The schemas +for all tables can be retrieved from a :class:`tskit.TreeSequence` by the +:attr:`tskit.TreeSequence.table_metadata_schemas` attribute. + +Each table's ``add_row`` method (e.g. :meth:`tskit.IndividualTable.add_row`) will +validate and encode the metadata using the schema. + +Metadata will be lazily decoded if accessed via +``tables.individuals[0].metadata`` or ``tree_sequence.individual(0).metadata``. + +In the interests of efficiency the bulk methods of ``set_columns`` +(e.g. :meth:`tskit.IndividualTable.set_columns`) +and ``append_columns`` (e.g. :meth:`tskit.IndividualTable.append_columns`) do not +validate or encode metadata. See :ref:`sec_tutorial_metadata_bulk` for how to prepare +metadata for these methods. + +Metadata processing can be disabled and raw bytes stored/retrived. See +:ref:`sec_tutorial_metadata_binary`. + +.. _sec_metadata_schema_schema: + +*************** +Full metaschema +*************** + +The schema for metadata schemas is formally defined using +`JSON Schema `_ and given in full here. Any schema passed to +:class:`tskit.MetadataSchema` is validated against this metaschema. + +.. literalinclude:: ../python/tskit/metadata_schema.schema.json + :language: json diff --git a/docs/python-api.rst b/docs/python-api.rst index 0068bd9d41..6157dad48b 100644 --- a/docs/python-api.rst +++ b/docs/python-api.rst @@ -321,10 +321,11 @@ Binary columns Columns storing binary data take the same approach as :ref:`sec_tables_api_text_columns` to encoding :ref:`variable length data `. -The difference between the two is -only raw :class:`bytes` values are accepted: no character encoding or -decoding is done on the data. Consider the following example:: - +The difference between the two is only raw :class:`bytes` values are accepted: no +character encoding or decoding is done on the data. Consider the following example +where a table has no ``metadata_schema`` such that arbitrary bytes can be stored and +no automatic encoding or decoding of objects is performed by the Python API and we can +store and retrive raw ``bytes``. (See :ref:`sec_metadata` for details):: >>> t = tskit.NodeTable() >>> t.add_row(metadata=b"raw bytes") @@ -388,30 +389,37 @@ and use, see :ref:`the table definitions `. .. autoclass:: tskit.IndividualTable() :members: :inherited-members: + :special-members: __getitem__ .. autoclass:: tskit.NodeTable() :members: :inherited-members: + :special-members: __getitem__ .. autoclass:: tskit.EdgeTable() :members: :inherited-members: + :special-members: __getitem__ .. autoclass:: tskit.MigrationTable() :members: :inherited-members: + :special-members: __getitem__ .. autoclass:: tskit.SiteTable() :members: :inherited-members: + :special-members: __getitem__ .. autoclass:: tskit.MutationTable() :members: :inherited-members: + :special-members: __getitem__ .. autoclass:: tskit.PopulationTable() :members: :inherited-members: + :special-members: __getitem__ .. autoclass:: tskit.ProvenanceTable() :members: @@ -461,6 +469,22 @@ Table functions .. autofunction:: tskit.unpack_bytes +.. _sec_metadata_api: + +******** +Metadata +******** + +The ``metadata`` module provides validation, encoding and decoding of metadata +using a schema. See :ref:`sec_metadata`, :ref:`sec_metadata_api_overview` and +:ref:`sec_tutorial_metadata`. + +.. autoclass:: tskit.MetadataSchema + :members: + :inherited-members: + +.. autofunction:: tskit.register_metadata_codec + .. _sec_stats_api: ********************** diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 8616cd6bb4..7ed39fe0d9 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -731,6 +731,210 @@ We can then finally obtain the tree sequence:: ┃ ┏┻┓ 0 1 2 +.. _sec_tutorial_metadata: + +********************* +Working with Metadata +********************* + +Metadata is information associated with entities that tskit doesn't use or interpret, +but which is useful to pass on to downstream analysis such as sample ids, dates etc. See +:ref:`sec_metadata` for a full discussion. Each table has a +``metadata_schema`` which details the +contents and encoding of the metadata for each row. A metadata schema is a JSON document +that conforms to `JSON Schema `_ +(The full schema for tskit is at :ref:`sec_metadata_schema_schema`). +Here's a simple example schema, which we'll apply to the individuals table: + +.. code-block:: python + + raw_schema = { + "codec": "json", + "type": "object", + "properties": { + "accession": {"type": "string"}, + "pcr": { + "name": "PCR Used", + "description": "Was PCR used on this sample", + "type": "boolean", + }, + }, + "required": ["accession", "pcr"], + "additionalProperties": False, + } + +The ``codec`` entry in the schema specifies how the metadata should be stored. +Currently only ``json`` is supported, others are planned (:issue:`535`). This schema +defines two properties, both of which are mandatory as they are in the ``required`` +list. To avoid errors we set ``additionalProperties`` to false, as this causes any +unexpected properties in the metadata to fail validation. + +Schemas in tskit are held in :class:`tskit.MetadataSchema`: + +.. code-block:: python + + schema = tskit.MetadataSchema(raw_schema) + +Which can be assigned to a table's +:attr:`metadata_schema `: + +.. code-block:: python + + tables = tskit.TableCollection(sequence_length=1) + tables.individuals.metadata_schema = schema + +Now that the table has a schema calls to +:meth:`add_row ` will validate and encode the metadata: + +.. code-block:: python + + tables.individuals.add_row(0, metadata={"accession": "Bob1234", "pcr": True}) + +and if we try to add metadata that doesn't fit the schema we'll get an error: + +.. code-block:: python + + tables.individuals.add_row(0, metadata={"accession": "Bob1234", "pcr": "false"}) + +:: + + Traceback (most recent call last): + File "/home/benj/projects/tskit/python/tskit/metadata.py", line 181, in validate_and_encode_row + self._validate_row(row) + File "/home/benj/projects/tskit/env/lib/python3.8/site-packages/jsonschema/validators.py", line 353, in validate + raise error + jsonschema.exceptions.ValidationError: 'false' is not of type 'boolean' + + Failed validating 'type' in schema['properties']['pcr']: + {'description': 'Was PCR used on this sample', + 'name': 'PCR Used', + 'type': 'boolean'} + + On instance['pcr']: + 'false' + + The above exception was the direct cause of the following exception: + + Traceback (most recent call last): + File "", line 1, in + File "/home/benj/projects/tskit/python/tskit/tables.py", line 409, in add_row + metadata = self.metadata_schema.validate_and_encode_row(metadata) + File "/home/benj/projects/tskit/python/tskit/metadata.py", line 183, in validate_and_encode_row + raise exceptions.MetadataValidationError from ve + tskit.exceptions.MetadataValidationError + +The metadata on the rows can be accessed directly: + +.. code-block:: python + + tables.individuals[0].metadata["accession"] + +:: + + 'Bob1234' + +Both the schema and the metadata can be retrieved from a :class:`tskit.TreeSequence`: + +.. code-block:: python + + ts = tables.tree_sequence() + ts.table_metadata_schemas.individual.schema + +:: + + {'codec': 'json', 'type': 'object', 'properties': {'accession': {...}}} + +.. code-block:: python + + ts.individual(0).metadata["accession"] + +:: + + 'Bob1234' + +.. _sec_tutorial_metadata_bulk: + ++++++++++++++++++++++++++++++++ +Metadata for bulk table methods ++++++++++++++++++++++++++++++++ + +In the interests of efficiency each table's +:meth:`set_columns ` and +:meth:`append_columns ` do not attempt to validate or +encode metadata. You can call +:meth:`MetadataSchema.validate_and_encode_row ` +directly to prepare metadata for these methods: + +.. code-block:: python + + metadata_column = [ + {"accession": "etho1234", "pcr": True}, + {"accession": "richard1235", "pcr": False}, + {"accession": "albert1236", "pcr": True}, + ] + encoded_metadata_column = [ + table.metadata_schema.validate_and_encode_row(r) for r in metadata_column + ] + metadata, metadata_offset = tskit.pack_bytes(encoded_metadata_column) + table.set_columns(flags=[0, 0, 0], metadata=metadata, metadata_offset=metadata_offset) + +.. _sec_tutorial_metadata_binary: + ++++++++++++++++ +Binary metadata ++++++++++++++++ + +To disable the validation and encoding of metadata and store raw bytes pass ``None`` to +:class:`tskit.MetadataSchema` + +.. code-block:: python + + table.metadata_schema = tskit.MetadataSchema(None) + table.add_row(0, metadata=b"SOME CUSTOM BYTES #!@") + t[0].metadata + +:: + + b'SOME CUSTOM BYTES #!@' + +.. _sec_tutorial_metadata_custom_codec: + ++++++++++++++++++++++ +Custom Metadata Codec ++++++++++++++++++++++ + +It is possible to use your own encoding for metadata using +:meth:`tskit.register_metadata_codec`, although this will reduce the portability of +the resulting file. + +.. code-block:: python + + class MsgPackCodec(tskit.AbstractMetadataCodec): + def __init__(self, schema): + pass + + def encode(self, obj): + return msgpack.dumps(obj) + + def decode(self, encoded): + return msgpack.loads(encoded) + + + tskit.register_metadata_codec(MsgPackCodec, "msgpack") + +This can then be referred to in a schema: + +.. code-block:: python + + raw_schema = { + "codec": "msgpack", + "title": "Example Metadata", + "type": "object", + "properties": {"one": {"type": "string"}, "two": {"type": "number"}}, + "required": ["one", "two"], + "additionalProperties": False, + } + schema = metadata.MetadataSchema(schema) ************** Calculating LD diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index f04dfb187f..9b0c018a50 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -6,6 +6,11 @@ In development **New features** +- Tables with a metadata column now have a ``metadata_schema`` that is used to + validate and encode metadata that is passed to ``add_row`` and decode metadata + on calls to ``table[j]`` and e.g. ``tree_sequence.node(j)`` See :ref:`sec_metadata`. + (:user:`benjeffery`, :pr:`491`, :pr:`542`, :pr:`543`) + - Add classes to SVG drawings to allow easy adjustment and styling, and document the new ``tskit.Tree.draw_svg()`` and ``tskit.TreeSequence.draw_svg()`` methods. This also fixes :issue:`467` for duplicate SVG entity ``id`` s in Jupyter notebooks. diff --git a/python/requirements/CI/requirements.txt b/python/requirements/CI/requirements.txt index 391e3e1626..86fa38a653 100644 --- a/python/requirements/CI/requirements.txt +++ b/python/requirements/CI/requirements.txt @@ -8,6 +8,7 @@ flake8==3.7.9 h5py==2.10.0 jsonschema==3.2.0 kastore==0.2.2 +msgpack==1.0.0 msprime==0.7.4 networkx==2.4 newick==1.0.0 diff --git a/python/requirements/conda-minimal.txt b/python/requirements/conda-minimal.txt index 9aa1ee18b5..812d408dd6 100644 --- a/python/requirements/conda-minimal.txt +++ b/python/requirements/conda-minimal.txt @@ -4,6 +4,7 @@ nose h5py jsonschema svgwrite +msgpack-python msprime kastore biopython diff --git a/python/requirements/development.txt b/python/requirements/development.txt index 09394b8e38..a9c79719cf 100644 --- a/python/requirements/development.txt +++ b/python/requirements/development.txt @@ -8,6 +8,7 @@ flake8 h5py jsonschema kastore +msgpack msprime networkx newick diff --git a/python/tests/__init__.py b/python/tests/__init__.py index d5546d86f2..4d1ab0a554 100644 --- a/python/tests/__init__.py +++ b/python/tests/__init__.py @@ -239,7 +239,10 @@ def make_mutation(id_): node=node, derived_state=derived_state, parent=parent, - metadata=metadata, + encoded_metadata=metadata, + metadata_decoder=tskit.metadata.parse_metadata_schema( + ll_ts.get_table_metadata_schemas().mutation + ).decode_row, ) for j in range(tree_sequence.num_sites): @@ -250,7 +253,10 @@ def make_mutation(id_): position=pos, ancestral_state=ancestral_state, mutations=[make_mutation(ll_mut) for ll_mut in ll_mutations], - metadata=metadata, + encoded_metadata=metadata, + metadata_decoder=tskit.metadata.parse_metadata_schema( + ll_ts.get_table_metadata_schemas().site + ).decode_row, ) ) diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index 7022b343f5..e76bb986a0 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -38,6 +38,7 @@ import uuid as _uuid import warnings +import attr import msprime import networkx as nx import numpy as np @@ -1342,6 +1343,124 @@ def test_sequence_iteration(self): self.assertEqual(n.id, 0) +class TestTreeSequenceMetadata(unittest.TestCase): + metadata_tables = [ + "node", + "edge", + "site", + "mutation", + "migration", + "individual", + "population", + ] + metadata_schema = tskit.MetadataSchema( + { + "codec": "json", + "title": "Example Metadata", + "type": "object", + "properties": { + "table": {"type": "string"}, + "string_prop": {"type": "string"}, + "num_prop": {"type": "number"}, + }, + "required": ["table", "string_prop", "num_prop"], + "additionalProperties": False, + }, + ) + + def test_metadata_schemas(self): + ts = msprime.simulate(5) + for table in self.metadata_tables: + tables = ts.dump_tables() + # Set and read back a unique schema for each table + schema = tskit.MetadataSchema({"codec": "json", "TEST": f"{table}-SCHEMA"}) + # Check via table API + getattr(tables, f"{table}s").metadata_schema = schema + self.assertEqual( + str(getattr(tables, f"{table}s").metadata_schema), str(schema) + ) + for other_table in self.metadata_tables: + if other_table != table: + self.assertEqual( + str(getattr(tables, f"{other_table}s").metadata_schema), "" + ) + # Check via tree-sequence API + new_ts = tskit.TreeSequence.load_tables(tables) + self.assertEqual( + str(getattr(new_ts.table_metadata_schemas, table)), str(schema), + ) + for other_table in self.metadata_tables: + if other_table != table: + self.assertEqual( + str(getattr(new_ts.table_metadata_schemas, other_table)), "" + ) + # Can't set schema via this API + with self.assertRaises(AttributeError): + new_ts.table_metadata_schemas = {} + # or modify the schema tuple return object + with self.assertRaises(attr.exceptions.FrozenInstanceError): + setattr( + new_ts.table_metadata_schemas, + table, + tskit.MetadataSchema({"codec": "json"}), + ) + + def test_metadata_round_trip_via_row_getters(self): + # A tree sequence with all entities + pop_configs = [msprime.PopulationConfiguration(5) for _ in range(2)] + migration_matrix = [[0, 1], [1, 0]] + ts = msprime.simulate( + population_configurations=pop_configs, + migration_matrix=migration_matrix, + mutation_rate=1, + record_migrations=True, + random_seed=1, + ) + tables = ts.dump_tables() + tables.individuals.add_row(location=[1, 2, 3]) + tables.individuals.add_row(location=[4, 5, 6]) + ts = tables.tree_sequence() + + for table in self.metadata_tables: + new_tables = ts.dump_tables() + tables_copy = ts.dump_tables() + table_obj = getattr(new_tables, f"{table}s") + table_obj.metadata_schema = self.metadata_schema + table_obj.clear() + # Write back the rows, but adding unique metadata + for j, row in enumerate(getattr(tables_copy, f"{table}s")): + row_data = attr.asdict(row) + row_data["metadata"] = { + "table": table, + "string_prop": f"Row number{j}", + "num_prop": j, + } + table_obj.add_row(**row_data) + new_ts = new_tables.tree_sequence() + # Check that all tables have data otherwise we'll silently not check one + assert getattr(new_ts, f"num_{table}s") > 0 + self.assertEqual( + getattr(new_ts, f"num_{table}s"), getattr(ts, f"num_{table}s") + ) + for j, row in enumerate(getattr(new_ts, f"{table}s")()): + self.assertDictEqual( + row.metadata, + { + "table": table, + "string_prop": f"Row number{row.id}", + "num_prop": row.id, + }, + ) + self.assertDictEqual( + getattr(new_ts, f"{table}")(j).metadata, + { + "table": table, + "string_prop": f"Row number{row.id}", + "num_prop": row.id, + }, + ) + + class TestPickle(HighLevelTestCase): """ Test pickling of a TreeSequence. @@ -2455,30 +2574,101 @@ def test_repr(self): self.assertGreater(len(repr(c)), 0) -class TestIndividualContainer(unittest.TestCase, SimpleContainersMixin): +class SimpleContainersWithMetadataMixin: + """ + Tests for the SimpleContainerWithMetadata classes. + """ + + def test_metadata(self): + # Test decoding + instances = self.get_instances(5) + for j, inst in enumerate(instances): + self.assertEqual(inst.metadata, ("x" * j) + "decoded") + + # Decoder doesn't effect equality + (inst,) = self.get_instances(1) + (inst2,) = self.get_instances(1) + self.assertTrue(inst == inst2) + inst._metadata_decoder = lambda m: "different decoder" + self.assertTrue(inst == inst2) + inst._encoded_metadata = b"different" + self.assertFalse(inst == inst2) + + def test_decoder_run_once(self): + # For a given instance, the decoded metadata should be cached, with the decoder + # called once + (inst,) = self.get_instances(1) + times_run = 0 + + def decoder(m): + nonlocal times_run + times_run += 1 + return m.decode() + "decoded" + + inst._metadata_decoder = decoder + self.assertEqual(times_run, 0) + _ = inst.metadata + self.assertEqual(times_run, 1) + _ = inst.metadata + self.assertEqual(times_run, 1) + + +class TestIndividualContainer( + unittest.TestCase, SimpleContainersMixin, SimpleContainersWithMetadataMixin +): def get_instances(self, n): return [ - tskit.Individual(id_=j, flags=j, location=[j], nodes=[j], metadata=b"x" * j) + tskit.Individual( + id_=j, + flags=j, + location=[j], + nodes=[j], + encoded_metadata=b"x" * j, + metadata_decoder=lambda m: m.decode() + "decoded", + ) for j in range(n) ] -class TestNodeContainer(unittest.TestCase, SimpleContainersMixin): +class TestNodeContainer( + unittest.TestCase, SimpleContainersMixin, SimpleContainersWithMetadataMixin +): def get_instances(self, n): return [ tskit.Node( - id_=j, flags=j, time=j, population=j, individual=j, metadata=b"x" * j + id_=j, + flags=j, + time=j, + population=j, + individual=j, + encoded_metadata=b"x" * j, + metadata_decoder=lambda m: m.decode() + "decoded", ) for j in range(n) ] -class TestEdgeContainer(unittest.TestCase, SimpleContainersMixin): +class TestEdgeContainer( + unittest.TestCase, SimpleContainersMixin, SimpleContainersWithMetadataMixin +): def get_instances(self, n): - return [tskit.Edge(left=j, right=j, parent=j, child=j, id_=j) for j in range(n)] + return [ + tskit.Edge( + left=j, + right=j, + parent=j, + child=j, + encoded_metadata=b"x" * j, + metadata_decoder=lambda m: m.decode() + "decoded", + id_=j, + ) + for j in range(n) + ] -class TestSiteContainer(unittest.TestCase, SimpleContainersMixin): +class TestSiteContainer( + unittest.TestCase, SimpleContainersMixin, SimpleContainersWithMetadataMixin +): def get_instances(self, n): return [ tskit.Site( @@ -2486,13 +2676,16 @@ def get_instances(self, n): position=j, ancestral_state="A" * j, mutations=TestMutationContainer().get_instances(j), - metadata=b"x" * j, + encoded_metadata=b"x" * j, + metadata_decoder=lambda m: m.decode() + "decoded", ) for j in range(n) ] -class TestMutationContainer(unittest.TestCase, SimpleContainersMixin): +class TestMutationContainer( + unittest.TestCase, SimpleContainersMixin, SimpleContainersWithMetadataMixin +): def get_instances(self, n): return [ tskit.Mutation( @@ -2501,23 +2694,44 @@ def get_instances(self, n): node=j, derived_state="A" * j, parent=j, - metadata=b"x" * j, + encoded_metadata=b"x" * j, + metadata_decoder=lambda m: m.decode() + "decoded", ) for j in range(n) ] -class TestMigrationContainer(unittest.TestCase, SimpleContainersMixin): +class TestMigrationContainer( + unittest.TestCase, SimpleContainersMixin, SimpleContainersWithMetadataMixin +): def get_instances(self, n): return [ - tskit.Migration(left=j, right=j, node=j, source=j, dest=j, time=j) + tskit.Migration( + left=j, + right=j, + node=j, + source=j, + dest=j, + time=j, + encoded_metadata=b"x" * j, + metadata_decoder=lambda m: m.decode() + "decoded", + ) for j in range(n) ] -class TestPopulationContainer(unittest.TestCase, SimpleContainersMixin): +class TestPopulationContainer( + unittest.TestCase, SimpleContainersMixin, SimpleContainersWithMetadataMixin +): def get_instances(self, n): - return [tskit.Population(id_=j, metadata="x" * j) for j in range(n)] + return [ + tskit.Population( + id_=j, + encoded_metadata=b"x" * j, + metadata_decoder=lambda m: m.decode() + "decoded", + ) + for j in range(n) + ] class TestProvenanceContainer(unittest.TestCase, SimpleContainersMixin): diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py index 50be388e35..bcfde6888e 100644 --- a/python/tests/test_metadata.py +++ b/python/tests/test_metadata.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2019 Tskit Developers +# Copyright (c) 2018-2020 Tskit Developers # Copyright (c) 2017 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -30,11 +30,14 @@ import tempfile import unittest +import msgpack import msprime import numpy as np import python_jsonschema_objects as pjs import tskit +import tskit.exceptions as exceptions +import tskit.metadata as metadata class TestMetadataHdf5RoundTrip(unittest.TestCase): @@ -163,7 +166,7 @@ def test_mutations(self): self.assertEqual(unpickled.two, metadata.two) -class TestJsonSchemaDecoding(unittest.TestCase): +class TestJSONSchemaDecoding(unittest.TestCase): """ Tests in which use json-schema to decode the metadata. """ @@ -284,3 +287,116 @@ def test_populations(self): expected = ["mno", ")(*&^%$#@!"] for a, b in zip(expected, p): self.assertEqual(a.encode("utf8"), b.metadata) + + +class TestMetadataModule(unittest.TestCase): + """ + Tests that use the metadata module + """ + + def test_metadata_schema(self): + # Bad jsonschema + with self.assertRaises(exceptions.MetadataSchemaValidationError): + metadata.MetadataSchema( + {"codec": "json", "additionalProperties": "THIS ISN'T RIGHT"}, + ) + # Bad codec + with self.assertRaises(exceptions.MetadataSchemaValidationError): + metadata.MetadataSchema({"codec": "morse-code"}) + # Missing codec + with self.assertRaises(exceptions.MetadataSchemaValidationError): + metadata.MetadataSchema({}) + schema = { + "codec": "json", + "title": "Example Metadata", + "type": "object", + "properties": {"one": {"type": "string"}, "two": {"type": "number"}}, + "required": ["one", "two"], + "additionalProperties": False, + } + ms = metadata.MetadataSchema(schema) + self.assertEqual(str(ms), json.dumps(schema)) + # Missing required properties + with self.assertRaises(exceptions.MetadataValidationError): + ms.validate_and_encode_row({}) + + def test_register_codec(self): + class TestCodec(metadata.AbstractMetadataCodec): + pass + + metadata.register_metadata_codec(TestCodec, "test") + self.assertEqual(TestCodec, metadata.codec_registry["test"]) + + def test_parse(self): + # Empty string gives MetaDataSchema with None codec + ms = metadata.parse_metadata_schema("") + self.assertIsInstance(ms, metadata.MetadataSchema) + self.assertEqual(ms.schema, None) + + # json gives MetaDataSchema with json codec + ms = metadata.parse_metadata_schema(json.dumps({"codec": "json"})) + self.assertIsInstance(ms, metadata.MetadataSchema) + self.assertDictEqual(ms.schema, {"codec": "json"}) + + # Bad JSON gives error + with self.assertRaises(ValueError): + metadata.parse_metadata_schema(json.dumps({"codec": "json"})[:-1]) + + def test_null_codec(self): + ms = metadata.MetadataSchema(None) + self.assertEqual(str(ms), "") + row = b"Some binary data that tskit can't interpret " + # Encode/decode are no-ops + self.assertEqual(row, ms.validate_and_encode_row(row)) + self.assertEqual(row, ms.decode_row(row)) + # Only bytes validate + with self.assertRaises(TypeError): + ms.validate_and_encode_row({}) + + def test_json_codec(self): + schema = { + "codec": "json", + "title": "Example Metadata", + "type": "object", + "properties": {"one": {"type": "string"}, "two": {"type": "number"}}, + "required": ["one", "two"], + "additionalProperties": False, + } + ms = metadata.MetadataSchema(schema) + # Valid row data + row_data = {"one": "tree", "two": 5} + self.assertEqual( + ms.validate_and_encode_row(row_data), json.dumps(row_data).encode() + ) + self.assertEqual(ms.decode_row(json.dumps(row_data).encode()), row_data) + # Round trip + self.assertEqual(ms.decode_row(ms.validate_and_encode_row(row_data)), row_data) + + def test_msgpack_codec(self): + class MsgPackCodec(metadata.AbstractMetadataCodec): + def __init__(self, schema): + pass + + def encode(self, obj): + return msgpack.dumps(obj) + + def decode(self, encoded): + return msgpack.loads(encoded) + + metadata.register_metadata_codec(MsgPackCodec, "msgpack") + + schema = { + "codec": "msgpack", + "title": "Example Metadata", + "type": "object", + "properties": {"one": {"type": "string"}, "two": {"type": "number"}}, + "required": ["one", "two"], + "additionalProperties": False, + } + ms = metadata.MetadataSchema(schema) + # Valid row data + row_data = {"one": "tree", "two": 5} + self.assertEqual(ms.validate_and_encode_row(row_data), msgpack.dumps(row_data)) + self.assertEqual(ms.decode_row(msgpack.dumps(row_data)), row_data) + # Round trip + self.assertEqual(ms.decode_row(ms.validate_and_encode_row(row_data)), row_data) diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index c88cfb425c..062c972c6d 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2019 Tskit Developers +# Copyright (c) 2018-2020 Tskit Developers # Copyright (c) 2017 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -38,6 +38,8 @@ import _tskit import tests.tsutil as tsutil import tskit +import tskit.exceptions as exceptions +import tskit.metadata as metadata class Column: @@ -76,6 +78,14 @@ class CommonTestsMixin: we have to make this a mixin. """ + def make_input_data(self, num_rows): + input_data = {col.name: col.get_input(num_rows) for col in self.columns} + for list_col, offset_col in self.ragged_list_columns: + value = list_col.get_input(num_rows) + input_data[list_col.name] = value + input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + return input_data + def test_max_rows_increment(self): for bad_value in [-1, -(2 ** 10)]: self.assertRaises( @@ -148,11 +158,7 @@ def test_set_columns_string_errors(self): self.assertRaises(TypeError, table.set_columns, **kwargs) def test_set_columns_interface(self): - kwargs = {c.name: c.get_input(1) for c in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(1) - kwargs[list_col.name] = value - kwargs[offset_col.name] = [0, 1] + kwargs = self.make_input_data(1) # Make sure this works. table = self.table_class() table.set_columns(**kwargs) @@ -171,11 +177,7 @@ def test_set_columns_interface(self): self.assertRaises(ValueError, table.append_columns, **error_kwargs) def test_set_columns_from_dict(self): - kwargs = {c.name: c.get_input(1) for c in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(1) - kwargs[list_col.name] = value - kwargs[offset_col.name] = [0, 1] + kwargs = self.make_input_data(1) # Make sure this works. t1 = self.table_class() t1.set_columns(**kwargs) @@ -184,11 +186,7 @@ def test_set_columns_from_dict(self): self.assertEqual(t1, t2) def test_set_columns_dimension(self): - kwargs = {c.name: c.get_input(1) for c in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(1) - kwargs[list_col.name] = value - kwargs[offset_col.name] = [0, 1] + kwargs = self.make_input_data(1) table = self.table_class() table.set_columns(**kwargs) table.append_columns(**kwargs) @@ -199,8 +197,7 @@ def test_set_columns_dimension(self): error_kwargs[focal_col.name] = bad_dims self.assertRaises(ValueError, table.set_columns, **error_kwargs) self.assertRaises(ValueError, table.append_columns, **error_kwargs) - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(1) + for _, offset_col in self.ragged_list_columns: error_kwargs = dict(kwargs) for bad_dims in [5, [[1], [1]], np.zeros((2, 2))]: error_kwargs[offset_col.name] = bad_dims @@ -211,13 +208,9 @@ def test_set_columns_dimension(self): self.assertRaises(ValueError, table.set_columns, **error_kwargs) def test_set_columns_input_sizes(self): - num_rows = 100 - input_data = {col.name: col.get_input(num_rows) for col in self.columns} + input_data = self.make_input_data(100) col_map = {col.name: col for col in self.columns} for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) col_map[list_col.name] = list_col col_map[offset_col.name] = offset_col table = self.table_class() @@ -253,11 +246,7 @@ def test_set_column_attributes_empty(self): def test_set_column_attributes_data(self): table = self.table_class() for num_rows in [1, 10, 100]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) table.set_columns(**input_data) for list_col, offset_col in self.ragged_list_columns: @@ -297,11 +286,7 @@ def test_set_column_attributes_data(self): def test_set_column_attributes_errors(self): table = self.table_class() num_rows = 10 - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) table.set_columns(**input_data) for list_col, offset_col in self.ragged_list_columns: @@ -357,11 +342,7 @@ def test_add_row_data(self): def test_add_row_round_trip(self): for num_rows in [0, 10, 100]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) t1 = self.table_class() t1.set_columns(**input_data) for colname, input_array in input_data.items(): @@ -451,12 +432,9 @@ def test_truncate_errors(self): def test_append_columns_data(self): for num_rows in [0, 10, 100, 1000]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} + input_data = self.make_input_data(num_rows) offset_cols = set() - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + for _, offset_col in self.ragged_list_columns: offset_cols.add(offset_col.name) table = self.table_class() for j in range(1, 10): @@ -479,11 +457,7 @@ def test_append_columns_data(self): def test_append_columns_max_rows(self): for num_rows in [0, 10, 100, 1000]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) for max_rows in [0, 1, 8192]: table = self.table_class(max_rows_increment=max_rows) for j in range(1, 10): @@ -494,11 +468,7 @@ def test_append_columns_max_rows(self): def test_str(self): for num_rows in [0, 10]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) table = self.table_class() table.set_columns(**input_data) s = str(table) @@ -518,11 +488,7 @@ def test_repr_html(self): def test_copy(self): for num_rows in [0, 10]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) table = self.table_class() table.set_columns(**input_data) for _ in range(10): @@ -534,11 +500,7 @@ def test_copy(self): def test_pickle(self): for num_rows in [0, 10, 100]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) table = self.table_class() table.set_columns(**input_data) pkl = pickle.dumps(table) @@ -551,11 +513,7 @@ def test_pickle(self): def test_equality(self): for num_rows in [1, 10, 100]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) t1 = self.table_class() t2 = self.table_class() self.assertEqual(t1, t1) @@ -612,11 +570,7 @@ def test_equality(self): def test_bad_offsets(self): for num_rows in [10, 100]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) t = self.table_class() t.set_columns(**input_data) @@ -651,13 +605,46 @@ class MetadataTestsMixin: Tests for column that have metadata columns. """ + metadata_schema = metadata.MetadataSchema( + { + "codec": "json", + "title": "Example Metadata", + "type": "object", + "properties": { + "one": {"type": "string"}, + "two": {"type": "number"}, + "three": {"type": "array"}, + "four": {"type": "boolean"}, + }, + "required": ["one", "two", "three", "four"], + "additionalProperties": False, + }, + ) + + def metadata_example_data(self): + try: + self.val += 1 + except AttributeError: + self.val = 0 + return { + "one": "val one", + "two": self.val, + "three": list(range(self.val, self.val + 10)), + "four": True, + } + + def input_data_for_add_row(self): + input_data = {col.name: col.get_input(1) for col in self.columns} + kwargs = {col: data[0] for col, data in input_data.items()} + for col in self.string_colnames: + kwargs[col] = "x" + for col in self.binary_colnames: + kwargs[col] = b"x" + return kwargs + def test_random_metadata(self): for num_rows in [0, 10, 100]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) table = self.table_class() metadatas = [tsutil.random_bytes(10) for _ in range(num_rows)] metadata, metadata_offset = tskit.pack_bytes(metadatas) @@ -670,36 +657,29 @@ def test_random_metadata(self): self.assertEqual(metadatas, unpacked_metadatas) def test_optional_metadata(self): - for num_rows in [0, 10, 100]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) - table = self.table_class() - del input_data["metadata"] - del input_data["metadata_offset"] - table.set_columns(**input_data) - self.assertEqual(len(list(table.metadata)), 0) - self.assertEqual( - list(table.metadata_offset), [0 for _ in range(num_rows + 1)] - ) - # Supplying None is the same not providing the column. - input_data["metadata"] = None - input_data["metadata_offset"] = None - table.set_columns(**input_data) - self.assertEqual(len(list(table.metadata)), 0) - self.assertEqual( - list(table.metadata_offset), [0 for _ in range(num_rows + 1)] - ) + if not getattr(self, "metadata_mandatory", False): + for num_rows in [0, 10, 100]: + input_data = self.make_input_data(num_rows) + table = self.table_class() + del input_data["metadata"] + del input_data["metadata_offset"] + table.set_columns(**input_data) + self.assertEqual(len(list(table.metadata)), 0) + self.assertEqual( + list(table.metadata_offset), [0 for _ in range(num_rows + 1)] + ) + # Supplying None is the same not providing the column. + input_data["metadata"] = None + input_data["metadata_offset"] = None + table.set_columns(**input_data) + self.assertEqual(len(list(table.metadata)), 0) + self.assertEqual( + list(table.metadata_offset), [0 for _ in range(num_rows + 1)] + ) def test_packset_metadata(self): for num_rows in [0, 10, 100]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) table = self.table_class() table.set_columns(**input_data) metadatas = [tsutil.random_bytes(10) for _ in range(num_rows)] @@ -708,9 +688,111 @@ def test_packset_metadata(self): self.assertTrue(np.array_equal(table.metadata, metadata)) self.assertTrue(np.array_equal(table.metadata_offset, metadata_offset)) + def test_set_metadata_schema(self): + metadata_schema2 = metadata.MetadataSchema({"codec": "json"}) + table = self.table_class() + # Default is no-op metadata codec + self.assertEqual(str(table.metadata_schema), str(metadata.MetadataSchema(None))) + # Set + table.metadata_schema = self.metadata_schema + self.assertEqual(str(table.metadata_schema), str(self.metadata_schema)) + # Overwrite + table.metadata_schema = metadata_schema2 + self.assertEqual(str(table.metadata_schema), str(metadata_schema2)) + # Remove + table.metadata_schema = "" + self.assertEqual(str(table.metadata_schema), str(metadata.MetadataSchema(None))) + # Set after remove + table.metadata_schema = self.metadata_schema + self.assertEqual(str(table.metadata_schema), str(self.metadata_schema)) + # Del should fail + with self.assertRaises(AttributeError): + del table.metadata_schema + # None should fail + with self.assertRaises(ValueError): + table.metadata_schema = None -class TestIndividualTable(unittest.TestCase, CommonTestsMixin, MetadataTestsMixin): + def test_default_metadata_schema(self): + # Default should allow bytes as in pre-exisiting code + table = self.table_class() + table.add_row( + **{**self.input_data_for_add_row(), "metadata": b"acceptable bytes"} + ) + # Adding non-bytes metadata should error + with self.assertRaises(TypeError): + table.add_row( + **{ + **self.input_data_for_add_row(), + "metadata": self.metadata_example_data(), + } + ) + + def test_row_round_trip_metadata_schema(self): + data = self.metadata_example_data() + table = self.table_class() + table.metadata_schema = self.metadata_schema + table.add_row(**{**self.input_data_for_add_row(), "metadata": data}) + self.assertDictEqual(table[0].metadata, data) + + def test_bad_row_metadata_schema(self): + metadata = self.metadata_example_data() + metadata["I really shouldn't be here"] = 6 + table = self.table_class() + table.metadata_schema = self.metadata_schema + with self.assertRaises(exceptions.MetadataValidationError): + table.add_row(**{**self.input_data_for_add_row(), "metadata": metadata}) + self.assertEqual(len(table), 0) + + def test_absent_metadata_with_required_schema(self): + table = self.table_class() + table.metadata_schema = self.metadata_schema + input_data = self.input_data_for_add_row() + del input_data["metadata"] + with self.assertRaises(exceptions.MetadataValidationError): + table.add_row(**{**input_data}) + def test_unsupported_type(self): + table = self.table_class() + table.metadata_schema = metadata.MetadataSchema( + { + "codec": "json", + "type": "object", + "properties": {"an_array": {"type": "array"}}, + } + ) + input_data = self.input_data_for_add_row() + # Numpy is not a JSONSchema array + input_data["metadata"] = {"an_array": np.arange(10)} + with self.assertRaises(exceptions.MetadataValidationError): + table.add_row(**{**input_data}) + + def test_round_trip_set_columns(self): + for num_rows in [0, 10, 100]: + table = self.table_class() + table.metadata_schema = self.metadata_schema + input_data = self.make_input_data(num_rows) + del input_data["metadata"] + del input_data["metadata_offset"] + metadata_column = [self.metadata_example_data() for _ in range(num_rows)] + encoded_metadata_column = [ + table.metadata_schema.validate_and_encode_row(r) + for r in metadata_column + ] + packed_metadata, metadata_offset = tskit.util.pack_bytes( + encoded_metadata_column + ) + table.set_columns( + metadata=packed_metadata, metadata_offset=metadata_offset, **input_data + ) + table.append_columns( + metadata=packed_metadata, metadata_offset=metadata_offset, **input_data + ) + for j in range(num_rows): + self.assertEqual(table[j].metadata, metadata_column[j]) + self.assertEqual(table[j + num_rows].metadata, metadata_column[j]) + + +class TestIndividualTable(unittest.TestCase, CommonTestsMixin, MetadataTestsMixin): columns = [UInt32Column("flags")] ragged_list_columns = [ (DoubleColumn("location"), UInt32Column("location_offset")), @@ -935,11 +1017,7 @@ def test_add_row_bad_data(self): def test_packset_ancestral_state(self): for num_rows in [0, 10, 100]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) table = self.table_class() table.set_columns(**input_data) ancestral_states = [tsutil.random_strings(10) for _ in range(num_rows)] @@ -995,11 +1073,7 @@ def test_add_row_bad_data(self): def test_packset_derived_state(self): for num_rows in [0, 10, 100]: - input_data = {col.name: col.get_input(num_rows) for col in self.columns} - for list_col, offset_col in self.ragged_list_columns: - value = list_col.get_input(num_rows) - input_data[list_col.name] = value - input_data[offset_col.name] = np.arange(num_rows + 1, dtype=np.uint32) + input_data = self.make_input_data(num_rows) table = self.table_class() table.set_columns(**input_data) derived_states = [tsutil.random_strings(10) for _ in range(num_rows)] @@ -1111,7 +1185,8 @@ def test_packset_record(self): self.assertEqual(t[1].record, "BBBB") -class TestPopulationTable(unittest.TestCase, CommonTestsMixin): +class TestPopulationTable(unittest.TestCase, CommonTestsMixin, MetadataTestsMixin): + metadata_mandatory = True columns = [] ragged_list_columns = [(CharColumn("metadata"), UInt32Column("metadata_offset"))] equal_len_columns = [[]] diff --git a/python/tskit/__init__.py b/python/tskit/__init__.py index 8bbec03964..b12efe1fd5 100644 --- a/python/tskit/__init__.py +++ b/python/tskit/__init__.py @@ -54,3 +54,4 @@ from tskit.stats import * # NOQA from tskit.exceptions import * # NOQA from tskit.util import * # NOQA +from tskit.metadata import * # NOQA diff --git a/python/tskit/exceptions.py b/python/tskit/exceptions.py index 1e79f468a2..82a84b78c9 100644 --- a/python/tskit/exceptions.py +++ b/python/tskit/exceptions.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2019 Tskit Developers +# Copyright (c) 2018-2020 Tskit Developers # Copyright (c) 2017 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -55,5 +55,17 @@ class DuplicatePositionsError(TskitException): class ProvenanceValidationError(TskitException): """ - A JSON document did non validate against the provenance schema. + A JSON document did not validate against the provenance schema. + """ + + +class MetadataValidationError(TskitException): + """ + A metadata object did not validate against the provenance schema. + """ + + +class MetadataSchemaValidationError(TskitException): + """ + A metadata schema object did not validate against the metaschema. """ diff --git a/python/tskit/metadata.py b/python/tskit/metadata.py new file mode 100644 index 0000000000..9ad6bccda0 --- /dev/null +++ b/python/tskit/metadata.py @@ -0,0 +1,212 @@ +# MIT License +# +# Copyright (c) 2020 Tskit Developers +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Classes for metadata decoding, encoding and validation +""" +import abc +import copy +import json +from typing import Any +from typing import Optional +from typing import Type + +import jsonschema + +import tskit.exceptions as exceptions + + +def replace_root_refs(obj): + if type(obj) == list: + return [replace_root_refs(j) for j in obj] + elif type(obj) == dict: + ret = {k: replace_root_refs(v) for k, v in obj.items()} + if ret.get("$ref") == "#": + ret["$ref"] = "#/definitions/root" + return ret + else: + return obj + + +class TSKITMetadataSchemaValidator(jsonschema.validators.Draft7Validator): + """ + Our schema is the Draft7Validator schema with added codec information. + """ + + META_SCHEMA: dict = copy.deepcopy(jsonschema.validators.Draft7Validator.META_SCHEMA) + # We need a top-level only required property so we need to rewite any reference + # to the top-level schema to a copy in a definition. + META_SCHEMA = replace_root_refs(META_SCHEMA) + META_SCHEMA["definitions"]["root"] = copy.deepcopy(META_SCHEMA) + META_SCHEMA["codec"] = {"type": "string"} + META_SCHEMA["required"] = ["codec"] + + +class AbstractMetadataCodec(metaclass=abc.ABCMeta): + """ + Superclass of all MetadataCodecs. + """ + + def __init__(self, schema: dict) -> None: + raise NotImplementedError # pragma: no cover + + @abc.abstractmethod + def encode(self, obj: Any) -> bytes: + raise NotImplementedError # pragma: no cover + + @abc.abstractmethod + def decode(self, encoded: bytes) -> Any: + raise NotImplementedError # pragma: no cover + + +codec_registry = {} + + +def register_metadata_codec( + codec_cls: Type[AbstractMetadataCodec], codec_id: str +) -> None: + """ + Register a metadata codec class. + This function maintains a mapping from metadata codec identifiers used in schemas + to codec classes. When a codec class is registered, it will replace any class + previously registered under the same codec identifier, if present. + See :ref:`sec_tutorial_metadata_custom_codec` for example usage. + + :param str codec_id: String to use to refer to the codec in the schema. + """ + codec_registry[codec_id] = codec_cls + + +class JSONCodec(AbstractMetadataCodec): + def __init__(self, schema: dict) -> None: + pass + + def encode(self, obj: Any) -> bytes: + return json.dumps(obj).encode() + + def decode(self, encoded: bytes) -> Any: + return json.loads(encoded.decode()) + + +register_metadata_codec(JSONCodec, "json") + + +class NOOPCodec(AbstractMetadataCodec): + def __init__(self, schema: dict) -> None: + pass + + def encode(self, data: bytes) -> bytes: + return data + + def decode(self, data: bytes) -> bytes: + return data + + +def validate_bytes(data: Optional[bytes]) -> None: + if data is not None and not isinstance(data, bytes): + raise TypeError( + f"If no encoding is set metadata should be bytes, found {type(bytes)}" + ) + + +class MetadataSchema: + """ + Class for validating, encoding and decoding metadata. + + :param dict schema: A dict containing a valid JSONSchema object. + """ + + def __init__(self, schema: Optional[dict]) -> None: + self._schema = schema + + if schema is None: + self._string = "" + self._validate_row = validate_bytes + self.encode_row = NOOPCodec({}).encode + self.decode_row = NOOPCodec({}).decode + else: + try: + TSKITMetadataSchemaValidator.check_schema(schema) + except jsonschema.exceptions.SchemaError as ve: + raise exceptions.MetadataSchemaValidationError from ve + codec = schema["codec"] + try: + codec_instance = codec_registry[codec](schema) + except KeyError: + raise exceptions.MetadataSchemaValidationError( + f"Unrecognised metadata codec '{schema['codec']}'. " + f"Valid options are {str(list(codec_registry.keys()))}." + ) + self._string = json.dumps(schema) + self._validate_row = TSKITMetadataSchemaValidator(schema).validate + self.encode_row = codec_instance.encode + self.decode_row = codec_instance.decode + + def __str__(self) -> str: + return self._string + + @property + def schema(self) -> Optional[dict]: + # Make schema read-only + return self._schema + + def validate_and_encode_row(self, row: Any) -> bytes: + """ + Validate a row of metadata against this schema and return the encoded + representation. + """ + try: + self._validate_row(row) + except jsonschema.exceptions.ValidationError as ve: + raise exceptions.MetadataValidationError from ve + return self.encode_row(row) + + def decode_row(self, row: bytes) -> Any: + """ + Decode an encoded row of metadata, note that no validation is performed. + """ + # Set by __init__ + pass # pragma: no cover + + def encode_row(self, row: bytes) -> Any: + """ + Encode an encoded row of metadata, note that no validation is performed. + """ + # Set by __init__ + pass # pragma: no cover + + +def parse_metadata_schema(encoded_schema: str) -> MetadataSchema: + """ + Create a schema object from its string encoding. The exact class returned is + determined by the ``encoding`` specification in the string. + + :param str encoded_schema: The string encoded schema. + :return: A subclass of AbstractMetadataSchema. + """ + if encoded_schema == "": + return MetadataSchema(schema=None) + else: + try: + decoded = json.loads(encoded_schema) + except json.decoder.JSONDecodeError: + raise ValueError(f"Metadata schema is not JSON, found {encoded_schema}") + return MetadataSchema(decoded) diff --git a/python/tskit/metadata_schema.schema.json b/python/tskit/metadata_schema.schema.json new file mode 100644 index 0000000000..f5ecf6887e --- /dev/null +++ b/python/tskit/metadata_schema.schema.json @@ -0,0 +1,226 @@ +{ + "$id": "http://json-schema.org/draft-07/schema#", + "$schema": "http://json-schema.org/draft-07/schema#", + "codec": {"type": "string"}, + "default": true, + "definitions": { + "nonNegativeInteger": {"minimum": 0, "type": "integer"}, + "nonNegativeIntegerDefault0": { + "allOf": [{"$ref": "#/definitions/nonNegativeInteger"}, {"default": 0}] + }, + "root": { + "$id": "http://json-schema.org/draft-07/schema#", + "$schema": "http://json-schema.org/draft-07/schema#", + "default": true, + "definitions": { + "nonNegativeInteger": {"minimum": 0, "type": "integer"}, + "nonNegativeIntegerDefault0": { + "allOf": [ + {"$ref": "#/definitions/nonNegativeInteger"}, + {"default": 0}, + ] + }, + "schemaArray": { + "items": {"$ref": "#/definitions/root"}, + "minItems": 1, + "type": "array", + }, + "simpleTypes": { + "enum": ["array", "boolean", "integer", "null", "number", "object", "string",] + }, + "stringArray": { + "default": [], + "items": {"type": "string"}, + "type": "array", + "uniqueItems": true, + }, + }, + "properties": { + "$comment": {"type": "string"}, + "$id": {"format": "uri-reference", "type": "string"}, + "$ref": {"format": "uri-reference", "type": "string"}, + "$schema": {"format": "uri", "type": "string"}, + "additionalItems": {"$ref": "#/definitions/root"}, + "additionalProperties": {"$ref": "#/definitions/root"}, + "allOf": {"$ref": "#/definitions/schemaArray"}, + "anyOf": {"$ref": "#/definitions/schemaArray"}, + "const": true, + "contains": {"$ref": "#/definitions/root"}, + "contentEncoding": {"type": "string"}, + "contentMediaType": {"type": "string"}, + "default": true, + "definitions": { + "additionalProperties": {"$ref": "#/definitions/root"}, + "default": {}, + "type": "object", + }, + "dependencies": { + "additionalProperties": { + "anyOf": [{"$ref": "#/definitions/root"}, + {"$ref": "#/definitions/stringArray"}, + ] + }, + "type": "object", + }, + "description": {"type": "string"}, + "else": {"$ref": "#/definitions/root"}, + "enum": {"items": true, "type": "array"}, + "examples": {"items": true, "type": "array"}, + "exclusiveMaximum": {"type": "number"}, + "exclusiveMinimum": {"type": "number"}, + "format": {"type": "string"}, + "if": {"$ref": "#/definitions/root"}, + "items": { + "anyOf": [ + {"$ref": "#/definitions/root"}, + {"$ref": "#/definitions/schemaArray"}, + ], + "default": true, + }, + "maxItems": {"$ref": "#/definitions/nonNegativeInteger"}, + "maxLength": {"$ref": "#/definitions/nonNegativeInteger"}, + "maxProperties": {"$ref": "#/definitions/nonNegativeInteger"}, + "maximum": {"type": "number"}, + "minItems": {"$ref": "#/definitions/nonNegativeIntegerDefault0"}, + "minLength": {"$ref": "#/definitions/nonNegativeIntegerDefault0"}, + "minProperties": {"$ref": "#/definitions/nonNegativeIntegerDefault0"}, + "minimum": {"type": "number"}, + "multipleOf": {"exclusiveMinimum": 0, "type": "number"}, + "not": {"$ref": "#/definitions/root"}, + "oneOf": {"$ref": "#/definitions/schemaArray"}, + "pattern": {"format": "regex", "type": "string"}, + "patternProperties": { + "additionalProperties": {"$ref": "#/definitions/root"}, + "default": {}, + "propertyNames": {"format": "regex"}, + "type": "object", + }, + "properties": { + "additionalProperties": {"$ref": "#/definitions/root"}, + "default": {}, + "type": "object", + }, + "propertyNames": {"$ref": "#/definitions/root"}, + "readOnly": {"default": false, "type": "boolean"}, + "required": {"$ref": "#/definitions/stringArray"}, + "then": {"$ref": "#/definitions/root"}, + "title": {"type": "string"}, + "type": { + "anyOf": [ + {"$ref": "#/definitions/simpleTypes"}, + { + "items": {"$ref": "#/definitions/simpleTypes"}, + "minItems": 1, + "type": "array", + "uniqueItems": true, + }, + ] + }, + "uniqueItems": {"default": false, "type": "boolean"}, + }, + "title": "Core schema meta-schema", + "type": ["object", "boolean"], + }, + "schemaArray": { + "items": {"$ref": "#/definitions/root"}, + "minItems": 1, + "type": "array", + }, + "simpleTypes": { + "enum": ["array", "boolean", "integer", "null", "number", "object", "string",] + }, + "stringArray": { + "default": [], + "items": {"type": "string"}, + "type": "array", + "uniqueItems": true, + }, + }, + "properties": { + "$comment": {"type": "string"}, + "$id": {"format": "uri-reference", "type": "string"}, + "$ref": {"format": "uri-reference", "type": "string"}, + "$schema": {"format": "uri", "type": "string"}, + "additionalItems": {"$ref": "#/definitions/root"}, + "additionalProperties": {"$ref": "#/definitions/root"}, + "allOf": {"$ref": "#/definitions/schemaArray"}, + "anyOf": {"$ref": "#/definitions/schemaArray"}, + "const": true, + "contains": {"$ref": "#/definitions/root"}, + "contentEncoding": {"type": "string"}, + "contentMediaType": {"type": "string"}, + "default": true, + "definitions": { + "additionalProperties": {"$ref": "#/definitions/root"}, + "default": {}, + "type": "object", + }, + "dependencies": { + "additionalProperties": { + "anyOf": [ + {"$ref": "#/definitions/root"}, + {"$ref": "#/definitions/stringArray"}, + ] + }, + "type": "object", + }, + "description": {"type": "string"}, + "else": {"$ref": "#/definitions/root"}, + "enum": {"items": true, "type": "array"}, + "examples": {"items": true, "type": "array"}, + "exclusiveMaximum": {"type": "number"}, + "exclusiveMinimum": {"type": "number"}, + "format": {"type": "string"}, + "if": {"$ref": "#/definitions/root"}, + "items": { + "anyOf": [ + {"$ref": "#/definitions/root"}, + {"$ref": "#/definitions/schemaArray"}, + ], + "default": true, + }, + "maxItems": {"$ref": "#/definitions/nonNegativeInteger"}, + "maxLength": {"$ref": "#/definitions/nonNegativeInteger"}, + "maxProperties": {"$ref": "#/definitions/nonNegativeInteger"}, + "maximum": {"type": "number"}, + "minItems": {"$ref": "#/definitions/nonNegativeIntegerDefault0"}, + "minLength": {"$ref": "#/definitions/nonNegativeIntegerDefault0"}, + "minProperties": {"$ref": "#/definitions/nonNegativeIntegerDefault0"}, + "minimum": {"type": "number"}, + "multipleOf": {"exclusiveMinimum": 0, "type": "number"}, + "not": {"$ref": "#/definitions/root"}, + "oneOf": {"$ref": "#/definitions/schemaArray"}, + "pattern": {"format": "regex", "type": "string"}, + "patternProperties": { + "additionalProperties": {"$ref": "#/definitions/root"}, + "default": {}, + "propertyNames": {"format": "regex"}, + "type": "object", + }, + "properties": { + "additionalProperties": {"$ref": "#/definitions/root"}, + "default": {}, + "type": "object", + }, + "propertyNames": {"$ref": "#/definitions/root"}, + "readOnly": {"default": false, "type": "boolean"}, + "required": {"$ref": "#/definitions/stringArray"}, + "then": {"$ref": "#/definitions/root"}, + "title": {"type": "string"}, + "type": { + "anyOf": [ + {"$ref": "#/definitions/simpleTypes"}, + { + "items": {"$ref": "#/definitions/simpleTypes"}, + "minItems": 1, + "type": "array", + "uniqueItems": true, + }, + ] + }, + "uniqueItems": {"default": false, "type": "boolean"}, + }, + "required": ["codec"], + "title": "Core schema meta-schema", + "type": ["object", "boolean"], +} diff --git a/python/tskit/tables.py b/python/tskit/tables.py index 3ab62f994c..e0bc3c36e9 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2019 Tskit Developers +# Copyright (c) 2018-2020 Tskit Developers # Copyright (c) 2017 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -27,12 +27,15 @@ import datetime import json import warnings +from typing import Any +from typing import Tuple import attr import numpy as np import _tskit import tskit +import tskit.metadata as metadata import tskit.provenance as provenance import tskit.util as util @@ -143,9 +146,10 @@ class BaseTable: # The list of columns in the table. Must be set by subclasses. column_names = [] - def __init__(self, ll_table, row_class): + def __init__(self, ll_table, row_class, **kwargs): self.ll_table = ll_table self.row_class = row_class + super().__init__(**kwargs) def _check_required_args(self, **kwargs): for k, v in kwargs.items(): @@ -193,11 +197,23 @@ def __setattr__(self, name, value): object.__setattr__(self, name, value) def __getitem__(self, index): + """ + Return the specifed row of this table, decoding metadata if it is present. + Supports negative indexing, e.g. ``table[-5]``. + + :param int index: the zero-index of the desired row + """ if index < 0: index += len(self) if index < 0 or index >= len(self): raise IndexError("Index out of bounds") - return self.row_class(*self.ll_table.get_row(index)) + row = self.ll_table.get_row(index) + try: + row = self.decode_row(row) + except AttributeError: + # This means the class returns the low-level row unchanged. + pass + return self.row_class(*row) def clear(self): """ @@ -287,6 +303,12 @@ class MetadataMixin: Mixin class for tables that have a metadata column. """ + def __init__(self): + self.metadata_column_index = list( + attr.fields_dict(self.row_class).keys() + ).index("metadata") + self._update_metadata_schema_cache_from_ll() + def packset_metadata(self, metadatas): """ Packs the specified list of metadata values and updates the ``metadata`` @@ -301,6 +323,30 @@ def packset_metadata(self, metadatas): d["metadata_offset"] = offset self.set_columns(**d) + @property + def metadata_schema(self) -> metadata.MetadataSchema: + """ + The :class:`tskit.MetadataSchema` for this table. + """ + return self._metadata_schema_cache + + @metadata_schema.setter + def metadata_schema(self, schema: metadata.MetadataSchema) -> None: + self.ll_table.metadata_schema = str(schema) + self._update_metadata_schema_cache_from_ll() + + def decode_row(self, row: Tuple[Any]) -> Tuple: + return ( + row[: self.metadata_column_index] + + (self._metadata_schema_cache.decode_row(row[self.metadata_column_index]),) + + row[self.metadata_column_index + 1 :] + ) + + def _update_metadata_schema_cache_from_ll(self) -> None: + self._metadata_schema_cache = metadata.parse_metadata_schema( + self.ll_table.metadata_schema + ) + class IndividualTable(BaseTable, MetadataMixin): """ @@ -330,6 +376,8 @@ class IndividualTable(BaseTable, MetadataMixin): :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 + :ivar metadata_schema: The metadata schema for this table's metadata column + :vartype metadata_schema: tskit.MetadataSchema """ column_names = [ @@ -362,17 +410,19 @@ def _text_header_and_rows(self): def add_row(self, flags=0, location=None, metadata=None): """ Adds a new row to this :class:`IndividualTable` and returns the ID of the - corresponding individual. + corresponding individual. Metadata, if specified, will be validated and encoded + according to the table's + :attr:`metadata_schema`. :param int flags: The bitwise flags for the new node. :param array-like location: A list of numeric values or one-dimensional numpy array describing the location of this individual. If not specified or None, a zero-dimensional location is stored. - :param bytes metadata: The binary-encoded metadata for the new individual. If not - specified or None, a zero-length byte string is stored. + :param object metadata: Any object that is valid metadata for the table's schema. :return: The ID of the newly added node. :rtype: int """ + metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row(flags=flags, location=location, metadata=metadata) def set_columns( @@ -394,7 +444,8 @@ def set_columns( together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. - See :ref:`sec_tables_api_binary_columns` for more information. + See :ref:`sec_tables_api_binary_columns` for more information and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param flags: The bitwise flags for each individual. Required. :type flags: numpy.ndarray, dtype=np.uint32 @@ -440,7 +491,8 @@ def append_columns( together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. - See :ref:`sec_tables_api_binary_columns` for more information. + See :ref:`sec_tables_api_binary_columns` for more information and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param flags: The bitwise flags for each individual. Required. :type flags: numpy.ndarray, dtype=np.uint32 @@ -512,7 +564,9 @@ class NodeTable(BaseTable, MetadataMixin): :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 - """ + :ivar metadata_schema: The metadata schema for this table's metadata column + :vartype metadata_schema: tskit.MetadataSchema +""" column_names = [ "time", @@ -548,7 +602,9 @@ def _text_header_and_rows(self): def add_row(self, flags=0, time=0, population=-1, individual=-1, metadata=None): """ Adds a new row to this :class:`NodeTable` and returns the ID of the - corresponding node. + corresponding node. Metadata, if specified, will be validated and encoded + according to the table's + :attr:`metadata_schema`. :param int flags: The bitwise flags for the new node. :param float time: The birth time for the new node. @@ -556,11 +612,11 @@ def add_row(self, flags=0, time=0, population=-1, individual=-1, metadata=None): Defaults to :data:`tskit.NULL`. :param int individual: The ID of the individual in which the new node was born. Defaults to :data:`tskit.NULL`. - :param bytes metadata: The binary-encoded metadata for the new node. If not - specified or None, a zero-length byte string is stored. + :param object metadata: Any object that is valid metadata for the table's schema. :return: The ID of the newly added node. :rtype: int """ + metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row(flags, time, population, individual, metadata) def set_columns( @@ -580,7 +636,8 @@ def set_columns( which is equal to the number of nodes the table will contain. The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. - See :ref:`sec_tables_api_binary_columns` for more information. + See :ref:`sec_tables_api_binary_columns` for more information and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param flags: The bitwise flags for each node. Required. :type flags: numpy.ndarray, dtype=np.uint32 @@ -628,7 +685,8 @@ def append_columns( which is equal to the number of nodes that will be added to the table. The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. - See :ref:`sec_tables_api_binary_columns` for more information. + See :ref:`sec_tables_api_binary_columns` for more information and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param flags: The bitwise flags for each node. Required. :type flags: numpy.ndarray, dtype=np.uint32 @@ -688,8 +746,9 @@ class EdgeTable(BaseTable, MetadataMixin): :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 - - """ + :ivar metadata_schema: The metadata schema for this table's metadata column + :vartype metadata_schema: tskit.MetadataSchema +""" column_names = [ "left", @@ -725,17 +784,19 @@ def _text_header_and_rows(self): def add_row(self, left, right, parent, child, metadata=None): """ Adds a new row to this :class:`EdgeTable` and returns the ID of the - corresponding edge. + corresponding edge. Metadata, if specified, will be validated and encoded + according to the table's + :attr:`metadata_schema`. :param float left: The left coordinate (inclusive). :param float right: The right coordinate (exclusive). :param int parent: The ID of parent node. :param int child: The ID of child node. - :param bytes metadata: The binary-encoded metadata for the new edge. If not - specified or None, a zero-length byte string is stored. + :param object metadata: Any object that is valid metadata for the table's schema. :return: The ID of the newly added edge. :rtype: int """ + metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row(left, right, parent, child, metadata) def set_columns( @@ -756,7 +817,8 @@ def set_columns( edges the table will contain). The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. - See :ref:`sec_tables_api_binary_columns` for more information. + See :ref:`sec_tables_api_binary_columns` for more information and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param left: The left coordinates (inclusive). @@ -799,7 +861,8 @@ def append_columns( additional edges to add to the table). The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. - See :ref:`sec_tables_api_binary_columns` for more information. + See :ref:`sec_tables_api_binary_columns` for more information and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param left: The left coordinates (inclusive). @@ -878,7 +941,9 @@ class MigrationTable(BaseTable, MetadataMixin): :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 - """ + :ivar metadata_schema: The metadata schema for this table's metadata column + :vartype metadata_schema: tskit.MetadataSchema +""" column_names = [ "left", @@ -918,7 +983,9 @@ def _text_header_and_rows(self): def add_row(self, left, right, node, source, dest, time, metadata=None): """ Adds a new row to this :class:`MigrationTable` and returns the ID of the - corresponding migration. + corresponding migration. Metadata, if specified, will be validated and encoded + according to the table's + :attr:`metadata_schema`. :param float left: The left coordinate (inclusive). :param float right: The right coordinate (exclusive). @@ -926,11 +993,11 @@ def add_row(self, left, right, node, source, dest, time, metadata=None): :param int source: The ID of the source population. :param int dest: The ID of the destination population. :param float time: The time of the migration event. - :param bytes metadata: The binary-encoded metadata for the new migration. If not - specified or None, a zero-length byte string is stored. + :param object metadata: Any object that is valid metadata for the table's schema. :return: The ID of the newly added migration. :rtype: int """ + metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row(left, right, node, source, dest, time, metadata) def set_columns( @@ -953,7 +1020,8 @@ def set_columns( migrations the table will contain). The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. - See :ref:`sec_tables_api_binary_columns` for more information. + See :ref:`sec_tables_api_binary_columns` for more information and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param left: The left coordinates (inclusive). :type left: numpy.ndarray, dtype=np.float64 @@ -1010,7 +1078,8 @@ def append_columns( additional migrations to add to the table). The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns`. - See :ref:`sec_tables_api_binary_columns` for more information. + See :ref:`sec_tables_api_binary_columns` for more information and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param left: The left coordinates (inclusive). :type left: numpy.ndarray, dtype=np.float64 @@ -1074,7 +1143,9 @@ class SiteTable(BaseTable, MetadataMixin): :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 - """ + :ivar metadata_schema: The metadata schema for this table's metadata column + :vartype metadata_schema: tskit.MetadataSchema +""" column_names = [ "position", @@ -1109,15 +1180,17 @@ def _text_header_and_rows(self): def add_row(self, position, ancestral_state, metadata=None): """ Adds a new row to this :class:`SiteTable` and returns the ID of the - corresponding site. + corresponding site. Metadata, if specified, will be validated and encoded + according to the table's + :attr:`metadata_schema`. :param float position: The position of this site in genome coordinates. :param str ancestral_state: The state of this site at the root of the tree. - :param bytes metadata: The binary-encoded metadata for the new site. If not - specified or None, a zero-length byte string is stored. + :param object metadata: Any object that is valid metadata for the table's schema. :return: The ID of the newly added site. :rtype: int """ + metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row(position, ancestral_state, metadata) def set_columns( @@ -1142,7 +1215,8 @@ def set_columns( ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see - :ref:`sec_tables_api_binary_columns` for more information). + :ref:`sec_tables_api_binary_columns` for more information) and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param position: The position of each site in genome coordinates. :type position: numpy.ndarray, dtype=np.float64 @@ -1195,7 +1269,8 @@ def append_columns( ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see - :ref:`sec_tables_api_binary_columns` for more information). + :ref:`sec_tables_api_binary_columns` for more information) and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param position: The position of each site in genome coordinates. :type position: numpy.ndarray, dtype=np.float64 @@ -1269,7 +1344,9 @@ class MutationTable(BaseTable, MetadataMixin): :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 - """ + :ivar metadata_schema: The metadata schema for this table's metadata column + :vartype metadata_schema: tskit.MetadataSchema +""" column_names = [ "site", @@ -1308,18 +1385,20 @@ def _text_header_and_rows(self): def add_row(self, site, node, derived_state, parent=-1, metadata=None): """ Adds a new row to this :class:`MutationTable` and returns the ID of the - corresponding mutation. + corresponding mutation. Metadata, if specified, will be validated and encoded + according to the table's + :attr:`metadata_schema`. :param int site: The ID of the site that this mutation occurs at. :param int node: The ID of the first node inheriting this mutation. :param str derived_state: The state of the site at this mutation's node. :param int parent: The ID of the parent mutation. If not specified, defaults to :attr:`NULL`. - :param bytes metadata: The binary-encoded metadata for the new mutation. If not - specified or None, a zero-length byte string is stored. + :param object metadata: Any object that is valid metadata for the table's schema. :return: The ID of the newly added mutation. :rtype: int """ + metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row(site, node, derived_state, parent, metadata) def set_columns( @@ -1347,7 +1426,8 @@ def set_columns( ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see - :ref:`sec_tables_api_binary_columns` for more information). + :ref:`sec_tables_api_binary_columns` for more information) and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param site: The ID of the site each mutation occurs at. :type site: numpy.ndarray, dtype=np.int32 @@ -1410,7 +1490,8 @@ def append_columns( ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see - :ref:`sec_tables_api_binary_columns` for more information). + :ref:`sec_tables_api_binary_columns` for more information) and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param site: The ID of the site each mutation occurs at. :type site: numpy.ndarray, dtype=np.int32 @@ -1478,7 +1559,9 @@ class PopulationTable(BaseTable, MetadataMixin): :ivar metadata_offset: The array of offsets into the metadata column. See :ref:`sec_tables_api_binary_columns` for more details. :vartype metadata_offset: numpy.ndarray, dtype=np.uint32 - """ + :ivar metadata_schema: The metadata schema for this table's metadata column + :vartype metadata_schema: tskit.MetadataSchema +""" column_names = ["metadata", "metadata_offset"] @@ -1490,13 +1573,15 @@ def __init__(self, max_rows_increment=0, ll_table=None): def add_row(self, metadata=None): """ Adds a new row to this :class:`PopulationTable` and returns the ID of the - corresponding population. + corresponding population. Metadata, if specified, will be validated and encoded + according to the table's + :attr:`metadata_schema`. - :param bytes metadata: The binary-encoded metadata for the new population. - If not specified or None, a zero-length byte string is stored. + :param object metadata: Any object that is valid metadata for the table's schema. :return: The ID of the newly added population. :rtype: int """ + metadata = self.metadata_schema.validate_and_encode_row(metadata) return self.ll_table.add_row(metadata=metadata) def _text_header_and_rows(self): @@ -1517,7 +1602,8 @@ def set_columns(self, metadata=None, metadata_offset=None): The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see - :ref:`sec_tables_api_binary_columns` for more information). + :ref:`sec_tables_api_binary_columns` for more information) and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata @@ -1538,7 +1624,8 @@ def append_columns(self, metadata=None, metadata_offset=None): The ``metadata`` and ``metadata_offset`` parameters must be supplied together, and meet the requirements for :ref:`sec_encoding_ragged_columns` (see - :ref:`sec_tables_api_binary_columns` for more information). + :ref:`sec_tables_api_binary_columns` for more information) and + :ref:`sec_tutorial_metadata_bulk` for an example of how to prepare metadata. :param metadata: The flattened metadata array. Must be specified along with ``metadata_offset``. If not specified or None, an empty metadata diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 1abf9d15c6..9f1e5cf463 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -1,7 +1,7 @@ # # MIT License # -# Copyright (c) 2018-2019 Tskit Developers +# Copyright (c) 2018-2020 Tskit Developers # Copyright (c) 2015-2018 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -32,13 +32,16 @@ import json import textwrap import warnings +from typing import Any +import attr import numpy as np import _tskit import tskit.drawing as drawing import tskit.exceptions as exceptions import tskit.formats as formats +import tskit.metadata as metadata import tskit.provenance as provenance import tskit.tables as tables import tskit.util as util @@ -65,7 +68,45 @@ def __repr__(self): return repr(self.__dict__) -class Individual(SimpleContainer): +class SimpleContainerWithMetadata(SimpleContainer): + """ + This class allows metadata to be lazily decoded and cached + """ + + class CachedMetadata: + """ + If we had python>=3.8 we could just use @functools.cached_property here. We + don't so we implement it similarly using a descriptor + """ + + def __get__(self, container: "SimpleContainerWithMetadata", owner: type): + decoded = container._metadata_decoder(container._encoded_metadata) + container.__dict__["metadata"] = decoded + return decoded + + metadata: Any = CachedMetadata() + + def __eq__(self, other: SimpleContainer) -> bool: + # We need to remove metadata and the decoder so we are just comparing + # the encoded metadata, along with the other attributes + other = {**other.__dict__} + other["metadata"] = None + other["_metadata_decoder"] = None + self_ = {**self.__dict__} + self_["metadata"] = None + self_["_metadata_decoder"] = None + return self_ == other + + def __repr__(self) -> str: + # Make sure we have a decoded metadata + _ = self.metadata + out = {**self.__dict__} + del out["_encoded_metadata"] + del out["_metadata_decoder"] + return repr(out) + + +class Individual(SimpleContainerWithMetadata): """ An :ref:`individual ` in a tree sequence. Since nodes correspond to genomes, individuals are associated with a collection @@ -87,28 +128,38 @@ class Individual(SimpleContainer): a numpy array (dtype=np.int32). If no nodes are associated with the individual this array will be empty. :vartype location: numpy.ndarray - :ivar metadata: The :ref:`metadata ` for this individual. - :vartype metadata: bytes + :ivar metadata: The decoded :ref:`metadata ` + for this individual. + :vartype metadata: object """ - def __init__(self, id_=None, flags=0, location=None, nodes=None, metadata=""): + def __init__( + self, + id_=None, + flags=0, + location=None, + nodes=None, + encoded_metadata=b"", + metadata_decoder=lambda metadata: metadata, + ): self.id = id_ self.flags = flags self.location = location - self.metadata = metadata + self._encoded_metadata = encoded_metadata + self._metadata_decoder = metadata_decoder self.nodes = nodes def __eq__(self, other): return ( self.id == other.id and self.flags == other.flags - and self.metadata == other.metadata + and self._encoded_metadata == other._encoded_metadata and np.array_equal(self.nodes, other.nodes) and np.array_equal(self.location, other.location) ) -class Node(SimpleContainer): +class Node(SimpleContainerWithMetadata): """ A :ref:`node ` in a tree sequence, corresponding to a single genome. The ``time`` and ``population`` are attributes of the @@ -129,18 +180,26 @@ class Node(SimpleContainer): :vartype population: int :ivar individual: The integer ID of the individual that this node was a part of. :vartype individual: int - :ivar metadata: The :ref:`metadata ` for this node. - :vartype metadata: bytes + :ivar metadata: The decoded :ref:`metadata ` for this node. + :vartype metadata: object """ def __init__( - self, id_=None, flags=0, time=0, population=NULL, individual=NULL, metadata="" + self, + id_=None, + flags=0, + time=0, + population=NULL, + individual=NULL, + encoded_metadata=b"", + metadata_decoder=lambda metadata: metadata, ): self.id = id_ self.time = time self.population = population self.individual = individual - self.metadata = metadata + self._encoded_metadata = encoded_metadata + self._metadata_decoder = metadata_decoder self.flags = flags def is_sample(self): @@ -153,7 +212,7 @@ def is_sample(self): return self.flags & NODE_IS_SAMPLE -class Edge(SimpleContainer): +class Edge(SimpleContainerWithMetadata): """ An :ref:`edge ` in a tree sequence. @@ -175,17 +234,27 @@ class Edge(SimpleContainer): :ivar id: The integer ID of this edge. Varies from 0 to :attr:`TreeSequence.num_edges` - 1. :vartype id: int - :ivar metadata: The :ref:`metadata ` for this edge. - :vartype metadata: bytes + :ivar metadata: The decoded :ref:`metadata ` for this edge. + :vartype metadata: object """ - def __init__(self, left, right, parent, child, metadata=b"", id_=None): + def __init__( + self, + left, + right, + parent, + child, + encoded_metadata=b"", + id_=None, + metadata_decoder=lambda metadata: metadata, + ): self.id = id_ self.left = left self.right = right self.parent = parent self.child = child - self.metadata = metadata + self._encoded_metadata = encoded_metadata + self._metadata_decoder = metadata_decoder def __repr__(self): return ( @@ -206,7 +275,7 @@ def span(self): return self.right - self.left -class Site(SimpleContainer): +class Site(SimpleContainerWithMetadata): """ A :ref:`site ` in a tree sequence. @@ -223,23 +292,32 @@ class Site(SimpleContainer): :ivar ancestral_state: The ancestral state at this site (i.e., the state inherited by nodes, unless mutations occur). :vartype ancestral_state: str - :ivar metadata: The :ref:`metadata ` for this site. - :vartype metadata: bytes + :ivar metadata: The decoded :ref:`metadata ` for this site. + :vartype metadata: object :ivar mutations: The list of mutations at this site. Mutations within a site are returned in the order they are specified in the underlying :class:`MutationTable`. :vartype mutations: list[:class:`Mutation`] """ - def __init__(self, id_, position, ancestral_state, mutations, metadata): + def __init__( + self, + id_, + position, + ancestral_state, + mutations, + encoded_metadata=b"", + metadata_decoder=lambda metadata: metadata, + ): self.id = id_ self.position = position self.ancestral_state = ancestral_state self.mutations = mutations - self.metadata = metadata + self._encoded_metadata = encoded_metadata + self._metadata_decoder = metadata_decoder -class Mutation(SimpleContainer): +class Mutation(SimpleContainerWithMetadata): """ A :ref:`mutation ` in a tree sequence. @@ -268,8 +346,9 @@ class Mutation(SimpleContainer): To obtain further information about a mutation with a given ID, use :meth:`TreeSequence.mutation`. :vartype parent: int - :ivar metadata: The :ref:`metadata ` for this site. - :vartype metadata: bytes + :ivar metadata: The decoded :ref:`metadata ` for this + mutation. + :vartype metadata: object """ def __init__( @@ -279,17 +358,19 @@ def __init__( node=NULL, derived_state=None, parent=NULL, - metadata=None, + encoded_metadata=b"", + metadata_decoder=lambda metadata: metadata, ): self.id = id_ self.site = site self.node = node self.derived_state = derived_state self.parent = parent - self.metadata = metadata + self._encoded_metadata = encoded_metadata + self._metadata_decoder = metadata_decoder -class Migration(SimpleContainer): +class Migration(SimpleContainerWithMetadata): """ A :ref:`migration ` in a tree sequence. @@ -312,9 +393,23 @@ class Migration(SimpleContainer): :vartype dest: int :ivar time: The time at which this migration occured at. :vartype time: float + :ivar metadata: The decoded :ref:`metadata ` for this + migration. + :vartype metadata: object """ - def __init__(self, left, right, node, source, dest, time, metadata=b"", id_=None): + def __init__( + self, + left, + right, + node, + source, + dest, + time, + encoded_metadata=b"", + metadata_decoder=lambda metadata: metadata, + id_=None, + ): self.id = id_ self.left = left self.right = right @@ -322,7 +417,8 @@ def __init__(self, left, right, node, source, dest, time, metadata=b"", id_=None self.source = source self.dest = dest self.time = time - self.metadata = metadata + self._encoded_metadata = encoded_metadata + self._metadata_decoder = metadata_decoder def __repr__(self): return ( @@ -340,7 +436,7 @@ def __repr__(self): ) -class Population(SimpleContainer): +class Population(SimpleContainerWithMetadata): """ A :ref:`population ` in a tree sequence. @@ -350,13 +446,17 @@ class Population(SimpleContainer): :ivar id: The integer ID of this population. Varies from 0 to :attr:`TreeSequence.num_populations` - 1. :vartype id: int - :ivar metadata: The :ref:`metadata ` for this population. - :vartype metadata: bytes + :ivar metadata: The decoded :ref:`metadata ` + for this population. + :vartype metadata: object """ - def __init__(self, id_, metadata=""): + def __init__( + self, id_, encoded_metadata=b"", metadata_decoder=lambda metadata: metadata, + ): self.id = id_ - self.metadata = metadata + self._encoded_metadata = encoded_metadata + self._metadata_decoder = metadata_decoder class Variant(SimpleContainer): @@ -2624,8 +2724,31 @@ class TreeSequence: the :meth:`.variants` method iterates over all sites and their genotypes. """ + @attr.s(slots=True, frozen=True, kw_only=True, auto_attribs=True) + class _TableMetadataSchemas: + """ + Convenience class for returning schemas + """ + + node: metadata.MetadataSchema + edge: metadata.MetadataSchema + site: metadata.MetadataSchema + mutation: metadata.MetadataSchema + migration: metadata.MetadataSchema + individual: metadata.MetadataSchema + population: metadata.MetadataSchema + def __init__(self, ll_tree_sequence): self._ll_tree_sequence = ll_tree_sequence + metadata_schema_strings = self._ll_tree_sequence.get_table_metadata_schemas() + metadata_schema_instances = { + name: metadata.parse_metadata_schema(getattr(metadata_schema_strings, name)) + for name in vars(self._TableMetadataSchemas) + if not name.startswith("_") + } + self._table_metadata_schemas = self._TableMetadataSchemas( + **metadata_schema_instances + ) # Implement the pickle protocol for TreeSequence def __getstate__(self): @@ -2895,6 +3018,13 @@ def num_samples(self): """ return self._ll_tree_sequence.get_num_samples() + @property + def table_metadata_schemas(self) -> "_TableMetadataSchemas": + """ + The set of metadata schemas for the tables in this tree sequence. + """ + return self._table_metadata_schemas + @property def sample_size(self): # Deprecated alias for num_samples @@ -3165,9 +3295,10 @@ def edge_diffs(self): :rtype: :class:`collections.abc.Iterable` """ iterator = _tskit.TreeDiffIterator(self._ll_tree_sequence) + metadata_decoder = self.table_metadata_schemas.edge.decode_row for interval, edge_tuples_out, edge_tuples_in in iterator: - edges_out = [Edge(*e) for e in edge_tuples_out] - edges_in = [Edge(*e) for e in edge_tuples_in] + edges_out = [Edge(*(e + (metadata_decoder,))) for e in edge_tuples_out] + edges_in = [Edge(*(e + (metadata_decoder,))) for e in edge_tuples_in] yield interval, edges_out, edges_in def sites(self): @@ -3567,7 +3698,12 @@ def individual(self, id_): """ flags, location, metadata, nodes = self._ll_tree_sequence.get_individual(id_) return Individual( - id_=id_, flags=flags, location=location, metadata=metadata, nodes=nodes + id_=id_, + flags=flags, + location=location, + encoded_metadata=metadata, + metadata_decoder=self.table_metadata_schemas.individual.decode_row, + nodes=nodes, ) def node(self, id_): @@ -3590,7 +3726,8 @@ def node(self, id_): time=time, population=population, individual=individual, - metadata=metadata, + encoded_metadata=metadata, + metadata_decoder=self.table_metadata_schemas.node.decode_row, ) def edge(self, id_): @@ -3607,7 +3744,8 @@ def edge(self, id_): right=right, parent=parent, child=child, - metadata=metadata, + encoded_metadata=metadata, + metadata_decoder=self.table_metadata_schemas.edge.decode_row, ) def migration(self, id_): @@ -3634,7 +3772,8 @@ def migration(self, id_): source=source, dest=dest, time=time, - metadata=metadata, + encoded_metadata=metadata, + metadata_decoder=self.table_metadata_schemas.migration.decode_row, ) def mutation(self, id_): @@ -3657,7 +3796,8 @@ def mutation(self, id_): node=node, derived_state=derived_state, parent=parent, - metadata=metadata, + encoded_metadata=metadata, + metadata_decoder=self.table_metadata_schemas.mutation.decode_row, ) def site(self, id_): @@ -3675,7 +3815,8 @@ def site(self, id_): position=pos, ancestral_state=ancestral_state, mutations=mutations, - metadata=metadata, + encoded_metadata=metadata, + metadata_decoder=self.table_metadata_schemas.site.decode_row, ) def population(self, id_): @@ -3686,7 +3827,11 @@ def population(self, id_): :rtype: :class:`Population` """ (metadata,) = self._ll_tree_sequence.get_population(id_) - return Population(id_=id_, metadata=metadata) + return Population( + id_=id_, + encoded_metadata=metadata, + metadata_decoder=self.table_metadata_schemas.population.decode_row, + ) def provenance(self, id_): timestamp, record = self._ll_tree_sequence.get_provenance(id_)