Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/data-model.rst
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,8 @@ interchanged, each row is `base 64 encoded <https://en.wikipedia.org/wiki/Base64
Thus, binary information can be safely printed and exchanged, but may not be
human readable.

The tree sequence itself also has metadata stored as a byte array.

.. _sec_valid_tree_sequence_requirements:

Valid tree sequence requirements
Expand Down
12 changes: 9 additions & 3 deletions docs/metadata.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
Metadata
========

Every entity (nodes, mutations, edges, etc.) in a tskit tree sequence can have
The tree-sequence and every entity within it (nodes, mutations, edges, etc.) can have
metadata associated with it. This is intended for storing and passing on information
that tskit itself does not use or interpret. For example information derived from a VCF
INFO field, or administrative information (such as unique identifiers) relating to
Expand Down Expand Up @@ -170,11 +170,17 @@ attribute (e.g. :attr:`tskit.IndividualTable.metadata_schema`). The schemas
for all tables can be retrieved from a :class:`tskit.TreeSequence` by the
:attr:`tskit.TreeSequence.table_metadata_schemas` attribute.

The top-level tree sequence metadata schema is set via
:attr:`tskit.TableCollection.metadata_schema` and can be accessed via
:attr:`tskit.TreeSequence.metadata_schema`.

Each table's ``add_row`` method (e.g. :meth:`tskit.IndividualTable.add_row`) will
validate and encode the metadata using the schema.
validate and encode the metadata using the schema. This encoding will also happen when
tree sequence metadata is set (e.g. ``table_collection.metadata = {...}``.

Metadata will be lazily decoded if accessed via
``tables.individuals[0].metadata`` or ``tree_sequence.individual(0).metadata``.
``tables.individuals[0].metadata``. ``tree_sequence.individual(0).metadata`` or
``tree_sequence.metadata``

In the interests of efficiency the bulk methods of ``set_columns``
(e.g. :meth:`tskit.IndividualTable.set_columns`)
Expand Down
3 changes: 3 additions & 0 deletions python/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ In development
on calls to ``table[j]`` and e.g. ``tree_sequence.node(j)`` See :ref:`sec_metadata`.
(:user:`benjeffery`, :pr:`491`, :pr:`542`, :pr:`543`, :pr:`601`)

- The tree-sequence now has top-level metadata with a schema.
(:user:`benjeffery`, :pr:`666`, :pr:`644`, :pr:`642`)

- Add classes to SVG drawings to allow easy adjustment and styling, and document the new
``tskit.Tree.draw_svg()`` and ``tskit.TreeSequence.draw_svg()`` methods. This also fixes
:issue:`467` for duplicate SVG entity ``id`` s in Jupyter notebooks.
Expand Down
34 changes: 32 additions & 2 deletions python/tests/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1368,7 +1368,37 @@ class TestTreeSequenceMetadata(unittest.TestCase):
},
)

def test_metadata_schemas(self):
def test_tree_sequence_metadata_schema(self):
tc = tskit.TableCollection(1)
ts = tc.tree_sequence()
self.assertEqual(str(ts.metadata_schema), str(tskit.MetadataSchema(None)))
tc.metadata_schema = self.metadata_schema
ts = tc.tree_sequence()
self.assertEqual(str(ts.metadata_schema), str(self.metadata_schema))
with self.assertRaises(AttributeError):
del ts.metadata_schema
with self.assertRaises(AttributeError):
ts.metadata_schema = tskit.MetadataSchema(None)

def test_tree_sequence_metadata(self):
tc = tskit.TableCollection(1)
ts = tc.tree_sequence()
self.assertEqual(ts.metadata, b"")
tc.metadata_schema = self.metadata_schema
data = {
"table": "tree-sequence",
"string_prop": "stringy",
"num_prop": 42,
}
tc.metadata = data
ts = tc.tree_sequence()
self.assertEqual(ts.metadata, data)
with self.assertRaises(AttributeError):
ts.metadata = {"should": "fail"}
with self.assertRaises(AttributeError):
del ts.metadata

def test_table_metadata_schemas(self):
ts = msprime.simulate(5)
for table in self.metadata_tables:
tables = ts.dump_tables()
Expand Down Expand Up @@ -1405,7 +1435,7 @@ def test_metadata_schemas(self):
tskit.MetadataSchema({"codec": "json"}),
)

def test_metadata_round_trip_via_row_getters(self):
def test_table_metadata_round_trip_via_row_getters(self):
# A tree sequence with all entities
pop_configs = [msprime.PopulationConfiguration(5) for _ in range(2)]
migration_matrix = [[0, 1], [1, 0]]
Expand Down
97 changes: 97 additions & 0 deletions python/tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2109,6 +2109,103 @@ def test_sequence_length_longer_than_edges(self):
self.assertEqual(len(tree.parent_dict), 0)


class TestTableCollectionMetadata(unittest.TestCase):

metadata_schema = metadata.MetadataSchema(
{
"codec": "json",
"title": "Example Metadata",
"type": "object",
"properties": {
"one": {"type": "string"},
"two": {"type": "number"},
"three": {"type": "array"},
"four": {"type": "boolean"},
},
"required": ["one", "two", "three", "four"],
"additionalProperties": False,
},
)

def metadata_example_data(self, val=0):
return {
"one": "val one",
"two": val,
"three": list(range(val, val + 10)),
"four": True,
}

def test_set_metadata_schema(self):
tc = tskit.TableCollection(1)
metadata_schema2 = metadata.MetadataSchema({"codec": "json"})
# Default is no-op metadata codec
self.assertEqual(str(tc.metadata_schema), str(metadata.MetadataSchema(None)))
# Set
tc.metadata_schema = self.metadata_schema
self.assertEqual(str(tc.metadata_schema), str(self.metadata_schema))
# Overwrite
tc.metadata_schema = metadata_schema2
self.assertEqual(str(tc.metadata_schema), str(metadata_schema2))
# Remove
tc.metadata_schema = ""
self.assertEqual(str(tc.metadata_schema), str(metadata.MetadataSchema(None)))
# Set after remove
tc.metadata_schema = self.metadata_schema
self.assertEqual(str(tc.metadata_schema), str(self.metadata_schema))
# Del should fail
with self.assertRaises(AttributeError):
del tc.metadata_schema
# None should fail
with self.assertRaises(ValueError):
tc.metadata_schema = None

def test_set_metadata(self):
tc = tskit.TableCollection(1)
# Default is empty bytes
self.assertEqual(tc.metadata, b"")

tc.metadata_schema = self.metadata_schema
md1 = self.metadata_example_data()
md2 = self.metadata_example_data(val=2)
# Set
tc.metadata = md1
self.assertEqual(tc.metadata, md1)
# Overwrite
tc.metadata = md2
self.assertEqual(tc.metadata, md2)
# Del should fail
with self.assertRaises(AttributeError):
del tc.metadata
# None should fail
with self.assertRaises(exceptions.MetadataValidationError):
tc.metadata = None

def test_default_metadata_schema(self):
# Default should allow bytes
tc = tskit.TableCollection(1)
tc.metadata = b"acceptable bytes"
self.assertEqual(tc.metadata, b"acceptable bytes")
# Adding non-bytes metadata should error
with self.assertRaises(TypeError):
tc.metadata = self.metadata_example_data()

def test_round_trip_metadata(self):
data = self.metadata_example_data()
tc = tskit.TableCollection(1)
tc.metadata_schema = self.metadata_schema
tc.metadata = data
self.assertDictEqual(tc.metadata, data)

def test_bad_metadata(self):
metadata = self.metadata_example_data()
metadata["I really shouldn't be here"] = 6
tc = tskit.TableCollection(1)
tc.metadata_schema = self.metadata_schema
with self.assertRaises(exceptions.MetadataValidationError):
tc.metadata = metadata
self.assertEqual(tc.ll_tables.metadata, b"")


class TestTableCollectionPickle(unittest.TestCase):
"""
Tests that we can round-trip table collections through pickle.
Expand Down
24 changes: 24 additions & 0 deletions python/tskit/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -1884,6 +1884,30 @@ def sequence_length(self, sequence_length):
def file_uuid(self):
return self.ll_tables.file_uuid

@property
def metadata_schema(self) -> metadata.MetadataSchema:
"""
The :class:`tskit.MetadataSchema` for this TableCollection.
"""
return metadata.parse_metadata_schema(self.ll_tables.metadata_schema)

@metadata_schema.setter
def metadata_schema(self, schema: metadata.MetadataSchema) -> None:
# Check the schema is a valid schema instance by roundtripping it.
metadata.parse_metadata_schema(str(schema))
self.ll_tables.metadata_schema = str(schema)

@property
def metadata(self) -> Any:
"""
The decoded metadata for this TableCollection.
"""
return self.metadata_schema.decode_row(self.ll_tables.metadata)

@metadata.setter
def metadata(self, metadata: Any) -> None:
self.ll_tables.metadata = self.metadata_schema.validate_and_encode_row(metadata)

def asdict(self):
"""
Returns a dictionary representation of this TableCollection.
Expand Down
36 changes: 27 additions & 9 deletions python/tskit/trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
import tskit.drawing as drawing
import tskit.exceptions as exceptions
import tskit.formats as formats
import tskit.metadata as metadata
import tskit.metadata as metadata_module
import tskit.provenance as provenance
import tskit.tables as tables
import tskit.util as util
Expand Down Expand Up @@ -2829,19 +2829,21 @@ class _TableMetadataSchemas:
Convenience class for returning schemas
"""

node: metadata.MetadataSchema
edge: metadata.MetadataSchema
site: metadata.MetadataSchema
mutation: metadata.MetadataSchema
migration: metadata.MetadataSchema
individual: metadata.MetadataSchema
population: metadata.MetadataSchema
node: metadata_module.MetadataSchema
edge: metadata_module.MetadataSchema
site: metadata_module.MetadataSchema
mutation: metadata_module.MetadataSchema
migration: metadata_module.MetadataSchema
individual: metadata_module.MetadataSchema
population: metadata_module.MetadataSchema

def __init__(self, ll_tree_sequence):
self._ll_tree_sequence = ll_tree_sequence
metadata_schema_strings = self._ll_tree_sequence.get_table_metadata_schemas()
metadata_schema_instances = {
name: metadata.parse_metadata_schema(getattr(metadata_schema_strings, name))
name: metadata_module.parse_metadata_schema(
getattr(metadata_schema_strings, name)
)
for name in vars(self._TableMetadataSchemas)
if not name.startswith("_")
}
Expand Down Expand Up @@ -3155,6 +3157,22 @@ def sequence_length(self):
def get_sequence_length(self):
return self._ll_tree_sequence.get_sequence_length()

@property
def metadata(self) -> Any:
"""
The decoded metadata for this TreeSequence.
"""
return self.metadata_schema.decode_row(self._ll_tree_sequence.get_metadata())

@property
def metadata_schema(self) -> metadata_module.MetadataSchema:
"""
The :class:`tskit.MetadataSchema` for this TreeSequence.
"""
return metadata_module.parse_metadata_schema(
self._ll_tree_sequence.get_metadata_schema()
)

@property
def num_edges(self):
"""
Expand Down