diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 2b0b042c88..06c36ed45e 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -6575,6 +6575,14 @@ test_empty_tree_kc(void) ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH); tsk_treeseq_free(&ts); + tables.sequence_length = NAN; + ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH); + tsk_treeseq_free(&ts); + tables.sequence_length = INFINITY; + ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH); + tsk_treeseq_free(&ts); tables.sequence_length = 1.0; ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 4a256df27f..8b92be497c 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -11078,7 +11078,7 @@ tsk_table_collection_check_integrity( | TSK_CHECK_MIGRATION_ORDERING | TSK_CHECK_INDEXES; } - if (self->sequence_length <= 0) { + if (!tsk_isfinite(self->sequence_length) || self->sequence_length <= 0) { ret = tsk_trace_error(TSK_ERR_BAD_SEQUENCE_LENGTH); goto out; } diff --git a/docs/data-model.md b/docs/data-model.md index 152da15100..01da842dd8 100644 --- a/docs/data-model.md +++ b/docs/data-model.md @@ -356,14 +356,10 @@ required for a valid set of populations. #### Provenance Table -:::{todo} -Document the provenance table. -::: - | Column | Type | Description | | :-------- | ----- | ----------------------------------------------------------------------: | | timestamp | char | Timestamp in [ISO-8601](https://en.wikipedia.org/wiki/ISO_8601) format. | -| record | char | Provenance record. | +| record | char | Provenance record as JSON. | (sec_metadata_definition)= @@ -374,10 +370,16 @@ Each table (excluding provenance) has a metadata column for storing and passing information that tskit does not use or interpret. See {ref}`sec_metadata` for details. The metadata columns are {ref}`binary columns `. -When using the {ref}`sec_text_file_format`, to ensure that metadata can be safely -interchanged, each row is [base 64 encoded](https://en.wikipedia.org/wiki/Base64). -Thus, binary information can be safely printed and exchanged, but may not be -human readable. +When using the {ref}`sec_text_file_format`, metadata values are written as opaque +text. By default, :meth:`TreeSequence.dump_text` will base64-encode metadata values +that are stored as raw bytes (when ``base64_metadata=True``) so that binary data can +be safely printed and exchanged; in this case :func:`tskit.load_text` will base64-decode +the corresponding text fields back to bytes. When metadata has already been decoded +to a structured Python object (for example via a metadata schema), the textual +representation written by :meth:`TreeSequence.dump_text` is the ``repr`` of that +object, and :func:`tskit.load_text` does not attempt to reconstruct the original +structured value from this representation. For reliable metadata round-tripping, +prefer the native binary tree sequence file format over the text formats. The tree sequence itself also has metadata stored as a byte array. @@ -399,6 +401,10 @@ error message. Some more complex requirements may not be detectable at load-time and errors may not occur until certain operations are attempted. These are documented below. +At the tree-sequence level, we require that the coordinate space has a finite, +strictly positive length; that is, the `sequence_length` attribute must be a +finite value greater than zero. + The Python API also provides tools that can transform a collection of tables into a valid collection of tables, so long as they are logically consistent, see {ref}`sec_tables_api_creating_valid_tree_sequence`. @@ -410,7 +416,8 @@ consistent, see {ref}`sec_tables_api_creating_valid_tree_sequence`. Individuals are a basic type in a tree sequence and are not defined with respect to any other tables. Individuals can have a reference to their parent -individuals, if present these references must be valid or null (-1). +individuals, if present these references must be valid or null (-1). An +individual cannot list itself as its own parent. A valid tree sequence does not require individuals to be sorted in any particular order, and sorting a set of tables using {meth}`TableCollection.sort` @@ -424,6 +431,7 @@ using {meth}`TableCollection.sort_individuals`. Given a valid set of individuals and populations, the requirements for each node are: +- `time` must be a finite (non-NaN, non-infinite) value; - `population` must either be null (-1) or refer to a valid population ID; - `individual` must either be null (-1) or refer to a valid individual ID. @@ -443,7 +451,7 @@ has no effect on nodes. Given a valid set of nodes and a sequence length {math}`L`, the simple requirements for each edge are: -- We must have {math}`0 \leq` `left` {math}`<` `right` {math}`\leq L`; +- We must have finite coordinates with {math}`0 \leq` `left` {math}`<` `right` {math}`\leq L`; - `parent` and `child` must be valid node IDs; - `time[parent]` > `time[child]`; - edges must be unique (i.e., no duplicate edges are allowed). @@ -480,7 +488,7 @@ properties are fulfilled. Given a valid set of nodes and a sequence length {math}`L`, the simple requirements for a valid set of sites are: -- We must have {math}`0 \leq` `position` {math}`< L`; +- We must have a finite coordinate with {math}`0 \leq` `position` {math}`< L`; - `position` values must be unique. For simplicity and algorithmic efficiency, sites must also: @@ -546,19 +554,33 @@ will always fail. Use `tskit.is_unknown_time` to detect unknown values. #### Migration requirements -Given a valid set of nodes and edges, the requirements for a value set of +Given a valid set of nodes and edges, the requirements for a valid set of migrations are: -- `left` and `right` must lie within the tree sequence coordinate space (i.e., - from 0 to `sequence_length`). -- `time` must be strictly between the time of its `node` and the time of any - ancestral node from which that node inherits on the segment `[left, right)`. -- The `population` of any such ancestor matching `source`, if another - `migration` does not intervene. +- `left` and `right` must be finite values that lie within the tree sequence + coordinate space (i.e., from 0 to `sequence_length`), with {math}`0 \leq` + `left` {math}`<` `right` {math}`\leq L`; +- `node` must be a valid node ID; +- if population references are checked, `source` and `dest` must be valid + population IDs; +- `time` must be a finite value. -To enable efficient processing, migrations must also be: +To enable efficient processing, migrations must also be sorted by +nondecreasing `time` value. -- Sorted by nondecreasing `time` value. +Conceptually, a migration records that a segment of ancestry for the given +`node` moves between populations along the tree. In typical demographic +models we expect: + +- `time` to lie strictly between the time of the migrating `node` and the time + of any ancestral node from which that node inherits on the segment + `[left, right)`; +- the `population` of any such ancestor to match the `source` population, + until another `migration` intervenes. + +These conceptual relationships are not currently validated. It is +the responsibility of code that creates migrations to satisfy them where +required. Note in particular that there is no requirement that adjacent migration records should be "squashed". That is, we can have two records `m1` and `m2` @@ -582,8 +604,10 @@ There are no requirements on a population table. The `timestamp` column of a provenance table should be in [ISO-8601](https://en.wikipedia.org/wiki/ISO_8601) format. -The `record` should be valid JSON with structure defined in the Provenance -Schema section (TODO). +The `record` column stores a JSON document describing how and where the tree sequence +was produced. For tree sequences generated by tskit and related tools, this JSON is +expected to conform to the :ref:`provenance schema ` described +in {ref}`sec_provenance`. (sec_table_indexes)= @@ -1148,4 +1172,3 @@ you won't see those parts of the tree sequence that are unrelated to the samples If you need to get those, too, you could either work with the {meth}`TreeSequence.edge_diffs` directly, or iterate over all nodes (instead of over {meth}`Tree.nodes`). - diff --git a/docs/export.md b/docs/export.md index 5616dee055..6dddb73a7c 100644 --- a/docs/export.md +++ b/docs/export.md @@ -319,6 +319,18 @@ the individual IDs are 2 and 3. See the these default labels. ::: +If some individuals have no associated nodes, they are omitted from the +VCF output. By default, only nodes that are marked as samples contribute +to the VCF genotypes; to include non-sample nodes as well (e.g., internal +nodes that have been marked as individuals), set +``include_non_sample_nodes=True`` when calling :meth:`TreeSequence.write_vcf`. + +:::{note} +At present, :meth:`TreeSequence.write_vcf` only supports sites with up to +9 distinct alleles; attempting to write a site with more than 9 alleles +will result in a :class:`ValueError`. +::: + (sec_export_vcf_individual_names)= ### Individual names @@ -369,23 +381,63 @@ id and individual id, and two underscores will throw an error. ### Modifying coordinates -Tskit supports continuous genome coordinates, but VCF only supports discrete -genome positions. Thus, when we convert a tree sequence that has sites -at continuous positions we must discretise the coordinates in some way. - -The ``position_transform`` argument provides a way to flexibly translate -the genomic location of sites in tskit to the appropriate value in VCF. -There are two fundamental differences in the way that tskit and VCF define -genomic coordinates. The first is that tskit uses floating point values -to encode positions, whereas VCF uses integers. Thus, if the tree sequence -contains positions at non-integral locations there is an information loss -incurred by translating to VCF. By default, we round the site positions -to the nearest integer, such that there may be several sites with the -same integer position in the output. The second difference between VCF -and tskit is that VCF is defined to be a 1-based coordinate system, whereas -tskit uses 0-based. However, how coordinates are transformed depends -on the VCF parser, and so we do **not** account for this change in -coordinate system by default. +Tree sequence site positions can be floating point values, whereas VCF +requires positive integers. The ``position_transform`` argument +controls how tskit maps coordinates into VCF. Translating non-integer +positions necessarily loses precision; by default we round to the nearest +integer, so multiple sites may share the same output position. +Furthermore, tskit's coordinate system starts at zero, +whereas the VCF standard requires positions to be positive, +and so a site at position 0 is not valid in the VCF standard. +Because VCF parsers differ, we do **not** do anything to account for this. + +The simplest resolution of this discrepancy in convention between tskit and VCF +positions is deal with any site at position 0 as a special case (for instance, +by discarding or ignoring it). +A different interpretation of this difference between tskit's position +and VCF's POS field +is that they are different coordinate systems: tskit coordinates are +"distance to the right of the left end of the chromosome", +while VCF coordinates are "which number site, counting from the left end +of the chromosome and starting at one". +Under this interpretation, the solution is to supply an explicit +``position_transform`` that adds 1 to the coordinate when outputting +to VCF (or, using the ``"legacy"`` option described below). However, this can +easily lead to off-by-one errors converting between the coordinate systems, +so should only be used if you really are using 0-based coordinates for your +tree sequence. + +:::{warning} +Most VCF tools cannot deal with a POS value of 0. If your tree +sequence contains a site with position 0, this will likely cause an error. +::: + +Internally, the coordinates used in the VCF output are obtained by applying +the ``position_transform`` function to the array of site positions (and, for +the contig length, to the tree sequence :attr:`.TreeSequence.sequence_length`). +This function must return a one-dimensional array of the same length as its +input; otherwise a :class:`ValueError` is raised. In addition to accepting a +callable, tskit also supports the string value ``"legacy"`` here, which +selects the pre-0.2.0 behaviour used by the original VCF exporter: +positions are rounded to the nearest integer, starting at 1, and are forced +to be strictly increasing by incrementing ties. + +The VCF specification does not allow positions to be 0. By default, if any +transformed position is 0, :meth:`TreeSequence.write_vcf` will raise a +:class:`ValueError`. If you wish to retain these records you can either: + +- set ``allow_position_zero=True`` to write such sites anyway; +- mask the offending sites using the ``site_mask`` argument; or +- choose a ``position_transform`` that maps 0 to a valid positive position. + +For example, to shift all coordinates by 1, we could define: + +```{code-cell} +def one_based_positions(positions): + return [int(round(x)) + 1 for x in positions] + +ts.write_vcf(sys.stdout, position_transform=one_based_positions) +``` :::{note} The msprime 0.x legacy API simulates using continuous coordinates. It may @@ -393,7 +445,3 @@ be simpler to update your code to use the msprime 1.0 API (which uses discrete coordinates by default) than to work out how to transform coordinates in a way that is suitable for your application. ::: - -:::{todo} -Provide some examples of using position transform. -::: diff --git a/docs/file-formats.md b/docs/file-formats.md index a78295ee2e..20b09376a6 100644 --- a/docs/file-formats.md +++ b/docs/file-formats.md @@ -43,9 +43,13 @@ e.g. ``python -m kastore ls file.trees``. ### Legacy Versions -Tree sequence files written by older versions of tskit are not readable by -newer versions of tskit. For tskit releases<0.6.2, `tskit upgrade` -will convert older tree sequence files to the latest version. +Tree sequence files are versioned. This version of tskit can read +``.trees`` files produced by earlier releases that use the same *major* +file format version (see ``format/version`` in a kastore listing). +Files written using the pre-kastore HDF5 format (for example, by +msprime < 0.6.0 or tskit < 0.6.2) cannot be read directly. To convert +such legacy files, use the ``tskit upgrade`` command from an older +tskit version (< 0.6.2) to produce a modern ``.trees`` file. (sec_text_file_format)= @@ -234,8 +238,14 @@ ts.dump_text(sites=sys.stdout) The mutation text format must contain the columns `site`, `node` and `derived_state`. The `time`, `parent` and `metadata` columns may also be optionally present (but `parent` must be specified if -more than one mutation occurs at the same site). If `time` is absent -`UNKNOWN_TIME` will be used to fill the column. See the +more than one mutation occurs at the same site). If the `time` column +is absent, the mutation times in the resulting tree sequence are set to +{data}`tskit.UNKNOWN_TIME`, which is a numeric value that behaves like NaN. +Unknown mutation times written out by +{meth}`TreeSequence.dump_text` are represented in the text file by the +literal string ``\"unknown\"`` in the `time` column, and +{func}`tskit.load_text` treats this string as `UNKNOWN_TIME` on input. +See the {ref}`mutation table definitions ` for details on these columns. @@ -280,5 +290,4 @@ ts.dump_text(populations=sys.stdout) ``` The `metadata` contains base64-encoded data (in this case, the strings -`pop1` and `pop1`). - +`pop1` and `pop2`). diff --git a/docs/stats.md b/docs/stats.md index 69b04c7575..75c5d36705 100644 --- a/docs/stats.md +++ b/docs/stats.md @@ -35,11 +35,14 @@ Please see the {ref}`tutorial ` for examples of th statistics API in use. :::{warning} -{ref}`sec_data_model_missing_data` is not currently -handled correctly by site statistics defined here, as we always -assign missing data to be equal to the ancestral state. Later -versions will add this behaviour as an option and will account -for the presence of missing data by default. +Site statistics defined here currently treat :ref:`missing data` +in the same way as in earlier versions of tskit: when computing site-based +statistics, isolated samples without mutations directly above them are treated +as carrying the ancestral allele rather than as missing. Future versions of +tskit may expose options to treat missing data differently in statistics; for +now, if you need explicit control over how missing data is handled you should +use the low-level genotype/variant APIs (for example with +``isolated_as_missing=True``) together with your own summary logic. ::: (sec_stats_available)= diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py index edecb44bde..316615cc12 100644 --- a/python/tests/test_metadata.py +++ b/python/tests/test_metadata.py @@ -217,6 +217,35 @@ def test_nodes(self): for a, b in zip(expected, n): assert a.encode("utf8") == b.metadata + @pytest.mark.parametrize( + "base64_metadata,metadata_text,expected", + [(True, "YWJj", b"abc"), (False, "plain", b"plain")], + ) + def test_edges_metadata(self, base64_metadata, metadata_text, expected): + edges = io.StringIO( + f"""\ + left right parent child metadata + 0.0 1.0 2 0,1 {metadata_text} + """ + ) + table = tskit.parse_edges( + edges, strict=False, encoding="utf8", base64_metadata=base64_metadata + ) + assert len(table) == 2 + for row in table: + assert row.metadata == expected + + def test_edges_without_metadata_column(self): + edges = io.StringIO( + """\ + left right parent child + 0.0 1.0 2 3 + """ + ) + table = tskit.parse_edges(edges, strict=False, encoding="utf8") + assert len(table) == 1 + assert table[0].metadata == b"" + def test_sites(self): sites = io.StringIO( """\ diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 356a650545..9fad08cbcd 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -2907,6 +2907,12 @@ def test_round_trip(self): b = tables.tree_sequence() assert a.tables == b.tables + @pytest.mark.parametrize("sequence_length", [np.inf, np.nan]) + def test_nonfinite_sequence_length(self, sequence_length): + tables = tskit.TableCollection(sequence_length=sequence_length) + with pytest.raises(tskit.LibraryError, match="TSK_ERR_BAD_SEQUENCE_LENGTH"): + tables.tree_sequence() + class TestMutationTimeErrors: def test_younger_than_node_below(self): diff --git a/python/tskit/trees.py b/python/tskit/trees.py index a0c7fa23bc..f26912f8ca 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -575,8 +575,8 @@ class Migration(util.Dataclass): """ id: int # noqa A003 """ - The integer ID of this mutation. Varies from 0 to - :attr:`TreeSequence.num_mutations` - 1. + The integer ID of this migration. Varies from 0 to + :attr:`TreeSequence.num_migrations` - 1. """ @@ -774,7 +774,7 @@ def root_threshold(self): calling the :meth:`TreeSequence.trees` iterator. :return: The root threshold. - :rtype: :class:`TreeSequence` + :rtype: int """ return self._ll_tree.get_root_threshold() @@ -885,7 +885,8 @@ def seek(self, position, skip=None): :param float position: The position along the sequence length to seek to. - :raises ValueError: If 0 < position or position >= + :raises ValueError: If ``position`` is less than 0 or ``position`` is greater + than or equal to :attr:`TreeSequence.sequence_length`. """ if position < 0 or position >= self.tree_sequence.sequence_length: @@ -922,7 +923,7 @@ def unrank(num_leaves, rank, *, span=1, branch_length=1) -> Tree: the interval :math:`[0, \\text{span})` and the :attr:`~Tree.tree_sequence` from which the tree is taken will have its :attr:`~tskit.TreeSequence.sequence_length` equal to ``span``. - :param: float branch_length: The minimum length of a branch in this tree. + :param float branch_length: The minimum length of a branch in this tree. :raises ValueError: If the given rank is out of bounds for trees with ``num_leaves`` leaves. """ @@ -3597,7 +3598,7 @@ def parse_nodes(source, strict=True, encoding="utf8", base64_metadata=True, tabl return table -def parse_edges(source, strict=True, table=None): +def parse_edges(source, strict=True, table=None, encoding="utf8", base64_metadata=True): """ Parse the specified file-like object containing a whitespace delimited description of a edge table and returns the corresponding :class:`EdgeTable` @@ -3613,6 +3614,9 @@ def parse_edges(source, strict=True, table=None): False, a relaxed whitespace splitting algorithm is used. :param EdgeTable table: If specified, write the edges into this table. If not, create a new :class:`EdgeTable` instance and return. + :param str encoding: Encoding used for text representation. + :param bool base64_metadata: If True, metadata is encoded using Base64 + encoding; otherwise, as plain text. """ sep = None if strict: @@ -3624,6 +3628,12 @@ def parse_edges(source, strict=True, table=None): right_index = header.index("right") parent_index = header.index("parent") children_index = header.index("child") + metadata_index = None + try: + metadata_index = header.index("metadata") + except ValueError: + pass + default_metadata = b"" for line in source: tokens = line.rstrip("\n").split(sep) if len(tokens) >= 4: @@ -3631,8 +3641,19 @@ def parse_edges(source, strict=True, table=None): right = float(tokens[right_index]) parent = int(tokens[parent_index]) children = tuple(map(int, tokens[children_index].split(","))) + metadata = default_metadata + if metadata_index is not None and metadata_index < len(tokens): + metadata = tokens[metadata_index].encode(encoding) + if base64_metadata: + metadata = base64.b64decode(metadata) for child in children: - table.add_row(left=left, right=right, parent=parent, child=child) + table.add_row( + left=left, + right=right, + parent=parent, + child=child, + metadata=metadata, + ) return table @@ -11012,12 +11033,12 @@ def map_to_vcf_model( return an integer numpy array the same dimension as the input. By default, this is set to ``numpy.round()`` which will round values to the nearest integer. - If neither `name_metadata_key` nor `individual_names` is not specified, the - individual names are set to "tsk_{individual_id}" for each individual. If + If neither `name_metadata_key` nor `individual_names` is specified, the + individual names are set to ``"tsk_{individual_id}"`` for each individual. If no individuals are present, the individual names are set to "tsk_{i}" with `0 <= i < num_sample_nodes/ploidy`. - A Warning are emmitted if any sample nodes do not have an individual ID. + A warning is emitted if any sample nodes do not have an individual ID. :param list individuals: Specific individual IDs to include in the VCF. If not specified and the tree sequence contains individuals, all individuals are