From 82afcde2f4c730fdafdb3e35684f526dc53ea09a Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 17 May 2022 20:02:58 +0100 Subject: [PATCH 1/3] Add mutation.edge for tsk_treeseq accessors. Closes #685 --- c/tests/test_trees.c | 98 ++++++++++++++++++++++++++++++++++++++++++++ c/tskit/tables.h | 3 ++ c/tskit/trees.c | 50 +++++++++++++++++----- 3 files changed, 141 insertions(+), 10 deletions(-) diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index e209c06553..eb1fa6ab10 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -3437,6 +3437,63 @@ test_simplest_chained_map_mutations(void) tsk_treeseq_free(&ts); } +static void +test_simplest_mutation_edges(void) +{ + const char *nodes = "1 0 0\n" + "0 1 0\n" + "0 1 0"; + const char *edges = "0 1 1 0\n" + "1 2 2 0\n"; + const char *sites = "0.5 0\n" + "1.5 0\n"; + const char *mutations = "0 1 1\n" + "0 0 1\n" + "0 2 1\n" + "1 0 1\n" + "1 1 1\n" + "1 2 1\n"; + tsk_treeseq_t ts; + tsk_tree_t tree; + /* We have mutations over roots, samples and just isolated nodes */ + tsk_id_t mutation_edges[] = { -1, 0, -1, 1, -1, -1 }; + tsk_size_t i, j, k, t; + tsk_mutation_t mut; + tsk_site_t site; + int ret; + + tsk_treeseq_from_text(&ts, 2, nodes, edges, NULL, sites, mutations, NULL, NULL, 0); + + CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 2); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 2); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 6); + + for (j = 0; j < tsk_treeseq_get_num_mutations(&ts); j++) { + ret = tsk_treeseq_get_mutation(&ts, (tsk_id_t) j, &mut); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(mut.edge, mutation_edges[j]); + } + + ret = tsk_tree_init(&tree, &ts, 0); + CU_ASSERT_EQUAL(ret, 0); + i = 0; + for (t = 0; t < 2; t++) { + ret = tsk_tree_next(&tree); + CU_ASSERT_EQUAL(ret, TSK_TREE_OK); + for (j = 0; j < tree.sites_length; j++) { + site = tree.sites[j]; + for (k = 0; k < site.mutations_length; k++) { + CU_ASSERT_EQUAL(site.mutations[k].edge, mutation_edges[i]); + i++; + } + } + } + CU_ASSERT_EQUAL(i, 6); + + tsk_tree_free(&tree); + tsk_treeseq_free(&ts); +} + /*======================================================= * Single tree tests. *======================================================*/ @@ -4465,6 +4522,44 @@ test_single_tree_compute_mutation_times(void) tsk_table_collection_free(&tables); } +static void +test_single_tree_mutation_edges(void) +{ + int ret = 0; + tsk_size_t i, j, k; + tsk_treeseq_t ts; + tsk_tree_t tree; + tsk_mutation_t mut; + tsk_site_t site; + tsk_id_t mutation_edges[] = { 2, 4, 0, 0, 1, 2, 3 }; + + tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, + single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); + + for (j = 0; j < 7; j++) { + ret = tsk_treeseq_get_mutation(&ts, (tsk_id_t) j, &mut); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(mut.edge, mutation_edges[j]); + } + + ret = tsk_tree_init(&tree, &ts, 0); + CU_ASSERT_EQUAL(ret, 0); + ret = tsk_tree_first(&tree); + CU_ASSERT_EQUAL(ret, TSK_TREE_OK); + i = 0; + for (j = 0; j < tree.sites_length; j++) { + site = tree.sites[j]; + for (k = 0; k < site.mutations_length; k++) { + CU_ASSERT_EQUAL(site.mutations[k].edge, mutation_edges[i]); + i++; + } + } + CU_ASSERT_EQUAL(i, 7); + + tsk_tree_free(&tree); + tsk_treeseq_free(&ts); +} + static void test_single_tree_is_descendant(void) { @@ -7206,6 +7301,7 @@ main(int argc, char **argv) { "test_simplest_multiple_root_map_mutations", test_simplest_multiple_root_map_mutations }, { "test_simplest_chained_map_mutations", test_simplest_chained_map_mutations }, + { "test_simplest_mutation_edges", test_simplest_mutation_edges }, /* Single tree tests */ { "test_single_tree_good_records", test_single_tree_good_records }, @@ -7232,6 +7328,7 @@ main(int argc, char **argv) test_single_tree_compute_mutation_parents }, { "test_single_tree_compute_mutation_times", test_single_tree_compute_mutation_times }, + { "test_single_tree_mutation_edges", test_single_tree_mutation_edges }, { "test_single_tree_is_descendant", test_single_tree_is_descendant }, { "test_single_tree_total_branch_length", test_single_tree_total_branch_length }, { "test_single_tree_map_mutations", test_single_tree_map_mutations }, @@ -7250,6 +7347,7 @@ main(int argc, char **argv) test_simplify_keep_input_roots_multi_tree }, { "test_left_to_right_multi_tree", test_left_to_right_multi_tree }, { "test_gappy_multi_tree", test_gappy_multi_tree }, + { "test_tsk_treeseq_bad_records", test_tsk_treeseq_bad_records }, /* multiroot tests */ diff --git a/c/tskit/tables.h b/c/tskit/tables.h index e625f471dc..d484220843 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -155,6 +155,9 @@ typedef struct { const char *metadata; /** @brief Size of the metadata in bytes. */ tsk_size_t metadata_length; + /** @brief The ID of the edge that this mutation lies on, or TSK_NULL + if there is no corresponding edge.*/ + tsk_id_t edge; } tsk_mutation_t; /** diff --git a/c/tskit/trees.c b/c/tskit/trees.c index d951b116f1..745c60dcb2 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -135,6 +135,7 @@ tsk_treeseq_init_sites(tsk_treeseq_t *self) const tsk_id_t *restrict mutation_site = self->tables->mutations.site; const double *restrict site_position = self->tables->sites.position; bool discrete_sites = true; + tsk_mutation_t *mutation; self->site_mutations_mem = tsk_malloc(num_mutations * sizeof(*self->site_mutations_mem)); @@ -149,10 +150,12 @@ tsk_treeseq_init_sites(tsk_treeseq_t *self) } for (k = 0; k < (tsk_id_t) num_mutations; k++) { - ret = tsk_treeseq_get_mutation(self, k, self->site_mutations_mem + k); + mutation = self->site_mutations_mem + k; + ret = tsk_treeseq_get_mutation(self, k, mutation); if (ret != 0) { goto out; } + mutation->edge = TSK_NULL; } k = 0; for (j = 0; j < (tsk_id_t) num_sites; j++) { @@ -242,45 +245,57 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) { int ret = TSK_ERR_GENERIC; tsk_size_t j, k, tree_index; - tsk_id_t site; + tsk_id_t site_id, edge_id, mutation_id; double tree_left, tree_right; const double sequence_length = self->tables->sequence_length; const tsk_id_t num_sites = (tsk_id_t) self->tables->sites.num_rows; + const tsk_id_t num_mutations = (tsk_id_t) self->tables->mutations.num_rows; const tsk_size_t num_edges = self->tables->edges.num_rows; + const tsk_size_t num_nodes = self->tables->nodes.num_rows; const double *restrict site_position = self->tables->sites.position; + const tsk_id_t *restrict mutation_site = self->tables->mutations.site; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; const double *restrict edge_right = self->tables->edges.right; const double *restrict edge_left = self->tables->edges.left; + const tsk_id_t *restrict edge_child = self->tables->edges.child; tsk_size_t num_trees_alloc = self->num_trees + 1; bool discrete_breakpoints = true; + tsk_id_t *node_edge_map = tsk_malloc(num_nodes * sizeof(*node_edge_map)); + tsk_mutation_t *mutation; self->tree_sites_length = tsk_malloc(num_trees_alloc * sizeof(*self->tree_sites_length)); self->tree_sites = tsk_malloc(num_trees_alloc * sizeof(*self->tree_sites)); self->breakpoints = tsk_malloc(num_trees_alloc * sizeof(*self->breakpoints)); - if (self->tree_sites == NULL || self->tree_sites_length == NULL - || self->breakpoints == NULL) { + if (node_edge_map == NULL || self->tree_sites == NULL + || self->tree_sites_length == NULL || self->breakpoints == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; } tsk_memset( self->tree_sites_length, 0, self->num_trees * sizeof(*self->tree_sites_length)); tsk_memset(self->tree_sites, 0, self->num_trees * sizeof(*self->tree_sites)); + tsk_memset(node_edge_map, TSK_NULL, num_nodes * sizeof(*node_edge_map)); tree_left = 0; tree_right = sequence_length; tree_index = 0; - site = 0; + site_id = 0; + mutation_id = 0; j = 0; k = 0; while (j < num_edges || tree_left < sequence_length) { discrete_breakpoints = discrete_breakpoints && is_discrete(tree_left); self->breakpoints[tree_index] = tree_left; while (k < num_edges && edge_right[O[k]] == tree_left) { + edge_id = O[k]; + node_edge_map[edge_child[edge_id]] = TSK_NULL; k++; } while (j < num_edges && edge_left[I[j]] == tree_left) { + edge_id = I[j]; + node_edge_map[edge_child[edge_id]] = edge_id; j++; } tree_right = sequence_length; @@ -290,21 +305,28 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) if (k < num_edges) { tree_right = TSK_MIN(tree_right, edge_right[O[k]]); } - self->tree_sites[tree_index] = self->tree_sites_mem + site; - while (site < num_sites && site_position[site] < tree_right) { + self->tree_sites[tree_index] = self->tree_sites_mem + site_id; + while (site_id < num_sites && site_position[site_id] < tree_right) { self->tree_sites_length[tree_index]++; - site++; + while ( + mutation_id < num_mutations && mutation_site[mutation_id] == site_id) { + mutation = self->site_mutations_mem + mutation_id; + mutation->edge = node_edge_map[mutation->node]; + mutation_id++; + } + site_id++; } tree_left = tree_right; tree_index++; } - tsk_bug_assert(site == num_sites); + tsk_bug_assert(site_id == num_sites); tsk_bug_assert(tree_index == self->num_trees); self->breakpoints[tree_index] = tree_right; discrete_breakpoints = discrete_breakpoints && is_discrete(tree_right); self->discrete_genome = self->discrete_genome && discrete_breakpoints; ret = 0; out: + tsk_safe_free(node_edge_map); return ret; } @@ -3144,7 +3166,15 @@ int TSK_WARN_UNUSED tsk_treeseq_get_mutation( const tsk_treeseq_t *self, tsk_id_t index, tsk_mutation_t *mutation) { - return tsk_mutation_table_get_row(&self->tables->mutations, index, mutation); + int ret = 0; + + ret = tsk_mutation_table_get_row(&self->tables->mutations, index, mutation); + if (ret != 0) { + goto out; + } + mutation->edge = self->site_mutations_mem[index].edge; +out: + return ret; } int TSK_WARN_UNUSED From ba9e1356fbf28cc0e58927041e1deda67f213628 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 17 May 2022 21:19:43 +0100 Subject: [PATCH 2/3] Clarify semantics of extra fields in site/mutation getters Add tests for getter error conditions also --- c/CHANGELOG.rst | 5 ++++- c/tests/test_trees.c | 35 +++++++++++++++++++++++++++++++++++ c/tskit/tables.c | 1 + c/tskit/tables.h | 14 +++++++++++++- c/tskit/trees.c | 1 - 5 files changed, 53 insertions(+), 3 deletions(-) diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index 9b221388b2..4a5ea3785f 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -9,7 +9,7 @@ - Change the type of genotypes to ``int32_t``, removing the TSK_16_BIT_GENOTYPES flag option. (:user:`benjeffery`, :issue:`463`, :pr:`2108`) -- ``tsk_variant_t`` now includes its ``tsk_site_t`` rather than pointing to it. +- ``tsk_variant_t`` now includes its ``tsk_site_t`` rather than pointing to it. (:user:`benjeffery`, :issue:`2161`, :pr:`2162`) - Rename TSK_TAKE_TABLES to TSK_TAKE_OWNERSHIP. @@ -37,6 +37,9 @@ - Make dumping of tables and tree sequences to disk a zero-copy operation. (:user:`benjeffery`, :issue:`2111`, :pr:`2124`) +- Add ``edge`` attribute to ``mutation_t`` struct make available in tree sequence. + (:user:`jeromekelleher`, :issue:`685`, :pr:`2279`) + ---------------------- [0.99.15] - 2021-12-07 ---------------------- diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index eb1fa6ab10..5815c2bbcc 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -6652,6 +6652,40 @@ test_tree_errors(void) tsk_treeseq_free(&ts); } +static void +test_treeseq_row_access_errors(void) +{ + int ret; + tsk_table_collection_t tables; + tsk_treeseq_t ts; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.sequence_length = 1; + ret = tsk_treeseq_init(&ts, &tables, TSK_TS_INIT_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_treeseq_get_individual(&ts, 0, NULL); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); + ret = tsk_treeseq_get_node(&ts, 0, NULL); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); + ret = tsk_treeseq_get_edge(&ts, 0, NULL); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); + ret = tsk_treeseq_get_migration(&ts, 0, NULL); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); + ret = tsk_treeseq_get_site(&ts, 0, NULL); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); + ret = tsk_treeseq_get_mutation(&ts, 0, NULL); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); + ret = tsk_treeseq_get_population(&ts, 0, NULL); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); + ret = tsk_treeseq_get_provenance(&ts, 0, NULL); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_PROVENANCE_OUT_OF_BOUNDS); + + tsk_treeseq_free(&ts); + tsk_table_collection_free(&tables); +} + static void test_tree_copy_flags(void) { @@ -7402,6 +7436,7 @@ main(int argc, char **argv) /* Misc */ { "test_tree_errors", test_tree_errors }, + { "test_treeseq_row_access_errors", test_treeseq_row_access_errors }, { "test_tree_copy_flags", test_tree_copy_flags }, { "test_genealogical_nearest_neighbours_errors", test_genealogical_nearest_neighbours_errors }, diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 5fb3c1fe38..d916fdee46 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -4351,6 +4351,7 @@ tsk_mutation_table_get_row_unsafe( row->metadata_length = self->metadata_offset[index + 1] - self->metadata_offset[index]; row->metadata = self->metadata + self->metadata_offset[index]; + row->edge = TSK_NULL; } int diff --git a/c/tskit/tables.h b/c/tskit/tables.h index d484220843..c46af7f367 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -155,7 +155,7 @@ typedef struct { const char *metadata; /** @brief Size of the metadata in bytes. */ tsk_size_t metadata_length; - /** @brief The ID of the edge that this mutation lies on, or TSK_NULL + /** @brief The ID of the edge that this mutation lies on, or TSK_NULL if there is no corresponding edge.*/ tsk_id_t edge; } tsk_mutation_t; @@ -2397,6 +2397,12 @@ int tsk_site_table_copy( @rst Updates the specified site struct to reflect the values in the specified row. + +This function always sets the ``mutations`` and ``mutations_length`` +fields in the parameter :c:struct:`tsk_site_t` to ``NULL`` and ``0`` respectively. +To get access to the mutations for a particular site, please use the +tree sequence method, :c:func:`tsk_treeseq_get_site`. + Pointers to memory within this struct are handled by the table and should **not** be freed by client code. These pointers are guaranteed to be valid until the next operation that modifies the table (e.g., by adding a new row), but not afterwards. @@ -2731,6 +2737,12 @@ int tsk_mutation_table_copy( @rst Updates the specified mutation struct to reflect the values in the specified row. + +This function always sets the ``edge`` field in parameter +:c:struct:`tsk_mutation_t` to ``TSK_NULL``. To determine the ID of +the edge associated with a particular mutation, please use the +tree sequence method, :c:func:`tsk_treeseq_get_mutation`. + Pointers to memory within this struct are handled by the table and should **not** be freed by client code. These pointers are guaranteed to be valid until the next operation that modifies the table (e.g., by adding a new row), but not afterwards. diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 745c60dcb2..eff547d3b8 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -155,7 +155,6 @@ tsk_treeseq_init_sites(tsk_treeseq_t *self) if (ret != 0) { goto out; } - mutation->edge = TSK_NULL; } k = 0; for (j = 0; j < (tsk_id_t) num_sites; j++) { From afd19b29314a304663d781979e75cde1502531f2 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 18 May 2022 14:58:21 +0100 Subject: [PATCH 3/3] Add Python interface for Mutation.edge --- python/CHANGELOG.rst | 4 +++ python/_tskitmodule.c | 24 +++++++++++-- python/tests/__init__.py | 13 +++++-- python/tests/test_lowlevel.py | 33 ++++++++--------- python/tests/test_topology.py | 68 +++++++++++++++++++++++++++++++++++ python/tskit/trees.py | 20 ++++++++++- 6 files changed, 140 insertions(+), 22 deletions(-) diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 78e7ac2cbc..1178169b28 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -17,6 +17,10 @@ - ``tree.mrca`` now takes 2 or more arguments and gives the common ancestor of them all. (:user:`savitakartik`, :issue:`1340`, :pr:`2121`) +- Add a ``edge`` attribute to the ``Mutation`` class that gives the ID of the + edge that the mutation falls on. + (:user:`jeromekelleher`, :issue:`685`, :pr:`2279`). + **Breaking Changes** - The JSON metadata codec now interprets the empty string as an empty object. This means diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 2c9c737fba..feb39b75ab 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -322,7 +322,7 @@ make_metadata(const char *metadata, Py_ssize_t length) } static PyObject * -make_mutation(const tsk_mutation_t *mutation) +make_mutation_row(const tsk_mutation_t *mutation) { PyObject *ret = NULL; PyObject *metadata = NULL; @@ -339,6 +339,24 @@ make_mutation(const tsk_mutation_t *mutation) return ret; } +static PyObject * +make_mutation_object(const tsk_mutation_t *mutation) +{ + PyObject *ret = NULL; + PyObject *metadata = NULL; + + metadata = make_metadata(mutation->metadata, (Py_ssize_t) mutation->metadata_length); + if (metadata == NULL) { + goto out; + } + ret = Py_BuildValue("iis#iOdi", mutation->site, mutation->node, + mutation->derived_state, (Py_ssize_t) mutation->derived_state_length, + mutation->parent, metadata, mutation->time, mutation->edge); +out: + Py_XDECREF(metadata); + return ret; +} + static PyObject * make_mutation_id_list(const tsk_mutation_t *mutations, tsk_size_t length) { @@ -4016,7 +4034,7 @@ MutationTable_get_row(MutationTable *self, PyObject *args) handle_library_error(err); goto out; } - ret = make_mutation(&mutation); + ret = make_mutation_row(&mutation); out: return ret; } @@ -7984,7 +8002,7 @@ TreeSequence_get_mutation(TreeSequence *self, PyObject *args) handle_library_error(err); goto out; } - ret = make_mutation(&record); + ret = make_mutation_object(&record); out: return ret; } diff --git a/python/tests/__init__.py b/python/tests/__init__.py index f5559979f7..bb4151d978 100644 --- a/python/tests/__init__.py +++ b/python/tests/__init__.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2021 Tskit Developers +# Copyright (c) 2018-2022 Tskit Developers # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -138,7 +138,15 @@ def __init__(self, tree_sequence, breakpoints=None): ll_ts = self._tree_sequence._ll_tree_sequence def make_mutation(id_): - site, node, derived_state, parent, metadata, time = ll_ts.get_mutation(id_) + ( + site, + node, + derived_state, + parent, + metadata, + time, + edge, + ) = ll_ts.get_mutation(id_) return tskit.Mutation( id=id_, site=site, @@ -147,6 +155,7 @@ def make_mutation(id_): derived_state=derived_state, parent=parent, metadata=metadata, + edge=edge, metadata_decoder=tskit.metadata.parse_metadata_schema( ll_ts.get_table_metadata_schemas().mutation ).decode_row, diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index 2a62a1ff67..02caa178d4 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -1215,22 +1215,20 @@ def test_dump_equality(self, tmp_path): ts2.dump_tables(tc2) assert tc.equals(tc2) - def verify_mutations(self, ts): - mutations = [ts.get_mutation(j) for j in range(ts.get_num_mutations())] - assert ts.get_num_mutations() > 0 - assert len(mutations) == ts.get_num_mutations() - # Check the form of the mutations - for j, (position, nodes, index) in enumerate(mutations): - assert j == index - for node in nodes: + def test_get_mutation_interface(self): + for ts in self.get_example_tree_sequences(): + mutations = [ts.get_mutation(j) for j in range(ts.get_num_mutations())] + assert len(mutations) == ts.get_num_mutations() + # Check the form of the mutations + for packed in mutations: + site, node, derived_state, parent, metadata, time, edge = packed + assert isinstance(site, int) assert isinstance(node, int) - assert node >= 0 - assert node <= ts.get_num_nodes() - assert isinstance(position, float) - assert position > 0 - assert position < ts.get_sequence_length() - # mutations must be sorted by position order. - assert mutations == sorted(mutations) + assert isinstance(derived_state, str) + assert isinstance(parent, int) + assert isinstance(metadata, bytes) + assert isinstance(time, float) + assert isinstance(edge, int) def test_get_edge_interface(self): for ts in self.get_example_tree_sequences(): @@ -2718,12 +2716,13 @@ def test_sites(self): all_tree_sites.extend(tree_sites) for ( position, - _ancestral_state, + ancestral_state, mutations, index, metadata, ) in tree_sites: assert st.get_left() <= position < st.get_right() + assert isinstance(ancestral_state, str) assert index == j assert metadata == b"" for mut_id in mutations: @@ -2734,11 +2733,13 @@ def test_sites(self): parent, metadata, time, + edge, ) = ts.get_mutation(mut_id) assert site == index assert mutation_id == mut_id assert st.get_parent(node) != _tskit.NULL assert metadata == b"" + assert edge != _tskit.NULL mutation_id += 1 j += 1 assert all_tree_sites == all_sites diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index fc15494d68..af0f315215 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -6201,6 +6201,74 @@ def test_many_multiroot_trees_recurrent_mutations(self): self.verify_branch_mutations(ts, mutations_per_branch) +class TestMutationEdge: + def verify_mutation_edge(self, ts): + # print(ts.tables) + for mutation in ts.mutations(): + site = ts.site(mutation.site) + if mutation.edge == tskit.NULL: + edges = [ + edge + for edge in ts.edges() + if edge.left <= site.position < edge.right + and mutation.node == edge.child + ] + assert len(edges) == 0 + else: + edge = ts.edge(mutation.edge) + assert edge.left <= site.position < edge.right + assert edge.child == mutation.node + + for tree in ts.trees(): + for site in tree.sites(): + for mutation in site.mutations: + assert mutation.edge == ts.mutation(mutation.id).edge + if mutation.edge == tskit.NULL: + assert tree.parent(mutation.node) == tskit.NULL + + def verify_branch_mutations(self, ts, mutations_per_branch): + ts = tsutil.insert_branch_mutations(ts, mutations_per_branch) + assert ts.num_mutations > 1 + self.verify_mutation_edge(ts) + + def test_single_tree_one_mutation_per_branch(self): + ts = msprime.simulate(6, random_seed=10) + self.verify_branch_mutations(ts, 1) + + def test_single_tree_two_mutations_per_branch(self): + ts = msprime.simulate(10, random_seed=9) + self.verify_branch_mutations(ts, 2) + + def test_single_tree_three_mutations_per_branch(self): + ts = msprime.simulate(8, random_seed=9) + self.verify_branch_mutations(ts, 3) + + def test_single_multiroot_tree_recurrent_mutations(self): + ts = msprime.simulate(6, random_seed=10) + ts = tsutil.decapitate(ts, ts.num_edges // 2) + for mutations_per_branch in [1, 2, 3]: + self.verify_branch_mutations(ts, mutations_per_branch) + + def test_many_multiroot_trees_recurrent_mutations(self): + ts = msprime.simulate(7, recombination_rate=1, random_seed=10) + assert ts.num_trees > 3 + ts = tsutil.decapitate(ts, ts.num_edges // 2) + for mutations_per_branch in [1, 2, 3]: + self.verify_branch_mutations(ts, mutations_per_branch) + + @pytest.mark.parametrize("n", range(2, 5)) + @pytest.mark.parametrize("mutations_per_branch", range(3)) + def test_balanced_binary_tree(self, n, mutations_per_branch): + ts = tskit.Tree.generate_balanced(4).tree_sequence + # These trees have a handy property + assert all(edge.id == edge.child for edge in ts.edges()) + for mutation in ts.mutations(): + assert mutation.edge == mutation.node + for site in ts.first().sites(): + for mutation in site.mutations: + assert mutation.edge == mutation.node + + class TestMutationTime: """ Tests that mutation time is correctly specified, and that we correctly diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 2282b8f541..11b83d6bdb 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -319,7 +319,16 @@ class Mutation(util.Dataclass): underlying tree sequence data. """ - __slots__ = ["id", "site", "node", "derived_state", "parent", "metadata", "time"] + __slots__ = [ + "id", + "site", + "node", + "derived_state", + "parent", + "metadata", + "time", + "edge", + ] id: int # noqa A003 """ The integer ID of this mutation. Varies from 0 to @@ -363,6 +372,10 @@ class Mutation(util.Dataclass): """ The occurrence time of this mutation. """ + edge: int + """ + The ID of the edge that this mutation is on. + """ # To get default values on slots we define a custom init def __init__( @@ -374,6 +387,7 @@ def __init__( derived_state=None, parent=NULL, metadata=b"", + edge=NULL, ): self.id = id self.site = site @@ -382,6 +396,7 @@ def __init__( self.derived_state = derived_state self.parent = parent self.metadata = metadata + self.edge = edge # We need a custom eq to compare unknown times. def __eq__(self, other): @@ -392,6 +407,7 @@ def __eq__(self, other): and self.node == other.node and self.derived_state == other.derived_state and self.parent == other.parent + and self.edge == other.edge and self.metadata == other.metadata and ( self.time == other.time @@ -5179,6 +5195,7 @@ def mutation(self, id_): parent, metadata, time, + edge, ) = self._ll_tree_sequence.get_mutation(id_) return Mutation( id=id_, @@ -5188,6 +5205,7 @@ def mutation(self, id_): parent=parent, metadata=metadata, time=time, + edge=edge, metadata_decoder=self.table_metadata_schemas.mutation.decode_row, )