From 84a2f4f31d4f57ad4a4a962a7d661c4ae7db7d09 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Tue, 19 Aug 2025 02:11:51 +0100 Subject: [PATCH] Make mutation sort identical to canonical --- c/tests/test_tables.c | 142 +++++++++++++++++++++++++ c/tskit/tables.c | 105 ++++-------------- python/CHANGELOG.rst | 7 ++ python/tests/test_extend_haplotypes.py | 3 +- python/tests/test_highlevel.py | 7 +- python/tests/test_table_transforms.py | 5 + python/tests/test_tables.py | 73 ++++++++++--- python/tests/test_topology.py | 54 +++++++++- python/tests/tsutil.py | 54 ++++------ python/tskit/tables.py | 15 +-- 10 files changed, 316 insertions(+), 149 deletions(-) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 2cff9b7a49..6816e3e7e0 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -8905,6 +8905,7 @@ static void test_sort_tables_errors(void) { int ret; + tsk_id_t ret_id; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_bookmark_t pos; @@ -8968,6 +8969,32 @@ test_sort_tables_errors(void) ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); + /* Test TSK_ERR_MUTATION_PARENT_INCONSISTENT */ + ret = tsk_table_collection_clear(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.sequence_length = 1.0; + + ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.0, TSK_NULL, TSK_NULL, NULL, 0); + CU_ASSERT_FATAL(ret_id >= 0); + ret_id = tsk_site_table_add_row(&tables.sites, 0.0, "x", 1, NULL, 0); + CU_ASSERT_FATAL(ret_id >= 0); + + ret_id + = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 2, 0.0, "a", 1, NULL, 0); + CU_ASSERT_FATAL(ret_id >= 0); + ret_id + = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 3, 0.0, "b", 1, NULL, 0); + CU_ASSERT_FATAL(ret_id >= 0); + ret_id + = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 1, 0.0, "c", 1, NULL, 0); + CU_ASSERT_FATAL(ret_id >= 0); + ret_id + = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 2, 0.0, "d", 1, NULL, 0); + CU_ASSERT_FATAL(ret_id >= 0); + + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_INCONSISTENT); + tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } @@ -9032,6 +9059,120 @@ test_sort_tables_mutation_times(void) tsk_treeseq_free(&ts); } +static void +test_sort_tables_mutations(void) +{ + int ret; + tsk_table_collection_t tables; + + /* Sorting hierarchy: + * 1. site + * 2. time (when known) + * 3. node_time + * 4. num_descendants: parent mutations first + * 5. node_id + * 6. mutation_id + */ + + const char *sites = "0.0 A\n" + "0.5 T\n" + "0.75 G\n"; + + const char *mutations_unsorted = + /* Test site criterion (primary) - site 1 should come after site 0 */ + "1 0 X -1 0.0\n" /* mut 0: site 1, will be sorted after site 0 mutations */ + "0 0 Y -1 0.0\n" /* mut 1: site 0, will be sorted before site 1 mutations */ + + /* Test time criterion - within same site, earlier time first */ + "0 4 B -1 2.0\n" /* mut 2: site 0, node 4 (time 1.0), time 2.0 (later time) + */ + "0 5 A -1 2.5\n" /* mut 3: site 0, node 5 (time 2.0), time 2.5 (earlier + relative) */ + + /* Test unknown vs known times - unknown times at site 2, fall back to node_time + sorting */ + "2 4 U2 -1\n" /* mut 4: site 2, node 4 (time 1.0), unknown time - falls back + to node_time */ + "2 4 U3 -1\n" /* mut 5: site 2, node 4 (time 1.0), unknown time - should use + mutation_id as tiebreaker */ + "2 5 U1 -1\n" /* mut 6: site 2, node 5 (time 2.0), unknown time - falls back + to node_time */ + + /* Test node_time criterion - same site, same mut time, different node times */ + "0 4 D -1 1.5\n" /* mut 7: site 0, node 4 (time 1.0), mut time 1.5 */ + "0 5 C -1 2.5\n" /* mut 8: site 0, node 5 (time 2.0), mut time 2.5 - same + mut time */ + + /* Test num_descendants criterion with mutation parent-child relationships */ + "0 2 P -1 0.0\n" /* mut 9: site 0, node 2, parent mutation (0 descendants + initially) */ + "0 1 C1 9 0.0\n" /* mut 10: site 0, node 1, child of mut 9 (parent now has + 1+ descendants) */ + "0 1 C2 9 0.0\n" /* mut 11: site 0, node 1, another child of mut 9 (parent + now has 2+ descendants) */ + "0 3 Q -1 0.0\n" /* mut 12: site 0, node 3, no children (0 descendants) */ + "0 0 C3 10 0.0\n" /* mut 13: site 0, node 0, child of mut 10 (making mut 9 a + grandparent) */ + + /* Test node and mutation_id criteria for final tiebreaking */ + "0 0 Z1 -1 0.0\n" /* mut 14: site 0, node 0, no parent, will test node+id + ordering */ + "0 0 Z2 -1 0.0\n"; /* mut 15: site 0, node 0, no parent, later in input = + higher ID */ + + const char *mutations_sorted = + /* Site 0 mutations - known times first, sorted by time */ + "0 5 A -1 2.5\n" + "0 5 C -1 2.5\n" + "0 4 B -1 2.0\n" + "0 4 D -1 1.5\n" + "0 2 P -1 0.0\n" + "0 1 C1 4 0.0\n" + "0 0 Y -1 0.0\n" + "0 0 C3 5 0.0\n" + "0 0 Z1 -1 0.0\n" + "0 0 Z2 -1 0.0\n" + "0 1 C2 4 0.0\n" + "0 3 Q -1 0.0\n" + + /* Site 1 mutations */ + "1 0 X -1 0.0\n" + + /* Site 2 mutations - unknown times, sorted by node_time then other criteria */ + "2 5 U1 -1\n" + "2 4 U2 -1\n" + "2 4 U3 -1\n"; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.sequence_length = 1.0; + parse_nodes(single_tree_ex_nodes, &tables.nodes); + parse_edges(single_tree_ex_edges, &tables.edges); + + parse_sites(sites, &tables.sites); + CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 3); + + parse_mutations(mutations_unsorted, &tables.mutations); + CU_ASSERT_EQUAL_FATAL(tables.mutations.num_rows, 16); + + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + tsk_table_collection_t expected; + ret = tsk_table_collection_init(&expected, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + expected.sequence_length = 1.0; + parse_nodes(single_tree_ex_nodes, &expected.nodes); + parse_edges(single_tree_ex_edges, &expected.edges); + parse_sites(sites, &expected.sites); + parse_mutations(mutations_sorted, &expected.mutations); + + CU_ASSERT_TRUE(tsk_mutation_table_equals(&tables.mutations, &expected.mutations, 0)); + + tsk_table_collection_free(&expected); + tsk_table_collection_free(&tables); +} + static void test_sort_tables_canonical_errors(void) { @@ -11608,6 +11749,7 @@ main(int argc, char **argv) { "test_sort_tables_errors", test_sort_tables_errors }, { "test_sort_tables_individuals", test_sort_tables_individuals }, { "test_sort_tables_mutation_times", test_sort_tables_mutation_times }, + { "test_sort_tables_mutations", test_sort_tables_mutations }, { "test_sort_tables_migrations", test_sort_tables_migrations }, { "test_sort_tables_no_edge_metadata", test_sort_tables_no_edge_metadata }, { "test_sort_tables_offsets", test_sort_tables_offsets }, diff --git a/c/tskit/tables.c b/c/tskit/tables.c index c1025ee8a5..6cc1e3c540 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -6756,7 +6756,8 @@ typedef struct { typedef struct { tsk_mutation_t mut; int num_descendants; -} mutation_canonical_sort_t; + double node_time; +} mutation_sort_t; typedef struct { tsk_individual_t ind; @@ -6797,39 +6798,30 @@ cmp_site(const void *a, const void *b) static int cmp_mutation(const void *a, const void *b) { - const tsk_mutation_t *ia = (const tsk_mutation_t *) a; - const tsk_mutation_t *ib = (const tsk_mutation_t *) b; - /* Compare mutations by site */ - int ret = (ia->site > ib->site) - (ia->site < ib->site); - /* Within a particular site sort by time if known, then ID. This ensures that - * relative ordering within a site is maintained */ - if (ret == 0 && !tsk_is_unknown_time(ia->time) && !tsk_is_unknown_time(ib->time)) { - ret = (ia->time < ib->time) - (ia->time > ib->time); - } - if (ret == 0) { - ret = (ia->id > ib->id) - (ia->id < ib->id); - } - return ret; -} - -static int -cmp_mutation_canonical(const void *a, const void *b) -{ - const mutation_canonical_sort_t *ia = (const mutation_canonical_sort_t *) a; - const mutation_canonical_sort_t *ib = (const mutation_canonical_sort_t *) b; + const mutation_sort_t *ia = (const mutation_sort_t *) a; + const mutation_sort_t *ib = (const mutation_sort_t *) b; /* Compare mutations by site */ int ret = (ia->mut.site > ib->mut.site) - (ia->mut.site < ib->mut.site); + + /* Within a particular site sort by time if known */ if (ret == 0 && !tsk_is_unknown_time(ia->mut.time) && !tsk_is_unknown_time(ib->mut.time)) { ret = (ia->mut.time < ib->mut.time) - (ia->mut.time > ib->mut.time); } + /* Or node times when mutation times are unknown or equal */ + if (ret == 0) { + ret = (ia->node_time < ib->node_time) - (ia->node_time > ib->node_time); + } + /* If node times are equal, sort by number of descendants */ if (ret == 0) { ret = (ia->num_descendants < ib->num_descendants) - (ia->num_descendants > ib->num_descendants); } + /* If number of descendants are equal, sort by node */ if (ret == 0) { ret = (ia->mut.node > ib->mut.node) - (ia->mut.node < ib->mut.node); } + /* Final tiebreaker: ID */ if (ret == 0) { ret = (ia->mut.id > ib->mut.id) - (ia->mut.id < ib->mut.id); } @@ -7056,77 +7048,15 @@ tsk_table_sorter_sort_sites(tsk_table_sorter_t *self) static int tsk_table_sorter_sort_mutations(tsk_table_sorter_t *self) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t ret_id, parent, mapped_parent; - tsk_mutation_table_t *mutations = &self->tables->mutations; - tsk_size_t num_mutations = mutations->num_rows; - tsk_mutation_table_t copy; - tsk_mutation_t *sorted_mutations - = tsk_malloc(num_mutations * sizeof(*sorted_mutations)); - tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map)); - - ret = tsk_mutation_table_copy(mutations, ©, 0); - if (ret != 0) { - goto out; - } - if (mutation_id_map == NULL || sorted_mutations == NULL) { - ret = tsk_trace_error(TSK_ERR_NO_MEMORY); - goto out; - } - - for (j = 0; j < num_mutations; j++) { - tsk_mutation_table_get_row_unsafe(©, (tsk_id_t) j, sorted_mutations + j); - sorted_mutations[j].site = self->site_id_map[sorted_mutations[j].site]; - } - ret = tsk_mutation_table_clear(mutations); - if (ret != 0) { - goto out; - } - - qsort(sorted_mutations, (size_t) num_mutations, sizeof(*sorted_mutations), - cmp_mutation); - - /* Make a first pass through the sorted mutations to build the ID map. */ - for (j = 0; j < num_mutations; j++) { - mutation_id_map[sorted_mutations[j].id] = (tsk_id_t) j; - } - - for (j = 0; j < num_mutations; j++) { - mapped_parent = TSK_NULL; - parent = sorted_mutations[j].parent; - if (parent != TSK_NULL) { - mapped_parent = mutation_id_map[parent]; - } - ret_id = tsk_mutation_table_add_row(mutations, sorted_mutations[j].site, - sorted_mutations[j].node, mapped_parent, sorted_mutations[j].time, - sorted_mutations[j].derived_state, sorted_mutations[j].derived_state_length, - sorted_mutations[j].metadata, sorted_mutations[j].metadata_length); - if (ret_id < 0) { - ret = (int) ret_id; - goto out; - } - } - ret = 0; - -out: - tsk_safe_free(mutation_id_map); - tsk_safe_free(sorted_mutations); - tsk_mutation_table_free(©); - return ret; -} - -static int -tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self) { int ret = 0; tsk_size_t j; tsk_id_t ret_id, parent, mapped_parent, p; tsk_mutation_table_t *mutations = &self->tables->mutations; + tsk_node_table_t *nodes = &self->tables->nodes; tsk_size_t num_mutations = mutations->num_rows; tsk_mutation_table_t copy; - mutation_canonical_sort_t *sorted_mutations + mutation_sort_t *sorted_mutations = tsk_malloc(num_mutations * sizeof(*sorted_mutations)); tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map)); @@ -7158,6 +7088,7 @@ tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self) for (j = 0; j < num_mutations; j++) { tsk_mutation_table_get_row_unsafe(©, (tsk_id_t) j, &sorted_mutations[j].mut); sorted_mutations[j].mut.site = self->site_id_map[sorted_mutations[j].mut.site]; + sorted_mutations[j].node_time = nodes->time[sorted_mutations[j].mut.node]; } ret = tsk_mutation_table_clear(mutations); if (ret != 0) { @@ -7165,7 +7096,7 @@ tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self) } qsort(sorted_mutations, (size_t) num_mutations, sizeof(*sorted_mutations), - cmp_mutation_canonical); + cmp_mutation); /* Make a first pass through the sorted mutations to build the ID map. */ for (j = 0; j < num_mutations; j++) { @@ -12245,7 +12176,7 @@ tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t opti if (ret != 0) { goto out; } - sorter.sort_mutations = tsk_table_sorter_sort_mutations_canonical; + sorter.sort_mutations = tsk_table_sorter_sort_mutations; sorter.sort_individuals = tsk_table_sorter_sort_individuals_canonical; nodes = tsk_malloc(self->nodes.num_rows * sizeof(*nodes)); diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index d9d5191cc5..8b0f352526 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -21,6 +21,13 @@ **Bugfixes** +- In some tables with mutations out-of-order `TableCollection.sort` did not re-order + the mutations so they formed a valid TreeSequence. `TableCollection.sort` and + `TableCollection.canonicalise` now sort mutations by site, then time (if known), + then the mutation's node's time, then number of descendant mutations + (ensuring that parent mutations occur before children), then node, then + their original order in the tables. (:user:`benjeffery`, :pr:`3257`, :issue:`3253`) + - Fix bug in ``TreeSequence.pair_coalescence_counts`` when ``span_normalise=True`` and a window breakpoint falls within an internal missing interval. (:user:`nspope`, :pr:`3176`, :issue:`3175`) diff --git a/python/tests/test_extend_haplotypes.py b/python/tests/test_extend_haplotypes.py index a2c8f59e89..8b12cae53d 100644 --- a/python/tests/test_extend_haplotypes.py +++ b/python/tests/test_extend_haplotypes.py @@ -482,7 +482,7 @@ def extend_haplotypes(ts, max_iter=10): extender = HaplotypeExtender(ts, forwards=forwards) extender.extend_haplotypes() tables.edges.replace_with(extender.edges) - tables.sort() + tables.sort(mutation_start=tables.mutations.num_rows) tables.build_index() ts = tables.tree_sequence() if ts.num_edges == last_num_edges: @@ -493,7 +493,6 @@ def extend_haplotypes(ts, max_iter=10): tables = ts.dump_tables() mutations = _slide_mutation_nodes_up(ts, mutations) tables.mutations.replace_with(mutations) - tables.sort() ts = tables.tree_sequence() return ts diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index f33083bc9b..b66243f60d 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -3396,7 +3396,12 @@ def test_text_record_round_trip(self, ts1): sequence_length=ts1.sequence_length, strict=True, ) - self.verify_approximate_equality(ts1, ts2) + tables1 = ts1.tables.copy() + # load_text performs a `sort`, which changes the order relative to + # the original tree sequence + tables1.sort() + ts1_sorted = tables1.tree_sequence() + self.verify_approximate_equality(ts1_sorted, ts2) def test_empty_files(self): nodes_file = io.StringIO("is_sample\ttime\n") diff --git a/python/tests/test_table_transforms.py b/python/tests/test_table_transforms.py index 672de341cb..5989d342f9 100644 --- a/python/tests/test_table_transforms.py +++ b/python/tests/test_table_transforms.py @@ -596,6 +596,11 @@ def test_genotypes_round_trip(self, ts): @pytest.mark.parametrize("ts", get_example_tree_sequences()) @pytest.mark.parametrize("population", [-1, None]) def test_definition(self, ts, population): + # The python implementation of split_edges performs a sort, + # which changes the order relative to the original tree sequence + tables = ts.dump_tables() + tables.sort() + ts = tables.tree_sequence() time = 0 if ts.num_nodes == 0 else np.median(ts.tables.nodes.time) if ts.num_migrations == 0: ts1 = split_edges_definition(ts, time, population=population) diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index bbbd8d7a40..14f7d53524 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -2286,11 +2286,9 @@ def verify_sort_equality(self, tables, seed): tables2 = tables1.copy() tables1.sort() tsutil.py_sort(tables2) - - # TODO - Check the sorted tables are valid ts, currently fails due to mutations - # tables1.tree_sequence() - # tables2.tree_sequence() - + # Check that both are valid tree sequences + tables1.tree_sequence() + tables2.tree_sequence() tables1.assert_equals(tables2) def verify_canonical_equality(self, tables, seed): @@ -2699,7 +2697,7 @@ def test_sort_mutations_stability(self): 1 0 1 -1 1 1 1 -1 0 1 1 -1 - 0 0 1 -1 + 0 1 0 -1 """ ) ts = tskit.load_text( @@ -2717,7 +2715,8 @@ def test_sort_mutations_stability(self): assert len(sites) == 2 assert len(mutations) == 4 assert list(mutations.site) == [0, 0, 1, 1] - assert list(mutations.node) == [1, 0, 0, 1] + assert list(mutations.node) == [1, 1, 0, 1] + assert list(map(chr, mutations.derived_state)) == ["1", "0", "1", "1"] def test_sort_mutations_remap_parent_id(self): nodes = io.StringIO( @@ -2861,6 +2860,37 @@ def test_sort_mutations_time(self): ) assert list(mutations.parent) == [-1, -1, -1, -1, -1, -1, -1, -1, -1] + def test_add_mutations_to_nodes(self): + # Test that adding mutations to random nodes, without parent IDs + # works - this requires the mutations to be sorted by node times + + ts = msprime.sim_mutations( + msprime.sim_ancestry( + 10, sequence_length=100, random_seed=1, recombination_rate=0.01 + ), + rate=1, + random_seed=1, + ) + + # Add some random mutations, and delete some others + tables = ts.dump_tables() + tables.mutations.time = np.full_like(tables.mutations.time, tskit.UNKNOWN_TIME) + np.random.seed(10) + for s in ts.sites(): + tables.mutations.add_row( + site=s.id, node=np.random.randint(ts.num_nodes), derived_state="A" + ) + keep = np.ones(tables.mutations.num_rows, dtype=bool) + keep[0:100] = False + tables.mutations.replace_with(tables.mutations[keep]) + # Remove all the parent IDs + tables.mutations.parent = np.full_like(tables.mutations.parent, tskit.NULL) + assert np.all(tables.mutations.parent == tskit.NULL) + tables.sort() + tables.build_index() + tables.compute_mutation_parents() + tables.tree_sequence() + class TestTablesToTreeSequence: """ @@ -4695,14 +4725,26 @@ def verify_subset(self, tables, nodes): for k, s in zip(sites, subset.sites): ss = tables.sites[k] assert ss == s - assert subset.mutations.num_rows == len(muts) - for k, m in zip(muts, subset.mutations): - mm = tables.mutations[k] - assert mutation_map[mm.parent] == m.parent - assert site_map[mm.site] == m.site - assert node_map[mm.node] == m.node - assert mm.derived_state == m.derived_state - assert mm.metadata == m.metadata + + # subset can reorder the mutations: we need to check we have the same set + def normalize_time(time): + return -42.0 if tskit.is_unknown_time(time) else time + + expected_mutations = { + ( + site_map[tables.mutations[k].site], + node_map[tables.mutations[k].node], + normalize_time(tables.mutations[k].time), + tables.mutations[k].metadata, + ) + for k in muts + } + actual_mutations = { + (m.site, m.node, normalize_time(m.time), m.metadata) + for m in subset.mutations + } + assert len(expected_mutations) == len(actual_mutations) + assert expected_mutations == actual_mutations assert tables.migrations == subset.migrations assert tables.provenances == subset.provenances @@ -4718,6 +4760,7 @@ def test_subset_all(self): # subsetting to everything shouldn't change things except the # individual and population ids in the node tables if there are gaps for tables in self.get_examples(123583): + tables.sort() tables2 = tables.copy() tables2.subset(np.arange(tables.nodes.num_rows)) tables.individuals.clear() diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index 89d72d31fe..e6f8a53446 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -1683,8 +1683,8 @@ def test_single_binary_tree_keep_roots_mutations(self): """ mutations_after = """\ site node derived_state - 0 2 2 0 2 1 + 0 2 2 """ self.verify_simplify( samples=[0, 1], @@ -3409,7 +3409,57 @@ def do_simplify( lib_tables1 = sts.dump_tables() py_tables = new_ts.dump_tables() - py_tables.assert_equals(lib_tables1, ignore_provenance=True) + # Compare all tables except mutations + py_tables_no_mut = py_tables.copy() + lib_tables1_no_mut = lib_tables1.copy() + py_tables_no_mut.mutations.clear() + lib_tables1_no_mut.mutations.clear() + py_tables_no_mut.assert_equals(lib_tables1_no_mut, ignore_provenance=True) + + # For mutations, check functional equivalence by comparing mutation properties + # but handling parent relationships that may differ due to reordering + def normalize_time(time): + return -42.0 if tskit.is_unknown_time(time) else time + + def mutation_signature(m, mutations): + # Create a signature that identifies a mutation by its properties + # and its parent's properties (to handle parent ID remapping) + def make_hashable(metadata): + # Convert unhashable metadata (like dicts) to hashable form + if isinstance(metadata, dict): + return tuple(sorted(metadata.items())) + elif isinstance(metadata, list): + return tuple(metadata) + else: + return metadata + + parent_sig = None + if m.parent != -1 and m.parent < len(mutations): + parent = mutations[m.parent] + parent_sig = ( + parent.site, + parent.node, + parent.derived_state, + make_hashable(parent.metadata), + normalize_time(parent.time), + ) + return ( + m.site, + m.node, + m.derived_state, + make_hashable(m.metadata), + normalize_time(m.time), + parent_sig, + ) + + py_mut_sigs = { + mutation_signature(m, py_tables.mutations) for m in py_tables.mutations + } + lib_mut_sigs = { + mutation_signature(m, lib_tables1.mutations) for m in lib_tables1.mutations + } + + assert py_mut_sigs == lib_mut_sigs assert all(node_map == lib_node_map1) return new_ts, node_map diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 02a3aab36a..242139ac02 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -954,35 +954,28 @@ def cmp_site(i, j, tables): return ret -def cmp_mutation_canonical(i, j, tables, site_order, num_descendants=None): +def cmp_mutation(i, j, tables, site_order, num_descendants=None): site_i = tables.mutations.site[i] site_j = tables.mutations.site[j] ret = site_order[site_i] - site_order[site_j] + # Within a particular site sort by time if known, then node time fallback if ( ret == 0 and (not tskit.is_unknown_time(tables.mutations.time[i])) and (not tskit.is_unknown_time(tables.mutations.time[j])) ): ret = tables.mutations.time[j] - tables.mutations.time[i] + if ret == 0: + # Use node times as fallback when mutation times are unknown or equal + node_time_i = tables.nodes.time[tables.mutations.node[i]] + node_time_j = tables.nodes.time[tables.mutations.node[j]] + ret = node_time_j - node_time_i if ret == 0: ret = num_descendants[j] - num_descendants[i] + # Tiebreaker: node if ret == 0: ret = tables.mutations.node[i] - tables.mutations.node[j] - if ret == 0: - ret = i - j - return ret - - -def cmp_mutation(i, j, tables, site_order): - site_i = tables.mutations.site[i] - site_j = tables.mutations.site[j] - ret = site_order[site_i] - site_order[site_j] - if ( - ret == 0 - and (not tskit.is_unknown_time(tables.mutations.time[i])) - and (not tskit.is_unknown_time(tables.mutations.time[j])) - ): - ret = tables.mutations.time[j] - tables.mutations.time[i] + # Final tiebreaker: ID if ret == 0: ret = i - j return ret @@ -1107,26 +1100,17 @@ def py_sort(tables, canonical=False): sorted_sites = sorted(range(copy.sites.num_rows), key=site_key) site_id_map = {k: j for j, k in enumerate(sorted_sites)} site_order = np.argsort(sorted_sites) - if canonical: - mut_num_descendants = compute_mutation_num_descendants(copy) - mut_key = functools.cmp_to_key( - lambda a, b: cmp_mutation_canonical( - a, - b, - tables=copy, - site_order=site_order, - num_descendants=mut_num_descendants, - ) - ) - else: - mut_key = functools.cmp_to_key( - lambda a, b: cmp_mutation( - a, - b, - tables=copy, - site_order=site_order, - ) + # Canonical sort, and regular sort are the same for mutations + mut_num_descendants = compute_mutation_num_descendants(copy) + mut_key = functools.cmp_to_key( + lambda a, b: cmp_mutation( + a, + b, + tables=copy, + site_order=site_order, + num_descendants=mut_num_descendants, ) + ) sorted_muts = sorted(range(copy.mutations.num_rows), key=mut_key) mut_id_map = {k: j for j, k in enumerate(sorted_muts)} mut_id_map[tskit.NULL] = tskit.NULL diff --git a/python/tskit/tables.py b/python/tskit/tables.py index a8db9e4952..3ba8e8da91 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -3627,10 +3627,10 @@ def sort(self, edge_start=0, *, site_start=0, mutation_start=0): Sites are sorted by position, and sites with the same position retain their relative ordering. - Mutations are sorted by site ID, and within the same site are sorted by time. - Those with equal or unknown time retain their relative ordering. This does not - currently rearrange tables so that mutations occur after their mutation parents, - which is a requirement for valid tree sequences. + Mutations are sorted by site, then time (if known), then the mutation's + node's time, then number of descendant mutations (ensuring that parent + mutations occur before children), then node, then original order in the + tables. Migrations are sorted by ``time``, ``source``, ``dest``, ``left`` and ``node`` values. This defines a total sort order, such that any permutation @@ -3667,9 +3667,10 @@ def canonicalise(self, remove_unreferenced=None): and population tables are sorted by the first node that refers to each (see :meth:`TreeSequence.subset`). Then, the remaining tables are sorted as in :meth:`.sort`, with the modification that mutations are sorted by - site, then time, then number of descendant mutations (ensuring that - parent mutations occur before children), then node, then original order - in the tables. This ensures that any two tables with the same information + site, then time (if known), then the mutation's node's time, then number + of descendant mutations (ensuring that parent mutations occur before + children), then node, then original order in the tables. This ensures + that any two tables with the same information and node order should be identical after canonical sorting (note that no canonical order exists for the node table).