From c26caec152237d3b805965a13d73f351072609c4 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Mon, 15 Feb 2021 16:27:28 +0000 Subject: [PATCH] Sort individuals --- c/CHANGELOG.rst | 2 +- c/tests/test_tables.c | 90 ++++++++++++++++--- c/tskit/core.c | 4 + c/tskit/core.h | 1 + c/tskit/tables.c | 137 +++++++++++++++++++++++++++++ python/CHANGELOG.rst | 2 +- python/tests/test_tables.py | 50 +++++++++-- python/tests/test_utilities.py | 27 ++++++ python/tests/test_wright_fisher.py | 14 +-- python/tests/tsutil.py | 80 +++++++++++++++++ 10 files changed, 378 insertions(+), 29 deletions(-) diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index 855af88d88..20e00706df 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -5,7 +5,7 @@ **Features** - Add ``parents`` to the individual table to enable recording of pedigrees - (:user:`ivan-krukov`, :user:`benjeffery`, :issue:`852`, :pr:`1125`, :pr:`866`, :pr:`1153`, :pr:`1177`). + (:user:`ivan-krukov`, :user:`benjeffery`, :issue:`852`, :pr:`1125`, :pr:`866`, :pr:`1153`, :pr:`1177`, :pr:`1199`). - Added a ``tsk_table_collection_canonicalse`` method, that allows checking for equality between tables that are equivalent up to reordering (:user:`petrelharp`, :user:`mufernando`, :pr:`1108`). diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 57ade414a3..6d1f9abe58 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -3859,6 +3859,30 @@ test_sort_tables_offsets(void) ret = tsk_table_collection_sort(&tables, &bookmark, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); + /* Individuals must either all be sorted or all skipped */ + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + /* Add a parent relation that unsorts the table */ + tables.individuals.parents[0] = 5; + ret = tsk_table_collection_copy(&tables, ©, TSK_NO_INIT); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + memset(&bookmark, 0, sizeof(bookmark)); + bookmark.individuals = tables.individuals.num_rows; + ret = tsk_table_collection_sort(&tables, &bookmark, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(&tables, ©, 0)); + + /* Check that sorting would have had an effect */ + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FALSE(tsk_table_collection_equals(&tables, ©, 0)); + + memset(&bookmark, 0, sizeof(bookmark)); + bookmark.individuals = tables.individuals.num_rows - 1; + ret = tsk_table_collection_sort(&tables, &bookmark, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); + tsk_table_collection_free(&tables); tsk_table_collection_free(©); tsk_treeseq_free(ts); @@ -3998,12 +4022,7 @@ test_sort_tables_errors(void) ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); - /* Individual, node, population and provenance positions are ignored */ - memset(&pos, 0, sizeof(pos)); - pos.individuals = 1; - ret = tsk_table_collection_sort(&tables, &pos, 0); - CU_ASSERT_EQUAL_FATAL(ret, 0); - + /* Node, population and provenance positions are ignored */ memset(&pos, 0, sizeof(pos)); pos.nodes = 1; ret = tsk_table_collection_sort(&tables, &pos, 0); @@ -4311,6 +4330,52 @@ test_sort_tables_migrations(void) free(ts); } +static void +test_sort_tables_individuals(void) +{ + int ret; + tsk_table_collection_t tables, copy; + const char *individuals = "1 0.25 2,3 0\n" + "2 0.5 5,-1 1\n" + "3 0.3 -1 2\n" + "4 0.3 -1 3\n" + "5 0.3 3 4\n" + "6 0.3 4 5\n"; + const char *individuals_cycle = "1 0.2 2 0\n" + "2 0.5 0 1\n" + "3 0.3 1 2\n"; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.sequence_length = 1.0; + parse_individuals(individuals, &tables.individuals); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_INDIVIDUAL_ORDERING); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_INDIVIDUALS); + + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_INDIVIDUAL_ORDERING); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + /* Check that the sort is stable */ + ret = tsk_table_collection_copy(&tables, ©, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); + + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT(tsk_table_collection_equals(&tables, ©, 0)); + + /* Errors on cycle */ + tsk_individual_table_clear(&tables.individuals); + parse_individuals(individuals_cycle, &tables.individuals); + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_PARENT_CYCLE); + + tsk_table_collection_free(&tables); + tsk_table_collection_free(©); +} + static void test_sorter_interface(void) { @@ -6083,7 +6148,6 @@ main(int argc, char **argv) test_link_ancestors_samples_and_ancestors_overlap }, { "test_link_ancestors_multiple_to_single_tree", test_link_ancestors_multiple_to_single_tree }, - { "test_sort_tables_offsets", test_sort_tables_offsets }, { "test_ibd_finder", test_ibd_finder }, { "test_ibd_finder_multiple_trees", test_ibd_finder_multiple_trees }, { "test_ibd_finder_empty_result", test_ibd_finder_empty_result }, @@ -6093,17 +6157,19 @@ main(int argc, char **argv) { "test_ibd_finder_multiple_ibd_paths", test_ibd_finder_multiple_ibd_paths }, { "test_ibd_finder_odd_topologies", test_ibd_finder_odd_topologies }, { "test_ibd_finder_errors", test_ibd_finder_errors }, + { "test_sorter_interface", test_sorter_interface }, + { "test_sort_tables_canonical_errors", test_sort_tables_canonical_errors }, + { "test_sort_tables_canonical", test_sort_tables_canonical }, { "test_sort_tables_drops_indexes", test_sort_tables_drops_indexes }, { "test_sort_tables_edge_metadata", test_sort_tables_edge_metadata }, - { "test_sort_tables_no_edge_metadata", test_sort_tables_no_edge_metadata }, + { "test_sort_tables_errors", test_sort_tables_errors }, + { "test_sort_tables_individuals", test_sort_tables_individuals }, { "test_sort_tables_mutation_times", test_sort_tables_mutation_times }, { "test_sort_tables_migrations", test_sort_tables_migrations }, + { "test_sort_tables_no_edge_metadata", test_sort_tables_no_edge_metadata }, + { "test_sort_tables_offsets", test_sort_tables_offsets }, { "test_edge_update_invalidates_index", test_edge_update_invalidates_index }, { "test_copy_table_collection", test_copy_table_collection }, - { "test_sort_tables_errors", test_sort_tables_errors }, - { "test_sort_tables_canonical_errors", test_sort_tables_canonical_errors }, - { "test_sort_tables_canonical", test_sort_tables_canonical }, - { "test_sorter_interface", test_sorter_interface }, { "test_dump_unindexed", test_dump_unindexed }, { "test_dump_load_empty", test_dump_load_empty }, { "test_dump_load_unsorted", test_dump_load_unsorted }, diff --git a/c/tskit/core.c b/c/tskit/core.c index 199096b12b..faa3487bac 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -484,6 +484,10 @@ tsk_strerror_internal(int err) case TSK_ERR_INDIVIDUAL_SELF_PARENT: ret = "Individuals cannot be their own parents."; break; + + case TSK_ERR_INDIVIDUAL_PARENT_CYCLE: + ret = "Individuals cannot be their own ancestor."; + break; } return ret; } diff --git a/c/tskit/core.h b/c/tskit/core.h index 00f1152ddc..91a989eb8d 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -326,6 +326,7 @@ not found in the file. /* Individual errors */ #define TSK_ERR_UNSORTED_INDIVIDUALS -1700 #define TSK_ERR_INDIVIDUAL_SELF_PARENT -1701 +#define TSK_ERR_INDIVIDUAL_PARENT_CYCLE -1702 // clang-format on diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 1cb95e5ff5..81666ad515 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -4959,6 +4959,121 @@ tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self) return ret; } +static int +tsk_table_sorter_sort_individuals(tsk_table_sorter_t *self) +{ + int ret = 0; + tsk_id_t i; + tsk_individual_table_t copy; + tsk_individual_t individual; + tsk_individual_table_t *individuals = &self->tables->individuals; + tsk_node_table_t *nodes = &self->tables->nodes; + tsk_size_t num_individuals = individuals->num_rows; + tsk_size_t current_todo = 0; + tsk_size_t todo_insertion_point = 0; + tsk_size_t *incoming_edge_count + = malloc(num_individuals * sizeof(*incoming_edge_count)); + tsk_id_t *individuals_todo + = malloc((num_individuals + 1) * sizeof(*individuals_todo)); + tsk_id_t *new_id_map = malloc(num_individuals * sizeof(*new_id_map)); + + ret = tsk_individual_table_copy(individuals, ©, 0); + if (ret != 0) { + goto out; + } + + if (incoming_edge_count == NULL || individuals_todo == NULL || new_id_map == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + for (i = 0; i < (tsk_id_t) num_individuals; i++) { + incoming_edge_count[i] = 0; + individuals_todo[i] = TSK_NULL; + new_id_map[i] = TSK_NULL; + } + individuals_todo[num_individuals] = TSK_NULL; /* Sentinel value */ + + /* First find the set of individuals that have no children by creating + * an array of incoming edge counts */ + for (i = 0; i < (tsk_id_t) individuals->parents_length; i++) { + if (individuals->parents[i] != TSK_NULL) { + incoming_edge_count[individuals->parents[i]]++; + } + } + /* Use these as the starting points for checking all individuals, + * doing this in reverse makes the sort stable */ + for (i = (tsk_id_t) num_individuals - 1; i >= 0; i--) { + if (incoming_edge_count[i] == 0) { + individuals_todo[todo_insertion_point] = i; + todo_insertion_point++; + } + } + + /* Now emit individuals from the set that have no children, removing their edges + * as we go adding new individuals to the no children set. */ + while (individuals_todo[current_todo] != TSK_NULL) { + tsk_individual_table_get_row_unsafe( + individuals, individuals_todo[current_todo], &individual); + for (i = 0; i < (tsk_id_t) individual.parents_length; i++) { + if (individual.parents[i] != TSK_NULL) { + incoming_edge_count[individual.parents[i]]--; + if (incoming_edge_count[individual.parents[i]] == 0) { + individuals_todo[todo_insertion_point] = individual.parents[i]; + todo_insertion_point++; + } + } + } + current_todo++; + } + + /* Any edges left are parts of cycles */ + for (i = 0; i < (tsk_id_t) num_individuals; i++) { + if (incoming_edge_count[i] > 0) { + ret = TSK_ERR_INDIVIDUAL_PARENT_CYCLE; + goto out; + } + } + + ret = tsk_individual_table_clear(individuals); + if (ret != 0) { + goto out; + } + + /* The sorted individuals are in reverse order */ + for (i = (tsk_id_t) num_individuals - 1; i >= 0; i--) { + tsk_individual_table_get_row_unsafe(©, individuals_todo[i], &individual); + ret = tsk_individual_table_add_row(individuals, individual.flags, + individual.location, individual.location_length, individual.parents, + individual.parents_length, individual.metadata, individual.metadata_length); + if (ret < 0) { + goto out; + } + new_id_map[individuals_todo[i]] = ret; + } + + /* Rewrite the parent ids */ + for (i = 0; i < (tsk_id_t) individuals->parents_length; i++) { + if (individuals->parents[i] != TSK_NULL) { + individuals->parents[i] = new_id_map[individuals->parents[i]]; + } + } + /* Rewrite the node individual ids */ + for (i = 0; i < (tsk_id_t) nodes->num_rows; i++) { + if (nodes->individual[i] != TSK_NULL) { + nodes->individual[i] = new_id_map[nodes->individual[i]]; + } + } + + ret = 0; +out: + tsk_safe_free(incoming_edge_count); + tsk_safe_free(individuals_todo); + tsk_safe_free(new_id_map); + tsk_individual_table_free(©); + return ret; +} + int tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) { @@ -4966,6 +5081,7 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) tsk_size_t edge_start = 0; tsk_size_t migration_start = 0; bool skip_sites = false; + bool skip_individuals = false; if (start != NULL) { if (start->edges > self->tables->edges.num_rows) { @@ -4989,6 +5105,14 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; goto out; } + + /* Individuals also must be all sorted or skipped entirely */ + if (start->individuals == self->tables->individuals.num_rows) { + skip_individuals = true; + } else if (start->individuals != 0) { + ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; + goto out; + } } /* The indexes will be invalidated, so drop them */ ret = tsk_table_collection_drop_index(self->tables, 0); @@ -5019,6 +5143,12 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) goto out; } } + if (!skip_individuals) { + ret = tsk_table_sorter_sort_individuals(self); + if (ret != 0) { + goto out; + } + } out: return ret; } @@ -9751,6 +9881,13 @@ tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, } } + /* Rewrite the individual parent ids */ + for (k = 0; k < (tsk_id_t) self->individuals.parents_length; k++) { + if (self->individuals.parents[k] != TSK_NULL) { + self->individuals.parents[k] = individual_map[self->individuals.parents[k]]; + } + } + ret = 0; out: tsk_safe_free(node_map); diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index cce5360afa..45e7267dc2 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -9,7 +9,7 @@ label can be set (:user:`hyanwong`,:issue:`1155`, :issue:`1194`, :pr:`1182`, :pr:`1213`) - Add ``parents`` column to the individual table to allow recording of pedigrees - (:user:`ivan-krukov`, :user:`benjeffery`, :issue:`852`, :pr:`1125`, :pr:`866`, :pr:`1153`, :pr:`1177` :pr:`1192`). + (:user:`ivan-krukov`, :user:`benjeffery`, :issue:`852`, :pr:`1125`, :pr:`866`, :pr:`1153`, :pr:`1177`, :pr:`1192` :pr:`1199`). - Added ``Tree.generate_random_binary`` static method to create random binary trees (:user:`hyanwong`, :user:`jeromekelleher`, :pr:`1037`). diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 310479f13d..c309062f66 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -1400,15 +1400,25 @@ def verify_sort_equality(self, tables, seed): tsutil.shuffle_tables( tables1, seed, - shuffle_individuals=False, shuffle_populations=False, ) + tables1.individuals.packset_metadata( + [bytes(str(i), "utf-8") for i in range(tables1.individuals.num_rows)] + ) tables2 = tables1.copy() tables1.sort() tsutil.py_sort(tables2) + + # TODO - Check the sorted tables are valid ts, currently fails due to mutations + # tables1.tree_sequence() + # tables2.tree_sequence() + tsutil.assert_table_collections_equal(tables1, tables2) def verify_canonical_equality(self, tables, seed): + # Migrations not supported + tables.migrations.clear() + for ru in [True, False]: tables1 = tables.copy() tsutil.shuffle_tables( @@ -1427,9 +1437,7 @@ def verify_sort_mutation_consistency(self, orig_tables, seed): mut_map[tables.sites[mut.site].position].append( (mut.node, mut.derived_state, mut.metadata) ) - tsutil.shuffle_tables( - tables, seed, shuffle_individuals=False, shuffle_populations=False - ) + tsutil.shuffle_tables(tables, seed, shuffle_populations=False) for mut in tables.mutations: site = tables.sites[mut.site] assert (mut.node, mut.derived_state, mut.metadata) in mut_map[site.position] @@ -1457,11 +1465,13 @@ def verify_randomise_tables(self, orig_tables, seed): tables.sort() tsutil.assert_table_collections_equal(tables, sorted_tables) - # Now also randomize sites and mutations + # Now also randomize sites, mutations and individuals tables.canonicalise(remove_unreferenced=False) sorted_tables = tables.copy() tsutil.shuffle_tables( - tables, seed=1234, shuffle_populations=False, shuffle_individuals=False + tables, + seed=1234, + shuffle_populations=False, ) tables.canonicalise(remove_unreferenced=False) tsutil.assert_table_collections_equal(tables, sorted_tables) @@ -1471,6 +1481,9 @@ def verify_randomise_tables(self, orig_tables, seed): tables.canonicalise(remove_unreferenced=False) tsutil.assert_table_collections_equal(tables, sorted_tables) + # Check the canonicalised form meets the tree sequence requirements + tables.tree_sequence() + def verify_sort(self, tables, seed): self.verify_sort_equality(tables, seed) self.verify_canonical_equality(tables, seed) @@ -1512,10 +1525,31 @@ def verify_edge_sort_offset(self, ts): assert edges == tables.edges def get_wf_example(self, seed): - tables = wf.wf_sim(6, 3, num_pops=2, seed=seed, num_loci=3) + tables = wf.wf_sim( + 6, + 3, + num_pops=2, + seed=seed, + num_loci=3, + record_migrations=True, + record_individuals=True, + ) tables.sort() ts = tables.tree_sequence() - return tsutil.insert_individuals(ts, ploidy=2) + return ts + + def test_wf_example(self): + tables = wf.wf_sim( + N=6, + ngens=3, + num_pops=2, + mig_rate=1.0, + deep_history=False, + seed=42, + record_migrations=True, + record_individuals=True, + ) + self.verify_sort(tables, 42) def test_single_tree_no_mutations(self): ts = msprime.simulate(10, random_seed=self.random_seed) diff --git a/python/tests/test_utilities.py b/python/tests/test_utilities.py index 1a12a83843..fd7cbac078 100644 --- a/python/tests/test_utilities.py +++ b/python/tests/test_utilities.py @@ -26,6 +26,7 @@ import pytest import tests.tsutil as tsutil +import tskit class TestJukesCantor: @@ -142,3 +143,29 @@ def test_ploidy_2_reversed(self): assert ts.num_individuals == 5 for j, ind in enumerate(ts.individuals()): assert list(ind.nodes) == [samples[2 * j + 1], samples[2 * j]] + + +class TestSortIndividuals: + def test_sort_individuals(self): + tables = tskit.TableCollection() + tables.individuals.add_row(parents=[1], metadata=b"0") + tables.individuals.add_row(parents=[-1], metadata=b"1") + tsutil.sort_individual_table(tables) + assert tables.individuals.metadata.tobytes() == b"10" + + tables = tskit.TableCollection() + tables.individuals.add_row(parents=[2, 3], metadata=b"0") + tables.individuals.add_row(parents=[5], metadata=b"1") + tables.individuals.add_row(parents=[-1], metadata=b"2") + tables.individuals.add_row(parents=[-1], metadata=b"3") + tables.individuals.add_row(parents=[3], metadata=b"4") + tables.individuals.add_row(parents=[4], metadata=b"5") + + tsutil.sort_individual_table(tables) + assert tables.individuals.metadata.tobytes() == b"342501" + + tables = tskit.TableCollection() + tables.individuals.add_row(parents=[1], metadata=b"0") + tables.individuals.add_row(parents=[0], metadata=b"1") + with pytest.raises(ValueError, match="Individual pedigree has cycles"): + tsutil.sort_individual_table(tables) diff --git a/python/tests/test_wright_fisher.py b/python/tests/test_wright_fisher.py index 4d072956d2..72686401d3 100644 --- a/python/tests/test_wright_fisher.py +++ b/python/tests/test_wright_fisher.py @@ -464,18 +464,18 @@ def test_record_individuals(self): tables = wf_sim( N=N, ngens=10, seed=12345, record_individuals=True, deep_history=False ) - tables.sort() assert len(tables.individuals) == len(tables.nodes) + for node_id, individual in enumerate(tables.nodes.individual): + assert node_id == individual + tables.sort() ts = tables.tree_sequence() - for node in ts.nodes(): - assert node.individual == node.id - for tree in ts.trees(): for u in tree.nodes(): + individual = ts.individual(ts.node(u).individual) parent_node = tree.parent(u) - # We already know the individual has the same ID as the node - individual = ts.individual(u) - assert parent_node in individual.parents + if parent_node != tskit.NULL: + parent_individual = ts.individual(ts.node(parent_node).individual) + assert parent_individual.id in individual.parents def get_wf_sims(seed): diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 24c66f9b59..37e09805f7 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -1016,6 +1016,19 @@ def cmp_edge(i, j, tables): return ret +def cmp_migration(i, j, tables): + ret = tables.migrations.time[i] - tables.migrations.time[j] + if ret == 0: + ret = tables.migrations.source[i] - tables.migrations.source[j] + if ret == 0: + ret = tables.migrations.dest[i] - tables.migrations.dest[j] + if ret == 0: + ret = tables.migrations.left[i] - tables.migrations.left[j] + if ret == 0: + ret = tables.migrations.node[i] - tables.migrations.node[j] + return ret + + def compute_mutation_num_descendants(tables): mutations = tables.mutations num_descendants = np.zeros(mutations.num_rows) @@ -1040,6 +1053,7 @@ def py_sort(tables, use_num_descendants=False): tables.edges.clear() tables.sites.clear() tables.mutations.clear() + tables.migrations.clear() edge_key = functools.cmp_to_key(lambda a, b: cmp_edge(a, b, tables=copy)) sorted_edges = sorted(range(copy.edges.num_rows), key=edge_key) site_key = functools.cmp_to_key(lambda a, b: cmp_site(a, b, tables=copy)) @@ -1069,6 +1083,8 @@ def py_sort(tables, use_num_descendants=False): sorted_muts = sorted(range(copy.mutations.num_rows), key=mut_key) mut_id_map = {k: j for j, k in enumerate(sorted_muts)} mut_id_map[tskit.NULL] = tskit.NULL + mig_key = functools.cmp_to_key(lambda a, b: cmp_migration(a, b, tables=copy)) + sorted_migs = sorted(range(copy.migrations.num_rows), key=mig_key) for edge_id in sorted_edges: tables.edges.add_row( copy.edges[edge_id].left, @@ -1091,6 +1107,18 @@ def py_sort(tables, use_num_descendants=False): copy.mutations[mut_id].metadata, copy.mutations[mut_id].time, ) + for mig_id in sorted_migs: + tables.migrations.add_row( + copy.migrations[mig_id].left, + copy.migrations[mig_id].right, + copy.migrations[mig_id].node, + copy.migrations[mig_id].source, + copy.migrations[mig_id].dest, + copy.migrations[mig_id].time, + copy.migrations[mig_id].metadata, + ) + + sort_individual_table(tables) def algorithm_T(ts): @@ -1665,3 +1693,55 @@ def assert_tables_equal(t1, t2, label=""): f"{label}: t1.num_rows {t1.num_rows} != {t2.num_rows} t2.num_rows" ) assert t1 == t2 + + +def sort_individual_table(tables): + """ + Sorts the individual table by parents-before-children. + """ + + individuals = tables.individuals + num_individuals = individuals.num_rows + + # First find the set of individuals that have no children + # by creating an array of incoming edge counts + incoming_edge_count = np.zeros((num_individuals,), np.int64) + for parent in individuals.parents: + if parent != tskit.NULL: + incoming_edge_count[parent] += 1 + + todo = collections.deque() + sorted_order = [] + for individual, num_edges in reversed(list(enumerate(incoming_edge_count))): + if num_edges == 0: + todo.append(individual) + sorted_order.append(individual) + # Now emit individuals from the set that have no children, removing their edges + # as we go adding new individuals to the no children set. + while len(todo) > 0: + individual = todo.popleft() + for parent in individuals[individual].parents: + if parent != tskit.NULL: + incoming_edge_count[parent] -= 1 + if incoming_edge_count[parent] == 0: + todo.append(parent) + sorted_order.append(parent) + + if np.sum(incoming_edge_count) > 0: + raise ValueError("Individual pedigree has cycles") + + ind_id_map = {tskit.NULL: tskit.NULL} + + individuals_copy = tables.copy().individuals + tables.individuals.clear() + for row in reversed(sorted_order): + ind_id_map[row] = tables.individuals.add_row( + flags=individuals_copy[row].flags, + location=individuals_copy[row].location, + parents=individuals_copy[row].parents, + metadata=individuals_copy[row].metadata, + ) + tables.individuals.parents = [ind_id_map[i] for i in tables.individuals.parents] + tables.nodes.individual = [ind_id_map[i] for i in tables.nodes.individual] + + return tables