diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index b82bfa30bd..855af88d88 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -13,6 +13,9 @@ - Removed a previous requirement on ``tsk_table_collection_union``, allowing for unioning of new information both above and below shared history (:user:`petrelharp`, :user:`mufernando`, :pr:`1108`). +- Support migrations in tsk_table_collection_sort. (:user:`jeromekelleher`, + :issue:`22`, :issue:`117`, :pr:`1131`). + **Breaking changes** - Method ``tsk_individual_table_add_row`` has an extra arguments ``parents`` and ``parents_length``. diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 47c146ab12..9ac80f86fe 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -29,6 +29,35 @@ #include #include +static void +reverse_migrations(tsk_table_collection_t *tables) +{ + int ret; + tsk_migration_table_t migrations; + tsk_migration_t migration; + tsk_id_t j; + + /* Easy way to copy the metadata schema */ + ret = tsk_migration_table_copy(&tables->migrations, &migrations, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_migration_table_clear(&migrations); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + for (j = (tsk_id_t) tables->migrations.num_rows - 1; j >= 0; j--) { + ret = tsk_migration_table_get_row(&tables->migrations, j, &migration); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_migration_table_add_row(&migrations, migration.left, migration.right, + migration.node, migration.source, migration.dest, migration.time, + migration.metadata, migration.metadata_length); + CU_ASSERT_FATAL(ret >= 0); + } + + ret = tsk_migration_table_copy(&migrations, &tables->migrations, TSK_NO_INIT); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + tsk_migration_table_free(&migrations); +} + static void reverse_edges(tsk_table_collection_t *tables) { @@ -37,7 +66,10 @@ reverse_edges(tsk_table_collection_t *tables) tsk_edge_t edge; tsk_id_t j; - ret = tsk_edge_table_init(&edges, tables->edges.options); + /* Easy way to copy the metadata schema */ + ret = tsk_edge_table_copy(&tables->edges, &edges, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_edge_table_clear(&edges); CU_ASSERT_EQUAL_FATAL(ret, 0); for (j = (tsk_id_t) tables->edges.num_rows - 1; j >= 0; j--) { @@ -3756,13 +3788,8 @@ test_sort_tables_offsets(void) ret = tsk_treeseq_copy_tables(ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_sort(&tables, NULL, 0); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED); - - tsk_migration_table_clear(&tables.migrations); ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - /* Check that setting edge offset = len(edges) does nothing */ reverse_edges(&tables); ret = tsk_table_collection_copy(&tables, ©, 0); @@ -3773,6 +3800,18 @@ test_sort_tables_offsets(void) CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + /* Check that setting migration offset = len(migrations) does nothing */ + reverse_migrations(&tables); + ret = tsk_table_collection_copy(&tables, ©, TSK_NO_INIT); + CU_ASSERT_EQUAL_FATAL(ret, 0); + memset(&bookmark, 0, sizeof(bookmark)); + bookmark.migrations = tables.migrations.num_rows; + ret = tsk_table_collection_sort(&tables, &bookmark, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); + ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tables.sites.num_rows > 2); @@ -3942,6 +3981,15 @@ test_sort_tables_errors(void) ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); + memset(&pos, 0, sizeof(pos)); + pos.migrations = (tsk_size_t) -1; + ret = tsk_table_collection_sort(&tables, &pos, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); + + pos.migrations = tables.migrations.num_rows + 1; + ret = tsk_table_collection_sort(&tables, &pos, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATION_OUT_OF_BOUNDS); + /* Individual, node, population and provenance positions are ignored */ memset(&pos, 0, sizeof(pos)); pos.individuals = 1; @@ -3963,13 +4011,8 @@ test_sort_tables_errors(void) ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - /* Setting migrations, sites or mutations gives a BAD_PARAM. See + /* Setting sites or mutations gives a BAD_PARAM. See * github.com/tskit-dev/tskit/issues/101 */ - memset(&pos, 0, sizeof(pos)); - pos.migrations = 1; - ret = tsk_table_collection_sort(&tables, &pos, 0); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATIONS_NOT_SUPPORTED); - memset(&pos, 0, sizeof(pos)); pos.sites = 1; ret = tsk_table_collection_sort(&tables, &pos, 0); @@ -3980,12 +4023,6 @@ test_sort_tables_errors(void) ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); - /* Migrations are not supported */ - tsk_migration_table_add_row(&tables.migrations, 0, 1, 0, 0, 0, 0, NULL, 0); - CU_ASSERT_EQUAL_FATAL(tables.migrations.num_rows, 1); - ret = tsk_table_collection_sort(&tables, NULL, 0); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED); - tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); } @@ -4179,6 +4216,51 @@ test_sort_tables_canonical(void) tsk_table_collection_free(&t1); } +static void +test_sort_tables_migrations(void) +{ + int ret; + tsk_treeseq_t *ts; + tsk_table_collection_t tables, copy; + + ts = caterpillar_tree(13, 1, 1); + ret = tsk_treeseq_copy_tables(ts, &tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FATAL(tables.migrations.num_rows > 0); + + ret = tsk_table_collection_copy(&tables, ©, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); + + reverse_migrations(&tables); + CU_ASSERT_FATAL(!tsk_table_collection_equals(&tables, ©, 0)); + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FATAL(tsk_migration_table_equals(&tables.migrations, ©.migrations, 0)); + CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); + + /* Make sure we test the deeper comparison keys. The full key is + * (time, source, dest, left, node) */ + tsk_migration_table_clear(&tables.migrations); + + /* params = left, right, node, source, dest, time */ + tsk_migration_table_add_row(&tables.migrations, 0, 1, 0, 0, 1, 0, NULL, 0); + tsk_migration_table_add_row(&tables.migrations, 0, 1, 1, 0, 1, 0, NULL, 0); + ret = tsk_migration_table_copy(&tables.migrations, ©.migrations, TSK_NO_INIT); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + reverse_migrations(&tables); + CU_ASSERT_FATAL(!tsk_table_collection_equals(&tables, ©, 0)); + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FATAL(tsk_migration_table_equals(&tables.migrations, ©.migrations, 0)); + + tsk_table_collection_free(&tables); + tsk_table_collection_free(©); + tsk_treeseq_free(ts); + free(ts); +} + static void test_sorter_interface(void) { @@ -5942,6 +6024,7 @@ main(int argc, char **argv) { "test_sort_tables_edge_metadata", test_sort_tables_edge_metadata }, { "test_sort_tables_no_edge_metadata", test_sort_tables_no_edge_metadata }, { "test_sort_tables_mutation_times", test_sort_tables_mutation_times }, + { "test_sort_tables_migrations", test_sort_tables_migrations }, { "test_edge_update_invalidates_index", test_edge_update_invalidates_index }, { "test_copy_table_collection", test_copy_table_collection }, { "test_sort_tables_errors", test_sort_tables_errors }, diff --git a/c/tests/testlib.c b/c/tests/testlib.c index 9ebe005838..96584fbb34 100644 --- a/c/tests/testlib.c +++ b/c/tests/testlib.c @@ -821,12 +821,7 @@ caterpillar_tree(tsk_size_t n, tsk_size_t num_sites, tsk_size_t num_mutations) strlen(prov_timestamp), prov_record, strlen(prov_record)); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_sort(&tables, 0, 0); - CU_ASSERT_EQUAL_FATAL(ret, 0); - - /* Add in some mock migrations. Must be done after sort as it doesn't support - * migrations. - * TODO make these consistent with the caterpillar tree topology. */ + /* TODO make these consistent with the caterpillar tree topology. */ for (j = 0; j < n - 1; j++) { m = j % num_metadatas; ret = tsk_migration_table_add_row(&tables.migrations, 0, 1, j, j, j + 1, j + 1.5, @@ -834,6 +829,8 @@ caterpillar_tree(tsk_size_t n, tsk_size_t num_sites, tsk_size_t num_mutations) CU_ASSERT_FATAL(ret >= 0); } + ret = tsk_table_collection_sort(&tables, 0, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_compute_mutation_parents(&tables, 0); diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 52d99cfd02..d79a46d726 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019-2020 Tskit Developers + * Copyright (c) 2019-2021 Tskit Developers * Copyright (c) 2017-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -3507,9 +3507,8 @@ tsk_migration_table_dump_text(const tsk_migration_table_t *self, FILE *out) for (j = 0; j < self->num_rows; j++) { metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; err = fprintf(out, "%.3f\t%.3f\t%d\t%d\t%d\t%f\t%.*s\n", self->left[j], - self->right[j], (int) self->node[j], (int) self->source[j], - (int) self->dest[j], self->time[j], metadata_len, - self->metadata + self->metadata_offset[j]); + self->right[j], self->node[j], self->source[j], self->dest[j], self->time[j], + metadata_len, self->metadata + self->metadata_offset[j]); if (err < 0) { goto out; } @@ -3531,7 +3530,6 @@ tsk_migration_table_equals(const tsk_migration_table_t *self, && memcmp(self->source, other->source, self->num_rows * sizeof(tsk_id_t)) == 0 && memcmp(self->dest, other->dest, self->num_rows * sizeof(tsk_id_t)) == 0 && memcmp(self->time, other->time, self->num_rows * sizeof(double)) == 0; - if (!(options & TSK_CMP_IGNORE_METADATA)) { ret = ret && self->metadata_length == other->metadata_length && self->metadata_schema_length == other->metadata_schema_length @@ -4545,6 +4543,17 @@ typedef struct { int num_descendants; } mutation_canonical_sort_t; +typedef struct { + double left; + double right; + tsk_id_t node; + tsk_id_t source; + tsk_id_t dest; + double time; + tsk_size_t metadata_offset; + tsk_size_t metadata_length; +} migration_sort_t; + static int cmp_site(const void *a, const void *b) { @@ -4628,6 +4637,32 @@ cmp_edge(const void *a, const void *b) return ret; } +static int +cmp_migration(const void *a, const void *b) +{ + const migration_sort_t *ca = (const migration_sort_t *) a; + const migration_sort_t *cb = (const migration_sort_t *) b; + + int ret = (ca->time > cb->time) - (ca->time < cb->time); + /* If time values are equal, sort by the source population */ + if (ret == 0) { + ret = (ca->source > cb->source) - (ca->source < cb->source); + /* If the source populations are equal, sort by the dest */ + if (ret == 0) { + ret = (ca->dest > cb->dest) - (ca->dest < cb->dest); + /* If the dest populations are equal, sort by the left coordinate. */ + if (ret == 0) { + ret = (ca->left > cb->left) - (ca->left < cb->left); + /* If everything else is equal, compare by node */ + if (ret == 0) { + ret = (ca->node > cb->node) - (ca->node < cb->node); + } + } + } + } + return ret; +} + static int tsk_table_sorter_sort_edges(tsk_table_sorter_t *self, tsk_size_t start) { @@ -4683,6 +4718,58 @@ tsk_table_sorter_sort_edges(tsk_table_sorter_t *self, tsk_size_t start) return ret; } +static int +tsk_table_sorter_sort_migrations(tsk_table_sorter_t *self, tsk_size_t start) +{ + int ret = 0; + const tsk_migration_table_t *migrations = &self->tables->migrations; + migration_sort_t *m; + tsk_size_t j, k, metadata_offset; + tsk_size_t n = migrations->num_rows - start; + migration_sort_t *sorted_migrations = malloc(n * sizeof(*sorted_migrations)); + char *old_metadata = malloc(migrations->metadata_length); + + if (sorted_migrations == NULL || old_metadata == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + memcpy(old_metadata, migrations->metadata, migrations->metadata_length); + for (j = 0; j < n; j++) { + m = sorted_migrations + j; + k = start + j; + m->left = migrations->left[k]; + m->right = migrations->right[k]; + m->node = migrations->node[k]; + m->source = migrations->source[k]; + m->dest = migrations->dest[k]; + m->time = migrations->time[k]; + m->metadata_offset = migrations->metadata_offset[k]; + m->metadata_length + = migrations->metadata_offset[k + 1] - migrations->metadata_offset[k]; + } + qsort(sorted_migrations, n, sizeof(migration_sort_t), cmp_migration); + /* Copy the migrations back into the table. */ + metadata_offset = 0; + for (j = 0; j < n; j++) { + m = sorted_migrations + j; + k = start + j; + migrations->left[k] = m->left; + migrations->right[k] = m->right; + migrations->node[k] = m->node; + migrations->source[k] = m->source; + migrations->dest[k] = m->dest; + migrations->time[k] = m->time; + memcpy(migrations->metadata + metadata_offset, old_metadata + m->metadata_offset, + m->metadata_length); + migrations->metadata_offset[k] = metadata_offset; + metadata_offset += m->metadata_length; + } +out: + tsk_safe_free(sorted_migrations); + tsk_safe_free(old_metadata); + return ret; +} + static int tsk_table_sorter_sort_sites(tsk_table_sorter_t *self) { @@ -4868,6 +4955,7 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) { int ret = 0; tsk_size_t edge_start = 0; + tsk_size_t migration_start = 0; bool skip_sites = false; if (start != NULL) { @@ -4876,11 +4964,12 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) goto out; } edge_start = start->edges; - - if (start->migrations != 0) { - ret = TSK_ERR_MIGRATIONS_NOT_SUPPORTED; + if (start->migrations > self->tables->migrations.num_rows) { + ret = TSK_ERR_MIGRATION_OUT_OF_BOUNDS; goto out; } + migration_start = start->migrations; + /* We only allow sites and mutations to be specified as a way to * skip sorting them entirely. Both sites and mutations must be * equal to the number of rows */ @@ -4897,12 +4986,20 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) if (ret != 0) { goto out; } + if (self->sort_edges != NULL) { ret = self->sort_edges(self, edge_start); if (ret != 0) { goto out; } } + /* Avoid calling sort_migrations in the common case when it's a no-op */ + if (self->tables->migrations.num_rows > 0) { + ret = tsk_table_sorter_sort_migrations(self, migration_start); + if (ret != 0) { + goto out; + } + } if (!skip_sites) { ret = tsk_table_sorter_sort_sites(self); if (ret != 0) { @@ -4924,10 +5021,6 @@ tsk_table_sorter_init( int ret = 0; memset(self, 0, sizeof(tsk_table_sorter_t)); - if (tables->migrations.num_rows != 0) { - ret = TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED; - goto out; - } if (!(options & TSK_NO_CHECK_INTEGRITY)) { ret = tsk_table_collection_check_integrity(tables, 0); if (ret != 0) { diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 7f6fab8b0f..368fa00486 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2020 Tskit Developers +# Copyright (c) 2018-2021 Tskit Developers # Copyright (c) 2017 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -1690,6 +1690,49 @@ def test_empty_tables(self): assert tables.migrations.num_rows == 0 +class TestSortMigrations: + """ + Tests that migrations are correctly ordered when sorting tables. + """ + + def test_msprime_output_unmodified(self): + pop_configs = [msprime.PopulationConfiguration(5) for _ in range(3)] + migration_matrix = [[0, 1, 1], [1, 0, 1], [1, 1, 0]] + ts = msprime.simulate( + recombination_rate=0.1, + population_configurations=pop_configs, + migration_matrix=migration_matrix, + record_migrations=True, + random_seed=1, + ) + assert ts.num_migrations > 100 + tables = ts.tables.copy() + tables.sort() + assert tables.equals(ts.tables, ignore_provenance=True) + + def test_full_sort_order(self): + tables = tskit.TableCollection(1) + for _ in range(3): + tables.nodes.add_row() + tables.populations.add_row() + for left in [0, 0.5]: + for a_time in range(3): + for node in range(2): + tables.migrations.add_row( + left=left, right=1, node=node, source=0, dest=1, time=a_time + ) + tables.migrations.add_row( + left=left, right=1, node=node, source=1, dest=2, time=a_time + ) + + sorted_list = sorted( + tables.migrations, key=lambda m: (m.time, m.source, m.dest, m.left, m.node) + ) + assert sorted_list != list(tables.migrations) + tables.sort() + assert sorted_list == list(tables.migrations) + + class TestSortMutations: """ Tests that mutations are correctly ordered when sorting tables. diff --git a/python/tskit/tables.py b/python/tskit/tables.py index defa4483ae..cae0760048 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -2686,6 +2686,14 @@ def sort(self, edge_start=0): currently rearrange tables so that mutations occur after their mutation parents, which is a requirement for valid tree sequences. + Migrations are sorted by ``time``, ``source``, ``dest``, ``left`` and + ``node`` values. This defines a total sort order, such that any permutation + of a valid migration table will be sorted into the same output order. + Note that this sorting order exceeds the + :ref:`migration sorting requirements ` for a + valid tree sequence, which only requires that migrations are sorted by + time value. + :param int edge_start: The index in the edge table where sorting starts (default=0; must be <= len(edges)). """