From be18b7e511d07539d9be6dc283925f594eaf7394 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 24 May 2022 12:58:29 +0100 Subject: [PATCH] Implement delete_older operation --- c/tests/test_tables.c | 54 +++++ c/tskit/tables.c | 110 ++++++++++ c/tskit/tables.h | 4 +- python/_tskitmodule.c | 27 +++ python/tests/test_lowlevel.py | 8 + python/tests/test_table_transforms.py | 285 ++++++++++++++++++++++++++ python/tskit/tables.py | 24 +++ 7 files changed, 511 insertions(+), 1 deletion(-) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 310caf23a3..8da440ed33 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -10345,6 +10345,59 @@ test_table_collection_takeset_indexes(void) tsk_treeseq_free(&ts); } +static void +test_table_collection_delete_older(void) +{ + int ret; + tsk_treeseq_t ts; + tsk_table_collection_t t; + + const char *mutations = "0 2 1 -1\n" + "0 2 0 0\n" + "1 0 1 -1\n" + "2 5 1 -1\n"; + + tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, + mutations, paper_ex_individuals, NULL, 0); + ret = tsk_treeseq_copy_tables(&ts, &t, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_treeseq_free(&ts); + + /* Add some migrations */ + tsk_population_table_add_row(&t.populations, NULL, 0); + tsk_population_table_add_row(&t.populations, NULL, 0); + tsk_migration_table_add_row(&t.migrations, 0, 10, 0, 0, 1, 0.05, NULL, 0); + tsk_migration_table_add_row(&t.migrations, 0, 10, 0, 1, 0, 0.09, NULL, 0); + tsk_migration_table_add_row(&t.migrations, 0, 10, 0, 0, 1, 0.10, NULL, 0); + CU_ASSERT_EQUAL(t.migrations.num_rows, 3); + + /* Note: trees 1 and 2 are identical now + * + 0.09┊ 5 ┊ 5 ┊ 5 ┊ + ┊ ┏┻┓ ┊ ┏━┻┓ ┊ ┏━┻┓ ┊ + 0.07┊ ┃ ┃ ┊ ┃ 4 ┊ ┃ 4 ┊ + ┊ ┃ ┃ ┊ ┃ ┏┻┓ ┊ ┃ ┏┻┓ ┊ + 0.00┊ 0 1 3 2 ┊ 0 1 2 3 ┊ 0 1 2 3 ┊ + 0.00 2.00 7.00 10.00 + */ + + ret = tsk_table_collection_delete_older(&t, 0.09, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_treeseq_init(&ts, &t, TSK_TS_INIT_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 2); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 9); + /* Lost the mutation over 5 */ + CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 3); + /* We delete the migration at exactly 0.09. */ + CU_ASSERT_EQUAL(tsk_treeseq_get_num_migrations(&ts), 1); + + tsk_table_collection_free(&t); + tsk_treeseq_free(&ts); +} + int main(int argc, char **argv) { @@ -10466,6 +10519,7 @@ main(int argc, char **argv) { "test_table_collection_clear", test_table_collection_clear }, { "test_table_collection_takeset_indexes", test_table_collection_takeset_indexes }, + { "test_table_collection_delete_older", test_table_collection_delete_older }, { NULL, NULL }, }; diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 5bd62819e4..b650b478bc 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -12008,6 +12008,116 @@ tsk_table_collection_compute_mutation_times( return ret; } +int TSK_WARN_UNUSED +tsk_table_collection_delete_older( + tsk_table_collection_t *self, double time, tsk_flags_t TSK_UNUSED(options)) +{ + int ret = 0; + tsk_edge_t edge; + tsk_mutation_t mutation; + tsk_migration_t migration; + tsk_edge_table_t edges; + tsk_mutation_table_t mutations; + tsk_migration_table_t migrations; + const double *restrict node_time = self->nodes.time; + tsk_id_t j, ret_id, parent; + double mutation_time; + tsk_id_t *mutation_map = NULL; + + memset(&edges, 0, sizeof(edges)); + memset(&mutations, 0, sizeof(mutations)); + memset(&migrations, 0, sizeof(migrations)); + + ret = tsk_edge_table_copy(&self->edges, &edges, 0); + if (ret != 0) { + goto out; + } + ret = tsk_edge_table_clear(&self->edges); + if (ret != 0) { + goto out; + } + for (j = 0; j < (tsk_id_t) edges.num_rows; j++) { + tsk_edge_table_get_row_unsafe(&edges, j, &edge); + if (node_time[edge.parent] <= time) { + ret_id = tsk_edge_table_add_row(&self->edges, edge.left, edge.right, + edge.parent, edge.child, edge.metadata, edge.metadata_length); + if (ret_id < 0) { + ret = (int) ret_id; + goto out; + } + } + } + /* Calling x_table_free multiple times is safe, so get rid of the + * extra edge table memory as soon as we can. */ + tsk_edge_table_free(&edges); + + mutation_map = tsk_malloc(self->mutations.num_rows * sizeof(*mutation_map)); + if (mutation_map == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + ret = tsk_mutation_table_copy(&self->mutations, &mutations, 0); + if (ret != 0) { + goto out; + } + ret = tsk_mutation_table_clear(&self->mutations); + if (ret != 0) { + goto out; + } + for (j = 0; j < (tsk_id_t) mutations.num_rows; j++) { + tsk_mutation_table_get_row_unsafe(&mutations, j, &mutation); + mutation_time = tsk_is_unknown_time(mutation.time) ? node_time[mutation.node] + : mutation.time; + mutation_map[j] = TSK_NULL; + if (mutation_time < time) { + ret_id = tsk_mutation_table_add_row(&self->mutations, mutation.site, + mutation.node, mutation.parent, mutation.time, mutation.derived_state, + mutation.derived_state_length, mutation.metadata, + mutation.metadata_length); + if (ret_id < 0) { + ret = (int) ret_id; + goto out; + } + mutation_map[j] = ret_id; + } + } + tsk_mutation_table_free(&mutations); + for (j = 0; j < (tsk_id_t) self->mutations.num_rows; j++) { + parent = self->mutations.parent[j]; + if (parent != TSK_NULL) { + self->mutations.parent[j] = mutation_map[parent]; + } + } + + ret = tsk_migration_table_copy(&self->migrations, &migrations, 0); + if (ret != 0) { + goto out; + } + ret = tsk_migration_table_clear(&self->migrations); + if (ret != 0) { + goto out; + } + for (j = 0; j < (tsk_id_t) migrations.num_rows; j++) { + tsk_migration_table_get_row_unsafe(&migrations, j, &migration); + if (migration.time < time) { + ret_id = tsk_migration_table_add_row(&self->migrations, migration.left, + migration.right, migration.node, migration.source, migration.dest, + migration.time, migration.metadata, migration.metadata_length); + if (ret_id < 0) { + ret = (int) ret_id; + goto out; + } + } + } + tsk_migration_table_free(&migrations); +out: + tsk_edge_table_free(&edges); + tsk_mutation_table_free(&mutations); + tsk_migration_table_free(&migrations); + tsk_safe_free(mutation_map); + return ret; +} + int tsk_table_collection_record_num_rows( const tsk_table_collection_t *self, tsk_bookmark_t *position) diff --git a/c/tskit/tables.h b/c/tskit/tables.h index ec14e90be3..bab3546220 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -4217,7 +4217,9 @@ int tsk_table_collection_deduplicate_sites( int tsk_table_collection_compute_mutation_parents( tsk_table_collection_t *self, tsk_flags_t options); int tsk_table_collection_compute_mutation_times( - tsk_table_collection_t *self, double *random, tsk_flags_t TSK_UNUSED(options)); + tsk_table_collection_t *self, double *random, tsk_flags_t options); +int tsk_table_collection_delete_older( + tsk_table_collection_t *self, double time, tsk_flags_t options); int tsk_table_collection_set_indexes(tsk_table_collection_t *self, tsk_id_t *edge_insertion_order, tsk_id_t *edge_removal_order); diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 5d8caa7c10..9d8477db71 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -7023,6 +7023,29 @@ TableCollection_canonicalise(TableCollection *self, PyObject *args, PyObject *kw return ret; } +static PyObject * +TableCollection_delete_older(TableCollection *self, PyObject *args) +{ + PyObject *ret = NULL; + int err; + double time; + + if (TableCollection_check_state(self) != 0) { + goto out; + } + if (!PyArg_ParseTuple(args, "d", &time)) { + goto out; + } + err = tsk_table_collection_delete_older(self->tables, time, 0); + if (err != 0) { + handle_library_error(err); + goto out; + } + ret = Py_BuildValue(""); +out: + return ret; +} + static PyObject * TableCollection_compute_mutation_parents(TableCollection *self) { @@ -7527,6 +7550,10 @@ static PyMethodDef TableCollection_methods[] = { .ml_flags = METH_VARARGS | METH_KEYWORDS, .ml_doc = "Returns True if the parameter table collection is equal to this one." }, + { .ml_name = "delete_older", + .ml_meth = (PyCFunction) TableCollection_delete_older, + .ml_flags = METH_VARARGS, + .ml_doc = "Delete edges, mutations and migrations older than this time" }, { .ml_name = "compute_mutation_parents", .ml_meth = (PyCFunction) TableCollection_compute_mutation_parents, .ml_flags = METH_NOARGS, diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index 75434aa0b8..2646e0cc24 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -469,6 +469,14 @@ def test_sort_individuals(self): tc = _tskit.TableCollection(1) tc.sort_individuals() + def test_delete_older_bad_args(self): + tc = _tskit.TableCollection(1) + self.get_example_tree_sequence().dump_tables(tc) + with pytest.raises(TypeError): + tc.delete_older() + with pytest.raises(TypeError): + tc.delete_older("1234") + class TestIbd: def test_uninitialised(self): diff --git a/python/tests/test_table_transforms.py b/python/tests/test_table_transforms.py index a7c1e5c319..06f941271e 100644 --- a/python/tests/test_table_transforms.py +++ b/python/tests/test_table_transforms.py @@ -29,12 +29,297 @@ import tests import tskit +import tskit.util as util from tests.test_highlevel import get_example_tree_sequences # ↑ See https://github.com/tskit-dev/tskit/issues/1804 for when # we can remove this. +def delete_older_definition(tables, time): + node_time = tables.nodes.time + edges = tables.edges.copy() + tables.edges.clear() + for edge in edges: + if node_time[edge.parent] <= time: + tables.edges.append(edge) + + mutations = tables.mutations.copy() + # Map of old ID -> new ID + mutation_map = np.full(len(mutations), tskit.NULL, dtype=int) + tables.mutations.clear() + keep = [] + for j, mutation in enumerate(mutations): + mutation_time = ( + node_time[mutation.node] + if util.is_unknown_time(mutation.time) + else mutation.time + ) + if mutation_time < time: + mutation_map[j] = len(keep) + keep.append(mutation) + # Not making assumptions about ordering, so do it in two passes. + for mutation in keep: + if mutation.parent != tskit.NULL: + mutation = mutation.replace(parent=mutation_map[mutation.parent]) + tables.mutations.append(mutation) + + migrations = tables.migrations.copy() + tables.migrations.clear() + for migration in migrations: + if migration.time < time: + tables.migrations.append(migration) + + +class TestDeleteOlderExamples: + @pytest.mark.parametrize("ts", get_example_tree_sequences()) + def test_definition(self, ts): + time = 0 if ts.num_nodes == 0 else np.median(ts.tables.nodes.time) + tables1 = ts.dump_tables() + delete_older_definition(tables1, time) + tables2 = ts.dump_tables() + tables2.delete_older(time) + tables1.assert_equals(tables2, ignore_provenance=True) + + @pytest.mark.parametrize("ts", get_example_tree_sequences()) + def test_mutation_parents(self, ts): + time = 0 if ts.num_nodes == 0 else np.median(ts.tables.nodes.time) + tables1 = ts.dump_tables() + tables1.delete_older(time) + tables2 = tables1.copy() + tables2.build_index() + tables2.compute_mutation_parents() + tables1.assert_equals(tables2, ignore_provenance=True) + + +class TestDeleteOlderSimpleTree: + + # 2.00┊ 4 ┊ + # ┊ ┏━┻┓ ┊ + # 1.00┊ ┃ 3 ┊ + # ┊ ┃ ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + def tables(self): + # Don't cache this because we modify the result! + tree = tskit.Tree.generate_balanced(3, branch_length=1) + return tree.tree_sequence.dump_tables() + + @pytest.mark.parametrize("time", [0, -0.5, -100, 0.01, 0.999]) + def test_before_first_internal_node(self, time): + tables = self.tables() + before = tables.copy() + tables.delete_older(time) + ts = tables.tree_sequence() + assert ts.num_trees == 1 + tree = ts.first() + assert tree.num_roots == 3 + assert list(sorted(tree.roots)) == [0, 1, 2] + assert before.nodes.equals(tables.nodes[: len(before.nodes)]) + assert len(tables.edges) == 0 + + @pytest.mark.parametrize("time", [1, 1.01, 1.5, 1.999]) + def test_t1_to_2(self, time): + # + # 2.00┊ ┊ + # ┊ ┊ + # 1.00┊ 3 ┊ + # ┊ ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + tables = self.tables() + before = tables.copy() + tables.delete_older(time) + ts = tables.tree_sequence() + assert ts.num_trees == 1 + tree = ts.first() + assert tree.num_roots == 2 + assert list(sorted(tree.roots)) == [0, 3] + assert len(tables.nodes) == 5 + assert before.nodes.equals(tables.nodes) + + @pytest.mark.parametrize("time", [2, 2.5, 1e9]) + def test_t2(self, time): + tables = self.tables() + before = tables.copy() + tables.delete_older(time) + tables.assert_equals(before, ignore_provenance=True) + + +class TestDeleteOlderSimpleTreeMutationExamples: + def test_single_mutation_no_time(self): + # 2.00┊ 4 ┊ + # ┊ ┏━┻┓ ┊ + # 1.00┊ ┃ 3 ┊ + # ┊ x ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + tree = tskit.Tree.generate_balanced(3, branch_length=1) + tables = tree.tree_sequence.dump_tables() + tables.sites.add_row(0, "A") + tables.mutations.add_row(site=0, node=0, derived_state="T", metadata=b"1234") + + tables.delete_older(1) + # 2.00┊ ┊ + # ┊ ┊ + # 1.00┊ 3 ┊ + # ┊ x ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + assert len(tables.nodes) == 5 + mut = tables.mutations[0] + assert mut.node == 0 + assert mut.derived_state == "T" + assert mut.metadata == b"1234" + assert tskit.is_unknown_time(mut.time) + + def test_single_mutation_before_time(self): + # 2.00┊ 4 ┊ + # ┊ x━┻┓ ┊ + # 1.00┊ ┃ 3 ┊ + # ┊ ┃ ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + tree = tskit.Tree.generate_balanced(3, branch_length=1) + tables = tree.tree_sequence.dump_tables() + tables.sites.add_row(0, "A") + tables.mutations.add_row( + site=0, node=0, time=1.5, derived_state="T", metadata=b"1234" + ) + tables.delete_older(1) + # 2.00┊ ┊ + # ┊ ┊ + # 1.00┊ 3 ┊ + # ┊ ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + assert len(tables.nodes) == 5 + assert len(tables.mutations) == 0 + + def test_single_mutation_at_time(self): + # 2.00┊ 4 ┊ + # ┊ ┏━┻┓ ┊ + # 1.00┊ x 3 ┊ + # ┊ ┃ ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + tree = tskit.Tree.generate_balanced(3, branch_length=1) + tables = tree.tree_sequence.dump_tables() + tables.sites.add_row(0, "A") + tables.mutations.add_row( + site=0, node=0, time=1, derived_state="T", metadata=b"1234" + ) + + tables.delete_older(1) + # 2.00┊ ┊ + # ┊ ┊ + # 1.00┊ 3 ┊ + # ┊ ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + assert len(tables.nodes) == 5 + assert len(tables.mutations) == 0 + + def test_multi_mutation_no_time(self): + # 2.00┊ 4 ┊ + # ┊ ┏━┻┓ ┊ + # 1.00┊ x 3 ┊ + # ┊ x ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + tree = tskit.Tree.generate_balanced(3, branch_length=1) + tables = tree.tree_sequence.dump_tables() + tables.sites.add_row(0, "A") + tables.mutations.add_row(site=0, node=0, derived_state="T") + tables.mutations.add_row(site=0, node=0, parent=0, derived_state="G") + before = tables.copy() + + tables.delete_older(1) + # 2.00┊ 4 ┊ + # ┊ ┊ + # ┊ 3 ┊ + # ┊ x ┃ ┊ + # ┊ x ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + tables.mutations.assert_equals(before.mutations) + + def test_multi_mutation_out_of_order(self): + # 2.00┊ 4 ┊ + # ┊ ┏━┻┓ ┊ + # 1.00┊ x 3 ┊ + # ┊ x ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + tree = tskit.Tree.generate_balanced(3, branch_length=1) + tables = tree.tree_sequence.dump_tables() + tables.sites.add_row(0, "A") + tables.mutations.add_row(site=0, node=0, parent=1, derived_state="G") + tables.mutations.add_row(site=0, node=0, derived_state="T") + before = tables.copy() + with pytest.raises(tskit.LibraryError, match="PARENT_AFTER_CHILD"): + tables.tree_sequence() + + tables.delete_older(1) + # 2.00┊ 4 ┊ + # ┊ ┊ + # ┊ 3 ┊ + # ┊ x ┃ ┊ + # ┊ x ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + tables.mutations.assert_equals(before.mutations) + + def test_mutation_not_on_branch(self): + tables = tskit.TableCollection(1) + tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + tables.sites.add_row(0, "A") + tables.mutations.add_row(site=0, node=0, derived_state="T") + before = tables.copy() + tables.delete_older(0.01) + tables.assert_equals(before, ignore_provenance=True) + + +class TestDeleteOlderSimpleTreeMigrationExamples: + @tests.cached_example + def ts(self): + # 2.00┊ 4 ┊ + # ┊ o━┻┓ ┊ + # 1.00┊ o 3 ┊ + # ┊ o ┏┻┓ ┊ + # 0.00┊ 0 1 2 ┊ + # 0 1 + tree = tskit.Tree.generate_balanced(3, branch_length=1) + tables = tree.tree_sequence.dump_tables() + tables.populations.add_row() + tables.populations.add_row() + tables.migrations.add_row(source=0, dest=1, node=0, time=0.5, left=0, right=1) + tables.migrations.add_row(source=1, dest=0, node=0, time=1.0, left=0, right=1) + tables.migrations.add_row(source=0, dest=1, node=0, time=1.5, left=0, right=1) + tables.compute_mutation_parents() + ts = tables.tree_sequence() + return ts + + def test_t099(self): + tables = self.ts().dump_tables() + tables.delete_older(0.99) + assert len(tables.migrations) == 1 + assert tables.migrations[0].time == 0.5 + + def test_t1(self): + tables = self.ts().dump_tables() + tables.delete_older(1) + assert len(tables.migrations) == 1 + assert tables.migrations[0].time == 0.5 + + @pytest.mark.parametrize("time", [1.51, 2.0, 2.5]) + def test_older(self, time): + tables = self.ts().dump_tables() + before = tables.copy() + tables.delete_older(time) + tables.migrations.assert_equals(before.migrations) + + def split_edges_definition(ts, time, *, flags=None, population=None, metadata=None): tables = ts.dump_tables() if ts.num_migrations > 0: diff --git a/python/tskit/tables.py b/python/tskit/tables.py index 9c30d0728a..8ac2e6a1fc 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -3862,6 +3862,30 @@ def trim(self, record_provenance=True): record=json.dumps(provenance.get_provenance_dict(parameters)) ) + def delete_older(self, time): + """ + Deletes edge, mutation and migration information at least as old as + the specified time. + + For the purposes of this method, an edge covers the times from the child node + up until the *parent* node, so that any any edge with parent node time > ``time`` + will be removed. + + Any mutation whose time is >= ``time`` will be removed. A mutation's time + is its associated ``time`` value, or the time of its node if the + mutation's time was marked as unknown (:data:`UNKNOWN_TIME`). + + Any migration with time >= ``time`` will be removed. + + The node table is not affected by this operation. + + .. note:: This method does not have any specific sorting requirements + and will maintain mutation parent mappings. + + :param float time: The cutoff time. + """ + self._ll_tables.delete_older(time) + def clear( self, clear_provenance=False,