Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions c/tests/test_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -10155,6 +10155,80 @@ test_table_collection_clear(void)
| TSK_CLEAR_TS_METADATA_AND_SCHEMA);
}

static void
test_table_collection_decapitate(void)
{
int ret;
tsk_treeseq_t ts;
tsk_table_collection_t t;

tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
paper_ex_mutations, paper_ex_individuals, NULL, 0);
ret = tsk_treeseq_copy_tables(&ts, &t, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_treeseq_free(&ts);

/* Add some migrations */
tsk_population_table_add_row(&t.populations, NULL, 0);
tsk_population_table_add_row(&t.populations, NULL, 0);
tsk_migration_table_add_row(&t.migrations, 0, 10, 0, 0, 1, 0.05, NULL, 0);
tsk_migration_table_add_row(&t.migrations, 0, 10, 0, 1, 0, 0.09, NULL, 0);
tsk_migration_table_add_row(&t.migrations, 0, 10, 0, 0, 1, 0.10, NULL, 0);
CU_ASSERT_EQUAL(t.migrations.num_rows, 3);

/* NOTE: haven't worked out the exact IDs on the branches here, just
* for illustration.
0.09┊ 9 5 10 ┊ 9 5 ┊11 5 ┊
┊ ┃ ┏┻┓ ┃ ┊ ┃ ┏━┻┓ ┊ ┃ ┏━┻┓ ┊
0.07┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ 4 ┊ ┃ ┃ 4 ┊
┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┏┻┓ ┊ ┃ ┃ ┏┻┓ ┊
0.00┊ 0 1 3 2 ┊ 0 1 2 3 ┊ 0 1 2 3 ┊
0.00 2.00 7.00 10.00
*/

ret = tsk_table_collection_decapitate(&t, 0.09, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_treeseq_init(&ts, &t, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 3);
CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 12);
/* Lost the mutation over 5 */
CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 2);
/* We keep the migration at exactly 0.09. */
CU_ASSERT_EQUAL(tsk_treeseq_get_num_migrations(&ts), 2);

tsk_table_collection_free(&t);
tsk_treeseq_free(&ts);
}

static void
test_table_collection_decapitate_errors(void)
{
int ret;
tsk_treeseq_t ts;
tsk_table_collection_t t;

tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
paper_ex_mutations, paper_ex_individuals, NULL, 0);
ret = tsk_treeseq_copy_tables(&ts, &t, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_treeseq_free(&ts);

/* This should be caught later when we try to index */
reverse_edges(&t);
ret = tsk_table_collection_decapitate(&t, 0.09, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGES_NOT_SORTED_CHILD);

/* This should be caught immediately on entry to the function */
t.sequence_length = -1;
ret = tsk_table_collection_decapitate(&t, 0.09, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH);

tsk_table_collection_free(&t);
}

static void
test_table_collection_takeset_indexes(void)
{
Expand Down Expand Up @@ -10317,6 +10391,9 @@ main(int argc, char **argv)
test_table_collection_union_middle_merge },
{ "test_table_collection_union_errors", test_table_collection_union_errors },
{ "test_table_collection_clear", test_table_collection_clear },
{ "test_table_collection_decapitate", test_table_collection_decapitate },
{ "test_table_collection_decapitate_errors",
test_table_collection_decapitate_errors },
{ "test_table_collection_takeset_indexes",
test_table_collection_takeset_indexes },
{ NULL, NULL },
Expand Down
127 changes: 127 additions & 0 deletions c/tskit/tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -12646,6 +12646,133 @@ tsk_table_collection_union(tsk_table_collection_t *self,
return ret;
}

int
tsk_table_collection_decapitate(
tsk_table_collection_t *self, double time, tsk_flags_t TSK_UNUSED(options))
{
int ret = 0;
tsk_edge_t edge;
tsk_mutation_t mutation;
tsk_migration_t migration;
tsk_edge_table_t edges;
tsk_mutation_table_t mutations;
tsk_migration_table_t migrations;
const double *restrict node_time = self->nodes.time;
tsk_id_t j, ret_id;
double mutation_time;

memset(&edges, 0, sizeof(edges));
memset(&mutations, 0, sizeof(mutations));
memset(&migrations, 0, sizeof(migrations));

/* Note: perhaps should be stricter about what we accept here in terms
* of sorting, but compute_mutation_parents below will check anyway and
* so we're safe while we're calling that.
*/
ret = (int) tsk_table_collection_check_integrity(self, 0);
if (ret != 0) {
goto out;
}

ret = tsk_edge_table_copy(&self->edges, &edges, 0);
if (ret != 0) {
goto out;
}
/* Note: we are assuming below that the tables are sorted, so we could save
* a bit of time and memory here by truncating part-way through the
* edges. */
ret = tsk_edge_table_clear(&self->edges);
if (ret != 0) {
goto out;
}
for (j = 0; j < (tsk_id_t) edges.num_rows; j++) {
tsk_edge_table_get_row_unsafe(&edges, j, &edge);
if (node_time[edge.child] < time) {
if (time < node_time[edge.parent]) {
ret_id = tsk_node_table_add_row(
&self->nodes, 0, time, TSK_NULL, TSK_NULL, NULL, 0);
if (ret_id < 0) {
ret = (int) ret_id;
goto out;
}
edge.parent = ret_id;
}
ret_id = tsk_edge_table_add_row(&self->edges, edge.left, edge.right,
edge.parent, edge.child, edge.metadata, edge.metadata_length);
if (ret_id < 0) {
ret = (int) ret_id;
goto out;
}
}
}
/* Calling x_table_free multiple times is safe, so get rid of the
* extra edge table memory as soon as we can. */
tsk_edge_table_free(&edges);

ret = tsk_mutation_table_copy(&self->mutations, &mutations, 0);
if (ret != 0) {
goto out;
}
ret = tsk_mutation_table_clear(&self->mutations);
if (ret != 0) {
goto out;
}
for (j = 0; j < (tsk_id_t) mutations.num_rows; j++) {
tsk_mutation_table_get_row_unsafe(&mutations, j, &mutation);
mutation_time = tsk_is_unknown_time(mutation.time) ? node_time[mutation.node]
: mutation.time;
if (mutation_time < time) {
/* Set the mutation parent to NULL, and recalculate below */
ret_id = tsk_mutation_table_add_row(&self->mutations, mutation.site,
mutation.node, TSK_NULL, mutation.time, mutation.derived_state,
mutation.derived_state_length, mutation.metadata,
mutation.metadata_length);
if (ret_id < 0) {
ret = (int) ret_id;
goto out;
}
}
}
tsk_mutation_table_free(&mutations);

ret = tsk_migration_table_copy(&self->migrations, &migrations, 0);
if (ret != 0) {
goto out;
}
ret = tsk_migration_table_clear(&self->migrations);
if (ret != 0) {
goto out;
}
for (j = 0; j < (tsk_id_t) migrations.num_rows; j++) {
tsk_migration_table_get_row_unsafe(&migrations, j, &migration);
if (migration.time <= time) {
ret_id = tsk_migration_table_add_row(&self->migrations, migration.left,
migration.right, migration.node, migration.source, migration.dest,
migration.time, migration.metadata, migration.metadata_length);
if (ret_id < 0) {
ret = (int) ret_id;
goto out;
}
}
}
tsk_migration_table_free(&migrations);

ret = tsk_table_collection_build_index(self, 0);
if (ret != 0) {
goto out;
}
ret = tsk_table_collection_compute_mutation_parents(self, 0);
if (ret != 0) {
goto out;
}

out:
tsk_edge_table_free(&edges);
tsk_mutation_table_free(&mutations);
tsk_migration_table_free(&migrations);
return ret;
}

static int
cmp_edge_cl(const void *a, const void *b)
{
Expand Down
5 changes: 5 additions & 0 deletions c/tskit/tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -4247,6 +4247,11 @@ int tsk_table_collection_compute_mutation_parents(
int tsk_table_collection_compute_mutation_times(
tsk_table_collection_t *self, double *random, tsk_flags_t TSK_UNUSED(options));

/* Not documenting this because we may want to pass through default values
* for the new nodes (in particular the metadata) in the future */
int tsk_table_collection_decapitate(
tsk_table_collection_t *self, double time, tsk_flags_t options);

int tsk_reference_sequence_init(tsk_reference_sequence_t *self, tsk_flags_t options);
int tsk_reference_sequence_free(tsk_reference_sequence_t *self);
bool tsk_reference_sequence_is_null(const tsk_reference_sequence_t *self);
Expand Down
1 change: 1 addition & 0 deletions docs/python-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ which perform the same actions but modify the {class}`TableCollection` in place.
TreeSequence.delete_intervals
TreeSequence.delete_sites
TreeSequence.trim
TreeSequence.decapitate
```

(sec_python_api_tree_sequences_ibd)=
Expand Down
7 changes: 6 additions & 1 deletion python/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@

**Changes**

- ``VcfWriter.write`` now prints the site ID of variants in the ID field of the output VCF files.
- Add ``TableCollection.decapitate`` and ``TreeSequence.decapitate`` operations
to remove information from that data model that is older than a specific time.
(:user:`jeromekelleher`, :issue:`2236`, :pr:`2240`)

- ``VcfWriter.write`` now prints the site ID of variants in the ID field of the
output VCF files.
(:user:`roohy`, :issue:`2103`, :pr:`2107`)

- Make dumping of tables and tree sequences to disk a zero-copy operation.
Expand Down
27 changes: 27 additions & 0 deletions python/_tskitmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -7004,6 +7004,29 @@ TableCollection_compute_mutation_parents(TableCollection *self)
return ret;
}

static PyObject *
TableCollection_decapitate(TableCollection *self, PyObject *args)
{
PyObject *ret = NULL;
int err;
double time;

if (TableCollection_check_state(self) != 0) {
goto out;
}
if (!PyArg_ParseTuple(args, "d", &time)) {
goto out;
}
err = tsk_table_collection_decapitate(self->tables, time, 0);
if (err != 0) {
handle_library_error(err);
goto out;
}
ret = Py_BuildValue("");
out:
return ret;
}

static PyObject *
TableCollection_compute_mutation_times(TableCollection *self)
{
Expand Down Expand Up @@ -7489,6 +7512,10 @@ static PyMethodDef TableCollection_methods[] = {
.ml_flags = METH_VARARGS | METH_KEYWORDS,
.ml_doc
= "Returns True if the parameter table collection is equal to this one." },
{ .ml_name = "decapitate",
.ml_meth = (PyCFunction) TableCollection_decapitate,
.ml_flags = METH_VARARGS,
.ml_doc = "Removes information older than the specified time." },
{ .ml_name = "compute_mutation_parents",
.ml_meth = (PyCFunction) TableCollection_compute_mutation_parents,
.ml_flags = METH_NOARGS,
Expand Down
2 changes: 1 addition & 1 deletion python/tests/test_drawing.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def get_multiroot_tree(self):

def get_mutations_over_roots_tree(self):
ts = msprime.simulate(15, random_seed=1)
ts = tsutil.decapitate(ts, 20)
ts = ts.decapitate(ts.tables.nodes.time[-1] / 2)
tables = ts.dump_tables()
delta = 1.0 / (ts.num_nodes + 1)
x = 0
Expand Down
6 changes: 4 additions & 2 deletions python/tests/test_genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1702,7 +1702,8 @@ def test_non_ascii_missing_data_char(self, missing_data_char):
class TestAlignmentExamples:
@pytest.mark.parametrize("ts", get_example_discrete_genome_tree_sequences())
def test_defaults(self, ts):
if any(tree.num_roots > 1 for tree in ts.trees()):
has_missing_data = np.any(ts.genotype_matrix() == -1)
if has_missing_data:
with pytest.raises(ValueError, match="1896"):
list(ts.alignments())
else:
Expand All @@ -1720,7 +1721,8 @@ def test_defaults(self, ts):
@pytest.mark.parametrize("ts", get_example_discrete_genome_tree_sequences())
def test_reference_sequence(self, ts):
ref = tskit.random_nucleotides(ts.sequence_length, seed=1234)
if any(tree.num_roots > 1 for tree in ts.trees()):
has_missing_data = np.any(ts.genotype_matrix() == -1)
if has_missing_data:
with pytest.raises(ValueError, match="1896"):
list(ts.alignments(reference_sequence=ref))
else:
Expand Down
6 changes: 3 additions & 3 deletions python/tests/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,11 +266,11 @@ def get_decapitated_examples():
Returns example tree sequences in which the oldest edges have been removed.
"""
ts = msprime.simulate(10, random_seed=1234)
yield tsutil.decapitate(ts, ts.num_edges // 2)
yield ts.decapitate(ts.tables.nodes.time[-1] / 2)

ts = msprime.simulate(20, recombination_rate=1, random_seed=1234)
assert ts.num_trees > 2
yield tsutil.decapitate(ts, ts.num_edges // 4)
yield ts.decapitate(ts.tables.nodes.time[-1] / 4)


def get_example_tree_sequences(back_mutations=True, gaps=True, internal_samples=True):
Expand Down Expand Up @@ -3622,7 +3622,7 @@ def test_copy_tracked_samples(self):

def test_copy_multiple_roots(self):
ts = msprime.simulate(20, recombination_rate=2, length=3, random_seed=42)
ts = tsutil.decapitate(ts, ts.num_edges // 2)
ts = ts.decapitate(np.max(ts.tables.nodes.time) / 2)
for root_threshold in [1, 2, 100]:
tree = tskit.Tree(ts, root_threshold=root_threshold)
copy = tree.copy()
Expand Down
13 changes: 13 additions & 0 deletions python/tests/test_lowlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,19 @@ def test_union_bad_args(self):
with pytest.raises(ValueError):
tc.union(tc2, np.array([[1], [2]], dtype="int32"))

def test_decapitate_bad_args(self):
tc = _tskit.TableCollection(1)
self.get_example_tree_sequence().dump_tables(tc)
with pytest.raises(TypeError):
tc.decapitate()
with pytest.raises(TypeError):
tc.decapitate("1234")

def test_decapitate_error(self):
tc = _tskit.TableCollection(-1)
with pytest.raises(_tskit.LibraryError, match="Sequence length"):
tc.decapitate(0)

def test_equals_bad_args(self):
ts = msprime.simulate(10, random_seed=1242)
tc = ts.tables._ll_tables
Expand Down
Loading