From f2225fc2f896954af5600ffab87c4df964314997 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 19 Aug 2020 11:04:22 +0100 Subject: [PATCH 1/5] Add keep_input_roots to simplify. Also updates the internal algorithm to remove ancestry that has been processed. --- appveyor.yml | 1 + c/tests/test_trees.c | 109 ++++++ c/tskit/tables.c | 283 +++++++++++--- c/tskit/tables.h | 5 + python/_tskitmodule.c | 24 +- python/requirements/CI/requirements.txt | 1 + python/requirements/development.txt | 1 + python/tests/simplify.py | 163 ++++++-- python/tests/test_lowlevel.py | 4 + python/tests/test_topology.py | 496 ++++++++++++++++++------ python/tests/tsutil.py | 26 ++ python/tskit/tables.py | 5 + python/tskit/trees.py | 5 + 13 files changed, 901 insertions(+), 222 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index df10a7d887..c6e751acc5 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -29,6 +29,7 @@ build_script: - cmd: python -m pip install newick - cmd: python -m pip install python_jsonschema_objects - cmd: python -m pip install xmlunittest + - cmd: python -m pip install portion - cmd: python -m nose -vs --processes=%NUMBER_OF_PROCESSORS% --process-timeout=5000 after_test: diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index c387f06abe..47d772b832 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -3640,6 +3640,7 @@ test_single_tree_iter_depths(void) tsk_tree_free(&tree); tsk_treeseq_free(&ts); } + static void test_single_tree_simplify(void) { @@ -3697,6 +3698,55 @@ test_single_tree_simplify(void) tsk_table_collection_free(&tables); } +static void +test_single_tree_simplify_debug(void) +{ + tsk_treeseq_t ts, simplified; + tsk_id_t samples[] = { 0, 1 }; + int ret; + FILE *save_stdout = stdout; + FILE *tmp = fopen(_tmp_file_name, "w"); + + CU_ASSERT_FATAL(tmp != NULL); + tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, + single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); + + stdout = tmp; + ret = tsk_treeseq_simplify(&ts, samples, 2, TSK_DEBUG, &simplified, NULL); + stdout = save_stdout; + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(ftell(tmp) > 0); + + fclose(tmp); + tsk_treeseq_free(&ts); + tsk_treeseq_free(&simplified); +} + +static void +test_single_tree_simplify_keep_input_roots(void) +{ + tsk_treeseq_t ts; + tsk_table_collection_t tables; + tsk_id_t samples[] = { 0, 1 }; + int ret; + + tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, + single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); + verify_simplify(&ts); + ret = tsk_treeseq_copy_tables(&ts, &tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_table_collection_simplify(&tables, samples, 2, TSK_KEEP_INPUT_ROOTS, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(tables.nodes.num_rows, 4); + CU_ASSERT_EQUAL(tables.edges.num_rows, 3); + CU_ASSERT_EQUAL(tables.sites.num_rows, 3); + CU_ASSERT_EQUAL(tables.mutations.num_rows, 4); + + tsk_treeseq_free(&ts); + tsk_table_collection_free(&tables); +} + static void test_single_tree_simplify_no_sample_nodes(void) { @@ -4283,6 +4333,60 @@ test_nonbinary_multi_tree(void) tsk_treeseq_free(&ts); } +static void +test_simplify_keep_input_roots_multi_tree(void) +{ + + /* + 0.25┊ 8 ┊ ┊ ┊ + ┊ ┏━┻━┓ ┊ ┊ ┊ + 0.20┊ ┃ ┃ ┊ ┊ 7 ┊ + ┊ ┃ ┃ ┊ ┊ ┏━┻━┓ ┊ + 0.17┊ 6 ┃ ┊ 6 ┊ ┃ ┃ ┊ + ┊ ┏━┻┓ ┃ ┊ ┏━┻━┓ ┊ ┃ ┃ ┊ + 0.09┊ ┃ 5 ┃ ┊ ┃ 5 ┊ ┃ 5 ┊ + ┊ ┃ ┏┻┓ ┃ ┊ ┃ ┏━┻┓ ┊ ┃ ┏━┻┓ ┊ + 0.07┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ 4 ┊ ┃ ┃ 4 ┊ + ┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┏┻┓ ┊ ┃ ┃ ┏┻┓ ┊ + 0.00┊ 0 1 3 2 ┊ 0 1 2 3 ┊ 0 1 2 3 ┊ + 0.00 2.00 7.00 10.00 + + Simplifies to + + 0.25┊ 4 ┊ ┊ ┊ + ┊ ┃ ┊ ┊ ┊ + 0.20┊ ┃ ┊ ┊ 3 ┊ + ┊ ┃ ┊ ┊ ┏┻┓ ┊ + 0.17┊ 2 ┊ 2 ┊ ┃ ┃ ┊ + ┊ ┏┻┓ ┊ ┏┻┓ ┊ ┃ ┃ ┊ + 0.00┊ 0 1 ┊ 0 1 ┊ 0 1 ┊ + 0.00 2.00 7.00 10.00 + + */ + int ret = 0; + // clang-format off + tsk_id_t parents[] = { + 2, 2, 4, -1, -1, + 2, 2, -1, -1, -1, + 3, 3, -1, -1, -1, + }; + // clang-format on + uint32_t num_trees = 3; + + tsk_id_t samples[] = { 0, 3 }; + tsk_treeseq_t ts, simplified; + + tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, + paper_ex_mutations, paper_ex_individuals, NULL, 0); + tsk_treeseq_dump(&ts, "tmp.trees", 0); + ret = tsk_treeseq_simplify(&ts, samples, 2, TSK_KEEP_INPUT_ROOTS, &simplified, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + verify_trees(&simplified, num_trees, parents); + + tsk_treeseq_free(&ts); + tsk_treeseq_free(&simplified); +} + static void test_left_to_right_multi_tree(void) { @@ -5839,6 +5943,9 @@ main(int argc, char **argv) { "test_single_tree_iter_times", test_single_tree_iter_times }, { "test_single_tree_iter_depths", test_single_tree_iter_depths }, { "test_single_tree_simplify", test_single_tree_simplify }, + { "test_single_tree_simplify_debug", test_single_tree_simplify_debug }, + { "test_single_tree_simplify_keep_input_roots", + test_single_tree_simplify_keep_input_roots }, { "test_single_tree_simplify_no_sample_nodes", test_single_tree_simplify_no_sample_nodes }, { "test_single_tree_simplify_null_samples", @@ -5859,6 +5966,8 @@ main(int argc, char **argv) { "test_internal_sample_multi_tree", test_internal_sample_multi_tree }, { "test_internal_sample_simplified_multi_tree", test_internal_sample_simplified_multi_tree }, + { "test_simplify_keep_input_roots_multi_tree", + test_simplify_keep_input_roots_multi_tree }, { "test_left_to_right_multi_tree", test_left_to_right_multi_tree }, { "test_gappy_multi_tree", test_gappy_multi_tree }, { "test_tsk_treeseq_bad_records", test_tsk_treeseq_bad_records }, diff --git a/c/tskit/tables.c b/c/tskit/tables.c index c697e32e09..4a6f26084f 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -4547,6 +4547,7 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, tsk_bookmark_t *start) { int ret = 0; tsk_size_t edge_start = 0; + bool skip_sites = false; if (start != NULL) { if (start->edges > self->tables->edges.num_rows) { @@ -4555,10 +4556,17 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, tsk_bookmark_t *start) } edge_start = start->edges; - /* For now, if migrations, sites or mutations are non-zero we get an error. - * This should be fixed: https://github.com/tskit-dev/tskit/issues/101 - */ - if (start->migrations != 0 || start->sites != 0 || start->mutations != 0) { + if (start->migrations != 0) { + ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; + goto out; + } + /* We only allow sites and mutations to be specified as a way to + * skip sorting them entirely. Both sites and mutations must be + * equal to the number of rows */ + if (start->sites == self->tables->sites.num_rows + && start->mutations == self->tables->mutations.num_rows) { + skip_sites = true; + } else { ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; goto out; } @@ -4574,13 +4582,15 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, tsk_bookmark_t *start) goto out; } } - ret = tsk_table_sorter_sort_sites(self); - if (ret != 0) { - goto out; - } - ret = tsk_table_sorter_sort_mutations(self); - if (ret != 0) { - goto out; + if (!skip_sites) { + ret = tsk_table_sorter_sort_sites(self); + if (ret != 0) { + goto out; + } + ret = tsk_table_sorter_sort_mutations(self); + if (ret != 0) { + goto out; + } } out: return ret; @@ -4697,6 +4707,7 @@ typedef struct { /* When reducing topology, we need a map positions to their corresponding * sites.*/ double *position_lookup; + int64_t edge_sort_offset; } simplifier_t; static int @@ -5434,11 +5445,13 @@ simplifier_print_state(simplifier_t *self, FILE *out) fprintf(out, "--simplifier state--\n"); fprintf(out, "options:\n"); - fprintf( - out, "\tfilter_unreferenced_sites: %d\n", !!(self->options & TSK_FILTER_SITES)); - fprintf(out, "\treduce_to_site_topology : %d\n", + fprintf(out, "\tfilter_unreferenced_sites : %d\n", + !!(self->options & TSK_FILTER_SITES)); + fprintf(out, "\treduce_to_site_topology : %d\n", !!(self->options & TSK_REDUCE_TO_SITE_TOPOLOGY)); - fprintf(out, "\tkeep_unary : %d\n", !!(self->options & TSK_KEEP_UNARY)); + fprintf(out, "\tkeep_unary : %d\n", !!(self->options & TSK_KEEP_UNARY)); + fprintf(out, "\tkeep_input_roots : %d\n", + !!(self->options & TSK_KEEP_INPUT_ROOTS)); fprintf(out, "===\nInput tables\n==\n"); tsk_table_collection_print_state(&self->input_tables, out); @@ -5740,6 +5753,25 @@ simplifier_init_sites(simplifier_t *self) return ret; } +static void +simplifier_map_mutations( + simplifier_t *self, tsk_id_t input_id, double left, double right, tsk_id_t output_id) +{ + mutation_id_list_t *m_node; + double position; + tsk_id_t site; + + m_node = self->node_mutation_list_map_head[input_id]; + while (m_node != NULL) { + site = self->input_tables.mutations.site[m_node->mutation]; + position = self->input_tables.sites.position[site]; + if (left <= position && position < right) { + self->mutation_node_map[m_node->mutation] = output_id; + } + m_node = m_node->next; + } +} + static int TSK_WARN_UNUSED simplifier_add_ancestry( simplifier_t *self, tsk_id_t input_id, double left, double right, tsk_id_t output_id) @@ -5770,6 +5802,7 @@ simplifier_add_ancestry( self->ancestor_map_tail[input_id] = x; } } + simplifier_map_mutations(self, input_id, left, right, output_id); out: return ret; } @@ -5881,11 +5914,11 @@ simplifier_init(simplifier_t *self, tsk_id_t *samples, size_t num_samples, } memset( self->node_id_map, 0xff, self->input_tables.nodes.num_rows * sizeof(tsk_id_t)); - ret = simplifier_init_samples(self, samples); + ret = simplifier_init_sites(self); if (ret != 0) { goto out; } - ret = simplifier_init_sites(self); + ret = simplifier_init_samples(self, samples); if (ret != 0) { goto out; } @@ -5895,6 +5928,7 @@ simplifier_init(simplifier_t *self, tsk_id_t *samples, size_t num_samples, goto out; } } + self->edge_sort_offset = TSK_NULL; out: return ret; } @@ -6058,13 +6092,92 @@ simplifier_merge_ancestors(simplifier_t *self, tsk_id_t input_id) return ret; } +/* Extract the ancestry for the specified input node over the specified + * interval and queue it up for merging. + */ +static int TSK_WARN_UNUSED +simplifier_extract_ancestry( + simplifier_t *self, double left, double right, tsk_id_t input_id) +{ + int ret = 0; + tsk_segment_t *x = self->ancestor_map_head[input_id]; + tsk_segment_t y; /* y is the segment that has been removed */ + tsk_segment_t *x_head, *x_tail, *x_prev, *seg_left, *seg_right; + + x_head = NULL; + x_prev = NULL; + while (x != NULL) { + if (x->right > left && right > x->left) { + y.left = TSK_MAX(x->left, left); + y.right = TSK_MIN(x->right, right); + y.node = x->node; + ret = simplifier_enqueue_segment(self, y.left, y.right, y.node); + if (ret != 0) { + goto out; + } + seg_left = NULL; + seg_right = NULL; + if (x->left != y.left) { + seg_left = simplifier_alloc_segment(self, x->left, y.left, x->node); + if (seg_left == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + if (x_prev == NULL) { + x_head = seg_left; + } else { + x_prev->next = seg_left; + } + x_prev = seg_left; + } + if (x->right != y.right) { + x->left = y.right; + seg_right = x; + } else { + seg_right = x->next; + // TODO free x + } + if (x_prev == NULL) { + x_head = seg_right; + } else { + x_prev->next = seg_right; + } + x = seg_right; + } else { + if (x_prev == NULL) { + x_head = x; + } + x_prev = x; + x = x->next; + } + } + + x = x_head; + x_tail = x_head; + while (x != NULL) { + x_tail = x; + if (x->next != NULL) { + assert(x->right <= x->next->left); + if (x->next->left == x->right && x->node == x->next->node) { + // Squash out (and free) the x.next segment. + x->right = x->next->right; + x->next = x->next->next; + } + } + x = x->next; + } + self->ancestor_map_head[input_id] = x_head; + self->ancestor_map_tail[input_id] = x_tail; +out: + return ret; +} + static int TSK_WARN_UNUSED simplifier_process_parent_edges( simplifier_t *self, tsk_id_t parent, size_t start, size_t end) { int ret = 0; size_t j; - tsk_segment_t *x; const tsk_edge_table_t *input_edges = &self->input_tables.edges; tsk_id_t child; double left, right; @@ -6076,15 +6189,9 @@ simplifier_process_parent_edges( child = input_edges->child[j]; left = input_edges->left[j]; right = input_edges->right[j]; - // printf("C: %i, L: %f, R: %f\n", child, left, right); - for (x = self->ancestor_map_head[child]; x != NULL; x = x->next) { - if (x->right > left && right > x->left) { - ret = simplifier_enqueue_segment( - self, TSK_MAX(x->left, left), TSK_MIN(x->right, right), x->node); - if (ret != 0) { - goto out; - } - } + ret = simplifier_extract_ancestry(self, left, right, child); + if (ret != 0) { + goto out; } } /* We can now merge the ancestral segments for the parent */ @@ -6096,38 +6203,6 @@ simplifier_process_parent_edges( return ret; } -static int TSK_WARN_UNUSED -simplifier_map_mutation_nodes(simplifier_t *self) -{ - int ret = 0; - tsk_segment_t *seg; - mutation_id_list_t *m_node; - size_t input_node; - tsk_id_t site; - double position; - - for (input_node = 0; input_node < self->input_tables.nodes.num_rows; input_node++) { - seg = self->ancestor_map_head[input_node]; - m_node = self->node_mutation_list_map_head[input_node]; - /* Co-iterate over the segments and mutations; mutations must be listed - * in increasing order of site position */ - while (seg != NULL && m_node != NULL) { - site = self->input_tables.mutations.site[m_node->mutation]; - position = self->input_tables.sites.position[site]; - if (seg->left <= position && position < seg->right) { - self->mutation_node_map[m_node->mutation] = seg->node; - m_node = m_node->next; - } else if (position >= seg->right) { - seg = seg->next; - } else { - assert(position < seg->left); - m_node = m_node->next; - } - } - } - return ret; -} - static int TSK_WARN_UNUSED simplifier_output_sites(simplifier_t *self) { @@ -6339,6 +6414,81 @@ simplifier_finalise_references(simplifier_t *self) return ret; } +static void +simplifier_set_edge_sort_offset(simplifier_t *self, double youngest_root_time) +{ + const tsk_edge_table_t edges = self->tables->edges; + const double *node_time = self->tables->nodes.time; + tsk_size_t offset; + + for (offset = 0; offset < edges.num_rows; offset++) { + if (node_time[edges.parent[offset]] >= youngest_root_time) { + break; + } + } + self->edge_sort_offset = offset; +} + +static int TSK_WARN_UNUSED +simplifier_sort_edges(simplifier_t *self) +{ + /* designated initialisers are guaranteed to set any missing fields to + * zero, so we don't need to set the rest of them. */ + tsk_bookmark_t bookmark = { + .edges = (tsk_size_t) self->edge_sort_offset, + .sites = self->tables->sites.num_rows, + .mutations = self->tables->mutations.num_rows, + }; + assert(self->edge_sort_offset >= 0); + return tsk_table_collection_sort(self->tables, &bookmark, 0); +} + +static int TSK_WARN_UNUSED +simplifier_insert_input_roots(simplifier_t *self) +{ + int ret = 0; + tsk_id_t input_id, output_id; + tsk_segment_t *x; + size_t num_flushed_edges; + double youngest_root_time = DBL_MAX; + const double *node_time = self->tables->nodes.time; + + for (input_id = 0; input_id < (tsk_id_t) self->input_tables.nodes.num_rows; + input_id++) { + x = self->ancestor_map_head[input_id]; + if (x != NULL) { + output_id = self->node_id_map[input_id]; + if (output_id == TSK_NULL) { + ret = simplifier_record_node(self, input_id, false); + if (ret < 0) { + goto out; + } + output_id = ret; + } + youngest_root_time = TSK_MIN(youngest_root_time, node_time[output_id]); + while (x != NULL) { + if (x->node != output_id) { + ret = simplifier_record_edge(self, x->left, x->right, x->node); + if (ret != 0) { + goto out; + } + simplifier_map_mutations(self, input_id, x->left, x->right, x->node); + } + x = x->next; + } + ret = simplifier_flush_edges(self, output_id, &num_flushed_edges); + if (ret != 0) { + goto out; + } + } + } + if (youngest_root_time != DBL_MAX) { + simplifier_set_edge_sort_offset(self, youngest_root_time); + } +out: + return ret; +} + static int TSK_WARN_UNUSED simplifier_run(simplifier_t *self, tsk_id_t *node_map) { @@ -6367,9 +6517,11 @@ simplifier_run(simplifier_t *self, tsk_id_t *node_map) goto out; } } - ret = simplifier_map_mutation_nodes(self); - if (ret != 0) { - goto out; + if (self->options & TSK_KEEP_INPUT_ROOTS) { + ret = simplifier_insert_input_roots(self); + if (ret != 0) { + goto out; + } } ret = simplifier_output_sites(self); if (ret != 0) { @@ -6384,6 +6536,13 @@ simplifier_run(simplifier_t *self, tsk_id_t *node_map) memcpy(node_map, self->node_id_map, self->input_tables.nodes.num_rows * sizeof(tsk_id_t)); } + if (self->edge_sort_offset != TSK_NULL) { + assert(self->options & TSK_KEEP_INPUT_ROOTS); + ret = simplifier_sort_edges(self); + if (ret != 0) { + goto out; + } + } out: return ret; } diff --git a/c/tskit/tables.h b/c/tskit/tables.h index cf1c6af232..2f4a6247fa 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -659,6 +659,7 @@ typedef struct _tsk_table_sorter_t { #define TSK_FILTER_INDIVIDUALS (1 << 2) #define TSK_REDUCE_TO_SITE_TOPOLOGY (1 << 3) #define TSK_KEEP_UNARY (1 << 4) +#define TSK_KEEP_INPUT_ROOTS (1 << 5) /* Flags for check_integrity */ #define TSK_CHECK_EDGE_ORDERING (1 << 0) @@ -2456,6 +2457,10 @@ TSK_KEEP_UNARY By default simplify removes unary nodes (i.e., nodes with exactly one child) along the path from samples to root. If this option is specified such unary nodes will be preserved in the output. +TSK_KEEP_INPUT_ROOTS + By default simplify removes all topology ancestral the MRCAs of the samples. + This option inserts edges from these MRCAs back to the roots of the input + trees. .. note:: Migrations are currently not supported by simplify, and an error will be raised if we attempt call simplify on a table collection with greater diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index a98f42b9bb..d02c5257fd 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -818,7 +818,7 @@ table_read_offset_array(PyObject *input, size_t *num_rows, size_t length, bool c } static const char * -parse_metadata_schema_arg(PyObject *arg, Py_ssize_t* metadata_schema_length) +parse_metadata_schema_arg(PyObject *arg, Py_ssize_t* metadata_schema_length) { const char *ret = NULL; if (arg == NULL) { @@ -1607,7 +1607,7 @@ parse_mutation_table_dict(tsk_mutation_table_t *table, PyObject *dict, bool clea if (node_array == NULL) { goto out; } - + time_data = NULL; if (time_input != Py_None) { time_array = table_read_column_array(time_input, NPY_FLOAT64, &num_rows, true); @@ -6427,14 +6427,15 @@ TableCollection_simplify(TableCollection *self, PyObject *args, PyObject *kwds) int filter_individuals = false; int filter_populations = false; int keep_unary = false; + int keep_input_roots = false; int reduce_to_site_topology = false; static char *kwlist[] = { "samples", "filter_sites", "filter_populations", "filter_individuals", - "reduce_to_site_topology", "keep_unary", NULL}; + "reduce_to_site_topology", "keep_unary", "keep_input_roots", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|iiiii", kwlist, + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|iiiiii", kwlist, &samples, &filter_sites, &filter_populations, &filter_individuals, - &reduce_to_site_topology, &keep_unary)) { + &reduce_to_site_topology, &keep_unary, &keep_input_roots)) { goto out; } samples_array = (PyArrayObject *) PyArray_FROMANY(samples, NPY_INT32, 1, 1, @@ -6459,6 +6460,9 @@ TableCollection_simplify(TableCollection *self, PyObject *args, PyObject *kwds) if (keep_unary) { options |= TSK_KEEP_UNARY; } + if (keep_input_roots) { + options |= TSK_KEEP_INPUT_ROOTS; + } /* Allocate a new array to hold the node map. */ dims = self->tables->nodes.num_rows; @@ -7120,17 +7124,17 @@ TreeSequence_get_site(TreeSequence *self, PyObject *args) return ret; } -static PyObject * +static PyObject * TreeSequence_get_metadata(TreeSequence * self) { return PyBytes_FromStringAndSize( - self->tree_sequence->tables->metadata, + self->tree_sequence->tables->metadata, self->tree_sequence->tables->metadata_length); } -static PyObject * +static PyObject * TreeSequence_get_metadata_schema(TreeSequence * self) { return make_Py_Unicode_FromStringAndLength( - self->tree_sequence->tables->metadata_schema, + self->tree_sequence->tables->metadata_schema, self->tree_sequence->tables->metadata_schema_length); } @@ -11224,7 +11228,7 @@ PyInit__tskit(void) PyObject *unknown_time = PyFloat_FromDouble(TSK_UNKNOWN_TIME); PyModule_AddObject(module, "UNKNOWN_TIME", unknown_time); - + /* Node flags */ PyModule_AddIntConstant(module, "NODE_IS_SAMPLE", TSK_NODE_IS_SAMPLE); /* Tree flags */ diff --git a/python/requirements/CI/requirements.txt b/python/requirements/CI/requirements.txt index d0171eac44..c233bc45b9 100644 --- a/python/requirements/CI/requirements.txt +++ b/python/requirements/CI/requirements.txt @@ -15,6 +15,7 @@ networkx==2.4 newick==1.0.0 nose==1.3.7 numpy==1.19.1 +portion==2.1.1 pre-commit==2.6.0 pyparsing==2.4.7 pysam==0.16.0.1 diff --git a/python/requirements/development.txt b/python/requirements/development.txt index d4e2011c90..b51dc6a2e6 100644 --- a/python/requirements/development.txt +++ b/python/requirements/development.txt @@ -15,6 +15,7 @@ networkx newick nose numpy +portion pre-commit pyparsing pysam diff --git a/python/tests/simplify.py b/python/tests/simplify.py index bfd9c387d5..af694927b8 100644 --- a/python/tests/simplify.py +++ b/python/tests/simplify.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2019 Tskit Developers +# Copyright (c) 2018-2020 Tskit Developers # Copyright (c) 2015-2018 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -26,6 +26,7 @@ import sys import numpy as np +import portion import tskit @@ -108,6 +109,7 @@ def __init__( filter_populations=True, filter_individuals=True, keep_unary=False, + keep_input_roots=False, ): self.ts = ts self.n = len(sample) @@ -117,6 +119,7 @@ def __init__( self.filter_populations = filter_populations self.filter_individuals = filter_individuals self.keep_unary = keep_unary + self.keep_input_roots = keep_input_roots self.num_mutations = ts.num_mutations self.input_sites = list(ts.sites()) self.A_head = [None for _ in range(ts.num_nodes)] @@ -126,9 +129,7 @@ def __init__( self.node_id_map = np.zeros(ts.num_nodes, dtype=np.int32) - 1 self.mutation_node_map = [-1 for _ in range(self.num_mutations)] self.samples = set(sample) - for sample_id in sample: - output_id = self.record_node(sample_id, is_sample=True) - self.add_ancestry(sample_id, 0, self.sequence_length, output_id) + self.sort_offset = -1 # We keep a map of input nodes to mutations. self.mutation_map = [[] for _ in range(ts.num_nodes)] position = ts.tables.sites.position @@ -137,6 +138,9 @@ def __init__( for mutation_id in range(ts.num_mutations): site_position = position[site[mutation_id]] self.mutation_map[node[mutation_id]].append((site_position, mutation_id)) + for sample_id in sample: + output_id = self.record_node(sample_id, is_sample=True) + self.add_ancestry(sample_id, 0, self.sequence_length, output_id) self.position_lookup = None if self.reduce_to_site_topology: self.position_lookup = np.hstack([[0], position, [self.sequence_length]]) @@ -242,6 +246,18 @@ def print_state(self): print(self.tables) self.check_state() + def map_mutations(self, left, right, input_id, output_id): + """ + Map any mutations for the input node ID on the + interval to it's output ID. + """ + assert output_id != -1 + # TODO we should probably remove these as they are used. + # Or else, binary search the list so it's quick. + for x, mutation_id in self.mutation_map[input_id]: + if left <= x < right: + self.mutation_node_map[mutation_id] = output_id + def add_ancestry(self, input_id, left, right, node): tail = self.A_tail[input_id] if tail is None: @@ -256,6 +272,8 @@ def add_ancestry(self, input_id, left, right, node): tail.next = x self.A_tail[input_id] = x + self.map_mutations(left, right, input_id, node) + def merge_labeled_ancestors(self, S, input_id): """ All ancestry segments in S come together into a new parent. @@ -303,6 +321,63 @@ def merge_labeled_ancestors(self, S, input_id): if num_edges == 0 and not is_sample: self.rewind_node(input_id, output_id) + def extract_ancestry(self, edge): + S = [] + x = self.A_head[edge.child] + + x_head = None + x_prev = None + while x is not None: + if x.right > edge.left and edge.right > x.left: + y = Segment(max(x.left, edge.left), min(x.right, edge.right), x.node) + # print("snip", y) + S.append(y) + assert x.left <= y.left + assert x.right >= y.right + seg_left = None + seg_right = None + if x.left != y.left: + seg_left = Segment(x.left, y.left, x.node) + if x_prev is None: + x_head = seg_left + else: + x_prev.next = seg_left + x_prev = seg_left + if x.right != y.right: + x.left = y.right + seg_right = x + else: + # Free x + seg_right = x.next + if x_prev is None: + x_head = seg_right + else: + x_prev.next = seg_right + x = seg_right + else: + if x_prev is None: + x_head = x + x_prev = x + x = x.next + + # We could probably do the squashing and tail tracking + # as part of the pass above, but keeping it simpler for now. + self.A_head[edge.child] = x_head + x = x_head + x_tail = x_head + while x is not None: + x_tail = x + if x.next is not None: + assert x.right <= x.next.left + if x.next.left == x.right and x.node == x.next.node: + # Squash out the x.next segment + x.right = x.next.right + x.next = x.next.next + x = x.next + self.A_tail[edge.child] = x_tail + + return S + def process_parent_edges(self, edges): """ Process all of the edges for a given parent. @@ -311,17 +386,9 @@ def process_parent_edges(self, edges): parent = edges[0].parent S = [] for edge in edges: - x = self.A_head[edge.child] - while x is not None: - if x.right > edge.left and edge.right > x.left: - y = Segment( - max(x.left, edge.left), min(x.right, edge.right), x.node - ) - S.append(y) - x = x.next + S.extend(self.extract_ancestry(edge)) self.merge_labeled_ancestors(S, parent) self.check_state() - # self.print_state() def finalise_sites(self): # Build a map from the old mutation IDs to new IDs. Any mutation that @@ -364,22 +431,6 @@ def finalise_sites(self): metadata=site.metadata, ) - def map_mutation_nodes(self): - for input_node in range(len(self.mutation_map)): - mutations = self.mutation_map[input_node] - seg = self.A_head[input_node] - m_index = 0 - while seg is not None and m_index < len(mutations): - x, mutation_id = mutations[m_index] - if seg.left <= x < seg.right: - self.mutation_node_map[mutation_id] = seg.node - m_index += 1 - elif x >= seg.right: - seg = seg.next - else: - assert x < seg.left - m_index += 1 - def finalise_references(self): input_populations = self.ts.tables.populations population_id_map = np.arange(len(input_populations) + 1, dtype=np.int32) @@ -432,8 +483,39 @@ def finalise_references(self): # We don't support migrations for now. We'll need to remap these as well. assert self.ts.num_migrations == 0 + def insert_input_roots(self): + youngest_root_time = np.inf + for input_id in range(len(self.node_id_map)): + x = self.A_head[input_id] + if x is not None: + output_id = self.node_id_map[input_id] + if output_id == -1: + output_id = self.record_node(input_id) + while x is not None: + if x.node != output_id: + self.record_edge(x.left, x.right, output_id, x.node) + self.map_mutations(x.left, x.right, input_id, x.node) + x = x.next + self.flush_edges() + root_time = self.tables.nodes.time[output_id] + if root_time < youngest_root_time: + youngest_root_time = root_time + # We have to sort the edge table from the point where the edges + # for the youngest root would be inserted. + # Note: it would be nicer to do the sort here, but we have to + # wait until the finalise_references method has been called to + # make sure all the populations etc have been setup. + node_time = self.tables.nodes.time + edge_parent = self.tables.edges.parent + offset = 0 + while ( + offset < len(self.tables.edges) + and node_time[edge_parent[offset]] < youngest_root_time + ): + offset += 1 + self.sort_offset = offset + def simplify(self): - # self.print_state() if self.ts.num_edges > 0: all_edges = list(self.ts.edges()) edges = all_edges[:1] @@ -443,14 +525,18 @@ def simplify(self): edges = [] edges.append(e) self.process_parent_edges(edges) - # self.print_state() - self.map_mutation_nodes() + if self.keep_input_roots: + self.insert_input_roots() self.finalise_sites() self.finalise_references() + if self.sort_offset != -1: + self.tables.sort(edge_start=self.sort_offset) ts = self.tables.tree_sequence() return ts, self.node_id_map def check_state(self): + # print("CHECK_STATE") + all_ancestry = [] num_nodes = len(self.A_head) for j in range(num_nodes): head = self.A_head[j] @@ -460,17 +546,27 @@ def check_state(self): else: x = head while x.next is not None: + assert x.right <= x.next.left x = x.next assert x == tail - x = head.next + x = head while x is not None: assert x.left < x.right + all_ancestry.append(portion.openclosed(x.left, x.right)) if x.next is not None: assert x.right <= x.next.left # We should also not have any squashable segments. if x.right == x.next.left: assert x.node != x.next.node x = x.next + # Make sure we haven't lost ancestry. + if len(all_ancestry) > 0: + union = all_ancestry[0] + for interval in all_ancestry[1:]: + union = union.union(interval) + assert union.atomic + assert union.lower == 0 + assert union.upper == self.sequence_length class AncestorMap: @@ -604,6 +700,7 @@ def flush_edges(self): return num_edges def check_state(self): + num_nodes = len(self.A_head) for j in range(num_nodes): head = self.A_head[j] diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index ab42a27e23..0a872044a2 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -236,6 +236,10 @@ def test_simplify_bad_args(self): tc.simplify() with self.assertRaises(ValueError): tc.simplify("asdf") + with self.assertRaises(TypeError): + tc.simplify([0, 1], keep_unary="sdf") + with self.assertRaises(TypeError): + tc.simplify([0, 1], keep_input_roots="sdf") with self.assertRaises(TypeError): tc.simplify([0, 1], filter_populations="x") with self.assertRaises(_tskit.LibraryError): diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index 5067b0fdd1..e9aee63b39 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2018-2019 Tskit Developers +# Copyright (c) 2018-2020 Tskit Developers # Copyright (c) 2016-2017 University of Oxford # # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -23,6 +23,7 @@ """ Test cases for the supported topological variations and operations. """ +import functools import io import itertools import json @@ -276,6 +277,83 @@ def c_kc_distance(tree1, tree2, lambda_=0): return norm_kc_vectors(vecs1, vecs2, lambda_) +class ExampleTopologyMixin: + """ + Some example topologies for tests cases. + """ + + def test_single_coalescent_tree(self): + ts = msprime.simulate(10, random_seed=1, length=10) + self.verify(ts) + + def test_coalescent_trees(self): + ts = msprime.simulate(8, recombination_rate=5, random_seed=1, length=2) + self.assertGreater(ts.num_trees, 2) + self.verify(ts) + + def test_coalescent_trees_internal_samples(self): + ts = msprime.simulate(8, recombination_rate=5, random_seed=10, length=2) + self.assertGreater(ts.num_trees, 2) + self.verify(tsutil.jiggle_samples(ts)) + + def test_coalescent_trees_all_samples(self): + ts = msprime.simulate(8, recombination_rate=5, random_seed=10, length=2) + self.assertGreater(ts.num_trees, 2) + tables = ts.dump_tables() + flags = np.zeros_like(tables.nodes.flags) + tskit.NODE_IS_SAMPLE + tables.nodes.flags = flags + self.verify(tables.tree_sequence()) + + def test_wright_fisher_trees_unsimplified(self): + tables = wf.wf_sim(10, 5, deep_history=False, seed=2) + tables.sort() + ts = tables.tree_sequence() + self.verify(ts) + + def test_wright_fisher_trees_simplified(self): + tables = wf.wf_sim(10, 5, deep_history=False, seed=1) + tables.sort() + ts = tables.tree_sequence() + ts = ts.simplify() + self.verify(ts) + + def test_wright_fisher_trees_simplified_one_gen(self): + tables = wf.wf_sim(10, 1, deep_history=False, seed=1) + tables.sort() + ts = tables.tree_sequence() + ts = ts.simplify() + self.verify(ts) + + def test_nonbinary_trees(self): + demographic_events = [ + msprime.SimpleBottleneck(time=1.0, population=0, proportion=0.95) + ] + ts = msprime.simulate( + 20, + recombination_rate=10, + mutation_rate=5, + demographic_events=demographic_events, + random_seed=7, + ) + found = False + for e in ts.edgesets(): + if len(e.children) > 2: + found = True + self.assertTrue(found) + self.verify(ts) + + def test_many_multiroot_trees(self): + ts = msprime.simulate(7, recombination_rate=1, random_seed=10) + self.assertGreater(ts.num_trees, 3) + ts = tsutil.decapitate(ts, ts.num_edges // 2) + self.verify(ts) + + def test_multiroot_tree(self): + ts = msprime.simulate(15, random_seed=10) + ts = tsutil.decapitate(ts, ts.num_edges // 2) + self.verify(ts) + + class TestKCMetric(unittest.TestCase): """ Tests on the KC metric distances. @@ -1234,6 +1312,7 @@ def validate_trees(self, n): for seed in range(1, 10): ts1 = msprime.simulate(n, random_seed=seed, recombination_rate=1) ts2 = msprime.simulate(n, random_seed=seed + 1, recombination_rate=1) + self.verify_same_kc(ts2, ts1) self.verify_same_kc(ts1, ts2) self.verify_same_kc(ts1, ts1) # Test sequences with equal breakpoints @@ -2711,6 +2790,7 @@ def verify_simplify( self, samples, filter_sites=True, + keep_input_roots=False, nodes_before=None, edges_before=None, sites_before=None, @@ -2747,18 +2827,20 @@ def verify_simplify( sequence_length=before.sequence_length, ) after = ts.dump_tables() - + # Make sure it's a valid tree sequence ts = before.tree_sequence() - # Make sure it's a valid topology. We want to be sure we evaluate the - # whole iterator - for t in ts.trees(): - self.assertTrue(t is not None) - before.simplify(samples=samples, filter_sites=filter_sites) + before.simplify( + samples=samples, + filter_sites=filter_sites, + keep_input_roots=keep_input_roots, + ) if debug: print("before") print(before) + print(before.tree_sequence().draw_text()) print("after") print(after) + print(after.tree_sequence().draw_text()) self.assertEqual(before, after) def test_unsorted_edges(self): @@ -2865,6 +2947,47 @@ def test_single_binary_tree_no_sample_nodes(self): edges_after=edges_after, ) + def test_single_binary_tree_keep_input_root(self): + # + # 2 4 + # / \ + # 1 3 \ + # / \ \ + # 0 (0)(1) (2) + nodes_before = """\ + id is_sample time + 0 1 0 + 1 1 0 + 2 1 0 + 3 0 1 + 4 0 2 + """ + edges_before = """\ + left right parent child + 0 1 3 0,1 + 0 1 4 2,3 + """ + nodes_after = """\ + id is_sample time + 0 1 0 + 1 1 0 + 2 0 1 + 3 0 2 + """ + edges_after = """\ + left right parent child + 0 1 2 0,1 + 0 1 3 2 + """ + self.verify_simplify( + samples=[0, 1], + nodes_before=nodes_before, + edges_before=edges_before, + nodes_after=nodes_after, + edges_after=edges_after, + keep_input_roots=True, + ) + def test_single_binary_tree_internal_sample(self): # # 2 4 @@ -3031,6 +3154,75 @@ def test_single_binary_tree_simple_mutations(self): mutations_after=mutations_after, ) + def test_single_binary_tree_keep_roots_mutations(self): + # 3 5 + # m0 / \ + # 2 4 \ + # m1 / \ \ + # 1 3 \ \ + # / \ \ \ + # 0 (0) (1) 2 6 + nodes_before = """\ + id is_sample time + 0 1 0 + 1 1 0 + 2 0 0 + 3 0 1 + 4 0 2 + 5 0 3 + 6 0 0 + """ + edges_before = """\ + left right parent child + 0 1 3 0,1 + 0 1 4 2,3 + 0 1 5 4,6 + """ + sites_before = """\ + id position ancestral_state + 0 0.1 0 + """ + mutations_before = """\ + site node derived_state + 0 3 2 + 0 4 1 + """ + + # We sample 0 and 2 + nodes_after = """\ + id is_sample time + 0 1 0 + 1 1 0 + 2 0 1 + 3 0 3 + """ + edges_after = """\ + left right parent child + 0 1 2 0,1 + 0 1 3 2 + """ + sites_after = """\ + id position ancestral_state + 0 0.1 0 + """ + mutations_after = """\ + site node derived_state + 0 2 2 + 0 2 1 + """ + self.verify_simplify( + samples=[0, 1], + nodes_before=nodes_before, + edges_before=edges_before, + sites_before=sites_before, + mutations_before=mutations_before, + nodes_after=nodes_after, + edges_after=edges_after, + sites_after=sites_after, + mutations_after=mutations_after, + keep_input_roots=True, + ) + def test_overlapping_edges(self): nodes = """\ id is_sample time @@ -4510,40 +4702,9 @@ def test_partial_overlap_contradictory_children(self): tskit.load_text(nodes=nodes, edges=edges, strict=False) -class TestSimplify(unittest.TestCase): +class SimplifyTestBase(unittest.TestCase): """ - Tests that the implementations of simplify() do what they are supposed to. - """ - - random_seed = 23 - # - # 8 - # / \ - # / \ - # / \ - # 7 \ - # / \ 6 - # / 5 / \ - # / / \ / \ - # 4 0 1 2 3 - small_tree_ex_nodes = """\ - id is_sample population time - 0 1 0 0.00000000000000 - 1 1 0 0.00000000000000 - 2 1 0 0.00000000000000 - 3 1 0 0.00000000000000 - 4 1 0 0.00000000000000 - 5 0 0 0.14567111023387 - 6 0 0 0.21385545626353 - 7 0 0 0.43508024345063 - 8 0 0 1.60156352971203 - """ - small_tree_ex_edges = """\ - id left right parent child - 0 0.00000000 1.00000000 5 0,1 - 1 0.00000000 1.00000000 6 2,3 - 2 0.00000000 1.00000000 7 4,5 - 3 0.00000000 1.00000000 8 6,7 + Base class for simplify tests. """ def do_simplify( @@ -4555,6 +4716,7 @@ def do_simplify( filter_populations=True, filter_individuals=True, keep_unary=False, + keep_input_roots=False, ): """ Runs the Python test implementation of simplify. @@ -4568,6 +4730,7 @@ def do_simplify( filter_populations=filter_populations, filter_individuals=filter_individuals, keep_unary=keep_unary, + keep_input_roots=keep_input_roots, ) new_ts, node_map = s.simplify() if compare_lib: @@ -4577,6 +4740,7 @@ def do_simplify( filter_individuals=filter_individuals, filter_populations=filter_populations, keep_unary=keep_unary, + keep_input_roots=keep_input_roots, map_nodes=True, ) lib_tables1 = sts.dump_tables() @@ -4586,6 +4750,7 @@ def do_simplify( samples, filter_sites=filter_sites, keep_unary=keep_unary, + keep_input_roots=keep_input_roots, filter_individuals=filter_individuals, filter_populations=filter_populations, ) @@ -4606,6 +4771,43 @@ def do_simplify( self.assertTrue(all(node_map == lib_node_map)) return new_ts, node_map + +class TestSimplify(SimplifyTestBase): + """ + Tests that the implementations of simplify() do what they are supposed to. + """ + + random_seed = 23 + # + # 8 + # / \ + # / \ + # / \ + # 7 \ + # / \ 6 + # / 5 / \ + # / / \ / \ + # 4 0 1 2 3 + small_tree_ex_nodes = """\ + id is_sample population time + 0 1 0 0.00000000000000 + 1 1 0 0.00000000000000 + 2 1 0 0.00000000000000 + 3 1 0 0.00000000000000 + 4 1 0 0.00000000000000 + 5 0 0 0.14567111023387 + 6 0 0 0.21385545626353 + 7 0 0 0.43508024345063 + 8 0 0 1.60156352971203 + """ + small_tree_ex_edges = """\ + id left right parent child + 0 0.00000000 1.00000000 5 0,1 + 1 0.00000000 1.00000000 6 2,3 + 2 0.00000000 1.00000000 7 4,5 + 3 0.00000000 1.00000000 8 6,7 + """ + def verify_no_samples(self, ts, keep_unary=False): """ Zero out the flags column and verify that we get back the correct @@ -5419,6 +5621,143 @@ def test_many_trees_recurrent_mutations_internal_samples(self): self.verify_simplify_haplotypes(ts, samples, keep_unary=keep) +class TestSimplifyKeepInputRoots(SimplifyTestBase, ExampleTopologyMixin): + """ + Tests for the keep_input_roots option to simplify. + """ + + def verify(self, ts): + # Called by the examples in ExampleTopologyMixin + samples = ts.samples() + self.verify_keep_input_roots(ts, samples[:2]) + # self.verify_keep_input_roots(ts, samples[:3]) + # self.verify_keep_input_roots(ts, samples[:-1]) + # self.verify_keep_input_roots(ts, samples) + + def verify_keep_input_roots(self, ts, samples): + ts_with_roots, node_map = self.do_simplify( + ts, samples, keep_input_roots=True, filter_sites=False, compare_lib=True + ) + new_to_input_map = { + value: key for key, value in enumerate(node_map) if value != tskit.NULL + } + for (left, right), input_tree, tree_with_roots in tsutil.coiterate( + ts, ts_with_roots + ): + input_roots = input_tree.roots + self.assertGreater(len(tree_with_roots.roots), 0) + for root in tree_with_roots.roots: + # Check that the roots in the current + input_root = new_to_input_map[root] + self.assertIn(input_root, input_roots) + input_node = ts.node(input_root) + new_node = ts_with_roots.node(root) + self.assertEqual(new_node.time, input_node.time) + self.assertEqual(new_node.population, input_node.population) + self.assertEqual(new_node.individual, input_node.individual) + self.assertEqual(new_node.metadata, input_node.metadata) + # This should only be marked as a sample if it's an + # element of the samples list. + self.assertEqual(new_node.is_sample(), input_root in samples) + # Find the MRCA of the samples below this root. + root_samples = list(tree_with_roots.samples(root)) + mrca = functools.reduce(tree_with_roots.mrca, root_samples) + if mrca != root: + # If the MRCA is not equal to the root, then there should + # be a unary branch joining them. + self.assertEqual(tree_with_roots.parent(mrca), root) + self.assertEqual(tree_with_roots.children(root), (mrca,)) + + # Any mutations that were on the path from the old MRCA + # to the root should be mapped to this node. + u = new_to_input_map[mrca] + root_path = [] + while u != tskit.NULL: + root_path.append(u) + u = input_tree.parent(u) + input_sites = {site.position: site for site in input_tree.sites()} + new_sites = { + site.position: site for site in tree_with_roots.sites() + } + positions = set(input_sites.keys()) & set(new_sites.keys()) + for position in positions: + self.assertTrue(left <= position < right) + new_site = new_sites[position] + # We assume the metadata contains a unique key for each mutation. + new_mutations = { + mut.metadata: mut for mut in new_site.mutations + } + input_site = input_sites[position] + for input_mutation in input_site.mutations: + if input_mutation.node in root_path: + # The same mutation should exist and be mapped to the + # mrca + new_mutation = new_mutations[input_mutation.metadata] + # We have turned filter sites off, so sites should + # be comparable + self.assertEqual(new_mutation.site, input_mutation.site) + self.assertEqual( + new_mutation.derived_state, + input_mutation.derived_state, + ) + self.assertEqual(new_mutation.node, mrca) + return ts_with_roots + + def test_many_trees(self): + ts = msprime.simulate(5, recombination_rate=1, random_seed=10) + self.assertGreater(ts.num_trees, 3) + for num_samples in range(1, ts.num_samples): + for samples in itertools.combinations(ts.samples(), num_samples): + self.verify_keep_input_roots(ts, samples) + + def test_many_trees_internal_samples(self): + ts = msprime.simulate(5, recombination_rate=1, random_seed=10) + ts = tsutil.jiggle_samples(ts) + self.assertGreater(ts.num_trees, 3) + for num_samples in range(1, ts.num_samples): + for samples in itertools.combinations(ts.samples(), num_samples): + self.verify_keep_input_roots(ts, samples) + + def test_many_multiroot_trees(self): + ts = msprime.simulate(7, recombination_rate=1, random_seed=10) + self.assertGreater(ts.num_trees, 3) + ts = tsutil.decapitate(ts, ts.num_edges // 2) + for num_samples in range(1, ts.num_samples): + for samples in itertools.combinations(ts.samples(), num_samples): + self.verify_keep_input_roots(ts, samples) + + def test_wright_fisher_unsimplified(self): + num_generations = 10 + tables = wf.wf_sim(10, num_generations, deep_history=False, seed=2) + tables.sort() + ts = tables.tree_sequence() + simplified = self.verify_keep_input_roots(ts, ts.samples()) + roots = set() + for tree in simplified.trees(): + for root in tree.roots: + roots.add(root) + self.assertEqual(tree.time(root), num_generations) + init_nodes = np.where(simplified.tables.nodes.time == num_generations)[0] + self.assertEqual(set(init_nodes), roots) + + def test_single_tree_recurrent_mutations(self): + ts = msprime.simulate(6, random_seed=10) + for mutations_per_branch in [1, 2, 3]: + ts = tsutil.insert_branch_mutations(ts, mutations_per_branch) + for num_samples in range(1, ts.num_samples): + for samples in itertools.combinations(ts.samples(), num_samples): + self.verify_keep_input_roots(ts, samples) + + def test_many_trees_recurrent_mutations(self): + ts = msprime.simulate(5, recombination_rate=1, random_seed=8) + self.assertGreater(ts.num_trees, 2) + for mutations_per_branch in [1, 2, 3]: + ts = tsutil.insert_branch_mutations(ts, mutations_per_branch) + for num_samples in range(1, ts.num_samples): + for samples in itertools.combinations(ts.samples(), num_samples): + self.verify_keep_input_roots(ts, samples) + + class TestMapToAncestors(unittest.TestCase): """ Tests the AncestorMap class. @@ -6066,83 +6405,6 @@ def test_coalescent_trees(self): self.assertRaises(StopIteration, next, new_trees) -class ExampleTopologyMixin: - """ - Some example topologies for tests cases. - """ - - def test_single_coalescent_tree(self): - ts = msprime.simulate(10, random_seed=1, length=10) - self.verify(ts) - - def test_coalescent_trees(self): - ts = msprime.simulate(8, recombination_rate=5, random_seed=1, length=2) - self.assertGreater(ts.num_trees, 2) - self.verify(ts) - - def test_coalescent_trees_internal_samples(self): - ts = msprime.simulate(8, recombination_rate=5, random_seed=10, length=2) - self.assertGreater(ts.num_trees, 2) - self.verify(tsutil.jiggle_samples(ts)) - - def test_coalescent_trees_all_samples(self): - ts = msprime.simulate(8, recombination_rate=5, random_seed=10, length=2) - self.assertGreater(ts.num_trees, 2) - tables = ts.dump_tables() - flags = np.zeros_like(tables.nodes.flags) + tskit.NODE_IS_SAMPLE - tables.nodes.flags = flags - self.verify(tables.tree_sequence()) - - def test_wright_fisher_trees_unsimplified(self): - tables = wf.wf_sim(10, 5, deep_history=False, seed=2) - tables.sort() - ts = tables.tree_sequence() - self.verify(ts) - - def test_wright_fisher_trees_simplified(self): - tables = wf.wf_sim(10, 5, deep_history=False, seed=1) - tables.sort() - ts = tables.tree_sequence() - ts = ts.simplify() - self.verify(ts) - - def test_wright_fisher_trees_simplified_one_gen(self): - tables = wf.wf_sim(10, 1, deep_history=False, seed=1) - tables.sort() - ts = tables.tree_sequence() - ts = ts.simplify() - self.verify(ts) - - def test_nonbinary_trees(self): - demographic_events = [ - msprime.SimpleBottleneck(time=1.0, population=0, proportion=0.95) - ] - ts = msprime.simulate( - 20, - recombination_rate=10, - mutation_rate=5, - demographic_events=demographic_events, - random_seed=7, - ) - found = False - for e in ts.edgesets(): - if len(e.children) > 2: - found = True - self.assertTrue(found) - self.verify(ts) - - def test_many_multiroot_trees(self): - ts = msprime.simulate(7, recombination_rate=1, random_seed=10) - self.assertGreater(ts.num_trees, 3) - ts = tsutil.decapitate(ts, ts.num_edges // 2) - self.verify(ts) - - def test_multiroot_tree(self): - ts = msprime.simulate(15, random_seed=10) - ts = tsutil.decapitate(ts, ts.num_edges // 2) - self.verify(ts) - - class TestSampleLists(unittest.TestCase, ExampleTopologyMixin): """ Tests for the sample lists algorithm. diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 53f358b9e8..647fbbf7b4 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -123,11 +123,13 @@ def insert_branch_mutations(ts, mutations_per_branch=1): parent = mutation[v] for _ in range(mutations_per_branch): state[u] = (state[u] + 1) % 2 + metadata = f"{len(tables.mutations)}".encode() mutation[u] = tables.mutations.add_row( site=site, node=u, derived_state=str(state[u]), parent=parent, + metadata=metadata, ) parent = mutation[u] add_provenance(tables.provenances, "insert_branch_mutations") @@ -1288,3 +1290,27 @@ def genealogical_nearest_neighbours(ts, focal, reference_sets): L[L == 0] = 1 A /= L.reshape((len(focal), 1)) return A + + +def coiterate(ts1, ts2, **kwargs): + """ + Returns an iterator over the pairs of trees for each distinct + interval in the specified pair of tree sequences. + """ + if ts1.sequence_length != ts2.sequence_length: + raise ValueError("Tree sequences must be equal length.") + L = ts1.sequence_length + trees1 = ts1.trees(**kwargs) + trees2 = ts2.trees(**kwargs) + tree1 = next(trees1) + tree2 = next(trees2) + right = 0 + while right != L: + left = right + right = min(tree1.interval[1], tree2.interval[1]) + yield (left, right), tree1, tree2 + # Advance + if tree1.interval[1] == right: + tree1 = next(trees1, None) + if tree2.interval[1] == right: + tree2 = next(trees2, None) diff --git a/python/tskit/tables.py b/python/tskit/tables.py index 8668afaa28..79317bb966 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -2206,6 +2206,7 @@ def simplify( filter_individuals=True, filter_sites=True, keep_unary=False, + keep_input_roots=False, ): """ Simplifies the tables in place to retain only the information necessary @@ -2252,6 +2253,9 @@ def simplify( :param bool keep_unary: If True, any unary nodes (i.e. nodes with exactly one child) that exist on the path from samples to root will be preserved in the output. (Default: False) + :param bool keep_input_roots: If True, insert edges from the MRCAs of the + samples to the roots in the input trees. If False, no topology older + than the MRCAs of the samples will be included. (Default: False) :return: A numpy array mapping node IDs in the input tables to their corresponding node IDs in the output tables. :rtype: numpy.ndarray (dtype=np.int32) @@ -2277,6 +2281,7 @@ def simplify( filter_populations=filter_populations, reduce_to_site_topology=reduce_to_site_topology, keep_unary=keep_unary, + keep_input_roots=keep_input_roots, ) def link_ancestors(self, samples, ancestors): diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 9c53572826..f7478e8f3e 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -4420,6 +4420,7 @@ def simplify( filter_sites=True, record_provenance=True, keep_unary=False, + keep_input_roots=False, ): """ Returns a simplified tree sequence that retains only the history of @@ -4482,6 +4483,9 @@ def simplify( :param bool keep_unary: If True, any unary nodes (i.e. nodes with exactly one child) that exist on the path from samples to root will be preserved in the output. (Default: False) + :param bool keep_input_roots: If True, insert edges from the MRCAs of the + samples to the roots in the input trees. If False, no topology older + than the MRCAs of the samples will be included. (Default: False) :return: The simplified tree sequence, or (if ``map_nodes`` is True) a tuple consisting of the simplified tree sequence and a numpy array mapping source node IDs to their corresponding IDs in the new tree @@ -4500,6 +4504,7 @@ def simplify( filter_individuals=filter_individuals, filter_sites=filter_sites, keep_unary=keep_unary, + keep_input_roots=keep_input_roots, ) if record_provenance: # TODO move this into tables.simplify. See issue #374 From dc90ef58584ebbf9b01d9383a78cbe67ca1f5412 Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 25 Aug 2020 14:05:03 -0700 Subject: [PATCH 2/5] stick test stick test --- c/tskit/tables.c | 3 +- python/tests/test_topology.py | 70 +++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 4a6f26084f..fe36967dbd 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -4566,7 +4566,8 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, tsk_bookmark_t *start) if (start->sites == self->tables->sites.num_rows && start->mutations == self->tables->mutations.num_rows) { skip_sites = true; - } else { + } else if (start->sites != 0 + || start->mutations != 0) { ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; goto out; } diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index e9aee63b39..ae0341fd2f 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -3223,6 +3223,76 @@ def test_single_binary_tree_keep_roots_mutations(self): keep_input_roots=True, ) + def test_map_mutations_with_and_without_roots(self): + nodes_before = """\ + id is_sample time + 0 1 0 + 1 0 1 + """ + edges_before = """\ + left right parent child + 0 2 1 0 + """ + sites = """\ + id position ancestral_state + 0 1.0 0 + """ + mutations_before = """\ + site node derived_state + 0 0 2 + 0 1 1 + """ + # expected result without keep_input_roots + nodes_after = """\ + id is_sample time + 0 1 0 + """ + edges_after = """\ + left right parent child + """ + mutations_after = """\ + site node derived_state + 0 0 2 + 0 0 1 + """ + # expected result with keep_input_roots + nodes_after_keep = nodes_before + edges_after_keep = """\ + left right parent child + 0 2 1 0 + """ + mutations_after_keep = """\ + site node derived_state + 0 0 2 + 0 0 1 + """ + self.verify_simplify( + samples=[0], + nodes_before=nodes_before, + edges_before=edges_before, + sites_before=sites, + mutations_before=mutations_before, + nodes_after=nodes_after, + edges_after=edges_after, + sites_after=sites, + mutations_after=mutations_after, + keep_input_roots=False, + debug=True + ) + self.verify_simplify( + samples=[0], + nodes_before=nodes_before, + edges_before=edges_before, + sites_before=sites, + mutations_before=mutations_before, + nodes_after=nodes_after_keep, + edges_after=edges_after_keep, + sites_after=sites, + mutations_after=mutations_after_keep, + keep_input_roots=True, + debug=True + ) + def test_overlapping_edges(self): nodes = """\ id is_sample time From b2326151c544874aff23bec681990fc910a17283 Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 25 Aug 2020 14:32:16 -0700 Subject: [PATCH 3/5] make sure no sites are skipped --- c/tskit/tables.c | 3 +-- python/tests/test_topology.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/c/tskit/tables.c b/c/tskit/tables.c index fe36967dbd..c10cc88254 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -4566,8 +4566,7 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, tsk_bookmark_t *start) if (start->sites == self->tables->sites.num_rows && start->mutations == self->tables->mutations.num_rows) { skip_sites = true; - } else if (start->sites != 0 - || start->mutations != 0) { + } else if (start->sites != 0 || start->mutations != 0) { ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; goto out; } diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index ae0341fd2f..72744a72c3 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -3277,7 +3277,6 @@ def test_map_mutations_with_and_without_roots(self): sites_after=sites, mutations_after=mutations_after, keep_input_roots=False, - debug=True ) self.verify_simplify( samples=[0], @@ -3290,7 +3289,6 @@ def test_map_mutations_with_and_without_roots(self): sites_after=sites, mutations_after=mutations_after_keep, keep_input_roots=True, - debug=True ) def test_overlapping_edges(self): @@ -5745,11 +5743,18 @@ def verify_keep_input_roots(self, ts, samples): while u != tskit.NULL: root_path.append(u) u = input_tree.parent(u) - input_sites = {site.position: site for site in input_tree.sites()} + input_sites = { + site.position: site + for site in input_tree.sites() + if site.position >= left and site.position < right + } new_sites = { - site.position: site for site in tree_with_roots.sites() + site.position: site + for site in tree_with_roots.sites() + if site.position >= left and site.position < right } - positions = set(input_sites.keys()) & set(new_sites.keys()) + self.assertEqual(set(input_sites.keys()), set(new_sites.keys())) + positions = input_sites.keys() for position in positions: self.assertTrue(left <= position < right) new_site = new_sites[position] From 0f65728d9ffd61ae5c7958a3b633651bd7a633fb Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 26 Aug 2020 10:59:19 +0100 Subject: [PATCH 4/5] Clarify the semantics of skipping sort on sites/mutations. --- c/tests/test_tables.c | 78 ++++++++++++++++++++++++++++++++++- c/tskit/core.c | 5 ++- c/tskit/tables.c | 2 +- c/tskit/tables.h | 8 +++- python/tests/test_topology.py | 8 ++-- 5 files changed, 92 insertions(+), 9 deletions(-) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index a23d335b2e..bacae64fa2 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -2978,6 +2978,81 @@ test_copy_table_collection(void) tsk_treeseq_free(&ts); } +static void +test_sort_tables_offsets(void) +{ + int ret; + tsk_treeseq_t *ts; + tsk_table_collection_t tables, copy; + tsk_bookmark_t bookmark; + + ts = caterpillar_tree(10, 5, 5); + ret = tsk_treeseq_copy_tables(ts, &tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED); + + tsk_migration_table_clear(&tables.migrations); + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + /* Check that setting edge offset = len(edges) does nothing */ + reverse_edges(&tables); + ret = tsk_table_collection_copy(&tables, ©, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + memset(&bookmark, 0, sizeof(bookmark)); + bookmark.edges = tables.edges.num_rows; + ret = tsk_table_collection_sort(&tables, &bookmark, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©)); + + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FATAL(tables.sites.num_rows > 2); + CU_ASSERT_FATAL(tables.mutations.num_rows > 2); + + /* Check that setting mutation and site offset = to the len + * of the tables leaves them untouched. */ + reverse_mutations(&tables); + /* Swap the positions of the first two sites, as a quick way + * to disorder the site table */ + tables.sites.position[0] = tables.sites.position[1]; + tables.sites.position[1] = 0; + ret = tsk_table_collection_copy(&tables, ©, TSK_NO_INIT); + CU_ASSERT_EQUAL_FATAL(ret, 0); + memset(&bookmark, 0, sizeof(bookmark)); + bookmark.sites = tables.sites.num_rows; + bookmark.mutations = tables.mutations.num_rows; + ret = tsk_table_collection_sort(&tables, &bookmark, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©)); + + /* Anything other than len(table) leads to an error for sites + * and mutations, and we can't specify one without the other. */ + memset(&bookmark, 0, sizeof(bookmark)); + bookmark.sites = tables.sites.num_rows; + ret = tsk_table_collection_sort(&tables, &bookmark, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); + + memset(&bookmark, 0, sizeof(bookmark)); + bookmark.mutations = tables.mutations.num_rows; + ret = tsk_table_collection_sort(&tables, &bookmark, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); + + memset(&bookmark, 0, sizeof(bookmark)); + bookmark.sites = tables.sites.num_rows - 1; + bookmark.mutations = tables.mutations.num_rows - 1; + ret = tsk_table_collection_sort(&tables, &bookmark, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); + + tsk_table_collection_free(&tables); + tsk_table_collection_free(©); + tsk_treeseq_free(ts); + free(ts); +} + + static void test_sort_tables_drops_indexes_with_options(tsk_flags_t tc_options) { @@ -3128,7 +3203,7 @@ test_sort_tables_errors(void) memset(&pos, 0, sizeof(pos)); pos.migrations = 1; ret = tsk_table_collection_sort(&tables, &pos, 0); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATIONS_NOT_SUPPORTED); memset(&pos, 0, sizeof(pos)); pos.sites = 1; @@ -4552,6 +4627,7 @@ main(int argc, char **argv) test_link_ancestors_samples_and_ancestors_overlap }, { "test_link_ancestors_multiple_to_single_tree", test_link_ancestors_multiple_to_single_tree }, + { "test_sort_tables_offsets", test_sort_tables_offsets }, { "test_sort_tables_drops_indexes", test_sort_tables_drops_indexes }, { "test_sort_tables_edge_metadata", test_sort_tables_edge_metadata }, { "test_sort_tables_no_edge_metadata", test_sort_tables_no_edge_metadata }, diff --git a/c/tskit/core.c b/c/tskit/core.c index 1b1b27b589..f3455708c0 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -351,8 +351,9 @@ tsk_strerror_internal(int err) ret = "Migrations not currently supported by this operation"; break; case TSK_ERR_SORT_OFFSET_NOT_SUPPORTED: - ret = "Specifying position for mutation, sites or migrations is not " - "supported"; + ret = "Sort offsets for sites and mutations must be either 0 " + "or the length of the respective tables. Intermediate values " + "are not supported"; break; case TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED: ret = "Only binary mutations are supported for this operation"; diff --git a/c/tskit/tables.c b/c/tskit/tables.c index c10cc88254..39cdcf5ce0 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -4557,7 +4557,7 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, tsk_bookmark_t *start) edge_start = start->edges; if (start->migrations != 0) { - ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; + ret = TSK_ERR_MIGRATIONS_NOT_SUPPORTED; goto out; } /* We only allow sites and mutations to be specified as a way to diff --git a/c/tskit/tables.h b/c/tskit/tables.h index 2f4a6247fa..bd90670bec 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -2383,8 +2383,12 @@ Positions in tables that are not sorted (``individual``, ``node``, ``population` and ``provenance``) are ignored and can be set to arbitrary values. .. warning:: The current implementation only supports specifying a start - position for the ``edge`` table. Specifying a non-zero ``migration``, - ``site`` or ``mutation`` start position results in an error. + position for the ``edge`` table and in a limited form for the + ``site`` and ``mutation`` tables. Specifying a non-zero ``migration``, + start position results in an error. The start positions for the + ``site`` and ``mutation`` tables can either be 0 or the length of the + respective tables, allowing these tables to either be fully sorted, or + not sorted at all. The table collection will always be unindexed after sort successfully completes. diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index 72744a72c3..cb0eda6f9c 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -5698,9 +5698,9 @@ def verify(self, ts): # Called by the examples in ExampleTopologyMixin samples = ts.samples() self.verify_keep_input_roots(ts, samples[:2]) - # self.verify_keep_input_roots(ts, samples[:3]) - # self.verify_keep_input_roots(ts, samples[:-1]) - # self.verify_keep_input_roots(ts, samples) + self.verify_keep_input_roots(ts, samples[:3]) + self.verify_keep_input_roots(ts, samples[:-1]) + self.verify_keep_input_roots(ts, samples) def verify_keep_input_roots(self, ts, samples): ts_with_roots, node_map = self.do_simplify( @@ -5762,6 +5762,8 @@ def verify_keep_input_roots(self, ts, samples): new_mutations = { mut.metadata: mut for mut in new_site.mutations } + # Just make sure the metadata is actually unique. + self.assertEqual(len(new_mutations), len(new_site.mutations)) input_site = input_sites[position] for input_mutation in input_site.mutations: if input_mutation.node in root_path: From 6f885de2ebed9028e23bc10da8367e0627cb432b Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 26 Aug 2020 20:55:39 +0100 Subject: [PATCH 5/5] Removed the squash loop from simplifier_extract_ancestry. --- c/CHANGELOG.rst | 10 +++++-- c/tests/test_tables.c | 1 - c/tests/test_trees.c | 64 ++++++++++++++++++++++++++++++++++++++++ c/tskit/tables.c | 18 ++--------- python/CHANGELOG.rst | 4 +++ python/tests/simplify.py | 20 +++---------- 6 files changed, 82 insertions(+), 35 deletions(-) diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index 33aa7af2ea..391fd72c2c 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -6,13 +6,19 @@ - The macro ``TSK_IMPUTE_MISSING_DATA`` is renamed to ``TSK_ISOLATED_NOT_MISSING`` +**New features** + +- Add a ``TSK_KEEP_INPUT_ROOTS`` option to simplify which, if enabled, adds edges + from the MRCAs of samples in the simplified tree sequence back to the roots + in the input tree sequence (:user:`jeromekelleher`, :issue:`775`, :pr:`782`). + --------------------- [0.99.4] - 2020-08-12 --------------------- **Note** -- The ``TSK_VERSION_PATCH`` macro was incorrectly set to ``4`` for 0.99.3, so both +- The ``TSK_VERSION_PATCH`` macro was incorrectly set to ``4`` for 0.99.3, so both 0.99.4 and 0.99.3 have the same value. **Changes** @@ -70,7 +76,7 @@ - New methods to perform set operations on table collections. ``tsk_table_collection_subset`` subsets and reorders table collections by nodes - (:user:`mufernando`, :user:`petrelharp`, :pr:`663`, :pr:`690`). + (:user:`mufernando`, :user:`petrelharp`, :pr:`663`, :pr:`690`). ``tsk_table_collection_union`` forms the node-wise union of two table collections (:user:`mufernando`, :user:`petrelharp`, :issue:`381`, :pr:`623`). diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index bacae64fa2..0776a0f5a9 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -3052,7 +3052,6 @@ test_sort_tables_offsets(void) free(ts); } - static void test_sort_tables_drops_indexes_with_options(tsk_flags_t tc_options) { diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 47d772b832..20a2c70e70 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -2566,6 +2566,69 @@ test_simplest_reduce_site_topology(void) tsk_table_collection_free(&tables); } +static void +test_simplest_simplify_defragment(void) +{ + const char *nodes = "0 2 -1\n" + "0 2 -1\n" + "0 2 -1\n" + "0 2 -1\n" + "0 2 -1\n" + "0 2 -1\n" + "0 1 -1\n" + "0 1 -1\n" + "0 1 -1\n" + "0 1 -1\n" + "0 1 -1\n" + "0 1 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n" + "1 0 -1\n"; + const char *edges = "0.00000000 0.20784841 8 12\n" + "0.00000000 0.42202433 8 15\n" + "0.00000000 0.63541014 8 16\n" + "0.42202433 1.00000000 9 15\n" + "0.00000000 1.00000000 9 17\n" + "0.00000000 1.00000000 10 14\n" + "0.20784841 1.00000000 11 12\n" + "0.00000000 1.00000000 11 13\n" + "0.63541014 1.00000000 11 16\n" + "0.00000000 1.00000000 0 10\n" + "0.62102072 1.00000000 1 9\n" + "0.00000000 1.00000000 1 11\n" + "0.00000000 0.26002984 2 6\n" + "0.26002984 1.00000000 2 6\n" + "0.00000000 0.62102072 2 9\n" + "0.55150554 1.00000000 3 8\n" + "0.00000000 1.00000000 4 7\n" + "0.00000000 0.55150554 5 8\n"; + + tsk_id_t samples[] = { 12, 13, 14, 15, 16, 17 }; + tsk_table_collection_t tables; + int ret; + + /* This was the simplest example I could find that exercised the + * inner loops of the simplifier_extract_ancestry function */ + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.sequence_length = 1; + parse_nodes(nodes, &tables.nodes); + CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 18); + parse_edges(edges, &tables.edges); + CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 18); + + ret = tsk_table_collection_simplify(&tables, samples, 6, 0, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + CU_ASSERT_EQUAL(tables.nodes.num_rows, 10); + CU_ASSERT_EQUAL(tables.edges.num_rows, 10); + + tsk_table_collection_free(&tables); +} + static void test_simplest_population_filter(void) { @@ -5915,6 +5978,7 @@ main(int argc, char **argv) { "test_simplest_overlapping_unary_edges_internal_samples_simplify", test_simplest_overlapping_unary_edges_internal_samples_simplify }, { "test_simplest_reduce_site_topology", test_simplest_reduce_site_topology }, + { "test_simplest_simplify_defragment", test_simplest_simplify_defragment }, { "test_simplest_population_filter", test_simplest_population_filter }, { "test_simplest_individual_filter", test_simplest_individual_filter }, { "test_simplest_map_mutations", test_simplest_map_mutations }, diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 39cdcf5ce0..f05c4f84bf 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -6102,7 +6102,7 @@ simplifier_extract_ancestry( int ret = 0; tsk_segment_t *x = self->ancestor_map_head[input_id]; tsk_segment_t y; /* y is the segment that has been removed */ - tsk_segment_t *x_head, *x_tail, *x_prev, *seg_left, *seg_right; + tsk_segment_t *x_head, *x_prev, *seg_left, *seg_right; x_head = NULL; x_prev = NULL; @@ -6152,22 +6152,8 @@ simplifier_extract_ancestry( } } - x = x_head; - x_tail = x_head; - while (x != NULL) { - x_tail = x; - if (x->next != NULL) { - assert(x->right <= x->next->left); - if (x->next->left == x->right && x->node == x->next->node) { - // Squash out (and free) the x.next segment. - x->right = x->next->right; - x->next = x->next->next; - } - } - x = x->next; - } self->ancestor_map_head[input_id] = x_head; - self->ancestor_map_tail[input_id] = x_tail; + self->ancestor_map_tail[input_id] = x_prev; out: return ret; } diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 61b50340c1..f952d2d60b 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -126,6 +126,10 @@ SVG drawing improvements and many others. - Add ``tree_sequence.to_macs()`` function to convert tree sequence to MACS format (:user:`winni2k`, :pr:`727`) +- Add a ``keep_input_roots`` option to simplify which, if enabled, adds edges + from the MRCAs of samples in the simplified tree sequence back to the roots + in the input tree sequence (:user:`jeromekelleher`, :issue:`775`, :pr:`782`). + **Bugfixes** - :issue:`453` - Fix LibraryError when ``tree.newick()`` is called with large node time diff --git a/python/tests/simplify.py b/python/tests/simplify.py index af694927b8..2428f1c95a 100644 --- a/python/tests/simplify.py +++ b/python/tests/simplify.py @@ -359,23 +359,11 @@ def extract_ancestry(self, edge): x_head = x x_prev = x x = x.next - - # We could probably do the squashing and tail tracking - # as part of the pass above, but keeping it simpler for now. + # Note - we had some code to defragment segments in the output + # chain here, but couldn't find an example where it needed to + # be called. So, looks like squashing isn't necessary here. self.A_head[edge.child] = x_head - x = x_head - x_tail = x_head - while x is not None: - x_tail = x - if x.next is not None: - assert x.right <= x.next.left - if x.next.left == x.right and x.node == x.next.node: - # Squash out the x.next segment - x.right = x.next.right - x.next = x.next.next - x = x.next - self.A_tail[edge.child] = x_tail - + self.A_tail[edge.child] = x_prev return S def process_parent_edges(self, edges):