From f677e3627a4f648c9d959ec669cc71e336192c8e Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Fri, 21 Aug 2020 17:48:00 +0100 Subject: [PATCH 1/2] Change diff iter types to tsk_ equivalents --- c/tests/test_trees.c | 2 +- c/tskit/trees.c | 16 ++++++++-------- c/tskit/trees.h | 10 +++++----- python/_tskitmodule.c | 2 +- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 20a2c70e70..e8b021dc7e 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -175,7 +175,7 @@ verify_individual_nodes(tsk_treeseq_t *ts) } static void -verify_trees(tsk_treeseq_t *ts, uint32_t num_trees, tsk_id_t *parents) +verify_trees(tsk_treeseq_t *ts, tsk_size_t num_trees, tsk_id_t *parents) { int ret; tsk_id_t u, j, v; diff --git a/c/tskit/trees.c b/c/tskit/trees.c index a91a4718d8..c8cd16f26b 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -4383,7 +4383,7 @@ tsk_diff_iter_init(tsk_diff_iter_t *self, tsk_treeseq_t *tree_sequence) self->insertion_index = 0; self->removal_index = 0; self->tree_left = 0; - self->tree_index = (size_t) -1; + self->tree_index = -1; self->edge_list_nodes = malloc(self->num_edges * sizeof(*self->edge_list_nodes)); if (self->edge_list_nodes == NULL) { ret = TSK_ERR_NO_MEMORY; @@ -4420,7 +4420,7 @@ tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, const double sequence_length = self->tree_sequence->tables->sequence_length; double left = self->tree_left; double right = sequence_length; - size_t next_edge_list_node = 0; + tsk_size_t next_edge_list_node = 0; tsk_treeseq_t *s = self->tree_sequence; tsk_edge_list_node_t *out_head = NULL; tsk_edge_list_node_t *out_tail = NULL; @@ -4429,7 +4429,7 @@ tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, tsk_edge_list_node_t *w = NULL; tsk_edge_list_t edges_out; tsk_edge_list_t edges_in; - size_t num_trees = tsk_treeseq_get_num_trees(s); + tsk_size_t num_trees = tsk_treeseq_get_num_trees(s); const tsk_edge_table_t *edges = &s->tables->edges; const tsk_id_t *insertion_order = s->tables->indexes.edge_insertion_order; const tsk_id_t *removal_order = s->tables->indexes.edge_removal_order; @@ -4437,9 +4437,9 @@ tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, memset(&edges_out, 0, sizeof(edges_out)); memset(&edges_in, 0, sizeof(edges_in)); - if (self->tree_index + 1 < num_trees) { + if (self->tree_index + 1 < (tsk_id_t) num_trees) { /* First we remove the stale records */ - while (self->removal_index < self->num_edges + while (self->removal_index < (tsk_id_t) self->num_edges && left == edges->right[removal_order[self->removal_index]]) { k = removal_order[self->removal_index]; assert(next_edge_list_node < self->num_edges); @@ -4469,7 +4469,7 @@ tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, edges_out.tail = out_tail; /* Now insert the new records */ - while (self->insertion_index < self->num_edges + while (self->insertion_index < (tsk_id_t) self->num_edges && left == edges->left[insertion_order[self->insertion_index]]) { k = insertion_order[self->insertion_index]; assert(next_edge_list_node < self->num_edges); @@ -4499,10 +4499,10 @@ tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, edges_in.tail = in_tail; right = sequence_length; - if (self->insertion_index < self->num_edges) { + if (self->insertion_index < (tsk_id_t) self->num_edges) { right = TSK_MIN(right, edges->left[insertion_order[self->insertion_index]]); } - if (self->removal_index < self->num_edges) { + if (self->removal_index < (tsk_id_t) self->num_edges) { right = TSK_MIN(right, edges->right[removal_order[self->removal_index]]); } self->tree_index++; diff --git a/c/tskit/trees.h b/c/tskit/trees.h index 1a41da1d86..69226eb586 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -198,13 +198,13 @@ typedef struct { } tsk_edge_list_t; typedef struct { - size_t num_nodes; - size_t num_edges; + tsk_size_t num_nodes; + tsk_size_t num_edges; double tree_left; tsk_treeseq_t *tree_sequence; - size_t insertion_index; - size_t removal_index; - size_t tree_index; + tsk_id_t insertion_index; + tsk_id_t removal_index; + tsk_id_t tree_index; tsk_edge_list_node_t *edge_list_nodes; } tsk_diff_iter_t; diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index d02c5257fd..99fa2437e2 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -9892,7 +9892,7 @@ TreeDiffIterator_next(TreeDiffIterator *self) PyObject *value = NULL; int err; double left, right; - size_t list_size, j; + tsk_size_t list_size, j; tsk_edge_list_node_t *record; tsk_edge_list_t records_out, records_in; From e91b43637408b44563aee46e83246eeb612d3fdd Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Fri, 21 Aug 2020 20:14:01 +0100 Subject: [PATCH 2/2] Add include_terminal param to edge diff iterator --- c/CHANGELOG.rst | 2 + c/tests/test_trees.c | 109 +++++++++++++++------ c/tests/testlib.c | 170 +++++++++++++++++++++++++-------- c/tests/testlib.h | 8 ++ c/tskit/haplotype_matching.c | 2 +- c/tskit/trees.c | 12 ++- c/tskit/trees.h | 7 +- python/CHANGELOG.rst | 3 + python/_tskitmodule.c | 14 ++- python/tests/test_highlevel.py | 21 ++++ python/tskit/trees.py | 16 ++-- 11 files changed, 276 insertions(+), 88 deletions(-) diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index c2c917dde2..97355f9a4c 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -2,6 +2,8 @@ [0.99.7] - 2020-XX-XX --------------------- +- Added ``TSK_INCLUDE_TERMINAL`` option to ``tsk_diff_iter_init`` to output the last edges + at the end of a tree sequence (:user:`hyanwong`, :issue:`783`, :pr:`787`) --------------------- [0.99.6] - 2020-09-04 diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index e8b021dc7e..e705613e1c 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -378,16 +378,16 @@ verify_tree_next_prev(tsk_treeseq_t *ts) } static void -verify_tree_diffs(tsk_treeseq_t *ts) +verify_tree_diffs(tsk_treeseq_t *ts, tsk_flags_t options) { - int ret; + int ret, valid_tree; tsk_diff_iter_t iter; tsk_tree_t tree; tsk_edge_list_node_t *record; tsk_edge_list_t records_out, records_in; - size_t num_nodes = tsk_treeseq_get_num_nodes(ts); - size_t j, num_trees; - double left, right; + tsk_size_t num_nodes = tsk_treeseq_get_num_nodes(ts); + tsk_size_t j, num_trees; + double lft, rgt; tsk_id_t *parent = malloc(num_nodes * sizeof(tsk_id_t)); tsk_id_t *child = malloc(num_nodes * sizeof(tsk_id_t)); tsk_id_t *sib = malloc(num_nodes * sizeof(tsk_id_t)); @@ -400,17 +400,17 @@ verify_tree_diffs(tsk_treeseq_t *ts) child[j] = TSK_NULL; sib[j] = TSK_NULL; } - ret = tsk_diff_iter_init(&iter, ts); + ret = tsk_diff_iter_init(&iter, ts, options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_init(&tree, ts, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_tree_first(&tree); - CU_ASSERT_EQUAL_FATAL(ret, 1); + valid_tree = tsk_tree_first(&tree); + CU_ASSERT_EQUAL_FATAL(valid_tree, 1); tsk_diff_iter_print_state(&iter, _devnull); num_trees = 0; - while ((ret = tsk_diff_iter_next(&iter, &left, &right, &records_out, &records_in)) - == 1) { + while ( + (ret = tsk_diff_iter_next(&iter, &lft, &rgt, &records_out, &records_in)) == 1) { tsk_diff_iter_print_state(&iter, _devnull); num_trees++; /* Update forwards */ @@ -420,9 +420,11 @@ verify_tree_diffs(tsk_treeseq_t *ts) for (record = records_in.head; record != NULL; record = record->next) { parent[record->edge.child] = record->edge.parent; } - /* Now check against the sparse tree iterator. */ - for (j = 0; j < num_nodes; j++) { - CU_ASSERT_EQUAL(parent[j], tree.parent[j]); + if (valid_tree) { + /* Now check against the sparse tree iterator. */ + for (j = 0; j < num_nodes; j++) { + CU_ASSERT_EQUAL(parent[j], tree.parent[j]); + } } /* Update backwards */ for (record = records_out.tail; record != NULL; record = record->prev) { @@ -431,21 +433,34 @@ verify_tree_diffs(tsk_treeseq_t *ts) for (record = records_in.tail; record != NULL; record = record->prev) { parent[record->edge.child] = record->edge.parent; } - /* Now check against the sparse tree iterator. */ - for (j = 0; j < num_nodes; j++) { - CU_ASSERT_EQUAL(parent[j], tree.parent[j]); - } - CU_ASSERT_EQUAL(tree.left, left); - CU_ASSERT_EQUAL(tree.right, right); - ret = tsk_tree_next(&tree); - if (num_trees < tsk_treeseq_get_num_trees(ts)) { - CU_ASSERT_EQUAL(ret, 1); + if (valid_tree) { + /* Now check against the sparse tree iterator. */ + for (j = 0; j < num_nodes; j++) { + CU_ASSERT_EQUAL(parent[j], tree.parent[j]); + } + CU_ASSERT_EQUAL(tree.left, lft); + CU_ASSERT_EQUAL(tree.right, rgt); + valid_tree = tsk_tree_next(&tree); + if (num_trees < tsk_treeseq_get_num_trees(ts)) { + CU_ASSERT_EQUAL(valid_tree, 1); + } else { + CU_ASSERT_EQUAL(valid_tree, 0); + } } else { - CU_ASSERT_EQUAL(ret, 0); + CU_ASSERT_TRUE_FATAL(options & TSK_INCLUDE_TERMINAL); + for (j = 0; j < num_nodes; j++) { + CU_ASSERT_EQUAL(parent[j], -1); + } + CU_ASSERT_EQUAL(lft, tsk_treeseq_get_sequence_length(ts)); + CU_ASSERT_EQUAL(rgt, tsk_treeseq_get_sequence_length(ts)); } } - CU_ASSERT_EQUAL(num_trees, tsk_treeseq_get_num_trees(ts)); - CU_ASSERT_EQUAL_FATAL(ret, 0); + if (options & TSK_INCLUDE_TERMINAL) { + CU_ASSERT_EQUAL(num_trees, tsk_treeseq_get_num_trees(ts) + 1); + } else { + CU_ASSERT_EQUAL(num_trees, tsk_treeseq_get_num_trees(ts)); + } + CU_ASSERT_EQUAL_FATAL(valid_tree, 0); ret = tsk_diff_iter_free(&iter); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_tree_free(&tree); @@ -4597,7 +4612,8 @@ test_simple_diff_iter(void) tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, NULL, NULL, paper_ex_individuals, NULL, 0); - verify_tree_diffs(&ts); + verify_tree_diffs(&ts, 0); + verify_tree_diffs(&ts, TSK_INCLUDE_TERMINAL); ret = tsk_treeseq_free(&ts); CU_ASSERT_EQUAL(ret, 0); @@ -4611,7 +4627,8 @@ test_nonbinary_diff_iter(void) tsk_treeseq_from_text(&ts, 100, nonbinary_ex_nodes, nonbinary_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); - verify_tree_diffs(&ts); + verify_tree_diffs(&ts, 0); + verify_tree_diffs(&ts, TSK_INCLUDE_TERMINAL); ret = tsk_treeseq_free(&ts); CU_ASSERT_EQUAL(ret, 0); @@ -4625,7 +4642,8 @@ test_unary_diff_iter(void) tsk_treeseq_from_text( &ts, 10, unary_ex_nodes, unary_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); - verify_tree_diffs(&ts); + verify_tree_diffs(&ts, 0); + verify_tree_diffs(&ts, TSK_INCLUDE_TERMINAL); ret = tsk_treeseq_free(&ts); CU_ASSERT_EQUAL(ret, 0); @@ -4639,7 +4657,38 @@ test_internal_sample_diff_iter(void) tsk_treeseq_from_text(&ts, 10, internal_sample_ex_nodes, internal_sample_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); - verify_tree_diffs(&ts); + verify_tree_diffs(&ts, 0); + verify_tree_diffs(&ts, TSK_INCLUDE_TERMINAL); + + ret = tsk_treeseq_free(&ts); + CU_ASSERT_EQUAL(ret, 0); +} + +static void +test_multiroot_diff_iter(void) +{ + int ret; + tsk_treeseq_t ts; + + tsk_treeseq_from_text(&ts, 10, multiroot_ex_nodes, multiroot_ex_edges, NULL, NULL, + NULL, NULL, NULL, 0); + verify_tree_diffs(&ts, 0); + verify_tree_diffs(&ts, TSK_INCLUDE_TERMINAL); + + ret = tsk_treeseq_free(&ts); + CU_ASSERT_EQUAL(ret, 0); +} + +static void +test_empty_diff_iter(void) +{ + int ret; + tsk_treeseq_t ts; + + tsk_treeseq_from_text( + &ts, 10, empty_ex_nodes, empty_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); + verify_tree_diffs(&ts, 0); + verify_tree_diffs(&ts, TSK_INCLUDE_TERMINAL); ret = tsk_treeseq_free(&ts); CU_ASSERT_EQUAL(ret, 0); @@ -6041,6 +6090,8 @@ main(int argc, char **argv) { "test_nonbinary_diff_iter", test_nonbinary_diff_iter }, { "test_unary_diff_iter", test_unary_diff_iter }, { "test_internal_sample_diff_iter", test_internal_sample_diff_iter }, + { "test_multiroot_diff_iter", test_multiroot_diff_iter }, + { "test_empty_diff_iter", test_empty_diff_iter }, /* Sample sets */ { "test_simple_sample_sets", test_simple_sample_sets }, diff --git a/c/tests/testlib.c b/c/tests/testlib.c index 59b5bd17e6..5a4b6552c4 100644 --- a/c/tests/testlib.c +++ b/c/tests/testlib.c @@ -27,7 +27,7 @@ char *_tmp_file_name; FILE *_devnull; -/* Simple single tree example. */ +/*** Simple single tree example. ***/ const char *single_tree_ex_nodes = /* 6 */ "1 0 -1 -1\n" /* / \ */ "1 0 -1 -1\n" /* / \ */ @@ -51,7 +51,21 @@ const char *single_tree_ex_mutations "2 2 1 -1\n" "2 3 1 -1\n"; -/* Example from the PLOS paper */ +/*** Example from the PLOS paper ***/ +/* +0.25┊ 8 ┊ ┊ ┊ + ┊ ┏━┻━┓ ┊ ┊ ┊ +0.20┊ ┃ ┃ ┊ ┊ 7 ┊ + ┊ ┃ ┃ ┊ ┊ ┏━┻━┓ ┊ +0.17┊ 6 ┃ ┊ 6 ┊ ┃ ┃ ┊ + ┊ ┏━┻┓ ┃ ┊ ┏━┻━┓ ┊ ┃ ┃ ┊ +0.09┊ ┃ 5 ┃ ┊ ┃ 5 ┊ ┃ 5 ┊ + ┊ ┃ ┏┻┓ ┃ ┊ ┃ ┏━┻┓ ┊ ┃ ┏━┻┓ ┊ +0.07┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ 4 ┊ ┃ ┃ 4 ┊ + ┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┏┻┓ ┊ ┃ ┃ ┏┻┓ ┊ +0.00┊ 0 1 3 2 ┊ 0 1 2 3 ┊ 0 1 2 3 ┊ + 0.00 2.00 7.00 10.00 +*/ const char *paper_ex_nodes = "1 0 -1 0\n" "1 0 -1 0\n" "1 0 -1 1\n" @@ -80,7 +94,20 @@ const char *paper_ex_mutations = "0 2 1\n" const char *paper_ex_individuals = "0 0.2,1.5\n" "0 0.0,0.0\n"; -/* An example of a nonbinary tree sequence */ +/*** An example of a nonbinary tree sequence ***/ +/* +0.41┊ 12 ┊ 12 ┊ + ┊ ┃ ┊ ┃ ┊ +0.28┊ ┃ ┊ ┃ ┊ + ┊ ┃ ┊ ┃ ┊ +0.13┊ 10 ┊ 10 ┊ + ┊ ┏━╋━┓ ┊ ┏┻┓ ┊ +0.07┊ ┃ ┃ ┃ ┊ ┃ ┃ ┊ + ┊ ┃ ┃ ┃ ┊ ┃ ┃ ┊ +0.01┊ ┃ ┃ ┃ ┊ ┃ ┃ ┊ + ┊ ┃ ┃ ┃ ┊ ┃ ┃ ┊ +0.00┊ 0 1 2 3 4 5 7 6 ┊ 0 1 2 3 4 7 5 6 ┊ +*/ const char *nonbinary_ex_nodes = "1 0 0 -1\n" "1 0 0 -1\n" "1 0 0 -1\n" @@ -108,7 +135,21 @@ const char *nonbinary_ex_sites = "1 0\n" const char *nonbinary_ex_mutations = "0 2 1\n" "1 11 1"; -/* An example of a tree sequence with unary nodes. */ +/*** An example of a tree sequence with unary nodes. ***/ +/* +0.25┊ 8 ┊ 8 ┊ ┊ + ┊ ┏━┻━┓ ┊ ┃ ┊ ┊ +0.20┊ ┃ 7 ┊ ┃ ┊ 7 ┊ + ┊ ┃ ┃ ┊ ┃ ┊ ┏━┻━┓ ┊ +0.17┊ 6 ┃ ┊ 6 ┊ ┃ ┃ ┊ + ┊ ┏━┻┓ ┃ ┊ ┏━┻━┓ ┊ ┃ ┃ ┊ +0.09┊ ┃ 5 ┃ ┊ ┃ 5 ┊ ┃ 5 ┊ + ┊ ┃ ┏┻┓ ┃ ┊ ┃ ┏━┻┓ ┊ ┃ ┏━┻┓ ┊ +0.07┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ 4 ┊ ┃ ┃ 4 ┊ + ┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┏┻┓ ┊ ┃ ┃ ┏┻┓ ┊ +0.00┊ 0 1 3 2 ┊ 0 1 2 3 ┊ 0 1 2 3 ┊ + 0.00 2.00 7.00 10.00 +*/ const char *unary_ex_nodes = "1 0 0 -1\n" "1 0 0 -1\n" "1 0 0 -1\n" @@ -129,7 +170,7 @@ const char *unary_ex_edges = "2 10 4 2,3\n" "0 7 8 6\n" "0 2 8 7\n"; -/* We make one mutation for each tree, over unary nodes if this exist */ +/* We make one mutation for each tree, over unary nodes if they exist */ const char *unary_ex_sites = "1.0 0\n" "4.5 0\n" "8.5 0\n"; @@ -137,45 +178,25 @@ const char *unary_ex_mutations = "0 2 1\n" "1 6 1\n" "2 5 1\n"; -/* An example of a tree sequence with internally sampled nodes. */ +/*** An example of a tree sequence with internally sampled nodes. ***/ -/* TODO: find a way to draw these side-by-side */ /* - 7 -+-+-+ -| 5 -| +-++ -| | 4 -| | +++ -| | | 3 -| | | -| 1 2 -| -0 - - 8 -+-+-+ -| 5 -| +-++ -| | 4 -| | +++ -3 | | | - | | | - 1 2 | - | - 0 - - 6 -+-+-+ -| 5 -| +-++ -| | 4 -| | +++ -| | | 3 -| | | -| 1 2 -| -0 +1.20┊ ┊ 8 ┊ ┊ + ┊ ┊ ┏━┻━┓ ┊ ┊ +1.00┊ 7 ┊ ┃ ┃ ┊ ┊ + ┊ ┏━┻━┓ ┊ ┃ ┃ ┊ ┊ +0.70┊ ┃ ┃ ┊ ┃ ┃ ┊ 6 ┊ + ┊ ┃ ┃ ┊ ┃ ┃ ┊ ┏━┻━┓ ┊ +0.50┊ ┃ 5 ┊ 5 ┃ ┊ ┃ 5 ┊ + ┊ ┃ ┏━┻┓ ┊ ┏┻━┓ ┃ ┊ ┃ ┏━┻┓ ┊ +0.40┊ ┃ ┃ 4 ┊ 4 ┃ ┃ ┊ ┃ ┃ 4 ┊ + ┊ ┃ ┃ ┏┻┓ ┊ ┏┻┓ ┃ ┃ ┊ ┃ ┃ ┏┻┓ ┊ +0.20┊ ┃ ┃ ┃ 3 ┊ ┃ ┃ ┃ 3 ┊ ┃ ┃ ┃ 3 ┊ + ┊ ┃ ┃ ┃ ┊ ┃ ┃ ┃ ┊ ┃ ┃ ┃ ┊ +0.10┊ ┃ 1 2 ┊ ┃ 2 1 ┊ ┃ 1 2 ┊ + ┊ ┃ ┊ ┃ ┊ ┃ ┊ +0.00┊ 0 ┊ 0 ┊ 0 ┊ + 0.00 2.00 8.00 10.00 */ const char *internal_sample_ex_nodes = "1 0.0 0 -1\n" @@ -203,6 +224,71 @@ const char *internal_sample_ex_mutations = "0 2 1\n" "1 5 1\n" "2 5 1\n"; +/*** An example of a tree sequence with multiple roots. ***/ +/* +0.90┊ ┊ 11 ┊ ┊ + ┊ ┊ ┏┻┓ ┊ ┊ +0.80┊ 10 ┊ ┃ ┃ ┊ ┊ + ┊ ┏┻┓ ┊ ┃ ┃ ┊ ┊ +0.40┊ ┃ ┃ 9 ┊ ┃ ┃ 9 ┊ 9 ┊ + ┊ ┃ ┃ ┏━┻┓ ┊ ┃ ┃ ┏━┻━┓ ┊ ┏━┻━━┓ ┊ +0.30┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ 8 ┃ ┊ ┃ 8 ┊ + ┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┏┻┓ ┃ ┊ ┃ ┏┻┓ ┊ +0.20┊ ┃ ┃ ┃ 7 ┊ ┃ ┃ ┃ ┃ 7 ┊ 7 ┃ ┃ ┊ + ┊ ┃ ┃ ┃ ┏┻┓ ┊ ┃ ┃ ┃ ┃ ┏┻┓ ┊ ┏┻━┓ ┃ ┃ ┊ +0.10┊ ┃ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┃ ┃ ┃ ┃ ┊ 6 ┃ ┃ ┃ ┊ + ┊ ┃ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┃ ┃ ┃ ┃ ┊ ┏┻┓ ┃ ┃ ┃ ┊ +0.00┊ 0 1 2 3 4 5 ┊ 0 5 1 2 3 4 ┊ 0 3 4 1 2 5 ┊ + 0.00 4.00 8.00 10.00 +*/ +const char *multiroot_ex_nodes = "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "0 0.1 0 -1\n" + "0 0.2 0 -1\n" + "0 0.3 0 -1\n" + "0 0.4 0 -1\n" + "0 0.8 0 -1\n" + "0 0.9 0 -1\n"; +const char *multiroot_ex_edges = "8 10 6 0,3\n" + "0 8 7 3\n" + "0 10 7 4\n" + "8 10 7 6\n" + "4 10 8 1,2\n" + "0 4 9 2\n" + "0 10 9 7\n" + "4 10 9 8\n" + "0 4 10 0,1\n" + "4 8 11 0,5\n"; + +/* We make one mutation over each root node */ +const char *multiroot_ex_sites = "1.0 0\n" + "2.0 0\n" + "3.0 0\n" + "5.0 0\n" + "6.0 0\n" + "8.0 0\n" + "9.0 0\n"; +const char *multiroot_ex_mutations = "0 10 1\n" + "1 9 1\n" + "2 5 1\n" + "3 11 1\n" + "4 9 1\n" + "5 9 1\n" + "6 5 1\n"; + +/*** An example of a empty tree sequence. ***/ +const char *empty_ex_nodes = "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "1 0.0 0 -1\n" + "1 0.0 0 -1\n"; +const char *empty_ex_edges = ""; + /* Simple utilities to parse text so we can write declaritive * tests. This is not intended as a robust general input mechanism. */ diff --git a/c/tests/testlib.h b/c/tests/testlib.h index 3f2d1fd9ac..e70a0f37bf 100644 --- a/c/tests/testlib.h +++ b/c/tests/testlib.h @@ -75,6 +75,14 @@ extern const char *internal_sample_ex_edges; extern const char *internal_sample_ex_sites; extern const char *internal_sample_ex_mutations; +extern const char *multiroot_ex_nodes; +extern const char *multiroot_ex_edges; +extern const char *multiroot_ex_sites; +extern const char *multiroot_ex_mutations; + +extern const char *empty_ex_nodes; +extern const char *empty_ex_edges; + extern const char *paper_ex_nodes; extern const char *paper_ex_edges; extern const char *paper_ex_sites; diff --git a/c/tskit/haplotype_matching.c b/c/tskit/haplotype_matching.c index c791c9ec93..5711df933d 100644 --- a/c/tskit/haplotype_matching.c +++ b/c/tskit/haplotype_matching.c @@ -240,7 +240,7 @@ tsk_ls_hmm_reset(tsk_ls_hmm_t *self) /* This is safe because we've already zero'd out the memory. */ tsk_diff_iter_free(&self->diffs); - ret = tsk_diff_iter_init(&self->diffs, self->tree_sequence); + ret = tsk_diff_iter_init(&self->diffs, self->tree_sequence, false); if (ret != 0) { goto out; } diff --git a/c/tskit/trees.c b/c/tskit/trees.c index c8cd16f26b..33129b9f68 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -4371,7 +4371,8 @@ tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes, * ======================================================== */ int TSK_WARN_UNUSED -tsk_diff_iter_init(tsk_diff_iter_t *self, tsk_treeseq_t *tree_sequence) +tsk_diff_iter_init( + tsk_diff_iter_t *self, tsk_treeseq_t *tree_sequence, tsk_flags_t options) { int ret = 0; @@ -4384,6 +4385,10 @@ tsk_diff_iter_init(tsk_diff_iter_t *self, tsk_treeseq_t *tree_sequence) self->removal_index = 0; self->tree_left = 0; self->tree_index = -1; + self->last_index = (tsk_id_t) tsk_treeseq_get_num_trees(tree_sequence); + if (options & TSK_INCLUDE_TERMINAL) { + self->last_index = self->last_index + 1; + } self->edge_list_nodes = malloc(self->num_edges * sizeof(*self->edge_list_nodes)); if (self->edge_list_nodes == NULL) { ret = TSK_ERR_NO_MEMORY; @@ -4429,7 +4434,6 @@ tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, tsk_edge_list_node_t *w = NULL; tsk_edge_list_t edges_out; tsk_edge_list_t edges_in; - tsk_size_t num_trees = tsk_treeseq_get_num_trees(s); const tsk_edge_table_t *edges = &s->tables->edges; const tsk_id_t *insertion_order = s->tables->indexes.edge_insertion_order; const tsk_id_t *removal_order = s->tables->indexes.edge_removal_order; @@ -4437,7 +4441,7 @@ tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right, memset(&edges_out, 0, sizeof(edges_out)); memset(&edges_in, 0, sizeof(edges_in)); - if (self->tree_index + 1 < (tsk_id_t) num_trees) { + if (self->tree_index + 1 < self->last_index) { /* First we remove the stale records */ while (self->removal_index < (tsk_id_t) self->num_edges && left == edges->right[removal_order[self->removal_index]]) { @@ -4943,7 +4947,7 @@ tsk_treeseq_kc_distance( if (ret != 0) { goto out; } - ret = tsk_diff_iter_init(&diff_iters[i], treeseqs[i]); + ret = tsk_diff_iter_init(&diff_iters[i], treeseqs[i], false); if (ret != 0) { goto out; } diff --git a/c/tskit/trees.h b/c/tskit/trees.h index 69226eb586..7d8282c832 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -56,6 +56,9 @@ extern "C" { #define TSK_DIR_FORWARD 1 #define TSK_DIR_REVERSE -1 + +/* For the edge diff iterator */ +#define TSK_INCLUDE_TERMINAL (1 << 0) // clang-format on /** @@ -205,6 +208,7 @@ typedef struct { tsk_id_t insertion_index; tsk_id_t removal_index; tsk_id_t tree_index; + tsk_id_t last_index; tsk_edge_list_node_t *edge_list_nodes; } tsk_diff_iter_t; @@ -412,7 +416,8 @@ int tsk_tree_kc_distance( /* Diff iterator */ /****************************************************************************/ -int tsk_diff_iter_init(tsk_diff_iter_t *self, tsk_treeseq_t *tree_sequence); +int tsk_diff_iter_init( + tsk_diff_iter_t *self, tsk_treeseq_t *tree_sequence, tsk_flags_t options); int tsk_diff_iter_free(tsk_diff_iter_t *self); int tsk_diff_iter_next(tsk_diff_iter_t *self, double *left, double *right, tsk_edge_list_t *edges_out, tsk_edge_list_t *edges_in); diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 835b1ad77e..36b5d6544b 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -7,6 +7,9 @@ - Genomic intervals returned by python functions are now namedtuples, allowing ``.left`` ``.right`` and ``.span`` usage (:user:`hyanwong`, :issue:`784`, :pr:`786`, :pr:`811`) +- Added ``include_terminal`` parameter to edge diffs iterator, to output the last edges + at the end of a tree sequence (:user:`hyanwong`, :issue:`783`, :pr:`787`) + - :issue:`832` - Add ``metadata_bytes`` method to allow access to raw TableCollection metadata (:user:`benjeffery`, :pr:`842`) diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 99fa2437e2..4526aac3b3 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -9852,15 +9852,20 @@ TreeDiffIterator_init(TreeDiffIterator *self, PyObject *args, PyObject *kwds) { int ret = -1; int err; - static char *kwlist[] = {"tree_sequence", NULL}; + static char *kwlist[] = {"tree_sequence", "include_terminal", NULL}; TreeSequence *tree_sequence; + int include_terminal = 0; + tsk_flags_t options = 0; self->tree_diff_iterator = NULL; self->tree_sequence = NULL; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!", kwlist, - &TreeSequenceType, &tree_sequence)) { + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!|p", kwlist, + &TreeSequenceType, &tree_sequence, &include_terminal)) { goto out; } + if (include_terminal) { + options |= TSK_INCLUDE_TERMINAL; + } self->tree_sequence = tree_sequence; Py_INCREF(self->tree_sequence); if (TreeSequence_check_tree_sequence(self->tree_sequence) != 0) { @@ -9873,7 +9878,8 @@ TreeDiffIterator_init(TreeDiffIterator *self, PyObject *args, PyObject *kwds) } memset(self->tree_diff_iterator, 0, sizeof(tsk_diff_iter_t)); err = tsk_diff_iter_init(self->tree_diff_iterator, - self->tree_sequence->tree_sequence); + self->tree_sequence->tree_sequence, + options); if (err != 0) { handle_library_error(err); goto out; diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index 6264f78a25..d83f053f0a 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -658,6 +658,27 @@ def test_edge_diffs(self): for ts in get_example_tree_sequences(): self.verify_edge_diffs(ts) + def test_edge_diffs_include_terminal(self): + for ts in get_example_tree_sequences(): + edges = set() + i = 0 + breakpoints = list(ts.breakpoints()) + for (left, right), e_out, e_in in ts.edge_diffs(include_terminal=True): + self.assertEqual(left, breakpoints[i]) + if i == ts.num_trees: + # Last iteration, right==left==sequence_length + self.assertEqual(left, ts.sequence_length) + self.assertEqual(right, ts.sequence_length) + else: + self.assertEqual(right, breakpoints[i + 1]) + for e in e_out: + edges.remove(e.id) + for e in e_in: + edges.add(e.id) + i += 1 + self.assertEqual(i, ts.num_trees + 1) + self.assertEqual(len(edges), 0) + def verify_edgesets(self, ts): """ Verifies that the edgesets we return are equivalent to the original edges. diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 5dd22f6420..ac7b9bcc5a 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -3474,7 +3474,7 @@ def edgesets(self): edgeset.children = sorted(children[edgeset.parent]) yield edgeset - def edge_diffs(self): + def edge_diffs(self, include_terminal=False): """ Returns an iterator over all the edges that are inserted and removed to build the trees as we move from left-to-right along the tree sequence. @@ -3492,15 +3492,17 @@ def edge_diffs(self): descending parent time, parent id, then child_id). This means that within each list, edges with the same parent appear consecutively. - This iterator terminates after the final interval in the tree sequence - (i.e. it does not report a final removal of all remaining edges); the - number of iterations is thus always equal to the number of trees in the - tree sequence. - + :param bool include_terminal: If False (default), the iterator terminates + after the final interval in the tree sequence (i.e. it does not + report a final removal of all remaining edges), and the number + of iterations will be equal to the number of trees in the tree + sequence. If True, an additional iteration takes place, with the last + ``edges_out`` value reporting all the edges contained in the final + tree (with both ``left`` and ``right`` equal to the sequence length). :return: An iterator over the (interval, edges_out, edges_in) tuples. :rtype: :class:`collections.abc.Iterable` """ - iterator = _tskit.TreeDiffIterator(self._ll_tree_sequence) + iterator = _tskit.TreeDiffIterator(self._ll_tree_sequence, include_terminal) metadata_decoder = self.table_metadata_schemas.edge.decode_row for interval, edge_tuples_out, edge_tuples_in in iterator: edges_out = [Edge(*(e + (metadata_decoder,))) for e in edge_tuples_out]