From 929f4d2e512cc52db687a06499654f80d5bda858 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 18 Jan 2021 14:59:11 +0000 Subject: [PATCH] Add KEEP_UNARY_IN_INDIVIDUALS option to simplify() --- c/CHANGELOG.rst | 4 ++ c/tests/test_trees.c | 98 +++++++++++++++++++++++++++++++++++++++++++- c/tskit/core.c | 5 +++ c/tskit/core.h | 3 ++ c/tskit/tables.c | 17 +++++++- c/tskit/tables.h | 4 ++ 6 files changed, 129 insertions(+), 2 deletions(-) diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index bc9b3ef277..9e432ac7a5 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -6,6 +6,10 @@ **Features** +- Add ``TSK_KEEP_UNARY_IN_INDIVIDUALS`` flag to simplify, which allows the user to + keep unary nodes only if they belong to a tabled individual. This is useful for + simplification in forwards simulations (:user:`hyanwong`, :issue:`1113`, :pr:`1119`). + **Bugfixes** diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 9c7490936f..d0a677ae59 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -940,11 +940,22 @@ test_simplest_records(void) CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); + ret = tsk_treeseq_simplify(&ts, sample_ids, 2, + TSK_KEEP_UNARY | TSK_KEEP_UNARY_IN_INDIVIDUALS, &simplified, NULL); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE); + tsk_treeseq_free(&simplified); + ret = tsk_treeseq_simplify(&ts, sample_ids, 2, TSK_KEEP_UNARY, &simplified, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); + ret = tsk_treeseq_simplify( + &ts, sample_ids, 2, TSK_KEEP_UNARY_IN_INDIVIDUALS, &simplified, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); + tsk_treeseq_free(&simplified); + tsk_treeseq_free(&ts); } @@ -978,6 +989,12 @@ test_simplest_nonbinary_records(void) CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); + ret = tsk_treeseq_simplify( + &ts, sample_ids, 4, TSK_KEEP_UNARY_IN_INDIVIDUALS, &simplified, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); + tsk_treeseq_free(&simplified); + tsk_treeseq_free(&ts); } @@ -993,7 +1010,7 @@ test_simplest_unary_records(void) const char *edges = "0 1 2 0\n" "0 1 3 1\n" "0 1 4 2,3\n"; - tsk_treeseq_t ts, simplified; + tsk_treeseq_t ts, simplified, simplified_other; tsk_id_t sample_ids[] = { 0, 1 }; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); @@ -1019,6 +1036,84 @@ test_simplest_unary_records(void) CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); tsk_treeseq_free(&simplified); + ret = tsk_treeseq_simplify( + &ts, sample_ids, 2, TSK_KEEP_UNARY_IN_INDIVIDUALS, &simplified, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FALSE(tsk_table_collection_equals(ts.tables, simplified.tables, 0)); + ret = tsk_treeseq_simplify(&ts, sample_ids, 2, 0, &simplified_other, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE( + tsk_table_collection_equals(simplified.tables, simplified_other.tables, 0)); + tsk_treeseq_free(&simplified); + tsk_treeseq_free(&simplified_other); + + tsk_treeseq_free(&ts); +} + +static void +test_simplest_unary_with_individuals(void) +{ + int ret; + const char *nodes = "1 0 0 -1\n" + "1 0 0 0\n" + "0 1 0 -1\n" + "0 1 0 1\n" + "0 2 0 -1\n" + "0 3 0 -1\n" + "0 3 0 2\n" + "0 1 0 -1\n" + "0 1 0 3\n" + "0 0 0 -1\n" + "0 0 0 4\n" + "0 1 0 3\n"; + const char *edges = "0 2 2 0\n" + "0 2 3 1\n" + "2 3 7 0\n" + "2 3 8 1,9\n" + "2 3 11 10\n" + "0 2 4 2,3\n" + "0 1 5 4\n" + "1 2 6 4\n"; + const char *individuals = "0 0.5\n" + "0 1.5,3.1\n" + "0 2.1\n" + "0 3.2\n" + "0 4.2\n"; + const char *nodes_expect = "1 0 0 -1\n" + "1 0 0 0\n" + "0 1 0 1\n" + "0 1 0 3\n" + "0 2 0 -1\n" + "0 3 0 2\n"; + const char *edges_expect = "0 2 2 1\n" + "2 3 3 1\n" + "0 2 4 0,2\n" + "1 2 5 4\n"; + const char *individuals_expect = "0 0.5\n" + "0 1.5,3.1\n" + "0 2.1\n" + "0 3.2\n"; + tsk_treeseq_t ts, simplified, expected; + tsk_id_t sample_ids[] = { 0, 1 }; + + tsk_treeseq_from_text(&ts, 3, nodes, edges, NULL, NULL, NULL, individuals, NULL, 0); + tsk_treeseq_from_text(&expected, 3, nodes_expect, edges_expect, NULL, NULL, NULL, + individuals_expect, NULL, 0); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2); + CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 3.0); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 12); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_individuals(&ts), 5); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 0); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 3); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_populations(&ts), 1); + + ret = tsk_treeseq_simplify(&ts, sample_ids, 2, + TSK_KEEP_UNARY_IN_INDIVIDUALS | TSK_FILTER_INDIVIDUALS, &simplified, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(simplified.tables, expected.tables, 0)); + tsk_treeseq_free(&simplified); + + tsk_treeseq_free(&expected); tsk_treeseq_free(&ts); } @@ -5987,6 +6082,7 @@ main(int argc, char **argv) { "test_simplest_records", test_simplest_records }, { "test_simplest_nonbinary_records", test_simplest_nonbinary_records }, { "test_simplest_unary_records", test_simplest_unary_records }, + { "test_simplest_unary_with_individuals", test_simplest_unary_with_individuals }, { "test_simplest_non_sample_leaf_records", test_simplest_non_sample_leaf_records }, { "test_simplest_degenerate_multiple_root_records", diff --git a/c/tskit/core.c b/c/tskit/core.c index 99d284e1dd..6348a21849 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -470,6 +470,11 @@ tsk_strerror_internal(int err) case TSK_ERR_DUPLICATE_SAMPLE_PAIRS: ret = "There are duplicate sample pairs."; break; + + /* Simplify errors */ + case TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE: + ret = "You cannot specify both KEEP_UNARY and KEEP_UNARY_IN_INDIVDUALS."; + break; } return ret; } diff --git a/c/tskit/core.h b/c/tskit/core.h index 3cbcd89096..ec45570a6d 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -320,6 +320,9 @@ not found in the file. #define TSK_ERR_NO_SAMPLE_PAIRS -1500 #define TSK_ERR_DUPLICATE_SAMPLE_PAIRS -1501 +/* Simplify errors */ +#define TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE -1600 + // clang-format on /* This bit is 0 for any errors originating from kastore */ diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 629389713f..3f3164c87b 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -6094,6 +6094,8 @@ simplifier_print_state(simplifier_t *self, FILE *out) fprintf(out, "\tkeep_unary : %d\n", !!(self->options & TSK_KEEP_UNARY)); fprintf(out, "\tkeep_input_roots : %d\n", !!(self->options & TSK_KEEP_INPUT_ROOTS)); + fprintf(out, "\tkeep_unary_in_individuals : %d\n", + !!(self->options & TSK_KEEP_UNARY_IN_INDIVIDUALS)); fprintf(out, "===\nInput tables\n==\n"); tsk_table_collection_print_state(&self->input_tables, out); @@ -6635,8 +6637,16 @@ simplifier_merge_ancestors(simplifier_t *self, tsk_id_t input_id) double left, right, prev_right; tsk_id_t ancestry_node; tsk_id_t output_id = self->node_id_map[input_id]; + bool is_sample = output_id != TSK_NULL; - bool keep_unary = !!(self->options & TSK_KEEP_UNARY); + bool keep_unary = false; + if (self->options & TSK_KEEP_UNARY) { + keep_unary = true; + } + if ((self->options & TSK_KEEP_UNARY_IN_INDIVIDUALS) + && (self->tables->nodes.individual[input_id] != TSK_NULL)) { + keep_unary = true; + } if (is_sample) { /* Free up the existing ancestry mapping. */ @@ -8546,6 +8556,11 @@ tsk_table_collection_simplify(tsk_table_collection_t *self, const tsk_id_t *samp /* Avoid calling to simplifier_free with uninit'd memory on error branches */ memset(&simplifier, 0, sizeof(simplifier_t)); + if ((options & TSK_KEEP_UNARY) && (options & TSK_KEEP_UNARY_IN_INDIVIDUALS)) { + ret = TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE; + goto out; + } + /* For now we don't bother with edge metadata, but it can easily be * implemented. */ if (self->edges.metadata_length > 0) { diff --git a/c/tskit/tables.h b/c/tskit/tables.h index 6c5e8da9e1..e31d5e45a1 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -693,6 +693,7 @@ typedef struct { #define TSK_REDUCE_TO_SITE_TOPOLOGY (1 << 3) #define TSK_KEEP_UNARY (1 << 4) #define TSK_KEEP_INPUT_ROOTS (1 << 5) +#define TSK_KEEP_UNARY_IN_INDIVIDUALS (1 << 6) /* Flags for check_integrity */ #define TSK_CHECK_EDGE_ORDERING (1 << 0) @@ -2671,6 +2672,9 @@ TSK_KEEP_INPUT_ROOTS By default simplify removes all topology ancestral the MRCAs of the samples. This option inserts edges from these MRCAs back to the roots of the input trees. +TSK_KEEP_UNARY_IN_INDIVDUALS + This acts like TSK_KEEP_UNARY (and is mutually exclusive with that flag). It + keeps unary nodes, but only if the unary node is referenced from an individual. .. note:: Migrations are currently not supported by simplify, and an error will be raised if we attempt call simplify on a table collection with greater