diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index f14137600a..0ece01fc32 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -4595,6 +4595,31 @@ test_unequal_samples_kc(void) tsk_tree_free(&other_t); } +static void +test_unary_nodes_kc(void) +{ + const char *nodes = "1 0 0\n" + "1 0 0\n" + "0 1 0\n" + "0 2 0"; + const char *edges = "0 1 2 0,1\n" + "0 1 3 2"; + tsk_treeseq_t ts; + tsk_tree_t t; + int ret; + double result = 0; + + tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL); + ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_tree_first(&t); + CU_ASSERT_EQUAL_FATAL(ret, 1); + ret = tsk_tree_kc_distance(&t, &t, 0, &result); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNARY_NODES); + tsk_treeseq_free(&ts); + tsk_tree_free(&t); +} + /*======================================================= * Miscellaneous tests. *======================================================*/ @@ -5306,6 +5331,7 @@ main(int argc, char **argv) { "test_internal_samples_kc", test_internal_samples_kc }, { "test_unequal_sample_size_kc", test_unequal_sample_size_kc }, { "test_unequal_samples_kc", test_unequal_samples_kc }, + { "test_unary_nodes_kc", test_unary_nodes_kc }, /* Misc */ { "test_tree_errors", test_tree_errors }, diff --git a/c/tskit/core.c b/c/tskit/core.c index 6df4f5aa27..cbea2d4d7c 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -394,6 +394,9 @@ tsk_strerror_internal(int err) case TSK_ERR_MULTIPLE_ROOTS: ret = "Trees with multiple roots not supported."; break; + case TSK_ERR_UNARY_NODES: + ret = "Unsimplified trees with unary nodes are not supported."; + break; /* Haplotype matching errors */ case TSK_ERR_NULL_VITERBI_MATRIX: diff --git a/c/tskit/core.h b/c/tskit/core.h index 7e85427a4c..14238d6f13 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -237,6 +237,7 @@ the file. #define TSK_ERR_SAMPLES_NOT_EQUAL -1201 #define TSK_ERR_INTERNAL_SAMPLES -1202 #define TSK_ERR_MULTIPLE_ROOTS -1203 +#define TSK_ERR_UNARY_NODES -1204 /* Haplotype matching errors */ #define TSK_ERR_NULL_VITERBI_MATRIX -1300 diff --git a/c/tskit/trees.c b/c/tskit/trees.c index a4933082bb..881bd249d0 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -4593,8 +4593,9 @@ norm_kc_vectors(kc_vectors *self, kc_vectors *other, double lambda) int tsk_tree_kc_distance(tsk_tree_t *self, tsk_tree_t *other, double lambda, double *result) { - tsk_id_t u, n, i; + tsk_id_t u, n, num_nodes, i, left_child; kc_vectors vecs[2]; + tsk_tree_t *t; tsk_tree_t *trees[2] = { self, other }; const tsk_id_t *samples = self->tree_sequence->samples; const tsk_id_t *other_samples = other->tree_sequence->samples; @@ -4617,6 +4618,18 @@ tsk_tree_kc_distance(tsk_tree_t *self, tsk_tree_t *other, double lambda, double goto out; } + for (i = 0; i < 2; i++) { + t = trees[i]; + num_nodes = (tsk_id_t) tsk_treeseq_get_num_nodes(t->tree_sequence); + for (u = 0; u < num_nodes; u++) { + left_child = t->left_child[u]; + if (left_child != TSK_NULL && left_child == t->right_child[u]) { + ret = TSK_ERR_UNARY_NODES; + goto out; + } + } + } + n = (tsk_id_t) self->tree_sequence->num_samples; for (i = 0; i < n; i++) { if (samples[i] != other_samples[i]) { diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index d28f663999..6b160fbada 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -6,6 +6,9 @@ In development **New features** +- Remove support for ``kc_distance`` on trees with unary nodes + (:user:`daniel-goldstein`, :`508`) + - Improve Kendall-Colijn tree distance algorithm to operate in O(n^2) time instead of O(n^2 * log(n)) where n is the number of samples (:user:`daniel-goldstein`, :pr:`490`) diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index 116ab1f4a3..2bffcc2384 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -137,6 +137,10 @@ def naive_kc_distance(tree1, tree2, lambda_=0): for u in samples: if not tree1.is_leaf(u) or not tree2.is_leaf(u): raise ValueError("Internal samples not supported") + for tree in [tree1, tree2]: + for u in tree.nodes(): + if tree.num_children(u) == 1: + raise ValueError("Unary nodes are not supported") n = samples.shape[0] N = (n * (n - 1)) // 2 @@ -272,6 +276,11 @@ def c_kc_distance(tree1, tree2, lambda_=0): for u in samples: if not tree1.is_leaf(u) or not tree2.is_leaf(u): raise ValueError("Internal samples not supported") + for tree in [tree1, tree2]: + for u in range(tree.tree_sequence.num_nodes): + left_child = tree.left_child(u) + if left_child != tskit.NULL and left_child == tree.right_child(u): + raise ValueError("Unary nodes are not supported") n = tree1.tree_sequence.num_samples vecs1 = KCVectors(n) @@ -327,6 +336,16 @@ def test_different_samples_error(self): self.assertRaises(ValueError, c_kc_distance, tree1, tree2) self.assertRaises(_tskit.LibraryError, tree1.kc_distance, tree2) + unsimplified_ts = msprime.simulate( + 10, random_seed=1, recombination_rate=10, record_full_arg=True + ) + trees = unsimplified_ts.trees(sample_lists=True) + tree1 = next(trees) + tree2 = next(trees) + self.assertRaises(ValueError, naive_kc_distance, tree1, tree2) + self.assertRaises(ValueError, c_kc_distance, tree1, tree2) + self.assertRaises(_tskit.LibraryError, tree1.kc_distance, tree2) + def validate_trees(self, n): for seed in range(1, 10): ts1 = msprime.simulate(n, random_seed=seed)