diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index b6d516a22e..c89b9d32e9 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -4383,6 +4383,44 @@ test_internal_sample_sample_sets(void) tsk_treeseq_free(&ts); } +static void +test_non_sample_leaf_sample_lists(void) +{ + const char *nodes = "1 0 0\n" + "0 0 0\n" + "1 2 0\n"; + const char *edges = "0 1 2 0,1\n"; + const tsk_id_t left_sample[3] = { 0, -1, 1 }; + const tsk_id_t right_sample[3] = { 0, -1, 0 }; + const tsk_id_t next_sample[2] = { -1, 0 }; + const tsk_id_t samples[2] = { 0, 2 }; + const tsk_id_t sample_index_map[3] = { 0, -1, 1 }; + tsk_treeseq_t ts; + tsk_tree_t t; + tsk_id_t i; + int ret; + + tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL); + + ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_tree_first(&t); + CU_ASSERT_EQUAL_FATAL(ret, 1); + + for (i = 0; i < 3; i++) { + CU_ASSERT_EQUAL_FATAL(left_sample[i], t.left_sample[i]); + CU_ASSERT_EQUAL_FATAL(right_sample[i], t.right_sample[i]); + CU_ASSERT_EQUAL_FATAL(sample_index_map[i], ts.sample_index_map[i]); + } + for (i = 0; i < 2; i++) { + CU_ASSERT_EQUAL_FATAL(next_sample[i], t.next_sample[i]); + CU_ASSERT_EQUAL_FATAL(samples[i], t.samples[i]); + } + + tsk_treeseq_free(&ts); + tsk_tree_free(&t); +} + /*======================================================= * KC Distance tests. *=======================================================*/ @@ -4644,7 +4682,37 @@ test_internal_samples_kc(void) ret = tsk_tree_first(&t); CU_ASSERT_EQUAL_FATAL(ret, 1); ret = tsk_tree_kc_distance(&t, &t, 0, &result); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_INTERNAL_SAMPLES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_treeseq_free(&ts); + tsk_tree_free(&t); +} + +static void +test_non_sample_leaf_kc(void) +{ + const char *nodes = "1 0 0\n" + "0 0 0\n" + "0 1 0\n"; + const char *edges = "0 1 2 0,1\n"; + tsk_treeseq_t ts; + tsk_tree_t t; + int ret; + double result = 0; + + tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL); + + ret = tsk_treeseq_kc_distance(&ts, &ts, 0, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL_FATAL(result, 0.0); + + ret = tsk_tree_init(&t, &ts, TSK_SAMPLE_LISTS); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_tree_first(&t); + CU_ASSERT_EQUAL_FATAL(ret, 1); + ret = tsk_tree_kc_distance(&t, &t, 0, &result); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL_FATAL(result, 0.0); + tsk_treeseq_free(&ts); tsk_tree_free(&t); } @@ -5647,6 +5715,7 @@ main(int argc, char **argv) { "test_simple_sample_sets", test_simple_sample_sets }, { "test_nonbinary_sample_sets", test_nonbinary_sample_sets }, { "test_internal_sample_sample_sets", test_internal_sample_sample_sets }, + { "test_non_sample_leaf_sample_lists", test_non_sample_leaf_sample_lists }, /* KC distance tests */ { "test_single_tree_kc", test_single_tree_kc }, @@ -5656,6 +5725,7 @@ main(int argc, char **argv) { "test_nonbinary_tree_kc", test_nonbinary_tree_kc }, { "test_nonzero_samples_kc", test_nonzero_samples_kc }, { "test_internal_samples_kc", test_internal_samples_kc }, + { "test_non_sample_leaf_kc", test_non_sample_leaf_kc }, { "test_unequal_sample_size_kc", test_unequal_sample_size_kc }, { "test_unequal_samples_kc", test_unequal_samples_kc }, { "test_unary_nodes_kc", test_unary_nodes_kc }, diff --git a/c/tskit/core.c b/c/tskit/core.c index 605c6bd11e..0424cfcba9 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -390,9 +390,6 @@ tsk_strerror_internal(int err) case TSK_ERR_SAMPLES_NOT_EQUAL: ret = "Samples must be identical in trees to compare."; break; - case TSK_ERR_INTERNAL_SAMPLES: - ret = "Internal samples are not supported."; - break; case TSK_ERR_MULTIPLE_ROOTS: ret = "Trees with multiple roots not supported."; break; diff --git a/c/tskit/core.h b/c/tskit/core.h index 1c220d4131..bfedf31b12 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -241,11 +241,10 @@ not found in the file. /* Distance metric errors */ #define TSK_ERR_SAMPLE_SIZE_MISMATCH -1200 #define TSK_ERR_SAMPLES_NOT_EQUAL -1201 -#define TSK_ERR_INTERNAL_SAMPLES -1202 -#define TSK_ERR_MULTIPLE_ROOTS -1203 -#define TSK_ERR_UNARY_NODES -1204 -#define TSK_ERR_SEQUENCE_LENGTH_MISMATCH -1205 -#define TSK_ERR_NO_SAMPLE_LISTS -1206 +#define TSK_ERR_MULTIPLE_ROOTS -1202 +#define TSK_ERR_UNARY_NODES -1203 +#define TSK_ERR_SEQUENCE_LENGTH_MISMATCH -1204 +#define TSK_ERR_NO_SAMPLE_LISTS -1205 /* Haplotype matching errors */ #define TSK_ERR_NULL_VITERBI_MATRIX -1300 diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 69735ec005..28e45f445d 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -4529,7 +4529,7 @@ kc_vectors_free(kc_vectors *self) } static inline void -update_kc_vectors_single_leaf( +update_kc_vectors_single_sample( tsk_treeseq_t *ts, kc_vectors *kc_vecs, tsk_id_t u, double time) { const tsk_id_t *sample_index_map = ts->sample_index_map; @@ -4543,25 +4543,19 @@ static inline void update_kc_vectors_all_pairs(tsk_tree_t *tree, kc_vectors *kc_vecs, tsk_id_t u, tsk_id_t v, tsk_size_t depth, double time) { - tsk_id_t leaf1_index, leaf2_index, leaf1, leaf2, n1, n2, tmp, pair_index; - tsk_treeseq_t *ts = tree->tree_sequence; - const tsk_id_t *restrict samples = ts->samples; + tsk_id_t sample1_index, sample2_index, n1, n2, tmp, pair_index; const tsk_id_t *restrict left_sample = tree->left_sample; const tsk_id_t *restrict right_sample = tree->right_sample; const tsk_id_t *restrict next_sample = tree->next_sample; - const tsk_id_t *restrict sample_index_map = ts->sample_index_map; tsk_size_t *restrict kc_m = kc_vecs->m; double *restrict kc_M = kc_vecs->M; - leaf1_index = left_sample[u]; - while (true) { - leaf1 = samples[leaf1_index]; - leaf2_index = left_sample[v]; - while (true) { - leaf2 = samples[leaf2_index]; - - n1 = sample_index_map[leaf1]; - n2 = sample_index_map[leaf2]; + sample1_index = left_sample[u]; + while (sample1_index != TSK_NULL) { + sample2_index = left_sample[v]; + while (sample2_index != TSK_NULL) { + n1 = sample1_index; + n2 = sample2_index; if (n1 > n2) { tmp = n1; n1 = n2; @@ -4574,15 +4568,15 @@ update_kc_vectors_all_pairs(tsk_tree_t *tree, kc_vectors *kc_vecs, tsk_id_t u, kc_m[pair_index] = depth; kc_M[pair_index] = time; - if (leaf2_index == right_sample[v]) { + if (sample2_index == right_sample[v]) { break; } - leaf2_index = next_sample[leaf2_index]; + sample2_index = next_sample[sample2_index]; } - if (leaf1_index == right_sample[u]) { + if (sample1_index == right_sample[u]) { break; } - leaf1_index = next_sample[leaf1_index]; + sample1_index = next_sample[sample1_index]; } } @@ -4620,22 +4614,21 @@ fill_kc_vectors(tsk_tree_t *t, kc_vectors *kc_vecs) depth = stack[stack_top].depth; stack_top--; - if (t->left_child[u] == TSK_NULL) { - if (u == root) { - time = 0; - } else { - time = times[t->parent[u]] - times[u]; - } - update_kc_vectors_single_leaf(ts, kc_vecs, u, time); - } else { + if (tsk_tree_is_sample(t, u)) { + time = tsk_tree_branch_length(t, u); + update_kc_vectors_single_sample(ts, kc_vecs, u, time); + } + + /* Don't bother going deeper if there are no samples under this node */ + if (t->left_sample[u] != TSK_NULL) { for (c1 = t->left_child[u]; c1 != TSK_NULL; c1 = t->right_sib[c1]) { stack_top++; stack[stack_top].node = c1; stack[stack_top].depth = depth + 1; for (c2 = t->right_sib[c1]; c2 != TSK_NULL; c2 = t->right_sib[c2]) { - update_kc_vectors_all_pairs( - t, kc_vecs, c1, c2, depth, times[root] - times[u]); + time = times[root] - times[u]; + update_kc_vectors_all_pairs(t, kc_vecs, c1, c2, depth, time); } } } @@ -4718,7 +4711,7 @@ check_kc_distance_samples_inputs(tsk_treeseq_t *self, tsk_treeseq_t *other) int tsk_tree_kc_distance(tsk_tree_t *self, tsk_tree_t *other, double lambda, double *result) { - tsk_id_t n, i, j, u, num_samples; + tsk_id_t n, i; kc_vectors vecs[2]; tsk_tree_t *trees[2] = { self, other }; int ret = 0; @@ -4732,17 +4725,6 @@ tsk_tree_kc_distance(tsk_tree_t *self, tsk_tree_t *other, double lambda, double goto out; } for (i = 0; i < 2; i++) { - num_samples = (tsk_id_t) trees[i]->tree_sequence->num_samples; - for (j = 0; j < num_samples; j++) { - u = trees[i]->tree_sequence->samples[j]; - if (trees[i]->left_child[u] != TSK_NULL) { - /* It's probably possible to support this, but it's too awkward - * to deal with and seems like a fairly niche requirement. */ - ret = TSK_ERR_INTERNAL_SAMPLES; - goto out; - } - } - ret = check_kc_distance_tree_inputs(trees[i]); if (ret != 0) { goto out; @@ -4789,7 +4771,7 @@ check_kc_distance_tree_sequence_inputs(tsk_treeseq_t *self, tsk_treeseq_t *other } static void -update_kc_pairs_with_leaf(tsk_tree_t *self, kc_vectors *kc, tsk_id_t leaf, +update_kc_pair_with_sample(tsk_tree_t *self, kc_vectors *kc, tsk_id_t sample, tsk_size_t *depths, double root_time) { tsk_id_t c, p, sib; @@ -4797,14 +4779,16 @@ update_kc_pairs_with_leaf(tsk_tree_t *self, kc_vectors *kc, tsk_id_t leaf, tsk_size_t depth; double *times = self->tree_sequence->tables->nodes.time; - for (c = leaf, p = self->parent[leaf]; p != TSK_NULL; c = p, p = self->parent[p]) { + c = sample; + for (p = self->parent[sample]; p != TSK_NULL; p = self->parent[p]) { time = root_time - times[p]; depth = depths[p]; for (sib = self->left_child[p]; sib != TSK_NULL; sib = self->right_sib[sib]) { if (sib != c) { - update_kc_vectors_all_pairs(self, kc, leaf, sib, depth, time); + update_kc_vectors_all_pairs(self, kc, sample, sib, depth, time); } } + c = p; } } @@ -4830,14 +4814,13 @@ update_kc_subtree_state( stack_top--; if (tsk_tree_is_sample(t, v)) { - update_kc_pairs_with_leaf(t, kc, v, depths, root_time); - } else { - for (c = t->left_child[v]; c != TSK_NULL; c = t->right_sib[c]) { - if (depths[c] != 0) { - depths[c] = depths[v] + 1; - stack_top++; - stack[stack_top] = c; - } + update_kc_pair_with_sample(t, kc, v, depths, root_time); + } + for (c = t->left_child[v]; c != TSK_NULL; c = t->right_sib[c]) { + if (depths[c] != 0) { + depths[c] = depths[v] + 1; + stack_top++; + stack[stack_top] = c; } } } @@ -4889,7 +4872,7 @@ update_kc_incremental(tsk_tree_t *self, kc_vectors *kc, tsk_edge_list_t *edges_o if (tsk_tree_is_sample(self, u)) { time = tsk_tree_branch_length(self, u); - update_kc_vectors_single_leaf(self->tree_sequence, kc, u, time); + update_kc_vectors_single_sample(self->tree_sequence, kc, u, time); } } diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 01433cdb58..1e87777a8d 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -12,6 +12,9 @@ In development **New features** +- Add support for trees with internal samples for the Kendall-Colijn tree distance + metric. (:user:`daniel-goldstein`, :pr:`610`) + - Add background shading to SVG tree sequences to reflect tree position along the sequence (:user:`hyanwong`, :pr:`563`) diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index b3264591f3..6bbad578b8 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -1976,8 +1976,7 @@ def test_kc_distance_errors(self): t2 = _tskit.Tree(ts1, options=_tskit.SAMPLE_LISTS) # If we don't seek to a specific tree, it has multiple roots (i.e., it's # in the null state). This fails because we don't accept multiple roots. - with self.assertRaises(_tskit.LibraryError): - t1.get_kc_distance(t2, 0) + self.verify_kc_library_error(t2, t2) # Different numbers of samples fail. ts2 = self.get_example_tree_sequence(11) @@ -1991,7 +1990,7 @@ def test_kc_distance_errors(self): t2.first() self.verify_kc_library_error(t1, t2) - # Internal samples cause errors. + # Unary nodes cause errors. tables = _tskit.TableCollection(1.0) tables.nodes.add_row(flags=1) tables.nodes.add_row(flags=1, time=1) diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index c241bb62bc..8ab49cbd7a 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -134,9 +134,6 @@ def naive_kc_distance(tree1, tree2, lambda_=0): raise ValueError("Trees must have the same samples") if not len(tree1.roots) == len(tree2.roots) == 1: raise ValueError("Trees must have one root") - for u in samples: - if not tree1.is_leaf(u) or not tree2.is_leaf(u): - raise ValueError("Internal samples not supported") for tree in [tree1, tree2]: for u in tree.nodes(): if tree.num_children(u) == 1: @@ -191,26 +188,20 @@ def fill_kc_vectors(tree, kc_vecs): stack = [(tree.root, 0)] while len(stack) > 0: u, depth = stack.pop() - if tree.is_leaf(u): - time = 0 if u is root else tree.branch_length(u) + if tree.is_sample(u): + time = tree.branch_length(u) update_kc_vectors_single_leaf(kc_vecs, u, time, sample_index_map) - else: - c1 = tree.left_child(u) - while c1 != tskit.NULL: - stack.append((c1, depth + 1)) - c2 = tree.right_sib(c1) - while c2 != tskit.NULL: - update_kc_vectors_all_pairs( - tree, - kc_vecs, - c1, - c2, - depth, - tree.time(root) - tree.time(u), - sample_index_map, - ) - c2 = tree.right_sib(c2) - c1 = tree.right_sib(c1) + + c1 = tree.left_child(u) + while c1 != tskit.NULL: + stack.append((c1, depth + 1)) + c2 = tree.right_sib(c1) + while c2 != tskit.NULL: + update_kc_vectors_all_pairs( + tree, kc_vecs, c1, c2, depth, tree.time(root) - tree.time(u), + ) + c2 = tree.right_sib(c2) + c1 = tree.right_sib(c1) def update_kc_vectors_single_leaf(kc_vecs, u, time, sample_index_map): @@ -219,30 +210,23 @@ def update_kc_vectors_single_leaf(kc_vecs, u, time, sample_index_map): kc_vecs.M[kc_vecs.N + u_index] = time -def update_kc_vectors_all_pairs(tree, kc_vecs, c1, c2, depth, time, sample_index_map): - leaves = tree.tree_sequence.samples() - c1_left = tree.left_sample(c1) - c1_right = tree.right_sample(c1) - leaf1_index = c1_left +def update_kc_vectors_all_pairs(tree, kc_vecs, c1, c2, depth, time): + s1_index = tree.left_sample(c1) while True: - c2_left = tree.left_sample(c2) - c2_right = tree.right_sample(c2) - leaf2_index = c2_left + s2_index = tree.left_sample(c2) while True: - leaf1 = leaves[leaf1_index] - leaf2 = leaves[leaf2_index] - update_kc_vectors_pair(kc_vecs, leaf1, leaf2, depth, time, sample_index_map) - if leaf2_index == c2_right: + update_kc_vectors_pair(kc_vecs, s1_index, s2_index, depth, time) + if s2_index == tree.right_sample(c2): break - leaf2_index = tree.next_sample(leaf2_index) - if leaf1_index == c1_right: + s2_index = tree.next_sample(s2_index) + if s1_index == tree.right_sample(c1): break - leaf1_index = tree.next_sample(leaf1_index) + s1_index = tree.next_sample(s1_index) -def update_kc_vectors_pair(kc_vecs, u, v, depth, time, sample_index_map): - n1 = int(min(sample_index_map[u], sample_index_map[v])) - n2 = int(max(sample_index_map[u], sample_index_map[v])) +def update_kc_vectors_pair(kc_vecs, n1, n2, depth, time): + if n1 > n2: + n1, n2 = n2, n1 pair_index = n2 - n1 - 1 + (-1 * n1 * (n1 - 2 * kc_vecs.n + 1)) // 2 kc_vecs.m[pair_index] = depth @@ -267,14 +251,13 @@ def c_kc_distance(tree1, tree2, lambda_=0): Written without Python features to aid writing C implementation. """ samples = tree1.tree_sequence.samples() + if tree1.tree_sequence.num_samples != tree2.tree_sequence.num_samples: + raise ValueError("Trees must have the same samples") for sample1, sample2 in zip(samples, tree2.tree_sequence.samples()): if sample1 != sample2: raise ValueError("Trees must have the same samples") if not len(tree1.roots) == len(tree2.roots) == 1: raise ValueError("Trees must have one root") - for u in samples: - if not tree1.is_leaf(u) or not tree2.is_leaf(u): - raise ValueError("Internal samples not supported") for tree in [tree1, tree2]: for u in range(tree.tree_sequence.num_nodes): left_child = tree.left_child(u) @@ -874,12 +857,104 @@ def test_internal_samples(self): t1 = next(ts2.trees(sample_lists=True)) t2 = next(ts2.trees(sample_lists=True)) - with self.assertRaises(ValueError): - naive_kc_distance(t1, t2) - with self.assertRaises(ValueError): - c_kc_distance(t1, t2) - with self.assertRaises(_tskit.LibraryError): - t1.kc_distance(t2) + naive_kc_distance(t1, t2) + c_kc_distance(t1, t2) + t1.kc_distance(t2) + + def test_root_sample(self): + tables1 = tskit.TableCollection(sequence_length=1.0) + tables1.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + only_root = next(tables1.tree_sequence().trees(sample_lists=True)) + self.assertEqual(only_root.kc_distance(only_root), 0) + self.assertEqual(only_root.kc_distance(only_root, lambda_=1), 0) + + def test_non_sample_leaf(self): + tables = tskit.TableCollection(sequence_length=1.0) + c1 = tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + c2 = tables.nodes.add_row(time=0) + p = tables.nodes.add_row(time=1) + tables.edges.add_row(left=0, right=1, parent=p, child=c1) + tables.edges.add_row(left=0, right=1, parent=p, child=c2) + ts = tables.tree_sequence() + tree = next(ts.trees(sample_lists=True)) + self.assertEqual(ts.kc_distance(ts), 0) + self.assertEqual(tree.kc_distance(tree), 0) + + # mirrored + tables = tskit.TableCollection(sequence_length=1.0) + c1 = tables.nodes.add_row(time=0) + c2 = tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + p = tables.nodes.add_row(time=1) + tables.edges.add_row(left=0, right=1, parent=p, child=c1) + tables.edges.add_row(left=0, right=1, parent=p, child=c2) + ts = tables.tree_sequence() + tree = next(ts.trees(sample_lists=True)) + self.assertEqual(ts.kc_distance(ts), 0) + self.assertEqual(tree.kc_distance(tree), 0) + + def test_ignores_subtrees_with_no_samples(self): + nodes_1 = io.StringIO( + """\ + id is_sample time population individual metadata + 0 0 0.000000 0 -1 + 1 0 0.000000 0 -1 + 2 0 0.000000 0 -1 + 3 1 0.000000 0 -1 + 4 0 0.000000 0 -1 + 5 0 0.000000 0 -1 + 6 1 1.000000 0 -1 + 7 1 2.000000 0 -1 + 8 0 2.000000 0 -1 + 9 0 3.000000 0 -1 + """ + ) + edges_1 = io.StringIO( + """\ + left right parent child + 0.000000 1.000000 6 0 + 0.000000 1.000000 6 1 + 0.000000 1.000000 7 2 + 0.000000 1.000000 7 6 + 0.000000 1.000000 8 4 + 0.000000 1.000000 8 5 + 0.000000 1.000000 9 3 + 0.000000 1.000000 9 7 + 0.000000 1.000000 9 8 + """ + ) + redundant = tskit.load_text( + nodes_1, edges_1, sequence_length=1, strict=False, base64_metadata=False + ) + + nodes_2 = io.StringIO( + """\ + id is_sample time population individual metadata + 0 0 0.000000 0 -1 + 1 0 0.000000 0 -1 + 2 0 0.000000 0 -1 + 3 1 0.000000 0 -1 + 4 0 0.000000 0 -1 + 5 0 0.000000 0 -1 + 6 1 1.000000 0 -1 + 7 1 2.000000 0 -1 + 8 0 2.000000 0 -1 + 9 0 3.000000 0 -1 + """ + ) + edges_2 = io.StringIO( + """\ + left right parent child + 0.000000 1.000000 7 2 + 0.000000 1.000000 7 6 + 0.000000 1.000000 9 3 + 0.000000 1.000000 9 7 + """ + ) + simplified = tskit.load_text( + nodes_2, edges_2, sequence_length=1, strict=False, base64_metadata=False + ) + self.assertEqual(redundant.kc_distance(simplified, 0), 0) + self.assertEqual(redundant.kc_distance(simplified, 1), 0) def ts_kc_distance(ts1, ts2, lambda_=0): @@ -1008,9 +1083,7 @@ def update_kc_pairs_with_leaf(tree, kc, leaf, sample_index_map, depths): depth = depths[p] for sibling in tree.children(p): if sibling != c: - update_kc_vectors_all_pairs( - tree, kc, leaf, sibling, depth, time, sample_index_map - ) + update_kc_vectors_all_pairs(tree, kc, leaf, sibling, depth, time) c, p = p, tree.parent(p) @@ -1353,6 +1426,72 @@ def test_remove_root(self): distance = (math.sqrt(8) * 5 + math.sqrt(6) * 5) / 10 self.verify_result(ts_1, ts_2, 0, distance) + def test_ignores_subtrees_with_no_samples(self): + nodes_1 = io.StringIO( + """\ + id is_sample time population individual metadata + 0 0 0.000000 0 -1 + 1 0 0.000000 0 -1 + 2 0 0.000000 0 -1 + 3 1 0.000000 0 -1 + 4 0 0.000000 0 -1 + 5 0 0.000000 0 -1 + 6 1 1.000000 0 -1 + 7 1 2.000000 0 -1 + 8 0 2.000000 0 -1 + 9 0 3.000000 0 -1 + """ + ) + edges_1 = io.StringIO( + """\ + left right parent child + 0.000000 1.000000 6 0 + 0.000000 1.000000 6 1 + 0.000000 1.000000 7 2 + 0.000000 1.000000 7 6 + 0.000000 1.000000 8 4 + 0.000000 1.000000 8 5 + 0.000000 1.000000 9 3 + 0.000000 1.000000 9 7 + 0.000000 1.000000 9 8 + """ + ) + redundant = tskit.load_text( + nodes_1, edges_1, sequence_length=1, strict=False, base64_metadata=False + ) + + nodes_2 = io.StringIO( + """\ + id is_sample time population individual metadata + 0 0 0.000000 0 -1 + 1 0 0.000000 0 -1 + 2 0 0.000000 0 -1 + 3 1 0.000000 0 -1 + 4 0 0.000000 0 -1 + 5 0 0.000000 0 -1 + 6 1 1.000000 0 -1 + 7 1 2.000000 0 -1 + 8 0 2.000000 0 -1 + 9 0 3.000000 0 -1 + """ + ) + edges_2 = io.StringIO( + """\ + left right parent child + 0.000000 1.000000 7 2 + 0.000000 1.000000 7 6 + 0.000000 1.000000 9 3 + 0.000000 1.000000 9 7 + """ + ) + simplified = tskit.load_text( + nodes_2, edges_2, sequence_length=1, strict=False, base64_metadata=False + ) + t1 = next(redundant.trees(sample_lists=True)) + t2 = next(simplified.trees(sample_lists=True)) + self.assertEqual(t1.kc_distance(t2, 0), 0) + self.assertEqual(t1.kc_distance(t2, 1), 0) + class TestOverlappingSegments(unittest.TestCase): """ diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 93f0aad45e..0aa2409760 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -2163,7 +2163,9 @@ def kc_distance(self, other, lambda_=0.0): `_ for details. The trees we are comparing to must have identical lists of sample - nodes (i.e., the same IDs in the same order). + nodes (i.e., the same IDs in the same order). The metric operates on + samples, not leaves, so internal samples are treated identically to + sample tips. Subtrees with no samples do not contribute to the metric. :param Tree other: The other tree to compare to. :param float lambda_: The KC metric lambda parameter determining the