diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 5ddf8a1efb..c6c2089023 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -4268,6 +4269,7 @@ get_smallest_set_bit(uint64_t v) uint64_t t = 1; int8_t r = 0; + assert(v != 0); while ((v & t) == 0) { t <<= 1; r++; @@ -4275,9 +4277,14 @@ get_smallest_set_bit(uint64_t v) return r; } +#define HARTIGAN_MAX_ALLELES 64 + /* This interface is experimental. In the future, we should provide the option to * use a general cost matrix, in which case we'll use the Sankoff algorithm. For * now this is unused. + * + * The algorithm used here is Hartigan parsimony, "Minimum Mutation Fits to a + * Given Tree", Biometrics 1973. */ int TSK_WARN_UNUSED tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes, @@ -4286,42 +4293,50 @@ tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes, tsk_state_transition_t **r_transitions) { int ret = 0; + struct stack_elem { + tsk_id_t node; + tsk_id_t transition_parent; + int8_t state; + }; const tsk_size_t num_samples = self->tree_sequence->num_samples; const tsk_size_t num_nodes = self->num_nodes; const tsk_id_t *restrict left_child = self->left_child; const tsk_id_t *restrict right_sib = self->right_sib; const tsk_id_t *restrict parent = self->parent; const tsk_flags_t *restrict node_flags = self->tree_sequence->tables->nodes.flags; - uint64_t root_state; - int8_t *restrict state = malloc(num_nodes * sizeof(*state)); - uint64_t *restrict A = calloc(num_nodes, sizeof(*A)); - tsk_id_t *restrict stack = malloc(num_nodes * sizeof(*stack)); - tsk_id_t *restrict parent_stack = malloc(num_nodes * sizeof(*parent_stack)); - tsk_id_t postorder_parent, transition_parent, root, u, v; + uint64_t optimal_root_set; + uint64_t *restrict optimal_set = calloc(num_nodes, sizeof(*optimal_set)); + tsk_id_t *restrict postorder_stack = malloc(num_nodes * sizeof(*postorder_stack)); + struct stack_elem *restrict preorder_stack + = malloc(num_nodes * sizeof(*preorder_stack)); + tsk_id_t postorder_parent, root, u, v; /* The largest possible number of transitions is one over every sample */ tsk_state_transition_t *transitions = malloc(num_samples * sizeof(*transitions)); - tsk_size_t j; - int8_t ancestral_state; + int8_t allele, ancestral_state; int stack_top; - tsk_size_t num_transitions; + struct stack_elem s; + tsk_size_t j, num_transitions, max_allele_count; + tsk_size_t allele_count[HARTIGAN_MAX_ALLELES]; tsk_size_t non_missing = 0; + int8_t num_alleles = 0; - if (A == NULL || stack == NULL || parent_stack == NULL || state == NULL + if (optimal_set == NULL || preorder_stack == NULL || postorder_stack == NULL || transitions == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; } for (j = 0; j < num_samples; j++) { - if (genotypes[j] >= 64 || genotypes[j] < TSK_MISSING_DATA) { + if (genotypes[j] >= HARTIGAN_MAX_ALLELES || genotypes[j] < TSK_MISSING_DATA) { ret = TSK_ERR_BAD_GENOTYPE; goto out; } u = self->tree_sequence->samples[j]; if (genotypes[j] == TSK_MISSING_DATA) { /* All bits set */ - A[u] = UINT64_MAX; + optimal_set[u] = UINT64_MAX; } else { - A[u] = set_bit(A[u], genotypes[j]); + optimal_set[u] = set_bit(optimal_set[u], genotypes[j]); + num_alleles = TSK_MAX(genotypes[j], num_alleles); non_missing++; } } @@ -4330,79 +4345,92 @@ tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes, ret = TSK_ERR_GENOTYPES_ALL_MISSING; goto out; } + num_alleles++; - root_state = 0; for (root = self->left_root; root != TSK_NULL; root = self->right_sib[root]) { /* Do a post order traversal */ - stack[0] = root; + postorder_stack[0] = root; stack_top = 0; postorder_parent = TSK_NULL; while (stack_top >= 0) { - u = stack[stack_top]; + u = postorder_stack[stack_top]; if (left_child[u] != TSK_NULL && u != postorder_parent) { for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { stack_top++; - stack[stack_top] = v; + postorder_stack[stack_top] = v; } } else { stack_top--; postorder_parent = parent[u]; /* Visit u */ + memset(allele_count, 0, ((size_t) num_alleles) * sizeof(*allele_count)); + for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { + for (allele = 0; allele < num_alleles; allele++) { + allele_count[allele] += bit_is_set(optimal_set[v], allele); + } + } if (!(node_flags[u] & TSK_NODE_IS_SAMPLE)) { - /* Get the intersection of the child sets */ - A[u] = UINT64_MAX; /* Set all the bits for u */ - for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { - A[u] &= A[v]; + max_allele_count = 0; + for (allele = 0; allele < num_alleles; allele++) { + max_allele_count + = TSK_MAX(max_allele_count, allele_count[allele]); } - if (A[u] == 0) { - /* Intersection is empty, so get the union */ - for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { - A[u] |= A[v]; + for (allele = 0; allele < num_alleles; allele++) { + if (allele_count[allele] == max_allele_count) { + optimal_set[u] = set_bit(optimal_set[u], allele); } } } } } - root_state |= A[root]; } - ancestral_state = get_smallest_set_bit(root_state); + + optimal_root_set = 0; + /* TODO it's annoying that this is essentially the same as the + * visit function above. It would be nice if we had an extra + * node that was the parent of all roots, then the algorithm + * would work as-is */ + memset(allele_count, 0, ((size_t) num_alleles) * sizeof(*allele_count)); + for (root = self->left_root; root != TSK_NULL; root = right_sib[root]) { + for (allele = 0; allele < num_alleles; allele++) { + allele_count[allele] += bit_is_set(optimal_set[root], allele); + } + } + max_allele_count = 0; + for (allele = 0; allele < num_alleles; allele++) { + max_allele_count = TSK_MAX(max_allele_count, allele_count[allele]); + } + for (allele = 0; allele < num_alleles; allele++) { + if (allele_count[allele] == max_allele_count) { + optimal_root_set = set_bit(optimal_root_set, allele); + } + } + ancestral_state = get_smallest_set_bit(optimal_root_set); num_transitions = 0; for (root = self->left_root; root != TSK_NULL; root = self->right_sib[root]) { - transition_parent = TSK_NULL; - state[root] = ancestral_state; - if (!bit_is_set(A[root], ancestral_state)) { - state[root] = get_smallest_set_bit(A[root]); - transitions[num_transitions].node = root; - transitions[num_transitions].parent = TSK_NULL; - transitions[num_transitions].state = state[root]; - transition_parent = (tsk_id_t) num_transitions; - num_transitions++; - } /* Do a preorder traversal */ - stack[0] = root; - parent_stack[0] = transition_parent; + preorder_stack[0].node = root; + preorder_stack[0].state = ancestral_state; + preorder_stack[0].transition_parent = TSK_NULL; stack_top = 0; while (stack_top >= 0) { - u = stack[stack_top]; - transition_parent = parent_stack[stack_top]; + s = preorder_stack[stack_top]; stack_top--; - for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { + if (!bit_is_set(optimal_set[s.node], s.state)) { + s.state = get_smallest_set_bit(optimal_set[s.node]); + transitions[num_transitions].node = s.node; + transitions[num_transitions].parent = s.transition_parent; + transitions[num_transitions].state = s.state; + s.transition_parent = (tsk_id_t) num_transitions; + num_transitions++; + } + for (v = left_child[s.node]; v != TSK_NULL; v = right_sib[v]) { stack_top++; - stack[stack_top] = v; - state[v] = state[u]; - if (!bit_is_set(A[v], state[u])) { - state[v] = get_smallest_set_bit(A[v]); - transitions[num_transitions].node = v; - transitions[num_transitions].parent = transition_parent; - transitions[num_transitions].state = state[v]; - parent_stack[stack_top] = (tsk_id_t) num_transitions; - num_transitions++; - } else { - parent_stack[stack_top] = transition_parent; - } + s.node = v; + preorder_stack[stack_top] = s; } } } @@ -4414,17 +4442,14 @@ tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes, out: tsk_safe_free(transitions); /* Cannot safe_free because of 'restrict' */ - if (A != NULL) { - free(A); - } - if (stack != NULL) { - free(stack); + if (optimal_set != NULL) { + free(optimal_set); } - if (parent_stack != NULL) { - free(parent_stack); + if (preorder_stack != NULL) { + free(preorder_stack); } - if (state != NULL) { - free(state); + if (postorder_stack != NULL) { + free(postorder_stack); } return ret; } diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index ef0cf3701c..563a3955fa 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -65,6 +65,11 @@ of arrays. (:user:`benjeffery`, :issue:`1025`, :pr:`1029`) +- The ``map_mutations`` method previously used the Fitch parsimony method, but this + does not produce parsimonious results on non-binary trees. We now now use the + Hartigan parsimony algorithm, which does. (:user:`jeromekelleher`, + :issue:`987`, :pr:`1030`). + **Breaking changes** - The argument to ``ts.dump`` and ``tskit.load`` has been renamed `file` from `path`. diff --git a/python/tests/test_parsimony.py b/python/tests/test_parsimony.py index fd775a060b..f5c0df52d8 100644 --- a/python/tests/test_parsimony.py +++ b/python/tests/test_parsimony.py @@ -1,6 +1,6 @@ # MIT License # -# Copyright (c) 2019 Tskit Developers +# Copyright (c) 2019-2020 Tskit Developers # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -25,6 +25,7 @@ import io import itertools +import attr import Bio.Phylo.TreeConstruction import msprime import numpy as np @@ -132,6 +133,8 @@ def fitch_map_mutations(tree, genotypes, alleles): else: A[u] = 1 for u in tree.nodes(order="postorder"): + if tree.num_children(u) > 2: + raise ValueError("Fitch parsimony is for binary trees only") if not tree.is_sample(u): A[u] = 1 for v in tree.children(u): @@ -178,6 +181,76 @@ def fitch_map_mutations(tree, genotypes, alleles): return alleles[ancestral_state], mutations +def hartigan_map_mutations(tree, genotypes, alleles): + """ + Returns a Hartigan parsimony reconstruction for the specified set of genotypes. + The reconstruction is specified by returning the ancestral state and a + list of mutations on the tree. Each mutation is a (node, parent, state) + triple, where node is the node over which the transition occurs, the + parent is the index of the parent transition above it on the tree (or -1 + if there is none) and state is the new state. + """ + genotypes = np.array(genotypes) + not_missing = genotypes != -1 + if np.sum(not_missing) == 0: + raise ValueError("Must have at least one non-missing genotype") + num_alleles = np.max(genotypes[not_missing]) + 1 + num_nodes = tree.tree_sequence.num_nodes + + # use a numpy array of 0/1 values to represent the set of states + # to make the code as similar as possible to the C implementation. + optimal_set = np.zeros((num_nodes + 1, num_alleles), dtype=np.int8) + for allele, u in zip(genotypes, tree.tree_sequence.samples()): + if allele != -1: + optimal_set[u, allele] = 1 + else: + optimal_set[u] = 1 + + allele_count = np.zeros(num_alleles, dtype=int) + for root in tree.roots: + for u in tree.nodes(root, order="postorder"): + allele_count[:] = 0 + for v in tree.children(u): + for j in range(num_alleles): + allele_count[j] += optimal_set[v, j] + if not tree.is_sample(u): + max_allele_count = np.max(allele_count) + optimal_set[u, allele_count == max_allele_count] = 1 + + allele_count[:] = 0 + for v in tree.roots: + for j in range(num_alleles): + allele_count[j] += optimal_set[v, j] + max_allele_count = np.max(allele_count) + optimal_root_set = np.zeros(num_alleles, dtype=int) + optimal_root_set[allele_count == max_allele_count] = 1 + ancestral_state = np.argmax(optimal_root_set) + + @attr.s + class StackElement: + node = attr.ib() + state = attr.ib() + mutation_parent = attr.ib() + + mutations = [] + for root in tree.roots: + stack = [StackElement(root, ancestral_state, -1)] + while len(stack) > 0: + s = stack.pop() + if optimal_set[s.node, s.state] == 0: + s.state = np.argmax(optimal_set[s.node]) + mutation = tskit.Mutation( + node=s.node, + derived_state=alleles[s.state], + parent=s.mutation_parent, + ) + s.mutation_parent = len(mutations) + mutations.append(mutation) + for v in tree.children(s.node): + stack.append(StackElement(v, s.state, s.mutation_parent)) + return alleles[ancestral_state], mutations + + def reconstruct_states(tree, genotypes, S, cost_matrix): """ Given the specified observations for the samples and tree score @@ -395,7 +468,7 @@ def verify(self, ts): variant.genotypes, variant.alleles ) assert ancestral_state1 == ancestral_state2 - assert transitions1 == transitions2 + assert len(transitions1) == len(transitions2) # The Sankoff algorithm doesn't recontruct the state in the same way. # Just a limitation of the implementation. ancestral_state3, transitions3 = sankoff_map_mutations( @@ -450,10 +523,11 @@ class TestParsimonyBase: def do_map_mutations(self, tree, genotypes, alleles=None, compare_lib=True): if alleles is None: alleles = [str(j) for j in range(max(genotypes) + 1)] - ancestral_state, transitions = fitch_map_mutations(tree, genotypes, alleles) + ancestral_state, transitions = hartigan_map_mutations(tree, genotypes, alleles) if compare_lib: ancestral_state1, transitions1 = tree.map_mutations(genotypes, alleles) assert ancestral_state == ancestral_state1 + assert len(transitions) == len(transitions1) assert transitions == transitions1 return ancestral_state, transitions @@ -495,31 +569,11 @@ def verify(self, ts): ): assert h1 == h2 - # Run again without the mutation parent to make sure we're doing it - # properly. - tables2 = ts.dump_tables() - tables2.sites.clear() - tables2.mutations.clear() - G = ts.genotype_matrix(isolated_as_missing=False) - alleles = [v.alleles for v in ts.variants()] - for tree in ts.trees(): - for site in tree.sites(): - ancestral_state, mutations = tree.map_mutations( - G[site.id], alleles[site.id] - ) - site_id = tables2.sites.add_row(site.position, ancestral_state) - for mutation in mutations: - tables2.mutations.add_row( - site_id, - node=mutation.node, - time=mutation.time, - derived_state=mutation.derived_state, - ) - tables2.sort() - tables2.build_index() - tables2.compute_mutation_parents() - assert tables.sites == tables2.sites - assert tables.mutations == tables2.mutations + # Make sure we're computing the parent correctly. + tables2 = tables.copy() + nulled = np.zeros_like(tables.mutations.parent) - 1 + tables2.mutations.parent = nulled + assert np.array_equal(tables.mutations.parent, tables.mutations.parent) def test_infinite_sites_n3(self): ts = msprime.simulate(3, mutation_rate=3, random_seed=3) @@ -573,6 +627,12 @@ def test_jukes_cantor_n50_internal_samples(self): ts = tsutil.jukes_cantor(ts, 5, 2, seed=2) self.verify(tsutil.jiggle_samples(ts)) + def test_jukes_cantor_balanced_ternary_internal_samples(self): + tree = tskit.Tree.generate_balanced(27, arity=3) + ts = tsutil.jukes_cantor(tree.tree_sequence, 5, 2, seed=1) + assert ts.num_sites > 1 + self.verify(tsutil.jiggle_samples(ts)) + def test_infinite_sites_n20_multiroot(self): ts = msprime.simulate(20, mutation_rate=3, random_seed=3) self.verify(tsutil.decapitate(ts, ts.num_edges // 2)) @@ -583,12 +643,58 @@ def test_jukes_cantor_n15_multiroot(self): ts = tsutil.jukes_cantor(ts, 15, 2, seed=3) self.verify(ts) + def test_jukes_cantor_balanced_ternary_multiroot(self): + ts = tskit.Tree.generate_balanced(50, arity=3).tree_sequence + ts = tsutil.decapitate(ts, ts.num_edges // 3) + ts = tsutil.jukes_cantor(ts, 15, 2, seed=3) + self.verify(ts) + assert ts.num_sites > 1 + self.verify(tsutil.jiggle_samples(ts)) + def test_jukes_cantor_n50_multiroot(self): ts = msprime.simulate(50, random_seed=1) ts = tsutil.decapitate(ts, ts.num_edges // 2) ts = tsutil.jukes_cantor(ts, 5, 2, seed=2) self.verify(ts) + def test_jukes_cantor_root_polytomy_n5(self): + tree = tskit.Tree.unrank(5, (1, 0)) + ts = tsutil.jukes_cantor(tree.tree_sequence, 5, 2, seed=1) + assert ts.num_sites > 2 + self.verify(ts) + + def test_jukes_cantor_leaf_polytomy_n5(self): + tree = tskit.Tree.unrank(5, (7, 0)) + ts = tsutil.jukes_cantor(tree.tree_sequence, 5, 2, seed=1) + assert ts.num_sites > 2 + self.verify(ts) + + @pytest.mark.parametrize( + "tree_builder", [tskit.Tree.generate_balanced, tskit.Tree.generate_comb] + ) + @pytest.mark.parametrize("n", [2, 5, 10]) + def test_many_states_binary(self, tree_builder, n): + tree = tree_builder(n) + tables = tree.tree_sequence.dump_tables() + tables.sites.add_row(0.5, "0") + for j in range(1, n): + tables.mutations.add_row(0, derived_state=str(j), node=j) + ts = tables.tree_sequence() + assert np.array_equal(ts.genotype_matrix(), [np.arange(n, dtype=np.int8)]) + self.verify(tables.tree_sequence()) + + @pytest.mark.parametrize("arity", [2, 3, 4]) + @pytest.mark.parametrize("n", [2, 5, 10]) + def test_many_states_arity(self, n, arity): + tree = tskit.Tree.generate_balanced(n, arity=arity) + tables = tree.tree_sequence.dump_tables() + tables.sites.add_row(0.5, "0") + for j in range(1, n): + tables.mutations.add_row(0, derived_state=str(j), node=j) + ts = tables.tree_sequence() + assert np.array_equal(ts.genotype_matrix(), [np.arange(n, dtype=np.int8)]) + self.verify(tables.tree_sequence()) + class TestParsimonyRoundTripMissingData(TestParsimonyRoundTrip): """ @@ -635,56 +741,81 @@ class TestParsimonyMissingData(TestParsimonyBase): Tests that we correctly map_mutations when we have missing data. """ - def test_all_missing(self): - for n in range(2, 10): - ts = msprime.simulate(n, random_seed=2) - tree = ts.first() + @pytest.mark.parametrize("n", range(2, 10)) + def test_all_missing(self, n): + ts = msprime.simulate(n, random_seed=2) + tree = ts.first() + genotypes = np.zeros(n, dtype=np.int8) - 1 + alleles = ["0", "1"] + with pytest.raises(ValueError): + fitch_map_mutations(tree, genotypes, alleles) + with pytest.raises(tskit.LibraryError): + tree.map_mutations(genotypes, alleles) + + @pytest.mark.parametrize("n", range(2, 10)) + def test_one_non_missing(self, n): + ts = msprime.simulate(n, random_seed=2) + tree = ts.first() + for j in range(n): genotypes = np.zeros(n, dtype=np.int8) - 1 - alleles = ["0", "1"] - with pytest.raises(ValueError): - fitch_map_mutations(tree, genotypes, alleles) - with pytest.raises(tskit.LibraryError): - tree.map_mutations(genotypes, alleles) - - def test_one_non_missing(self): - for n in range(2, 10): - ts = msprime.simulate(n, random_seed=2) - tree = ts.first() - for j in range(n): - genotypes = np.zeros(n, dtype=np.int8) - 1 - genotypes[j] = 0 - ancestral_state, transitions = self.do_map_mutations( - tree, genotypes, ["0", "1"] - ) - assert ancestral_state == "0" - assert len(transitions) == 0 + genotypes[j] = 0 + ancestral_state, transitions = self.do_map_mutations( + tree, genotypes, ["0", "1"] + ) + assert ancestral_state == "0" + assert len(transitions) == 0 - def test_many_states_half_missing(self): - for n in range(2, 20): - ts = msprime.simulate(n, random_seed=2) - tree = ts.first() + @pytest.mark.parametrize("arity", range(2, 10)) + def test_one_non_missing_balanced(self, arity): + n = 40 + tree = tskit.Tree.generate_balanced(n, arity=arity) + for j in range(n): genotypes = np.zeros(n, dtype=np.int8) - 1 - genotypes[0 : n // 2] = np.arange(n // 2, dtype=int) - alleles = [str(j) for j in range(n)] + genotypes[j] = 0 + ancestral_state, transitions = self.do_map_mutations( + tree, genotypes, ["0", "1"] + ) + assert ancestral_state == "0" + assert len(transitions) == 0 + + @pytest.mark.parametrize("n", range(2, 10)) + def test_many_states_half_missing(self, n): + ts = msprime.simulate(n, random_seed=2) + tree = ts.first() + genotypes = np.zeros(n, dtype=np.int8) - 1 + genotypes[0 : n // 2] = np.arange(n // 2, dtype=int) + alleles = [str(j) for j in range(n)] + ancestral_state, transitions = self.do_map_mutations(tree, genotypes, alleles) + assert ancestral_state == "0" + assert len(transitions) == max(0, n // 2 - 1) + + @pytest.mark.parametrize("n", range(2, 10)) + def test_one_missing(self, n): + ts = msprime.simulate(n, random_seed=2) + tree = ts.first() + alleles = [str(j) for j in range(2)] + for j in range(n): + genotypes = np.zeros(n, dtype=np.int8) - 1 + genotypes[j] = 0 ancestral_state, transitions = self.do_map_mutations( tree, genotypes, alleles ) assert ancestral_state == "0" - assert len(transitions) == max(0, n // 2 - 1) - - def test_one_missing(self): - for n in range(2, 10): - ts = msprime.simulate(n, random_seed=2) - tree = ts.first() - alleles = [str(j) for j in range(2)] - for j in range(n): - genotypes = np.zeros(n, dtype=np.int8) - 1 - genotypes[j] = 0 - ancestral_state, transitions = self.do_map_mutations( - tree, genotypes, alleles - ) - assert ancestral_state == "0" - assert len(transitions) == 0 + assert len(transitions) == 0 + + @pytest.mark.parametrize("arity", range(2, 10)) + def test_one_missing_balanced(self, arity): + n = 40 + tree = tskit.Tree.generate_balanced(n, arity=arity) + alleles = [str(j) for j in range(2)] + for j in range(n): + genotypes = np.zeros(n, dtype=np.int8) - 1 + genotypes[j] = 0 + ancestral_state, transitions = self.do_map_mutations( + tree, genotypes, alleles + ) + assert ancestral_state == "0" + assert len(transitions) == 0 def test_one_missing_derived_state(self): tables = felsenstein_tables() @@ -832,6 +963,219 @@ def test_multi_mutation_missing_data(self): assert transitions[1] == tskit.Mutation(node=1, parent=0, derived_state="2") +class TestParsimonyExamplesPolytomy(TestParsimonyBase): + """ + Some examples on a given non-binary tree. + """ + + # 9 + # ┏━┻━━┓ + # 7 8 + # ┏━┻━┓ ┏┻┓ + # 6 ┃ ┃ ┃ + # ┏━╋━┓ ┃ ┃ ┃ + # 0 2 4 5 1 3 + + nodes = io.StringIO( + """\ + id is_sample time + 0 1 0 + 1 1 0 + 2 1 0 + 3 1 0 + 4 1 0 + 5 1 0 + 6 0 1 + 7 0 2 + 8 0 2 + 9 0 3 + """ + ) + edges = io.StringIO( + """\ + left right parent child + 0 1 6 0,2,4 + 0 1 7 6,5 + 0 1 8 1,3 + 0 1 9 7,8 + """ + ) + + tree = tskit.load_text( + nodes=nodes, + edges=edges, + strict=False, + ).first() + + def test_all_zeros(self): + genotypes = [0, 0, 0, 0, 0, 0] + ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 0 + + def test_mutation_over_8(self): + genotypes = [0, 1, 0, 1, 0, 0] + ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 1 + assert transitions[0] == tskit.Mutation(node=8, derived_state="1") + + def test_mutation_over_6(self): + genotypes = [1, 0, 1, 0, 1, 0] + ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 1 + assert transitions[0] == tskit.Mutation(node=6, derived_state="1") + + def test_mutation_over_0_5(self): + # Bug reported in https://github.com/tskit-dev/tskit/issues/987 + genotypes = [1, 0, 0, 0, 0, 1] + ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 2 + assert transitions[0] == tskit.Mutation(node=0, derived_state="1") + assert transitions[1] == tskit.Mutation(node=5, derived_state="1") + + def test_mutation_over_7_back_mutation_4(self): + genotypes = [1, 0, 1, 0, 0, 1] + ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 2 + assert transitions[0] == tskit.Mutation(node=7, derived_state="1") + assert transitions[1] == tskit.Mutation(node=4, derived_state="0", parent=0) + + +class TestParsimonyExamplesStar(TestParsimonyBase): + """ + Some examples on star topologies. + """ + + @pytest.mark.parametrize("n", range(3, 8)) + def test_two_states_freq_n_minus_1(self, n): + tree = tskit.Tree.generate_star(n) + genotypes = np.zeros(n, dtype=np.int8) + genotypes[0] = 1 + ancestral_state, transitions = self.do_map_mutations(tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 1 + assert transitions[0] == tskit.Mutation(node=0, derived_state="1") + + genotypes[:] = 1 + genotypes[0] = 0 + ancestral_state, transitions = self.do_map_mutations(tree, genotypes) + assert ancestral_state == "1" + assert len(transitions) == 1 + assert transitions[0] == tskit.Mutation(node=0, derived_state="0") + + @pytest.mark.parametrize("n", range(5, 10)) + def test_two_states_freq_n_minus_2(self, n): + tree = tskit.Tree.generate_star(n) + genotypes = np.zeros(n, dtype=np.int8) + genotypes[0:2] = 1 + ancestral_state, transitions = self.do_map_mutations(tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 2 + assert transitions[0] == tskit.Mutation(node=1, derived_state="1") + assert transitions[1] == tskit.Mutation(node=0, derived_state="1") + + genotypes[:] = 1 + genotypes[0:2] = 0 + ancestral_state, transitions = self.do_map_mutations(tree, genotypes) + assert ancestral_state == "1" + assert len(transitions) == 2 + assert transitions[0] == tskit.Mutation(node=1, derived_state="0") + assert transitions[1] == tskit.Mutation(node=0, derived_state="0") + + @pytest.mark.parametrize("n", range(5, 10)) + def test_three_states_freq_n_minus_2(self, n): + tree = tskit.Tree.generate_star(n) + genotypes = np.zeros(n, dtype=np.int8) + genotypes[0] = 1 + genotypes[1] = 2 + ancestral_state, transitions = self.do_map_mutations(tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 2 + assert transitions[0] == tskit.Mutation(node=1, derived_state="2") + assert transitions[1] == tskit.Mutation(node=0, derived_state="1") + + @pytest.mark.parametrize("n", range(2, 10)) + def test_n_states(self, n): + tree = tskit.Tree.generate_star(n) + genotypes = np.arange(n, dtype=np.int8) + ancestral_state, transitions = self.do_map_mutations(tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == n - 1 + + @pytest.mark.parametrize("n", range(3, 10)) + def test_missing_data(self, n): + tree = tskit.Tree.generate_star(n) + genotypes = np.zeros(n, dtype=np.int8) + genotypes[0] = tskit.MISSING_DATA + genotypes[1] = 1 + ancestral_state, transitions = self.do_map_mutations(tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 1 + assert transitions[0] == tskit.Mutation(node=1, derived_state="1") + + +class TestParsimonyExamplesBalancedTernary(TestParsimonyBase): + """ + Some examples on a given non-binary tree. + """ + + tree = tskit.Tree.generate_balanced(27, arity=3) + # 39 + # ┏━━━━━━━━━━━━━━━━━━━━━┳━┻━━━━━━━━━━━━━━━━━━━━━━━━┓ + # 30 34 38 + # ┏━━━━━╋━━━━━┓ ┏━━━━━━━━╋━━━━━━━━┓ ┏━━━━━━━━╋━━━━━━━━┓ + # 27 28 29 31 32 33 35 36 37 + # ┏━╋━┓ ┏━╋━┓ ┏━╋━┓ ┏━━╋━━┓ ┏━━╋━━┓ ┏━━╋━━┓ ┏━━╋━━┓ ┏━━╋━━┓ ┏━━╋━━┓ + # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 + + def test_mutation_over_27_29(self): + genotypes = np.zeros(27, dtype=int) + genotypes[0:3] = 1 + genotypes[6:9] = 1 + ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 2 + # the algorithm chooses a back mutation instead + assert transitions[0] == tskit.Mutation(node=30, derived_state="1") + assert transitions[1] == tskit.Mutation(node=28, derived_state="0", parent=0) + + def test_three_clades(self): + genotypes = np.zeros(27, dtype=int) + genotypes[9:18] = 1 + genotypes[18:27] = 2 + ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 2 + assert transitions[0] == tskit.Mutation(node=38, derived_state="2") + assert transitions[1] == tskit.Mutation(node=34, derived_state="1") + + def test_nonzero_ancestral_state(self): + genotypes = np.ones(27, dtype=int) + genotypes[0] = 0 + genotypes[26] = 0 + ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) + assert ancestral_state == "1" + assert len(transitions) == 2 + assert transitions[0] == tskit.Mutation(node=26, derived_state="0") + assert transitions[1] == tskit.Mutation(node=0, derived_state="0") + + def test_many_states(self): + genotypes = np.arange(27, dtype=int) + ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 26 + + def test_least_parsimonious(self): + genotypes = [0, 1, 2] * 9 + ancestral_state, transitions = self.do_map_mutations(self.tree, genotypes) + assert ancestral_state == "0" + assert len(transitions) == 18 + + class TestReconstructAllTuples: """ Tests that the parsimony algorithm correctly round-trips all possible @@ -904,3 +1248,15 @@ def test_simple_n5_k4(self): def test_simple_n6_k3(self): ts = msprime.simulate(6, random_seed=4) self.verify(ts, 3) + + def test_root_polytomy_n5_k4(self): + tree = tskit.Tree.unrank(5, (1, 0)) + self.verify(tree.tree_sequence, 4) + + def test_leaf_polytomy_n5_k4(self): + tree = tskit.Tree.unrank(5, (7, 0)) + self.verify(tree.tree_sequence, 4) + + def test_leaf_polytomy_n5_k5(self): + tree = tskit.Tree.unrank(5, (7, 0)) + self.verify(tree.tree_sequence, 5)