From 1fabda7237b36faecda40780eaa1a83abcdf002a Mon Sep 17 00:00:00 2001 From: Brian Zhang Date: Mon, 20 Apr 2020 12:57:49 +0100 Subject: [PATCH] Add minimum lexicographic order traversal for nodes in a Tree We add a new traversal order in the Tree.nodes() function called "minlex_postorder", described in #389. Additional changes in this commit: * Update "time_asc" and "time_desc" traversal orders to fall back to sorting by ID * Add explicit tests for traversal orders * Add documentation for all traversal orders. This closes #401. --- python/CHANGELOG.rst | 5 + python/tests/__init__.py | 26 ++++++ python/tests/test_highlevel.py | 18 +++- python/tests/test_topology.py | 162 ++++++++++++++++++++++++++++++++- python/tests/test_util.py | 4 +- python/tskit/trees.py | 128 +++++++++++++++++++++++++- 6 files changed, 329 insertions(+), 14 deletions(-) diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index f856915ce1..d2acda4a69 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -6,6 +6,11 @@ In development **New features** +- Add an optional node traversal order in ``tskit.Tree`` that uses the minimum + lexicographic order of leaf nodes visited. This ordering (``"minlex_postorder"``) + adds more determinism because it constraints the order in which children of + a node are visited (:user:`brianzhang01`, :pr:`411`). + - Add ``_repr_html_`` to tables, so that jupyter notebooks render them as html tables (:user:`benjeffery`, :pr:`514`) diff --git a/python/tests/__init__.py b/python/tests/__init__.py index 6134936d74..d5546d86f2 100644 --- a/python/tests/__init__.py +++ b/python/tests/__init__.py @@ -110,6 +110,30 @@ def _levelorder_nodes(self, u, l, level): for c in self.children(u): self._levelorder_nodes(c, l, level + 1) + def _minlex_postorder_nodes(self, u, l): + l.extend(self._minlex_postorder_nodes_helper(u)[1]) + + def _minlex_postorder_nodes_helper(self, u): + """ + For a given input ID u, this function returns a tuple whose first value + is the minimum leaf node ID under node u, and whose second value is + a list containing the minlex postorder for the subtree rooted at node u. + The first value is needed for sorting, and the second value is what + finally gets returned. + """ + children = self.children(u) + if len(children) > 0: + children_return = [self._minlex_postorder_nodes_helper(c) for c in children] + # sorts by first value, which is the minimum leaf node ID + children_return.sort() + minlex_postorder = [] + for _, child_minlex_postorder in children_return: + minlex_postorder.extend(child_minlex_postorder) + minlex_postorder.extend([u]) + return (children_return[0][0], minlex_postorder) + else: + return (u, [u]) + def nodes(self, root=None, order="preorder"): roots = [root] if root is None: @@ -127,6 +151,8 @@ def nodes(self, root=None, order="preorder"): # Nested list comprehension flattens node_list in order self._levelorder_nodes(u, node_list, 0) node_list = iter([i for level in node_list for i in level]) + elif order == "minlex_postorder": + self._minlex_postorder_nodes(u, node_list) else: raise ValueError("order not supported") yield from node_list diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index 2de877956a..146f76166e 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -1888,9 +1888,8 @@ def test_traversals(self): tree = next(ts.trees()) self.verify_traversals(tree) - # To verify time-ordered traversal we can't use the method used for the - # other traversals above, it checks for one-to-one correspondence. - # As more than one ordering is valid for time, we do it separately here + # Verify time-ordered traversals separately, because the PythonTree + # class does not contain time information at the moment for root in tree.roots: time_ordered = tree.nodes(root, order="timeasc") t = tree.time(next(time_ordered)) @@ -1909,7 +1908,13 @@ def verify_traversals(self, tree): t1 = tree t2 = tests.PythonTree.from_tree(t1) self.assertEqual(list(t1.nodes()), list(t2.nodes())) - orders = ["inorder", "postorder", "levelorder", "breadthfirst"] + orders = [ + "inorder", + "postorder", + "levelorder", + "breadthfirst", + "minlex_postorder", + ] if tree.num_roots == 1: self.assertRaises(ValueError, list, t1.nodes(order="bad order")) self.assertEqual(list(t1.nodes()), list(t1.nodes(t1.get_root()))) @@ -1946,7 +1951,10 @@ def verify_traversals(self, tree): list(t2.nodes(root, order=test_order)), ) all_nodes.extend(t1.nodes(root, order=test_order)) - self.assertEqual(all_nodes, list(t1.nodes(order=test_order))) + # minlex_postorder reorders the roots, so this last test is + # not appropriate + if test_order != "minlex_postorder": + self.assertEqual(all_nodes, list(t1.nodes(order=test_order))) def test_total_branch_length(self): # Note: this definition works when we have no non-sample branches. diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index 2bffcc2384..2884d615bb 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -1952,6 +1952,162 @@ def test_nonbinary_tree_sequence_permuted_nodes(self): self.verify_permuted_nodes(ts) +class TestTraversalOrder(unittest.TestCase): + """ + Tests node traversal orders. + """ + + # + # 9 10 + # / \ / \ + # / \ / 8 + # / \ / / \ + # 7 \ / / \ + # / \ 6 / / 6 + # / 5 / \ / 5 / \ + # / / \ / \ / / \ / \ + # 4 0 1 2 3 4 0 1 2 3 + # + # 0 ------------------ 0.5 ------------------ 1.0 + nodes = """\ + id is_sample population time + 0 1 0 0.00000000000000 + 1 1 0 0.00000000000000 + 2 1 0 0.00000000000000 + 3 1 0 0.00000000000000 + 4 1 0 0.00000000000000 + 5 0 0 0.14567111023387 + 6 0 0 0.21385545626353 + 7 0 0 0.43508024345063 + 8 0 0 0.60156352971203 + 9 0 0 0.90000000000000 + 10 0 0 1.20000000000000 + """ + edges = """\ + id left right parent child + 0 0.00000000 1.00000000 5 0,1 + 1 0.00000000 1.00000000 6 2,3 + 2 0.00000000 0.50000000 7 4,5 + 3 0.50000000 1.00000000 8 5,6 + 4 0.00000000 0.50000000 9 6,7 + 5 0.50000000 1.00000000 10 4,8 + """ + node_order_results = { + "preorder": [[9, 6, 2, 3, 7, 4, 5, 0, 1], [10, 4, 8, 5, 0, 1, 6, 2, 3]], + "inorder": [[2, 6, 3, 9, 4, 7, 0, 5, 1], [4, 10, 0, 5, 1, 8, 2, 6, 3]], + "postorder": [[2, 3, 6, 4, 0, 1, 5, 7, 9], [4, 0, 1, 5, 2, 3, 6, 8, 10]], + "levelorder": [[9, 6, 7, 2, 3, 4, 5, 0, 1], [10, 4, 8, 5, 6, 0, 1, 2, 3]], + "breadthfirst": [[9, 6, 7, 2, 3, 4, 5, 0, 1], [10, 4, 8, 5, 6, 0, 1, 2, 3]], + "timeasc": [[0, 1, 2, 3, 4, 5, 6, 7, 9], [0, 1, 2, 3, 4, 5, 6, 8, 10]], + "timedesc": [[9, 7, 6, 5, 4, 3, 2, 1, 0], [10, 8, 6, 5, 4, 3, 2, 1, 0]], + "minlex_postorder": [[0, 1, 5, 4, 7, 2, 3, 6, 9], [0, 1, 5, 2, 3, 6, 8, 4, 10]], + } + + def test_traversal_order(self): + ts = tskit.load_text( + nodes=io.StringIO(self.nodes), edges=io.StringIO(self.edges), strict=False + ) + for test_order, expected_result in self.node_order_results.items(): + tree_orders = [] + for tree in ts.trees(): + tree_orders.append(list(tree.nodes(order=test_order))) + self.assertEqual(tree_orders, expected_result) + + def test_polytomy_inorder(self): + """ + If there are N children, current inorder traversal first visits + floor(N/2) children, then the parent, then the remaining children. + Here we explicitly test that behaviour. + """ + # + # __4__ + # / / \ \ + # 0 1 2 3 + # + nodes_polytomy_4 = """\ + id is_sample population time + 0 1 0 0.00000000000000 + 1 1 0 0.00000000000000 + 2 1 0 0.00000000000000 + 3 1 0 0.00000000000000 + 4 0 0 1.00000000000000 + """ + edges_polytomy_4 = """\ + id left right parent child + 0 0.00000000 1.00000000 4 0,1,2,3 + """ + # + # __5__ + # / /|\ \ + # 0 1 2 3 4 + # + nodes_polytomy_5 = """\ + id is_sample population time + 0 1 0 0.00000000000000 + 1 1 0 0.00000000000000 + 2 1 0 0.00000000000000 + 3 1 0 0.00000000000000 + 4 1 0 0.00000000000000 + 5 0 0 1.00000000000000 + """ + edges_polytomy_5 = """\ + id left right parent child + 0 0.00000000 1.00000000 5 0,1,2,3,4 + """ + for nodes_string, edges_string, expected_result in [ + [nodes_polytomy_4, edges_polytomy_4, [[0, 1, 4, 2, 3]]], + [nodes_polytomy_5, edges_polytomy_5, [[0, 1, 5, 2, 3, 4]]], + ]: + ts = tskit.load_text( + nodes=io.StringIO(nodes_string), + edges=io.StringIO(edges_string), + strict=False, + ) + tree_orders = [] + for tree in ts.trees(): + tree_orders.append(list(tree.nodes(order="inorder"))) + self.assertEqual(tree_orders, expected_result) + + def test_minlex_postorder_multiple_roots(self): + # + # 10 8 9 11 + # / \ / \ / \ / \ + # 5 3 2 4 6 7 1 0 + # + nodes_string = """\ + id is_sample population time + 0 1 0 0.00000000000000 + 1 1 0 0.00000000000000 + 2 1 0 0.00000000000000 + 3 1 0 0.00000000000000 + 4 1 0 0.00000000000000 + 5 1 0 0.00000000000000 + 6 1 0 0.00000000000000 + 7 1 0 0.00000000000000 + 8 0 0 1.00000000000000 + 9 0 0 1.00000000000000 + 10 0 0 1.00000000000000 + 11 0 0 1.00000000000000 + """ + edges_string = """\ + id left right parent child + 0 0.00000000 1.00000000 8 2,4 + 1 0.00000000 1.00000000 9 6,7 + 2 0.00000000 1.00000000 10 5,3 + 3 0.00000000 1.00000000 11 1,0 + """ + expected_result = [[0, 1, 11, 2, 4, 8, 3, 5, 10, 6, 7, 9]] + ts = tskit.load_text( + nodes=io.StringIO(nodes_string), + edges=io.StringIO(edges_string), + strict=False, + ) + tree_orders = [] + for tree in ts.trees(): + tree_orders.append(list(tree.nodes(order="minlex_postorder"))) + self.assertEqual(tree_orders, expected_result) + + class TestSimplifyExamples(TopologyTestCase): """ Tests for simplify where we write out the input and expected output @@ -2598,7 +2754,7 @@ def test_simplest_non_degenerate_case(self): t = next(ts_simplified.trees()) self.assertEqual(t.parent_dict, {0: 4, 1: 4, 2: 5, 3: 5}) - def test_two_reducable_trees(self): + def test_two_reducible_trees(self): # We have n = 4 and two trees, with some unary nodes and non-sample leaves nodes = io.StringIO( """\ @@ -2682,8 +2838,8 @@ def test_two_reducable_trees(self): self.assertEqual(sites[-1].position, 0.4) self.assertEqual(t.parent_dict, {0: 4, 1: 4, 2: 5, 3: 5}) - def test_one_reducable_tree(self): - # We have n = 4 and two trees. One tree is reducable and the other isn't. + def test_one_reducible_tree(self): + # We have n = 4 and two trees. One tree is reducible and the other isn't. nodes = io.StringIO( """\ id is_sample time diff --git a/python/tests/test_util.py b/python/tests/test_util.py index 471b67159f..20e85aa513 100644 --- a/python/tests/test_util.py +++ b/python/tests/test_util.py @@ -121,7 +121,9 @@ def test_nonrectangular_input(self): ] for dtype in self.dtypes_to_test: for bad_input in bad_inputs: - with self.assertRaises(TypeError): + # On some platforms and Python / numpy versions, a ValueError + # occurs instead + with self.assertRaises((TypeError, ValueError)): util.safe_np_int_cast(bad_input, dtype) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 47f1cc8eba..56fd05052f 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -1564,13 +1564,83 @@ def _levelorder_traversal(self, u): yield v def _timeasc_traversal(self, u): - yield from sorted(self.nodes(u, order="levelorder"), key=self.time) + """ + Sorts by increasing time but falls back to increasing ID for equal times. + """ + yield from sorted( + self.nodes(u, order="levelorder"), key=lambda u: (self.time(u), u) + ) def _timedesc_traversal(self, u): + """ + Sorts by decreasing time but falls back to decreasing ID for equal times. + """ yield from sorted( - self.nodes(u, order="levelorder"), key=self.time, reverse=True + self.nodes(u, order="levelorder"), + key=lambda u: (self.time(u), u), + reverse=True, ) + def _minlex_postorder_traversal(self, u): + """ + Postorder traversal that visits leaves in minimum lexicographic order. + + Minlex stands for minimum lexicographic. We wish to visit a tree in such + a way that the leaves visited, when their IDs are listed out, have + minimum lexicographic order. This is a useful ordering for drawing + multiple Trees of a TreeSequence, as it leads to more consistency + between adjacent Trees. + """ + # We skip perf optimisations here (compared to _preorder_traversal and + # _postorder_traversal) as this ordering is unlikely to be used in perf + # sensitive applications + stack = collections.deque([u]) + parent = NULL + + # We compute a dictionary mapping from internal node ID to min leaf ID + # under the node, using a first postorder traversal + min_leaf_dict = {} + while len(stack) > 0: + v = stack[-1] + children = [] if v == parent else self.children(v) + if children: + # The first time visiting a node, we push its children onto the stack. + # reversed is not strictly necessary, but it gives the postorder + # we would intuitively expect. + stack.extend(reversed(children)) + else: + # The second time visiting a node, we record its min leaf ID + # underneath, pop it, and update the parent variable + if v != parent: + # at a leaf node + min_leaf_dict[v] = v + else: + # at a parent after finishing all its children + min_leaf_dict[v] = min([min_leaf_dict[c] for c in self.children(v)]) + parent = self.get_parent(v) + stack.pop() + + # Now we do a second postorder traversal + stack.clear() + stack.extend([u]) + parent = NULL + while len(stack) > 0: + v = stack[-1] + children = [] if v == parent else self.children(v) + if children: + # The first time visiting a node, we push onto the stack its children + # in order of reverse min leaf ID under each child. This guarantees + # that the earlier children visited have smaller min leaf ID, + # which is equivalent to the minlex condition. + stack.extend( + sorted(children, key=lambda u: min_leaf_dict[u], reverse=True) + ) + else: + # The second time visiting a node, we pop and yield it, and + # we update the parent variable + parent = self.get_parent(v) + yield stack.pop() + def nodes(self, root=None, order="preorder"): """ Returns an iterator over the node IDs in this tree. If the root parameter @@ -1582,10 +1652,45 @@ def nodes(self, root=None, order="preorder"): Unlike the :meth:`TreeSequence.nodes` method, this iterator produces integer node IDs, not :class:`Node` objects. + The currently implemented traversal orders are: + + - 'preorder': starting at root, yield the current node, then recurse + and do a preorder on each child of the current node. See also `Wikipedia + `__. + - 'inorder': starting at root, assuming binary trees, recurse and do + an inorder on the first child, then yield the current node, then + recurse and do an inorder on the second child. In the case of ``n`` + child nodes (not necessarily 2), the first ``n // 2`` children are + visited in the first stage, and the remaining ``n - n // 2`` children + are visited in the second stage. See also `Wikipedia + `__. + - 'postorder': starting at root, recurse and do a postorder on each + child of the current node, then yield the current node. See also + `Wikipedia + `__. + - 'levelorder' ('breadthfirst'): visit the nodes under root (including + the root) in increasing order of their depth from root. See also + `Wikipedia + `__. + - 'timeasc': visits the nodes in order of increasing time, falling back to + increasing ID if times are equal. + - 'timedesc': visits the nodes in order of decreasing time, falling back to + decreasing ID if times are equal. + - 'minlex_postorder': a usual postorder has ambiguity in the order in + which children of a node are visited. We constrain this by outputting + a postorder such that the leaves visited, when their IDs are + listed out, have minimum `lexicographic order + `__ out of all valid + traversals. This traversal is useful for drawing multiple trees of + a ``TreeSequence``, as it leads to more consistency between adjacent + trees. Note that internal non-leaf nodes are not counted in + assessing the lexicographic order. + :param int root: The root of the subtree we are traversing. :param str order: The traversal ordering. Currently 'preorder', 'inorder', 'postorder', 'levelorder' ('breadthfirst'), 'timeasc' and - 'timedesc' are supported. + 'timedesc' and 'minlex_postorder' are supported. :return: An iterator over the node IDs in the tree in some traversal order. :rtype: collections.abc.Iterable, int """ @@ -1597,6 +1702,7 @@ def nodes(self, root=None, order="preorder"): "breadthfirst": self._levelorder_traversal, "timeasc": self._timeasc_traversal, "timedesc": self._timedesc_traversal, + "minlex_postorder": self._minlex_postorder_traversal, } try: iterator = methods[order] @@ -1605,8 +1711,20 @@ def nodes(self, root=None, order="preorder"): roots = [root] if root is None: roots = self.roots - for u in roots: - yield from iterator(u) + if order == "minlex_postorder" and len(roots) > 1: + # we need to visit the roots in minlex order as well + # we first visit all the roots and then sort by the min value + root_values = [] + for u in roots: + root_minlex_postorder = list(iterator(u)) + min_value = root_minlex_postorder[0] + root_values.append([min_value, root_minlex_postorder]) + root_values.sort() + for _, nodes_for_root in root_values: + yield from nodes_for_root + else: + for u in roots: + yield from iterator(u) # TODO make this a bit less embarrassing by using an iterative method. def __build_newick(self, node, precision, node_labels):