From 8f3f2bd6f933a868fe4221c233370cef927949bd Mon Sep 17 00:00:00 2001 From: saunack Date: Tue, 21 Apr 2020 19:24:37 +0530 Subject: [PATCH 1/2] Added nexus output functionality in TreeSequence and corresponding tests Combined newick and nexus test files to test_phylo_formats.py --- .../{test_newick.py => test_phylo_formats.py} | 114 ++++++++++++++---- python/tskit/trees.py | 32 +++++ 2 files changed, 123 insertions(+), 23 deletions(-) rename python/tests/{test_newick.py => test_phylo_formats.py} (69%) diff --git a/python/tests/test_newick.py b/python/tests/test_phylo_formats.py similarity index 69% rename from python/tests/test_newick.py rename to python/tests/test_phylo_formats.py index c288a74568..8534549666 100644 --- a/python/tests/test_newick.py +++ b/python/tests/test_phylo_formats.py @@ -23,40 +23,21 @@ """ Tests for the newick output feature. """ +import itertools import unittest import msprime import newick +from Bio.Nexus import Nexus -class TestNewick(unittest.TestCase): +class TreeExamples(unittest.TestCase): """ - Tests that the newick output has the properties that we need using - external Newick parser. + Generates trees for testing the phylo format outputs. """ random_seed = 155 - def verify_newick_topology(self, tree, root=None, node_labels=None): - if root is None: - root = tree.root - ns = tree.newick(precision=16, root=root, node_labels=node_labels) - if node_labels is None: - leaf_labels = {u: str(u + 1) for u in tree.leaves(root)} - else: - leaf_labels = {u: node_labels[u] for u in tree.leaves(root)} - newick_tree = newick.loads(ns)[0] - leaf_names = newick_tree.get_leaf_names() - self.assertEqual(sorted(leaf_names), sorted(leaf_labels.values())) - for u in tree.leaves(root): - name = leaf_labels[u] - node = newick_tree.get_node(name) - while u != root: - self.assertAlmostEqual(node.length, tree.branch_length(u)) - node = node.ancestor - u = tree.parent(u) - self.assertIsNone(node.ancestor) - def get_nonbinary_example(self): ts = msprime.simulate( sample_size=20, @@ -94,6 +75,36 @@ def get_multiroot_example(self): ) return tables.tree_sequence() + def get_single_tree(self): + return msprime.simulate(10, random_seed=2) + + +class TestNewick(TreeExamples): + """ + Tests that the newick output has the properties that we need using + external Newick parser. + """ + + def verify_newick_topology(self, tree, root=None, node_labels=None): + if root is None: + root = tree.root + ns = tree.newick(precision=16, root=root, node_labels=node_labels) + if node_labels is None: + leaf_labels = {u: str(u + 1) for u in tree.leaves(root)} + else: + leaf_labels = {u: node_labels[u] for u in tree.leaves(root)} + newick_tree = newick.loads(ns)[0] + leaf_names = newick_tree.get_leaf_names() + self.assertEqual(sorted(leaf_names), sorted(leaf_labels.values())) + for u in tree.leaves(root): + name = leaf_labels[u] + node = newick_tree.get_node(name) + while u != root: + self.assertAlmostEqual(node.length, tree.branch_length(u)) + node = node.ancestor + u = tree.parent(u) + self.assertIsNone(node.ancestor) + def test_nonbinary_tree(self): ts = self.get_nonbinary_example() for t in ts.trees(): @@ -146,3 +157,60 @@ def test_single_node_label(self): [n.name for n in root.walk()], [labels[tree.root]] + [None for _ in range(len(list(tree.nodes())) - 1)], ) + + +class TestNexus(TreeExamples): + """ + Tests that the nexus output has the properties that we need using + external Nexus parser. + """ + + def verify_tree(self, nexus_tree, tree): + root = tree.root + self.assertEqual(len(nexus_tree.get_terminals()), tree.num_samples()) + for u in nexus_tree.get_terminals(): + leaf_label = nexus_tree.node(u).data.taxon + leaf_data = leaf_label.split("_") + self.assertEqual(len(leaf_data), 3) + t_id = int(leaf_data[2]) + self.assertTrue(tree.is_leaf(t_id)) + node = nexus_tree.node(u) + while t_id != root: + self.assertAlmostEqual(node.data.branchlength, tree.branch_length(t_id)) + ancestor_id = node.get_prev() + node = nexus_tree.node(ancestor_id) + t_id = tree.parent(t_id) + self.assertIsNone(node.get_prev()) + + def verify_nexus_topology(self, treeseq): + nexus = treeseq.nexus(precision=16) + nexus_treeseq = Nexus.Nexus(nexus) + self.assertEqual(treeseq.num_trees, len(nexus_treeseq.trees)) + for tree, nexus_tree in itertools.zip_longest( + treeseq.trees(), nexus_treeseq.trees + ): + name = nexus_tree.name + split_name = name.split("_") + self.assertEqual(len(split_name), 2) + start = float(split_name[0][4:]) + end = float(split_name[1]) + self.assertAlmostEqual(tree.interval[0], start) + self.assertAlmostEqual(tree.interval[1], end) + + self.verify_tree(nexus_tree, tree) + + def test_binary_tree(self): + ts = self.get_binary_example() + self.verify_nexus_topology(ts) + + def test_nonbinary_example(self): + ts = self.get_nonbinary_example() + self.verify_nexus_topology(ts) + + def test_single_tree(self): + ts = self.get_single_tree() + self.verify_nexus_topology(ts) + + def test_multiroot(self): + ts = self.get_multiroot_example() + self.assertRaises(ValueError, ts.nexus) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 7626a57f77..41d42a0675 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -5468,3 +5468,35 @@ def newick_trees(self, precision=3, breakpoints=None, Ne=1): "This method is no longer supported. Please use the Tree.newick" " method instead" ) + + def nexus(self, precision=14): + """ + Returns a nexus encoding `_ + of this tree. Trees along the sequence are listed sequentially in + the TREES block. The tree spanning the interval :math:`[x, y)`` is + given the name "tree_x_y". Spatial positions are written at the + specified precision. + + :param int precision: The numerical precision with which branch lengths + and tree positions are printed. + :return: A nexus representation of the TreeSequence. + :rtype: str + """ + node_labels = {node.id: f"tsk_{node.flags}_{node.id}" for node in self.nodes()} + + s = "#NEXUS\n" + s += "BEGIN TAXA;\n" + s += "TAXLABELS " + s += ",".join(node_labels[node.id] for node in self.nodes()) + ";\n" + s += "END;\n" + + s += "BEGIN TREES;\n" + for tree in self.trees(): + start_interval = "{0:.{1}f}".format(tree.interval[0], precision) + end_interval = "{0:.{1}f}".format(tree.interval[1], precision) + newick = tree.newick(precision, node_labels=node_labels) + s += f"\tTREE tree{start_interval}_{end_interval} = {newick}\n" + + s += "END;\n" + print(s) + return s From ae05b3d40f83b874d07c1433bdf1c66bcca9913e Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 5 May 2020 16:55:36 +0100 Subject: [PATCH 2/2] Add some extra tests and update CHANGELOG. --- python/CHANGELOG.rst | 3 ++ python/tests/test_phylo_formats.py | 56 +++++++++++++++++------- python/tskit/trees.py | 70 ++++++++++++++++-------------- 3 files changed, 81 insertions(+), 48 deletions(-) diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index d6332d9839..46294f9069 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -6,6 +6,9 @@ In development **New features** +- Add a ``nexus`` function that outputs a tree sequence in Nexus format + (:user:`saunack`, :pr:`550`). + - Add extension of Kendall-Colijn tree distance metric for tree sequences computed by ``TreeSequence.kc_distance`` (:user:`daniel-goldstein`, :pr:`548`) diff --git a/python/tests/test_phylo_formats.py b/python/tests/test_phylo_formats.py index 8534549666..5556536e21 100644 --- a/python/tests/test_phylo_formats.py +++ b/python/tests/test_phylo_formats.py @@ -30,6 +30,9 @@ import newick from Bio.Nexus import Nexus +import tskit +from tests import tsutil + class TreeExamples(unittest.TestCase): """ @@ -165,22 +168,30 @@ class TestNexus(TreeExamples): external Nexus parser. """ - def verify_tree(self, nexus_tree, tree): - root = tree.root - self.assertEqual(len(nexus_tree.get_terminals()), tree.num_samples()) - for u in nexus_tree.get_terminals(): - leaf_label = nexus_tree.node(u).data.taxon - leaf_data = leaf_label.split("_") - self.assertEqual(len(leaf_data), 3) - t_id = int(leaf_data[2]) - self.assertTrue(tree.is_leaf(t_id)) - node = nexus_tree.node(u) - while t_id != root: - self.assertAlmostEqual(node.data.branchlength, tree.branch_length(t_id)) - ancestor_id = node.get_prev() - node = nexus_tree.node(ancestor_id) - t_id = tree.parent(t_id) - self.assertIsNone(node.get_prev()) + def verify_tree(self, nexus_tree, tsk_tree): + self.assertEqual(len(nexus_tree.get_terminals()), tsk_tree.num_samples()) + + bio_node_map = {} + for node_id in nexus_tree.all_ids(): + bio_node = nexus_tree.node(node_id) + bio_node_map[bio_node.data.taxon] = bio_node + + for u in tsk_tree.nodes(): + node = tsk_tree.tree_sequence.node(u) + label = f"tsk_{node.id}_{node.flags}" + bio_node = bio_node_map.pop(label) + self.assertAlmostEqual( + bio_node.data.branchlength, tsk_tree.branch_length(u) + ) + if tsk_tree.parent(u) == tskit.NULL: + self.assertEqual(bio_node.prev, None) + else: + bio_node_parent = nexus_tree.node(bio_node.prev) + parent = tsk_tree.tree_sequence.node(tsk_tree.parent(u)) + self.assertEqual( + bio_node_parent.data.taxon, f"tsk_{parent.id}_{parent.flags}" + ) + self.assertEqual(len(bio_node_map), 0) def verify_nexus_topology(self, treeseq): nexus = treeseq.nexus(precision=16) @@ -214,3 +225,16 @@ def test_single_tree(self): def test_multiroot(self): ts = self.get_multiroot_example() self.assertRaises(ValueError, ts.nexus) + + def test_many_trees(self): + ts = msprime.simulate(4, recombination_rate=2, random_seed=123) + self.verify_nexus_topology(ts) + + def test_many_trees_sequence_length(self): + ts = msprime.simulate(4, length=10, recombination_rate=0.2, random_seed=13) + self.verify_nexus_topology(ts) + + def test_internal_samples(self): + ts = msprime.simulate(8, random_seed=2) + ts = tsutil.jiggle_samples(ts) + self.verify_nexus_topology(ts) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 41d42a0675..9356d659a8 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -3800,6 +3800,44 @@ def write_vcf( ) writer.write(output) + def nexus(self, precision=14): + """ + Returns a `nexus encoding `_ + of this tree sequence. Trees along the sequence are listed sequentially in + the TREES block. The tree spanning the interval :math:`[x, y)`` is + given the name "tree_x_y". Spatial positions are written at the + specified precision. + + Nodes in the tree sequence are identified by the taxon labels of the + form ``f"tsk_{node.id}_{node.flags}"``, such that a node with ``id=5`` + and ``flags=1`` will have the label ``"tsk_5_1"`` (please see the + :ref:`data model ` section for details + on the interpretation of node ID and flags values). These labels are + listed for all nodes in the tree sequence in the ``TAXLABELS`` block. + + :param int precision: The numerical precision with which branch lengths + and tree positions are printed. + :return: A nexus representation of this TreeSequence. + :rtype: str + """ + node_labels = {node.id: f"tsk_{node.id}_{node.flags}" for node in self.nodes()} + + s = "#NEXUS\n" + s += "BEGIN TAXA;\n" + s += "TAXLABELS " + s += ",".join(node_labels[node.id] for node in self.nodes()) + ";\n" + s += "END;\n" + + s += "BEGIN TREES;\n" + for tree in self.trees(): + start_interval = "{0:.{1}f}".format(tree.interval[0], precision) + end_interval = "{0:.{1}f}".format(tree.interval[1], precision) + newick = tree.newick(precision, node_labels=node_labels) + s += f"\tTREE tree{start_interval}_{end_interval} = {newick}\n" + + s += "END;\n" + return s + def simplify( self, samples=None, @@ -5468,35 +5506,3 @@ def newick_trees(self, precision=3, breakpoints=None, Ne=1): "This method is no longer supported. Please use the Tree.newick" " method instead" ) - - def nexus(self, precision=14): - """ - Returns a nexus encoding `_ - of this tree. Trees along the sequence are listed sequentially in - the TREES block. The tree spanning the interval :math:`[x, y)`` is - given the name "tree_x_y". Spatial positions are written at the - specified precision. - - :param int precision: The numerical precision with which branch lengths - and tree positions are printed. - :return: A nexus representation of the TreeSequence. - :rtype: str - """ - node_labels = {node.id: f"tsk_{node.flags}_{node.id}" for node in self.nodes()} - - s = "#NEXUS\n" - s += "BEGIN TAXA;\n" - s += "TAXLABELS " - s += ",".join(node_labels[node.id] for node in self.nodes()) + ";\n" - s += "END;\n" - - s += "BEGIN TREES;\n" - for tree in self.trees(): - start_interval = "{0:.{1}f}".format(tree.interval[0], precision) - end_interval = "{0:.{1}f}".format(tree.interval[1], precision) - newick = tree.newick(precision, node_labels=node_labels) - s += f"\tTREE tree{start_interval}_{end_interval} = {newick}\n" - - s += "END;\n" - print(s) - return s