diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index d6332d9839..46294f9069 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -6,6 +6,9 @@ In development **New features** +- Add a ``nexus`` function that outputs a tree sequence in Nexus format + (:user:`saunack`, :pr:`550`). + - Add extension of Kendall-Colijn tree distance metric for tree sequences computed by ``TreeSequence.kc_distance`` (:user:`daniel-goldstein`, :pr:`548`) diff --git a/python/tests/test_newick.py b/python/tests/test_phylo_formats.py similarity index 63% rename from python/tests/test_newick.py rename to python/tests/test_phylo_formats.py index c288a74568..5556536e21 100644 --- a/python/tests/test_newick.py +++ b/python/tests/test_phylo_formats.py @@ -23,40 +23,24 @@ """ Tests for the newick output feature. """ +import itertools import unittest import msprime import newick +from Bio.Nexus import Nexus +import tskit +from tests import tsutil -class TestNewick(unittest.TestCase): + +class TreeExamples(unittest.TestCase): """ - Tests that the newick output has the properties that we need using - external Newick parser. + Generates trees for testing the phylo format outputs. """ random_seed = 155 - def verify_newick_topology(self, tree, root=None, node_labels=None): - if root is None: - root = tree.root - ns = tree.newick(precision=16, root=root, node_labels=node_labels) - if node_labels is None: - leaf_labels = {u: str(u + 1) for u in tree.leaves(root)} - else: - leaf_labels = {u: node_labels[u] for u in tree.leaves(root)} - newick_tree = newick.loads(ns)[0] - leaf_names = newick_tree.get_leaf_names() - self.assertEqual(sorted(leaf_names), sorted(leaf_labels.values())) - for u in tree.leaves(root): - name = leaf_labels[u] - node = newick_tree.get_node(name) - while u != root: - self.assertAlmostEqual(node.length, tree.branch_length(u)) - node = node.ancestor - u = tree.parent(u) - self.assertIsNone(node.ancestor) - def get_nonbinary_example(self): ts = msprime.simulate( sample_size=20, @@ -94,6 +78,36 @@ def get_multiroot_example(self): ) return tables.tree_sequence() + def get_single_tree(self): + return msprime.simulate(10, random_seed=2) + + +class TestNewick(TreeExamples): + """ + Tests that the newick output has the properties that we need using + external Newick parser. + """ + + def verify_newick_topology(self, tree, root=None, node_labels=None): + if root is None: + root = tree.root + ns = tree.newick(precision=16, root=root, node_labels=node_labels) + if node_labels is None: + leaf_labels = {u: str(u + 1) for u in tree.leaves(root)} + else: + leaf_labels = {u: node_labels[u] for u in tree.leaves(root)} + newick_tree = newick.loads(ns)[0] + leaf_names = newick_tree.get_leaf_names() + self.assertEqual(sorted(leaf_names), sorted(leaf_labels.values())) + for u in tree.leaves(root): + name = leaf_labels[u] + node = newick_tree.get_node(name) + while u != root: + self.assertAlmostEqual(node.length, tree.branch_length(u)) + node = node.ancestor + u = tree.parent(u) + self.assertIsNone(node.ancestor) + def test_nonbinary_tree(self): ts = self.get_nonbinary_example() for t in ts.trees(): @@ -146,3 +160,81 @@ def test_single_node_label(self): [n.name for n in root.walk()], [labels[tree.root]] + [None for _ in range(len(list(tree.nodes())) - 1)], ) + + +class TestNexus(TreeExamples): + """ + Tests that the nexus output has the properties that we need using + external Nexus parser. + """ + + def verify_tree(self, nexus_tree, tsk_tree): + self.assertEqual(len(nexus_tree.get_terminals()), tsk_tree.num_samples()) + + bio_node_map = {} + for node_id in nexus_tree.all_ids(): + bio_node = nexus_tree.node(node_id) + bio_node_map[bio_node.data.taxon] = bio_node + + for u in tsk_tree.nodes(): + node = tsk_tree.tree_sequence.node(u) + label = f"tsk_{node.id}_{node.flags}" + bio_node = bio_node_map.pop(label) + self.assertAlmostEqual( + bio_node.data.branchlength, tsk_tree.branch_length(u) + ) + if tsk_tree.parent(u) == tskit.NULL: + self.assertEqual(bio_node.prev, None) + else: + bio_node_parent = nexus_tree.node(bio_node.prev) + parent = tsk_tree.tree_sequence.node(tsk_tree.parent(u)) + self.assertEqual( + bio_node_parent.data.taxon, f"tsk_{parent.id}_{parent.flags}" + ) + self.assertEqual(len(bio_node_map), 0) + + def verify_nexus_topology(self, treeseq): + nexus = treeseq.nexus(precision=16) + nexus_treeseq = Nexus.Nexus(nexus) + self.assertEqual(treeseq.num_trees, len(nexus_treeseq.trees)) + for tree, nexus_tree in itertools.zip_longest( + treeseq.trees(), nexus_treeseq.trees + ): + name = nexus_tree.name + split_name = name.split("_") + self.assertEqual(len(split_name), 2) + start = float(split_name[0][4:]) + end = float(split_name[1]) + self.assertAlmostEqual(tree.interval[0], start) + self.assertAlmostEqual(tree.interval[1], end) + + self.verify_tree(nexus_tree, tree) + + def test_binary_tree(self): + ts = self.get_binary_example() + self.verify_nexus_topology(ts) + + def test_nonbinary_example(self): + ts = self.get_nonbinary_example() + self.verify_nexus_topology(ts) + + def test_single_tree(self): + ts = self.get_single_tree() + self.verify_nexus_topology(ts) + + def test_multiroot(self): + ts = self.get_multiroot_example() + self.assertRaises(ValueError, ts.nexus) + + def test_many_trees(self): + ts = msprime.simulate(4, recombination_rate=2, random_seed=123) + self.verify_nexus_topology(ts) + + def test_many_trees_sequence_length(self): + ts = msprime.simulate(4, length=10, recombination_rate=0.2, random_seed=13) + self.verify_nexus_topology(ts) + + def test_internal_samples(self): + ts = msprime.simulate(8, random_seed=2) + ts = tsutil.jiggle_samples(ts) + self.verify_nexus_topology(ts) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 7626a57f77..9356d659a8 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -3800,6 +3800,44 @@ def write_vcf( ) writer.write(output) + def nexus(self, precision=14): + """ + Returns a `nexus encoding `_ + of this tree sequence. Trees along the sequence are listed sequentially in + the TREES block. The tree spanning the interval :math:`[x, y)`` is + given the name "tree_x_y". Spatial positions are written at the + specified precision. + + Nodes in the tree sequence are identified by the taxon labels of the + form ``f"tsk_{node.id}_{node.flags}"``, such that a node with ``id=5`` + and ``flags=1`` will have the label ``"tsk_5_1"`` (please see the + :ref:`data model ` section for details + on the interpretation of node ID and flags values). These labels are + listed for all nodes in the tree sequence in the ``TAXLABELS`` block. + + :param int precision: The numerical precision with which branch lengths + and tree positions are printed. + :return: A nexus representation of this TreeSequence. + :rtype: str + """ + node_labels = {node.id: f"tsk_{node.id}_{node.flags}" for node in self.nodes()} + + s = "#NEXUS\n" + s += "BEGIN TAXA;\n" + s += "TAXLABELS " + s += ",".join(node_labels[node.id] for node in self.nodes()) + ";\n" + s += "END;\n" + + s += "BEGIN TREES;\n" + for tree in self.trees(): + start_interval = "{0:.{1}f}".format(tree.interval[0], precision) + end_interval = "{0:.{1}f}".format(tree.interval[1], precision) + newick = tree.newick(precision, node_labels=node_labels) + s += f"\tTREE tree{start_interval}_{end_interval} = {newick}\n" + + s += "END;\n" + return s + def simplify( self, samples=None,