tskit-dev · mergify · Aug 30, 2022 · Aug 26, 2022 · petrelharp · Aug 26, 2022
diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst
@@ -2,6 +2,11 @@
 [0.5.3] - 2022-XX-XX
 --------------------
 
+**Fixes**
+
+ - ``ts.subset()`` produces valid tree sequences even if nodes are shuffled
+   out of time order (:user:`hyanwong`, :pr:`2479`, :issue:`2473`)
+
 **Changes**
 
  - Single statistics computed with ``TreeSequence.general_stat`` are now

diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py
@@ -1949,6 +1949,27 @@ def test_simplify_migrations_fails(self):
         with pytest.raises(_tskit.LibraryError):
             ts.simplify()
 
+    def test_subset_reverse_all_nodes(self):
+        ts = tskit.Tree.generate_comb(5).tree_sequence
+        assert np.all(ts.samples() == np.arange(ts.num_samples))
+        flipped_ids = np.flip(np.arange(ts.num_nodes))
+        new_ts = ts.subset(flipped_ids)
+        assert set(new_ts.samples()) == set(flipped_ids[np.arange(ts.num_samples)])
+        r1 = ts.first().rank()
+        r2 = new_ts.first().rank()
+        assert r1.shape == r2.shape
+        assert r1.label != r2.label
+
+    def test_subset_reverse_internal_nodes(self):
+        ts = tskit.Tree.generate_balanced(5).tree_sequence
+        internal_nodes = np.ones(ts.num_nodes, dtype=bool)
+        internal_nodes[ts.samples()] = False
+        node_ids = np.arange(ts.num_nodes)
+        node_ids[internal_nodes] = np.flip(node_ids[internal_nodes])
+        new_ts = ts.subset(node_ids)
+        assert np.any(new_ts.nodes_time != ts.nodes_time)
+        assert new_ts.first().rank() == ts.first().rank()
+
     def test_deprecated_apis(self):
         ts = msprime.simulate(10, random_seed=1)
         assert ts.get_ll_tree_sequence() == ts.ll_tree_sequence

diff --git a/python/tskit/trees.py b/python/tskit/trees.py
@@ -6694,12 +6694,12 @@ def subset(
         of the retained nodes.  Note that this does *not* retain
         the ancestry of these nodes - for that, see :meth:`.simplify`.
 
-        This has the side effect of reordering the nodes, individuals, and
-        populations in the tree sequence: the nodes in the new tree sequence
-        will be in the order provided in ``nodes``, and both individuals and
-        populations will be ordered by the earliest retained node that refers
-        to them. (However, ``reorder_populations`` may be set to False
-        to keep the population table unchanged.)
+        This has the side effect that it may change the order of the nodes,
+        individuals, populations, and migrations in the tree sequence: the nodes
+        in the new tree sequence will be in the order provided in ``nodes``, and
+        both individuals and populations will be ordered by the earliest retained
+        node that refers to them. (However, ``reorder_populations`` may be set to
+        False to keep the population table unchanged.)
 
         By default, the method removes all individuals and populations not
         referenced by any nodes, and all sites not referenced by any mutations.
@@ -6734,6 +6734,7 @@ def subset(
             reorder_populations=reorder_populations,
             remove_unreferenced=remove_unreferenced,
         )
+        tables.sort()
         return tables.tree_sequence()
 
     def union(