From ce3ae679b2afb003b173dd5bdc748159bb0c01ec Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 12 Aug 2019 17:42:09 +0100 Subject: [PATCH 1/7] Change simplify to use keep_unary=True, rationalise param order, and document --- evaluation.py | 10 +++-- tsinfer/eval_util.py | 2 + tsinfer/inference.py | 100 ++++++++++++++++++++++++++----------------- 3 files changed, 69 insertions(+), 43 deletions(-) diff --git a/evaluation.py b/evaluation.py index f07ef87d..e32c1545 100644 --- a/evaluation.py +++ b/evaluation.py @@ -373,9 +373,13 @@ def edge_plot(ts, filename): for tree in ts.trees(): left, right = tree.interval for u in tree.nodes(): - for c in tree.children(u): - lines.append([(left, c), (right, c)]) - colours.append(pallete[unrank(tree.samples(c), n)]) + children = tree.children(u) + # Don't bother plotting unary nodes, which will all have the same + # samples under them as their next non-unary descendant + if len(children) > 1: + for c in children: + lines.append([(left, c), (right, c)]) + colours.append(pallete[unrank(tree.samples(c), n)]) lc = mc.LineCollection(lines, linewidths=2, colors=colours) fig, ax = plt.subplots() diff --git a/tsinfer/eval_util.py b/tsinfer/eval_util.py index 57b49c0a..5532a35c 100644 --- a/tsinfer/eval_util.py +++ b/tsinfer/eval_util.py @@ -705,6 +705,8 @@ def run_perfect_inference( num_threads=num_threads, extended_checks=extended_checks, progress_monitor=progress_monitor, stabilise_node_ordering=time_chunking and not path_compression) + # to compare against the original, we need to remove unary nodes from the inferred TS + inferred_ts = inferred_ts.simplify(keep_unary=False, filter_sites=False) return ts, inferred_ts diff --git a/tsinfer/inference.py b/tsinfer/inference.py index 35d46a04..b9659551 100644 --- a/tsinfer/inference.py +++ b/tsinfer/inference.py @@ -139,10 +139,10 @@ def verify(samples, tree_sequence, progress_monitor=None): def infer( - sample_data, progress_monitor=None, num_threads=0, path_compression=True, - simplify=True, engine=constants.C_ENGINE): + sample_data, num_threads=0, path_compression=True, simplify=True, + engine=constants.C_ENGINE, progress_monitor=None): """ - infer(sample_data, num_threads=0) + infer(sample_data, num_threads=0, path_compression=True, simplify=True) Runs the full :ref:`inference pipeline ` on the specified :class:`SampleData` instance and returns the inferred @@ -153,13 +153,19 @@ def infer( :param int num_threads: The number of worker threads to use in parallelised sections of the algorithm. If <= 0, do not spawn any threads and use simpler sequential algorithms (default). + :param bool path_compression: Whether to merge edges that share identical + paths (essentially taking advantage of shared recombination breakpoints). + :param bool simplify: Whether to remove extra tree nodes and edges that are not + on a path between the root and any of the samples. To do so, the final tree + sequence is simplified by appling the :meth:`tskit.TreeSequence.simplify` method + with ``keep_unary`` set to True (default = ``True``). :returns: The :class:`tskit.TreeSequence` object inferred from the input sample data. :rtype: tskit.TreeSequence """ ancestor_data = generate_ancestors( - sample_data, engine=engine, progress_monitor=progress_monitor, - num_threads=num_threads) + sample_data, num_threads=num_threads, + engine=engine, progress_monitor=progress_monitor,) ancestors_ts = match_ancestors( sample_data, ancestor_data, engine=engine, num_threads=num_threads, path_compression=path_compression, progress_monitor=progress_monitor) @@ -171,8 +177,8 @@ def infer( def generate_ancestors( - sample_data, num_threads=0, progress_monitor=None, engine=constants.C_ENGINE, - **kwargs): + sample_data, num_threads=0, path=None, + engine=constants.C_ENGINE, progress_monitor=None, **kwargs): """ generate_ancestors(sample_data, num_threads=0, path=None, **kwargs) @@ -186,21 +192,23 @@ def generate_ancestors( ancestor_data = tsinfer.generate_ancestors(sample_data, path="mydata.ancestors") - All other keyword arguments are passed to the :class:`AncestorData` constructor, + Other keyword arguments are passed to the :class:`AncestorData` constructor, which may be used to control the storage properties of the generated file. :param SampleData sample_data: The :class:`SampleData` instance that we are genering putative ancestors from. :param int num_threads: The number of worker threads to use. If < 1, use a simpler synchronous algorithm. + :param str path: The path of the file to store the sample data. If None, + the information is stored in memory and not persistent. :rtype: AncestorData :returns: The inferred ancestors stored in an :class:`AncestorData` instance. """ progress_monitor = _get_progress_monitor(progress_monitor) - with formats.AncestorData(sample_data, **kwargs) as ancestor_data: + with formats.AncestorData(sample_data, path=path, **kwargs) as ancestor_data: generator = AncestorsGenerator( - sample_data, ancestor_data, progress_monitor, engine=engine, - num_threads=num_threads) + sample_data, ancestor_data, num_threads=num_threads, + engine=engine, progress_monitor=progress_monitor) generator.add_sites() generator.run() ancestor_data.record_provenance("generate-ancestors") @@ -208,8 +216,8 @@ def generate_ancestors( def match_ancestors( - sample_data, ancestor_data, progress_monitor=None, num_threads=0, - path_compression=True, extended_checks=False, engine=constants.C_ENGINE): + sample_data, ancestor_data, num_threads=0, path_compression=True, + extended_checks=False, engine=constants.C_ENGINE, progress_monitor=None): """ match_ancestors(sample_data, ancestor_data, num_threads=0, path_compression=True) @@ -225,24 +233,25 @@ def match_ancestors( a history for. :param int num_threads: The number of match worker threads to use. If this is <= 0 then a simpler sequential algorithm is used (default). - :param bool path_compression: Should we try to merge edges that share identical - paths (essentially taking advantage of shared recombination breakpoints) + :param bool path_compression: Whether to merge edges that share identical + paths (essentially taking advantage of shared recombination breakpoints). :return: The ancestors tree sequence representing the inferred history of the set of ancestors. :rtype: tskit.TreeSequence """ matcher = AncestorMatcher( - sample_data, ancestor_data, engine=engine, - progress_monitor=progress_monitor, path_compression=path_compression, - num_threads=num_threads, extended_checks=extended_checks) + sample_data, ancestor_data, num_threads=num_threads, + path_compression=path_compression, extended_checks=extended_checks, + engine=engine, progress_monitor=progress_monitor) return matcher.match_ancestors() def augment_ancestors( - sample_data, ancestors_ts, indexes, progress_monitor=None, num_threads=0, - path_compression=True, extended_checks=False, engine=constants.C_ENGINE): + sample_data, ancestors_ts, indexes, num_threads=0, path_compression=True, + extended_checks=False, engine=constants.C_ENGINE, progress_monitor=None): """ - augment_ancestors(sample_data, ancestors_ts, indexes, num_threads=0, simplify=True) + augment_ancestors(sample_data, ancestors_ts, indexes, num_threads=0, + path_compression=True) Runs the sample matching :ref:`algorithm ` on the specified :class:`SampleData` instance and ancestors tree sequence, @@ -260,32 +269,34 @@ def augment_ancestors( tree sequence. :param int num_threads: The number of match worker threads to use. If this is <= 0 then a simpler sequential algorithm is used (default). + :param bool path_compression: Whether to merge edges that share identical + paths (essentially taking advantage of shared recombination breakpoints). :return: The specified ancestors tree sequence augmented with copying paths for the specified sample. :rtype: tskit.TreeSequence """ manager = SampleMatcher( - sample_data, ancestors_ts, path_compression=path_compression, - engine=engine, progress_monitor=progress_monitor, num_threads=num_threads, - extended_checks=extended_checks) + sample_data, ancestors_ts, num_threads=num_threads, + path_compression=path_compression, extended_checks=extended_checks, + engine=engine, progress_monitor=progress_monitor + ) manager.match_samples(indexes) ts = manager.get_augmented_ancestors_tree_sequence(indexes) return ts def match_samples( - sample_data, ancestors_ts, progress_monitor=None, num_threads=0, - path_compression=True, simplify=True, extended_checks=False, - stabilise_node_ordering=False, engine=constants.C_ENGINE): + sample_data, ancestors_ts, num_threads=0, path_compression=True, simplify=True, + extended_checks=False, stabilise_node_ordering=False, engine=constants.C_ENGINE, + progress_monitor=None): """ - match_samples(sample_data, ancestors_ts, num_threads=0, simplify=True) + match_samples(sample_data, ancestors_ts, num_threads=0, path_compression=True, + simplify=True) Runs the sample matching :ref:`algorithm ` on the specified :class:`SampleData` instance and ancestors tree sequence, returning the final :class:`tskit.TreeSequence` instance containing - the full inferred history for all samples and sites. If ``simplify`` is - True (the default) run :meth:`tskit.TreeSequence.simplify` on the - inferred tree sequence to ensure that it is in canonical form. + the full inferred history for all samples and sites. :param SampleData sample_data: The :class:`SampleData` instance representing the input data. @@ -294,14 +305,20 @@ def match_samples( history among ancestral ancestral haplotypes. :param int num_threads: The number of match worker threads to use. If this is <= 0 then a simpler sequential algorithm is used (default). + :param bool path_compression: Whether to merge edges that share identical + paths (essentially taking advantage of shared recombination breakpoints). + :param bool simplify: Whether to remove extra tree nodes and edges that are not + on a path between the root and any of the samples. To do so, the final tree + sequence is simplified by appling the :meth:`tskit.TreeSequence.simplify` method + with ``keep_unary`` set to True (default = ``True``). :return: The tree sequence representing the inferred history of the sample. :rtype: tskit.TreeSequence """ manager = SampleMatcher( - sample_data, ancestors_ts, path_compression=path_compression, - engine=engine, progress_monitor=progress_monitor, num_threads=num_threads, - extended_checks=extended_checks) + sample_data, ancestors_ts, num_threads=num_threads, + path_compression=path_compression, extended_checks=extended_checks, + engine=engine, progress_monitor=progress_monitor) manager.match_samples() ts = manager.finalise( simplify=simplify, stabilise_node_ordering=stabilise_node_ordering) @@ -313,7 +330,8 @@ class AncestorsGenerator(object): Manages the process of building ancestors. """ def __init__( - self, sample_data, ancestor_data, progress_monitor, engine, num_threads=0): + self, sample_data, ancestor_data, num_threads=0, + engine=constants.C_ENGINE, progress_monitor=None): self.sample_data = sample_data self.ancestor_data = ancestor_data self.progress_monitor = progress_monitor @@ -453,8 +471,8 @@ def run(self): class Matcher(object): def __init__( - self, sample_data, num_threads=1, engine=constants.C_ENGINE, - path_compression=True, progress_monitor=None, extended_checks=False): + self, sample_data, num_threads=1, path_compression=True, + extended_checks=False, engine=constants.C_ENGINE, progress_monitor=None): self.sample_data = sample_data self.num_threads = num_threads self.path_compression = path_compression @@ -1106,8 +1124,9 @@ def finalise(self, simplify=True, stabilise_node_ordering=False): logger.info("Finalising tree sequence") ts = self.get_samples_tree_sequence() if simplify: - logger.info("Running simplify on {} nodes and {} edges".format( - ts.num_nodes, ts.num_edges)) + logger.info( + "Running simplify(keep_unary=True) on {} nodes and {} edges".format( + ts.num_nodes, ts.num_edges)) if stabilise_node_ordering: # Ensure all the node times are distinct so that they will have # stable IDs after simplifying. This could possibly also be done @@ -1122,7 +1141,8 @@ def finalise(self, simplify=True, stabilise_node_ordering=False): tables.nodes.set_columns(flags=tables.nodes.flags, time=time) tables.sort() ts = tables.tree_sequence() - ts = ts.simplify(samples=self.sample_ids, filter_sites=False) + ts = ts.simplify( + samples=self.sample_ids, filter_sites=False, keep_unary=True) logger.info("Finished simplify; now have {} nodes and {} edges".format( ts.num_nodes, ts.num_edges)) return ts From a74d021028144b9f9890746529e1f8e93da3ecbb Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Wed, 14 Aug 2019 15:06:41 +0100 Subject: [PATCH 2/7] Make inference unittests work with tskit 0.2.0 missing data output --- tests/test_inference.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_inference.py b/tests/test_inference.py index 166bdca6..52ece125 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -311,7 +311,7 @@ class TestZeroInferenceSites(unittest.TestCase): Tests for the degenerate case in which we have no inference sites. """ def verify(self, genotypes): - genotypes = np.array(genotypes, dtype=np.int8) + genotypes = np.array(genotypes, dtype=np.uint8) m = genotypes.shape[0] with tsinfer.SampleData(sequence_length=m + 1) as sample_data: for j in range(m): @@ -610,13 +610,13 @@ def verify_inserted_ancestors(self, ts): with tsinfer.SampleData(sequence_length=ts.sequence_length) as sample_data: for v in ts.variants(): sample_data.add_site(v.position, v.genotypes, v.alleles) - ancestor_data = tsinfer.AncestorData(sample_data) tsinfer.build_simulated_ancestors(sample_data, ancestor_data, ts) ancestor_data.finalise() - A = np.zeros( - (ancestor_data.num_sites, ancestor_data.num_ancestors), dtype=np.uint8) + A = np.full( + (ancestor_data.num_sites, ancestor_data.num_ancestors), + tskit.MISSING_DATA, dtype=np.int8) start = ancestor_data.ancestors_start[:] end = ancestor_data.ancestors_end[:] ancestors = ancestor_data.ancestors_haplotype[:] From 77d365e4d3142d6c59b8d4102ccbfda75bb1d630 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Wed, 14 Aug 2019 15:42:57 +0100 Subject: [PATCH 3/7] Update tskit requirements (now specifies a prerelease 0.2.0 version) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cd526248..3469d0a8 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ def finalize_options(self): "tqdm", "humanize", "daiquiri", - "tskit", + "tskit>=0.2.0a3", "numcodecs>=0.6", "zarr>=2.2", "lmdb", From 82f6b50c9da5dc0af74d4544173b416e112e3ef5 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 19 Aug 2019 12:12:22 +0100 Subject: [PATCH 4/7] Add docs explaining keep_unary in simplify=True --- docs/inference.rst | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/docs/inference.rst b/docs/inference.rst index 4fbff4db..6a269242 100644 --- a/docs/inference.rst +++ b/docs/inference.rst @@ -198,13 +198,18 @@ The final phase of a ``tsinfer`` inference consists of a number steps: may be clades of ancestral state samples which would allow us to encode the data with fewer back mutations.** -3. Reduce the resulting tree sequence to a canonical form by - `simplifying it - `_. +3. Remove ancestral paths that do not lead to any of the samples by + `simplifying + `_ + the final tree sequence. When simplifying, we keep non-branching ("unary") + nodes, as they indicate ancestors which we have actively inferred, and + for technical reasons keeping unary ancestors can also lead to better + compression. Note that this means that not every internal node in the + inferred tree sequence will correspond to a coalescent event. .. todo:: 1. Describe path compression here and above in the ancestors section - 2. Describe the structure of the outpt tree sequences; how the + 2. Describe the structure of the output tree sequences; how the nodes are mapped, what the time values mean, etc. From 8ade99ce3461ffcfa4e317df54a8ccf8b8c5d65c Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 19 Aug 2019 12:52:39 +0100 Subject: [PATCH 5/7] Added a brief description about path compression --- docs/inference.rst | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/docs/inference.rst b/docs/inference.rst index 6a269242..e25215bb 100644 --- a/docs/inference.rst +++ b/docs/inference.rst @@ -161,6 +161,19 @@ given time, and an ancestor can copy from any older ancestor. For each ancestor, we find a path through older ancestors that minimises the number of recombination events. +As well as minimising recombination events by finding the best path, we can also +minimise events by looking for *shared recombination breakpoints*. A shared +breakpoint exists if a set of children share a breakpoint in the same position, +and they also have identical parents to the left of the breakpoint and identical +parents to the right of the breakpoint. Rather than supposing that these +children experienced multiple identical recombination events in parallel, we can +reduce the number of ancestral recombination events by postulating a "synthetic +ancestor" with this breakpoint, existing at a slightly older point +in time, from whom all the children are descended at this genomic position. We +call the algorithm used to implement this addition to the ancestral copying +paths, "path compression". + + .. todo:: Schematic of the ancestors copying process. The copying path for each ancestor then describes its ancestry at every @@ -183,7 +196,9 @@ The final phase of a ``tsinfer`` inference consists of a number steps: 1. The first (and usually most time-consuming) is to find copying paths for our sample haplotypes through the ancestors. Each copying path corresponds to a set of tree sequence edges in precisely the same - way as for ancestors. + way as for ancestors, and the path compression algorithm can be equally + applied here. + 2. As we only use a subset of the available sites for inference (excluding by default any sites that are fixed or singletons) From 52e974014a6f71eba0cbb6f91615f468ac8fe418 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 19 Aug 2019 13:05:41 +0100 Subject: [PATCH 6/7] Remove reference to msprime & GSL, and note Windows tesing --- docs/installation.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/installation.rst b/docs/installation.rst index 33d06bba..1a5f3094 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -4,8 +4,8 @@ Installation ############ -Python 3.3 or newer is required for ``tsinfer``. Any Unix-like platform should work -(``tsinfer`` is tested on Linux and OSX). +Python 3.3 or newer is required for ``tsinfer``. Any Unix-like platform should +work (``tsinfer`` is tested on Linux, OS X, and Windows). Please use ``pip`` to install, e.g.:: @@ -33,8 +33,6 @@ first using `venv `_:: (tsinfer-venv) $ pip install tsinfer (tsinfer-venv) $ tsinfer --help -.. _sec_installation_installation_problems: - **************** Potential issues **************** From 899ebf551235ccc193950e0b798c2eb69f0d4f1b Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Mon, 2 Sep 2019 12:19:22 +0100 Subject: [PATCH 7/7] Minor clarifications. --- docs/api.rst | 3 +-- docs/inference.rst | 4 ++-- docs/installation.rst | 2 +- setup.py | 2 +- tests/test_inference.py | 13 ++++++++++--- tsinfer/inference.py | 6 +++--- 6 files changed, 18 insertions(+), 12 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index ec46feaf..0f293d2b 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -42,8 +42,7 @@ Running inference .. autofunction:: tsinfer.match_samples -.. todo:: - 1. Add documentation for path compression here. +.. autofunction:: tsinfer.augment_ancestors ++++++++++ Exceptions diff --git a/docs/inference.rst b/docs/inference.rst index e25215bb..dea8bc4e 100644 --- a/docs/inference.rst +++ b/docs/inference.rst @@ -197,7 +197,7 @@ The final phase of a ``tsinfer`` inference consists of a number steps: for our sample haplotypes through the ancestors. Each copying path corresponds to a set of tree sequence edges in precisely the same way as for ancestors, and the path compression algorithm can be equally - applied here. + applied here. 2. As we only use a subset of the available sites for inference @@ -213,7 +213,7 @@ The final phase of a ``tsinfer`` inference consists of a number steps: may be clades of ancestral state samples which would allow us to encode the data with fewer back mutations.** -3. Remove ancestral paths that do not lead to any of the samples by +3. Remove ancestral paths that do not lead to any of the samples by `simplifying `_ the final tree sequence. When simplifying, we keep non-branching ("unary") diff --git a/docs/installation.rst b/docs/installation.rst index 1a5f3094..204ac029 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -4,7 +4,7 @@ Installation ############ -Python 3.3 or newer is required for ``tsinfer``. Any Unix-like platform should +Python 3.5 or newer is required for ``tsinfer``. Any Unix-like platform should work (``tsinfer`` is tested on Linux, OS X, and Windows). Please use ``pip`` to install, diff --git a/setup.py b/setup.py index 3469d0a8..d39942c8 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ def finalize_options(self): "tqdm", "humanize", "daiquiri", - "tskit>=0.2.0a3", + "tskit>=0.2.1", "numcodecs>=0.6", "zarr>=2.2", "lmdb", diff --git a/tests/test_inference.py b/tests/test_inference.py index 52ece125..e1f34523 100644 --- a/tests/test_inference.py +++ b/tests/test_inference.py @@ -311,7 +311,7 @@ class TestZeroInferenceSites(unittest.TestCase): Tests for the degenerate case in which we have no inference sites. """ def verify(self, genotypes): - genotypes = np.array(genotypes, dtype=np.uint8) + genotypes = np.array(genotypes, dtype=np.int8) m = genotypes.shape[0] with tsinfer.SampleData(sequence_length=m + 1) as sample_data: for j in range(m): @@ -628,8 +628,7 @@ def verify_inserted_ancestors(self, ts): tsinfer.check_ancestors_ts(ancestors_ts) self.assertEqual(ancestor_data.num_sites, ancestors_ts.num_sites) self.assertEqual(ancestor_data.num_ancestors, ancestors_ts.num_samples) - self.assertTrue(np.array_equal( - ancestors_ts.genotype_matrix(impute_missing_data=True), A)) + self.assertTrue(np.array_equal(ancestors_ts.genotype_matrix(), A)) inferred_ts = tsinfer.match_samples( sample_data, ancestors_ts, engine=engine) self.assertTrue(np.array_equal( @@ -1230,6 +1229,14 @@ def verify(self, ts): list(ts2.samples()), list(range(ts2.num_nodes - n, ts2.num_nodes))) + # Check that we're calling simplify with the correct arguments. + ts2 = tsinfer.infer(sd, simplify=False).simplify(keep_unary=True) + t1 = ts1.dump_tables() + t2 = ts2.dump_tables() + t1.provenances.clear() + t2.provenances.clear() + self.assertEqual(t1, t2) + def test_single_tree(self): ts = msprime.simulate(5, random_seed=1, mutation_rate=2) self.verify(ts) diff --git a/tsinfer/inference.py b/tsinfer/inference.py index b9659551..bd44d3e1 100644 --- a/tsinfer/inference.py +++ b/tsinfer/inference.py @@ -250,8 +250,8 @@ def augment_ancestors( sample_data, ancestors_ts, indexes, num_threads=0, path_compression=True, extended_checks=False, engine=constants.C_ENGINE, progress_monitor=None): """ - augment_ancestors(sample_data, ancestors_ts, indexes, num_threads=0, - path_compression=True) + augment_ancestors(sample_data, ancestors_ts, indexes, num_threads=0,\ + path_compression=True) Runs the sample matching :ref:`algorithm ` on the specified :class:`SampleData` instance and ancestors tree sequence, @@ -290,7 +290,7 @@ def match_samples( extended_checks=False, stabilise_node_ordering=False, engine=constants.C_ENGINE, progress_monitor=None): """ - match_samples(sample_data, ancestors_ts, num_threads=0, path_compression=True, + match_samples(sample_data, ancestors_ts, num_threads=0, path_compression=True,\ simplify=True) Runs the sample matching :ref:`algorithm `