From 664c498cec8ebbad4418f42008d53ed372313d7b Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Mon, 25 Nov 2019 20:29:02 +0000 Subject: [PATCH] Add ability to output missing data in haplotypes --- python/CHANGELOG.rst | 7 +++ python/tests/test_genotypes.py | 25 ++++++++-- python/tskit/trees.py | 91 ++++++++++++++++++++-------------- 3 files changed, 82 insertions(+), 41 deletions(-) diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index c75d136d11..3fe42a08ae 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -6,6 +6,13 @@ In development **New features** +- Allow sites with missing data to be output by the `haplotypes` method, by + default replacing with ``-``. Errors are no longer raised for missing data + with `impute_missing_data=False`; the error types returned for bad alleles + (e.g. multiletter or non-ascii) have also changed from `_tskit.LibraryError` + to TypeError, or ValueError if the missing data character clashes + (:user:`hyanwong`, :pr:`426`). + **Bugfixes** -------------------- diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index db57b132a9..3362240a47 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -549,12 +549,26 @@ def test_acgt_mutations(self): H = [h.replace("0", "A").replace("1", "T") for h in ts.haplotypes()] self.assertEqual(H, list(tsp.haplotypes())) - def test_multiletter_mutations(self): + def test_fails_multiletter_mutations(self): ts = msprime.simulate(10, random_seed=2) tables = ts.tables tables.sites.add_row(0, "ACTG") tsp = tables.tree_sequence() - self.assertRaises(exceptions.LibraryError, list, tsp.haplotypes()) + self.assertRaises(TypeError, list, tsp.haplotypes()) + + def test_fails_deletion_mutations(self): + ts = msprime.simulate(10, random_seed=2) + tables = ts.tables + tables.sites.add_row(0, "") + tsp = tables.tree_sequence() + self.assertRaises(TypeError, list, tsp.haplotypes()) + + def test_nonascii_mutations(self): + ts = msprime.simulate(10, random_seed=2) + tables = ts.tables + tables.sites.add_row(0, chr(169)) # Copyright symbol + tsp = tables.tree_sequence() + self.assertRaises(TypeError, list, tsp.haplotypes()) def test_recurrent_mutations_over_samples(self): ts = msprime.simulate(10, random_seed=2) @@ -591,12 +605,15 @@ def test_back_mutations(self): ts = tsutil.insert_branch_mutations(base_ts, mutations_per_branch=j) self.verify_tree_sequence(ts) - def test_fails_missing_data(self): + def test_missing_data(self): tables = tskit.TableCollection(1.0) tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) tables.sites.add_row(0.5, "A") ts = tables.tree_sequence() - self.assertRaises(exceptions.LibraryError, list, ts.haplotypes()) + self.assertRaises(ValueError, list, ts.haplotypes(missing_data_character="A")) + for c in ("-", ".", "a"): + h = list(ts.haplotypes(missing_data_character=c)) + self.assertEqual(h, [c, c]) h = list(ts.haplotypes(impute_missing_data=True)) self.assertEqual(h, ["A", "A"]) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 6cc865e65f..bc30357ccf 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -2831,33 +2831,36 @@ def trees( sample_lists=sample_lists) return TreeIterator(tree) - def haplotypes(self, impute_missing_data=False): - """ - Returns an iterator over the haplotypes resulting from the trees - and mutations in this tree sequence as a string. - The iterator returns a total of :math:`n` strings, each of which - contains :math:`s` characters (:math:`n` is the sample size - returned by :attr:`tskit.TreeSequence.num_samples` and - :math:`s` is the number of sites returned by - :attr:`tskit.TreeSequence.num_sites`). The first - string returned is the haplotype for sample `0`, and so on. - For a given haplotype ``h``, the value of ``h[j]`` is the observed - allelic state at site ``j``. + def haplotypes(self, impute_missing_data=False, missing_data_character="-"): + """ + Returns an iterator over the strings of haplotypes that result from + the trees and mutations in this tree sequence. Each haplotype string + is guaranteed to be of the same length. A tree sequence with + :math:`n` samples and :math:`s` sites will return a total of :math:`n` + strings of :math:`s` alleles concatenated together, where an allele + consists of a single ascii character (tree sequences that include alleles + which are not a single character in length, or where the character is + non-ascii, will raise an error). The first string returned is the + haplotype for sample ``0``, and so on. + + The alleles at each site must be represented by single byte characters, + (i.e. variants must be single nucleotide polymorphisms, or SNPs), hence + the strings returned will all be of length :math:`s`, and for a haplotype + ``h``, the value of ``h[j]`` will be the observed allelic state + at site ``j``. + + If ``impute_missing_data`` is False, + :ref:`missing data` will be represented + in the string by the ``missing_data_character``. If + instead it is set to True, missing data will be imputed such that all + isolated samples are assigned the ancestral state (unless they have + mutations directly above them, in which case they will take the most recent + derived mutational state for that node). This was the default + behaviour in versions prior to 0.2.0. See also the :meth:`.variants` iterator for site-centric access to sample genotypes. - This method is only supported for single-letter alleles. - - As :ref:`missing data` cannot be - represented directly in the haplotypes array, an error will be raised - if missing data is present. However, if ``impute_missing_data`` set - to True, missing data will be imputed such that all isolated samples - are assigned the ancestral state (unless they have mutations directly - above them, in which case they will take the most recent derived - mutational state for that node). This was the default behaviour in - versions prior to 0.2.0. - .. warning:: For large datasets, this method can consume a **very large** amount of memory! To output all the sample data, it is more efficient to iterate @@ -2871,25 +2874,39 @@ def haplotypes(self, impute_missing_data=False): :param bool impute_missing_data: If True, the allele assigned to any isolated samples is the ancestral state; that is, we impute missing data as the ancestral state. Default: False. + :param str missing_data_character: A single ascii character that will + be used to represent missing data. + If any normal allele contains this character, an error is raised. + Default: '-'. :rtype: collections.abc.Iterable - :raises: LibraryError if called on a tree sequence containing - multiletter alleles. - :raises: LibraryError if missing data is present and impute_missing_data - is False + :raises: TypeError if the ``missing_data_character`` or any of the alleles + at a site or the are not a single ascii character. + :raises: ValueError + if the ``missing_data_character`` exists in one of the alleles """ H = np.empty((self.num_samples, self.num_sites), dtype=np.int8) + missing_int8 = ord(missing_data_character.encode('ascii')) for var in self.variants(impute_missing_data=impute_missing_data): - if var.has_missing_data: - raise _tskit.LibraryError( - "Generated haplotypes contain missing data, which cannot be " - "represented. Either generate variants (which support missing " - "data) or use the impute missing data option.") - alleles = np.empty(len(var.alleles), dtype=np.int8) + alleles = np.full(len(var.alleles), missing_int8, dtype=np.int8) for i, allele in enumerate(var.alleles): - if len(allele) > 1: - raise _tskit.LibraryError( - "Cannot produce haplotypes from multi-letter alleles") - alleles[i] = ord(allele.encode('ascii')) + if allele is not None: + if len(allele) != 1: + raise TypeError( + "Multi-letter allele or deletion detected at site {}" + .format(var.site.id)) + try: + ascii_allele = allele.encode('ascii') + except UnicodeEncodeError: + raise TypeError( + "Non-ascii character in allele at site {}" + .format(var.site.id)) + allele_int8 = ord(ascii_allele) + if allele_int8 == missing_int8: + raise ValueError( + "The missing data character '{}' clashes with an " + "existing allele at site {}" + .format(missing_data_character, var.site.id)) + alleles[i] = allele_int8 H[:, var.site.id] = alleles[var.genotypes] for h in H: yield h.tostring().decode('ascii')