tskit-dev · hyanwong · Nov 25, 2019 · Nov 25, 2019
diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py
@@ -25,6 +25,7 @@
 import unittest
 import itertools
 import random
+import warnings
 
 import numpy as np
 import msprime
@@ -552,9 +553,29 @@ def test_acgt_mutations(self):
     def test_multiletter_mutations(self):
         ts = msprime.simulate(10, random_seed=2)
         tables = ts.tables
-        tables.sites.add_row(0, "ACTG")
+        site0 = tables.sites.add_row(0.0, "AT")
+        site1 = tables.sites.add_row(0.1, "T")  # This should raise a warning if mut > 1
+        site2 = tables.sites.add_row(0.2, "A")  # This should raise a warning if mut > 1
+        tables.mutations.add_row(site0, 0, "")
+        tables.mutations.add_row(site1, 1, "GGT")
+        tables.mutations.add_row(site2, 2, "ATCG")
         tsp = tables.tree_sequence()
-        self.assertRaises(exceptions.LibraryError, list, tsp.haplotypes())
+        with warnings.catch_warnings(record=True) as w:
+            hap = list(tsp.haplotypes())
+            self.assertEqual(len(w), 2)
+            for warning in w:
+                self.assertTrue(issubclass(warning.category, UserWarning))
+        self.assertTrue(hap[0] == "--" + "---" + "----")
+        self.assertTrue(hap[1] == "AT" + "GGT" + "----")
+        self.assertTrue(hap[2] == "AT" + "---" + "ATCG")
+        self.assertTrue(all([h == "AT" + "---" + "----" for h in hap[3:]]))
+
+    def test_nonascii_mutations(self):
+        ts = msprime.simulate(10, random_seed=2)
+        tables = ts.tables
+        tables.sites.add_row(0, chr(169))  # Copyright symbol
+        tsp = tables.tree_sequence()
+        self.assertRaises(TypeError, list, tsp.haplotypes())
 
     def test_recurrent_mutations_over_samples(self):
         ts = msprime.simulate(10, random_seed=2)
@@ -591,12 +612,15 @@ def test_back_mutations(self):
             ts = tsutil.insert_branch_mutations(base_ts, mutations_per_branch=j)
             self.verify_tree_sequence(ts)
 
-    def test_fails_missing_data(self):
+    def test_missing_data(self):
         tables = tskit.TableCollection(1.0)
         tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
         tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
         tables.sites.add_row(0.5, "A")
         ts = tables.tree_sequence()
-        self.assertRaises(exceptions.LibraryError, list, ts.haplotypes())
+        self.assertRaises(ValueError, list, ts.haplotypes(missing_data_character="A"))
+        for c in ("-", ".", "a"):
+            h = list(ts.haplotypes(missing_data_character=c))
+            self.assertEqual(h, [c, c])
         h = list(ts.haplotypes(impute_missing_data=True))
         self.assertEqual(h, ["A", "A"])
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
@@ -2827,33 +2827,49 @@ def trees(
             sample_lists=sample_lists)
         return TreeIterator(tree)
 
-    def haplotypes(self, impute_missing_data=False):
-        """
-        Returns an iterator over the haplotypes resulting from the trees
-        and mutations in this tree sequence as a string.
-        The iterator returns a total of :math:`n` strings, each of which
-        contains :math:`s` characters (:math:`n` is the sample size
-        returned by :attr:`tskit.TreeSequence.num_samples` and
-        :math:`s` is the number of sites returned by
-        :attr:`tskit.TreeSequence.num_sites`). The first
-        string returned is the haplotype for sample `0`, and so on.
-        For a given haplotype ``h``, the value of ``h[j]`` is the observed
-        allelic state at site ``j``.
+    def haplotypes(self, impute_missing_data=False, missing_data_character="-"):
+        """
+        Returns an iterator over the strings of haplotypes that result from
+        the trees and mutations in this tree sequence. Each haplotype string
+        is guaranteed to be of the same length. A tree sequence with
+        :math:`n` samples and :math:`s` sites will return a total of :math:`n`
+        strings of :math:`s` alleles concatenated together, where an allele
+        can consist of zero or more ascii characters (tree sequences that
+        include alleles containing non-ascii characters will raise an error).
+        The first string returned is the haplotype for sample ``0``, and so on.
+
+        Each allele is represented by a number of characters given by the maximum
+        length (in bytes) of any of the alleles at that site. In the simplest
+        instance, the alleles at each site are represented by single byte characters
+        such as when all variants are single nucleotide polymorphisms (SNPs). In
+        this case the strings returned will all be of length :math:`s`, and for a
+        haplotype ``h``, the value of ``h[j]`` is the observed allelic state
+        at site ``j``. If instead, one site contains, say, an indel of
+        length 10 (e.g. if the ancestral state is the empty string and a mutation
+        exists at that site to a derived site of "ATATATATAT"), then the returned
+        strings will each be of length :math:`s` + 9.
+
+        Zero-length alleles (i.e. deletions) are represented in the string by
+        the ``missing_data_character`` repeated :math:`c` times, where :math:`c`
+        is the maximum allele length (in bytes) at that site. Any alleles of
+        length greater than 0 but less than :math:`c`, which could therefore
+        be aligned at different positions in the string, are *not emitted*, but
+        are instead replaced with a string consisting of the
+        ``missing_data_character``, as if they were a deletion, and a warning is
+        given.
+
+        If ``impute_missing_data`` is False,
+        :ref:`missing data<sec_data_model_missing_data>` will be also represented
+        in the string by :math:`c` repeats of the ``missing_data_character``. If
+        instead it is set to True, missing data will be imputed such that all
+        isolated samples are assigned the ancestral state (unless they have
+        mutations directly above them, in which case they will take the most recent
+        derived  mutational state for that node). This was the default
+        behaviour in versions prior to 0.2.0.
 
         See also the :meth:`.variants` iterator for site-centric access
         to sample genotypes.
 
-        This method is only supported for single-letter alleles.
-
-        As :ref:`missing data<sec_data_model_missing_data>` cannot be
-        represented directly in the haplotypes array, an error will be raised
-        if missing data is present. However, if ``impute_missing_data`` set
-        to True, missing data will be imputed such that all isolated samples
-        are assigned the ancestral state (unless they have mutations directly
-        above them, in which case they will take the most recent derived
-        mutational state for that node). This was the default behaviour in
-        versions prior to 0.2.0.
-
         .. warning::
             For large datasets, this method can consume a **very large** amount of
             memory! To output all the sample data, it is more efficient to iterate
@@ -2867,26 +2883,74 @@ def haplotypes(self, impute_missing_data=False):
         :param bool impute_missing_data: If True, the allele assigned to any
             isolated samples is the ancestral state; that is, we impute
             missing data as the ancestral state. Default: False.
+        :param str missing_data_character: A single ascii character that will
+            be used to represent missing data, deletions, and other alleles that
+            are shorter in length than the maximum allele length at each site.
+            If any normal allele contains this character, an error is raised.
+            Default: '-'.
         :rtype: collections.abc.Iterable
-        :raises: LibraryError if called on a tree sequence containing
-            multiletter alleles.
-        :raises: LibraryError if missing data is present and impute_missing_data
-            is False
-        """
-        H = np.empty((self.num_samples, self.num_sites), dtype=np.int8)
-        for var in self.variants(impute_missing_data=impute_missing_data):
-            if var.has_missing_data:
-                raise _tskit.LibraryError(
-                    "Generated haplotypes contain missing data, which cannot be "
-                    "represented. Either generate variants (which support missing "
-                    "data) or use the impute missing data option.")
-            alleles = np.empty(len(var.alleles), dtype=np.int8)
+        :raises: TypeError if the missing_data_character is not a single ascii
+            character, or if any allele contains non-ascii characters
+        :raises: ValueError
+            if the ``missing_data_character`` exists in one of the alleles
+        """
+        def sitewise_allele_max_bytes(ts):
+            """
+            Return the maximum length in bytes of all the alleles at each site.
+
+            This is equivalent to, but much faster then, the following code
+            ``np.array([max([len(s.encode('utf-8')) for s in v.alleles])
+                for v in ts.variants()])``
+
+            NB: this requires mutations to be sorted by site ID, which is a necessary
+            requirement of a tree sequence, hence this needs to be a ts function
+            not a tables function.
+            """
+            site_mutations_max_bytes = np.zeros((ts.num_sites), dtype=np.uint32)
+            if ts.num_mutations > 0:
+                mutations_bytes = np.diff(ts.tables.mutations.derived_state_offset[:])
+                site_increment = np.append(1, np.diff(ts.tables.mutations.site[:]))
+                site_id = np.cumsum(site_increment[site_increment > 0]) - 1
+                site_breakpoints = np.where(site_increment > 0)[0]
+                site_mutations_max_bytes[site_id] = np.maximum.reduceat(
+                    mutations_bytes, site_breakpoints)
+            ancestral_state_max_bytes = np.diff(ts.tables.sites.ancestral_state_offset)
+            return np.maximum(site_mutations_max_bytes, ancestral_state_max_bytes)
+
+        max_allele_bytes = sitewise_allele_max_bytes(self)
+        output_length_bytes = np.sum(max_allele_bytes)
+        H = np.empty((self.num_samples, output_length_bytes), dtype=np.int8)
+        missing_ascii = missing_data_character.encode('ascii')
+        missing_int8 = ord(missing_ascii)
+        curr_position = 0
+        for max_bytes, var in zip(
+                max_allele_bytes,
+                self.variants(impute_missing_data=impute_missing_data)):
+            # Fill by missing data character by default
+            alleles = np.full((len(var.alleles), max_bytes), missing_int8, dtype=np.int8)
             for i, allele in enumerate(var.alleles):
-                if len(allele) > 1:
-                    raise _tskit.LibraryError(
-                        "Cannot produce haplotypes from multi-letter alleles")
-                alleles[i] = ord(allele.encode('ascii'))
-            H[:, var.site.id] = alleles[var.genotypes]
+                if allele is not None and len(allele) > 0:
+                    try:
+                        ascii_allele = allele.encode('ascii')
+                    except UnicodeEncodeError:
+                        raise TypeError(
+                            "Non-ascii character in allele at site {}"
+                            .format(var.site.id))
+                    assert len(ascii_allele) <= max_bytes
+                    if len(ascii_allele) < max_bytes:
+                        warnings.warn(
+                            "Site {} (position {}) has alleles of different lengths. "
+                            "Treating any shorter than {} bytes as missing."
+                            .format(var.site.id, var.site.position, max_bytes))
+                    else:
+                        if missing_ascii in ascii_allele:
+                            raise ValueError(
+                                "The missing data character '{}' clashes with an "
+                                "existing allele at site {}"
+                                .format(missing_data_character, var.site.id))
+                        alleles[i, :] = np.frombuffer(ascii_allele, dtype=np.int8)
+            H[:, curr_position:(curr_position + max_bytes)] = alleles[var.genotypes, :]
+            curr_position += max_bytes
         for h in H:
             yield h.tostring().decode('ascii')