From d55e5430ccc9b66e219ba8ad9002680e83edde94 Mon Sep 17 00:00:00 2001
From: Yan Wong <yan@yanwong.me>
Date: Mon, 25 Nov 2019 20:29:02 +0000
Subject: [PATCH 1/2] Add ability to output missing data in haplotypes

---
 python/tests/test_genotypes.py | 16 ++++--
 python/tskit/trees.py          | 92 ++++++++++++++++++++--------------
 2 files changed, 68 insertions(+), 40 deletions(-)

diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py
index db57b132a9..e3bd84ab56 100644
--- a/python/tests/test_genotypes.py
+++ b/python/tests/test_genotypes.py
@@ -554,7 +554,14 @@ def test_multiletter_mutations(self):
         tables = ts.tables
         tables.sites.add_row(0, "ACTG")
         tsp = tables.tree_sequence()
-        self.assertRaises(exceptions.LibraryError, list, tsp.haplotypes())
+        self.assertRaises(TypeError, list, tsp.haplotypes())
+
+    def test_nonascii_mutations(self):
+        ts = msprime.simulate(10, random_seed=2)
+        tables = ts.tables
+        tables.sites.add_row(0, chr(169))  # Copyright symbol
+        tsp = tables.tree_sequence()
+        self.assertRaises(TypeError, list, tsp.haplotypes())
 
     def test_recurrent_mutations_over_samples(self):
         ts = msprime.simulate(10, random_seed=2)
@@ -591,12 +598,15 @@ def test_back_mutations(self):
             ts = tsutil.insert_branch_mutations(base_ts, mutations_per_branch=j)
             self.verify_tree_sequence(ts)
 
-    def test_fails_missing_data(self):
+    def test_missing_data(self):
         tables = tskit.TableCollection(1.0)
         tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
         tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
         tables.sites.add_row(0.5, "A")
         ts = tables.tree_sequence()
-        self.assertRaises(exceptions.LibraryError, list, ts.haplotypes())
+        self.assertRaises(ValueError, list, ts.haplotypes(missing_data_character="A"))
+        for c in ("-", ".", "a"):
+            h = list(ts.haplotypes(missing_data_character=c))
+            self.assertEqual(h, [c, c])
         h = list(ts.haplotypes(impute_missing_data=True))
         self.assertEqual(h, ["A", "A"])
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
index a31016ed8d..78170c43b9 100644
--- a/python/tskit/trees.py
+++ b/python/tskit/trees.py
@@ -2827,33 +2827,35 @@ def trees(
             sample_lists=sample_lists)
         return TreeIterator(tree)
 
-    def haplotypes(self, impute_missing_data=False):
-        """
-        Returns an iterator over the haplotypes resulting from the trees
-        and mutations in this tree sequence as a string.
-        The iterator returns a total of :math:`n` strings, each of which
-        contains :math:`s` characters (:math:`n` is the sample size
-        returned by :attr:`tskit.TreeSequence.num_samples` and
-        :math:`s` is the number of sites returned by
-        :attr:`tskit.TreeSequence.num_sites`). The first
-        string returned is the haplotype for sample `0`, and so on.
-        For a given haplotype ``h``, the value of ``h[j]`` is the observed
-        allelic state at site ``j``.
+    def haplotypes(self, impute_missing_data=False, missing_data_character="-"):
+        """
+        Returns an iterator over the strings of haplotypes that result from
+        the trees and mutations in this tree sequence. Each haplotype string
+        is guaranteed to be of the same length. A tree sequence with
+        :math:`n` samples and :math:`s` sites will return a total of :math:`n`
+        strings of :math:`s` alleles concatenated together, where an allele
+        consists of a single ascii character (tree sequences that include alleles
+        containing multi-letter or non-ascii characters will raise an error).
+        The first string returned is the haplotype for sample ``0``, and so on.
+
+        The alleles at each site must be represented by single byte characters,
+        (i.e. variants must be single nucleotide polymorphisms, or SNPs), hence
+        the strings returned will all be of length :math:`s`, and for a haplotype
+        ``h``, the value of ``h[j]`` will be the observed allelic state
+        at site ``j``.
+
+        If ``impute_missing_data`` is False,
+        :ref:`missing data<sec_data_model_missing_data>` will be represented
+        in the string by the ``missing_data_character``. If
+        instead it is set to True, missing data will be imputed such that all
+        isolated samples are assigned the ancestral state (unless they have
+        mutations directly above them, in which case they will take the most recent
+        derived  mutational state for that node). This was the default
+        behaviour in versions prior to 0.2.0.
 
         See also the :meth:`.variants` iterator for site-centric access
         to sample genotypes.
 
-        This method is only supported for single-letter alleles.
-
-        As :ref:`missing data<sec_data_model_missing_data>` cannot be
-        represented directly in the haplotypes array, an error will be raised
-        if missing data is present. However, if ``impute_missing_data`` set
-        to True, missing data will be imputed such that all isolated samples
-        are assigned the ancestral state (unless they have mutations directly
-        above them, in which case they will take the most recent derived
-        mutational state for that node). This was the default behaviour in
-        versions prior to 0.2.0.
-
         .. warning::
             For large datasets, this method can consume a **very large** amount of
             memory! To output all the sample data, it is more efficient to iterate
@@ -2867,25 +2869,41 @@ def haplotypes(self, impute_missing_data=False):
         :param bool impute_missing_data: If True, the allele assigned to any
             isolated samples is the ancestral state; that is, we impute
             missing data as the ancestral state. Default: False.
+        :param str missing_data_character: A single ascii character that will
+            be used to represent missing data.
+            If any normal allele contains this character, an error is raised.
+            Default: '-'.
         :rtype: collections.abc.Iterable
-        :raises: LibraryError if called on a tree sequence containing
-            multiletter alleles.
-        :raises: LibraryError if missing data is present and impute_missing_data
-            is False
+        :raises: TypeError if the ``missing_data_character`` or any of the alleles
+            at a site or the are not a single ascii character.
+        :raises: ValueError
+            if the ``missing_data_character`` exists in one of the alleles
         """
         H = np.empty((self.num_samples, self.num_sites), dtype=np.int8)
+        missing_int8 = ord(missing_data_character.encode('ascii'))
         for var in self.variants(impute_missing_data=impute_missing_data):
-            if var.has_missing_data:
-                raise _tskit.LibraryError(
-                    "Generated haplotypes contain missing data, which cannot be "
-                    "represented. Either generate variants (which support missing "
-                    "data) or use the impute missing data option.")
-            alleles = np.empty(len(var.alleles), dtype=np.int8)
+            alleles = np.full(len(var.alleles), missing_int8, dtype=np.int8)
             for i, allele in enumerate(var.alleles):
-                if len(allele) > 1:
-                    raise _tskit.LibraryError(
-                        "Cannot produce haplotypes from multi-letter alleles")
-                alleles[i] = ord(allele.encode('ascii'))
+                if allele is not None:
+                    try:
+                        ascii_allele = allele.encode('ascii')
+                    except UnicodeEncodeError:
+                        raise TypeError(
+                            "Non-ascii character in allele at site {}"
+                            .format(var.site.id))
+                    if len(ascii_allele) == 0:
+                        pass  # A deletion: acceptable to emit missing data char as-is
+                    if len(ascii_allele) > 1:
+                        raise TypeError(
+                            "Multi-letter allele detected at site {}"
+                            .format(var.site.id))
+                    allele_int8 = ord(ascii_allele)
+                    if allele_int8 == missing_int8:
+                        raise ValueError(
+                            "The missing data character '{}' clashes with an "
+                            "existing allele at site {}"
+                            .format(missing_data_character, var.site.id))
+                    alleles[i] = allele_int8
             H[:, var.site.id] = alleles[var.genotypes]
         for h in H:
             yield h.tostring().decode('ascii')

From 12e2407a866bfd04f9c9f2e102fd217e57a6382c Mon Sep 17 00:00:00 2001
From: Yan Wong <yan@yanwong.me>
Date: Mon, 25 Nov 2019 20:29:02 +0000
Subject: [PATCH 2/2] Add ability to output multi-letter alleles

---
 python/tests/test_genotypes.py |  18 +++++-
 python/tskit/trees.py          | 108 +++++++++++++++++++++++----------
 2 files changed, 93 insertions(+), 33 deletions(-)

diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py
index e3bd84ab56..cf70b92cc9 100644
--- a/python/tests/test_genotypes.py
+++ b/python/tests/test_genotypes.py
@@ -25,6 +25,7 @@
 import unittest
 import itertools
 import random
+import warnings
 
 import numpy as np
 import msprime
@@ -552,9 +553,22 @@ def test_acgt_mutations(self):
     def test_multiletter_mutations(self):
         ts = msprime.simulate(10, random_seed=2)
         tables = ts.tables
-        tables.sites.add_row(0, "ACTG")
+        site0 = tables.sites.add_row(0.0, "AT")
+        site1 = tables.sites.add_row(0.1, "T")  # This should raise a warning if mut > 1
+        site2 = tables.sites.add_row(0.2, "A")  # This should raise a warning if mut > 1
+        tables.mutations.add_row(site0, 0, "")
+        tables.mutations.add_row(site1, 1, "GGT")
+        tables.mutations.add_row(site2, 2, "ATCG")
         tsp = tables.tree_sequence()
-        self.assertRaises(TypeError, list, tsp.haplotypes())
+        with warnings.catch_warnings(record=True) as w:
+            hap = list(tsp.haplotypes())
+            self.assertEqual(len(w), 2)
+            for warning in w:
+                self.assertTrue(issubclass(warning.category, UserWarning))
+        self.assertTrue(hap[0] == "--" + "---" + "----")
+        self.assertTrue(hap[1] == "AT" + "GGT" + "----")
+        self.assertTrue(hap[2] == "AT" + "---" + "ATCG")
+        self.assertTrue(all([h == "AT" + "---" + "----" for h in hap[3:]]))
 
     def test_nonascii_mutations(self):
         ts = msprime.simulate(10, random_seed=2)
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
index 78170c43b9..b7d13ff11c 100644
--- a/python/tskit/trees.py
+++ b/python/tskit/trees.py
@@ -2834,19 +2834,33 @@ def haplotypes(self, impute_missing_data=False, missing_data_character="-"):
         is guaranteed to be of the same length. A tree sequence with
         :math:`n` samples and :math:`s` sites will return a total of :math:`n`
         strings of :math:`s` alleles concatenated together, where an allele
-        consists of a single ascii character (tree sequences that include alleles
-        containing multi-letter or non-ascii characters will raise an error).
+        can consist of zero or more ascii characters (tree sequences that
+        include alleles containing non-ascii characters will raise an error).
         The first string returned is the haplotype for sample ``0``, and so on.
 
-        The alleles at each site must be represented by single byte characters,
-        (i.e. variants must be single nucleotide polymorphisms, or SNPs), hence
-        the strings returned will all be of length :math:`s`, and for a haplotype
-        ``h``, the value of ``h[j]`` will be the observed allelic state
-        at site ``j``.
+        Each allele is represented by a number of characters given by the maximum
+        length (in bytes) of any of the alleles at that site. In the simplest
+        instance, the alleles at each site are represented by single byte characters
+        such as when all variants are single nucleotide polymorphisms (SNPs). In
+        this case the strings returned will all be of length :math:`s`, and for a
+        haplotype ``h``, the value of ``h[j]`` is the observed allelic state
+        at site ``j``. If instead, one site contains, say, an indel of
+        length 10 (e.g. if the ancestral state is the empty string and a mutation
+        exists at that site to a derived site of "ATATATATAT"), then the returned
+        strings will each be of length :math:`s` + 9.
+
+        Zero-length alleles (i.e. deletions) are represented in the string by
+        the ``missing_data_character`` repeated :math:`c` times, where :math:`c`
+        is the maximum allele length (in bytes) at that site. Any alleles of
+        length greater than 0 but less than :math:`c`, which could therefore
+        be aligned at different positions in the string, are *not emitted*, but
+        are instead replaced with a string consisting of the
+        ``missing_data_character``, as if they were a deletion, and a warning is
+        given.
 
         If ``impute_missing_data`` is False,
-        :ref:`missing data<sec_data_model_missing_data>` will be represented
-        in the string by the ``missing_data_character``. If
+        :ref:`missing data<sec_data_model_missing_data>` will be also represented
+        in the string by :math:`c` repeats of the ``missing_data_character``. If
         instead it is set to True, missing data will be imputed such that all
         isolated samples are assigned the ancestral state (unless they have
         mutations directly above them, in which case they will take the most recent
@@ -2870,41 +2884,73 @@ def haplotypes(self, impute_missing_data=False, missing_data_character="-"):
             isolated samples is the ancestral state; that is, we impute
             missing data as the ancestral state. Default: False.
         :param str missing_data_character: A single ascii character that will
-            be used to represent missing data.
+            be used to represent missing data, deletions, and other alleles that
+            are shorter in length than the maximum allele length at each site.
             If any normal allele contains this character, an error is raised.
             Default: '-'.
         :rtype: collections.abc.Iterable
-        :raises: TypeError if the ``missing_data_character`` or any of the alleles
-            at a site or the are not a single ascii character.
+        :raises: TypeError if the missing_data_character is not a single ascii
+            character, or if any allele contains non-ascii characters
         :raises: ValueError
             if the ``missing_data_character`` exists in one of the alleles
         """
-        H = np.empty((self.num_samples, self.num_sites), dtype=np.int8)
-        missing_int8 = ord(missing_data_character.encode('ascii'))
-        for var in self.variants(impute_missing_data=impute_missing_data):
-            alleles = np.full(len(var.alleles), missing_int8, dtype=np.int8)
+        def sitewise_allele_max_bytes(ts):
+            """
+            Return the maximum length in bytes of all the alleles at each site.
+
+            This is equivalent to, but much faster then, the following code
+            ``np.array([max([len(s.encode('utf-8')) for s in v.alleles])
+                for v in ts.variants()])``
+
+            NB: this requires mutations to be sorted by site ID, which is a necessary
+            requirement of a tree sequence, hence this needs to be a ts function
+            not a tables function.
+            """
+            site_mutations_max_bytes = np.zeros((ts.num_sites), dtype=np.uint32)
+            if ts.num_mutations > 0:
+                mutations_bytes = np.diff(ts.tables.mutations.derived_state_offset[:])
+                site_increment = np.append(1, np.diff(ts.tables.mutations.site[:]))
+                site_id = np.cumsum(site_increment[site_increment > 0]) - 1
+                site_breakpoints = np.where(site_increment > 0)[0]
+                site_mutations_max_bytes[site_id] = np.maximum.reduceat(
+                    mutations_bytes, site_breakpoints)
+            ancestral_state_max_bytes = np.diff(ts.tables.sites.ancestral_state_offset)
+            return np.maximum(site_mutations_max_bytes, ancestral_state_max_bytes)
+
+        max_allele_bytes = sitewise_allele_max_bytes(self)
+        output_length_bytes = np.sum(max_allele_bytes)
+        H = np.empty((self.num_samples, output_length_bytes), dtype=np.int8)
+        missing_ascii = missing_data_character.encode('ascii')
+        missing_int8 = ord(missing_ascii)
+        curr_position = 0
+        for max_bytes, var in zip(
+                max_allele_bytes,
+                self.variants(impute_missing_data=impute_missing_data)):
+            # Fill by missing data character by default
+            alleles = np.full((len(var.alleles), max_bytes), missing_int8, dtype=np.int8)
             for i, allele in enumerate(var.alleles):
-                if allele is not None:
+                if allele is not None and len(allele) > 0:
                     try:
                         ascii_allele = allele.encode('ascii')
                     except UnicodeEncodeError:
                         raise TypeError(
                             "Non-ascii character in allele at site {}"
                             .format(var.site.id))
-                    if len(ascii_allele) == 0:
-                        pass  # A deletion: acceptable to emit missing data char as-is
-                    if len(ascii_allele) > 1:
-                        raise TypeError(
-                            "Multi-letter allele detected at site {}"
-                            .format(var.site.id))
-                    allele_int8 = ord(ascii_allele)
-                    if allele_int8 == missing_int8:
-                        raise ValueError(
-                            "The missing data character '{}' clashes with an "
-                            "existing allele at site {}"
-                            .format(missing_data_character, var.site.id))
-                    alleles[i] = allele_int8
-            H[:, var.site.id] = alleles[var.genotypes]
+                    assert len(ascii_allele) <= max_bytes
+                    if len(ascii_allele) < max_bytes:
+                        warnings.warn(
+                            "Site {} (position {}) has alleles of different lengths. "
+                            "Treating any shorter than {} bytes as missing."
+                            .format(var.site.id, var.site.position, max_bytes))
+                    else:
+                        if missing_ascii in ascii_allele:
+                            raise ValueError(
+                                "The missing data character '{}' clashes with an "
+                                "existing allele at site {}"
+                                .format(missing_data_character, var.site.id))
+                        alleles[i, :] = np.frombuffer(ascii_allele, dtype=np.int8)
+            H[:, curr_position:(curr_position + max_bytes)] = alleles[var.genotypes, :]
+            curr_position += max_bytes
         for h in H:
             yield h.tostring().decode('ascii')