Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions python/tests/test_genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import unittest
import itertools
import random
import warnings

import numpy as np
import msprime
Expand Down Expand Up @@ -552,9 +553,29 @@ def test_acgt_mutations(self):
def test_multiletter_mutations(self):
ts = msprime.simulate(10, random_seed=2)
tables = ts.tables
tables.sites.add_row(0, "ACTG")
site0 = tables.sites.add_row(0.0, "AT")
site1 = tables.sites.add_row(0.1, "T") # This should raise a warning if mut > 1
site2 = tables.sites.add_row(0.2, "A") # This should raise a warning if mut > 1
tables.mutations.add_row(site0, 0, "")
tables.mutations.add_row(site1, 1, "GGT")
tables.mutations.add_row(site2, 2, "ATCG")
tsp = tables.tree_sequence()
self.assertRaises(exceptions.LibraryError, list, tsp.haplotypes())
with warnings.catch_warnings(record=True) as w:
hap = list(tsp.haplotypes())
self.assertEqual(len(w), 2)
for warning in w:
self.assertTrue(issubclass(warning.category, UserWarning))
self.assertTrue(hap[0] == "--" + "---" + "----")
self.assertTrue(hap[1] == "AT" + "GGT" + "----")
self.assertTrue(hap[2] == "AT" + "---" + "ATCG")
self.assertTrue(all([h == "AT" + "---" + "----" for h in hap[3:]]))

def test_nonascii_mutations(self):
ts = msprime.simulate(10, random_seed=2)
tables = ts.tables
tables.sites.add_row(0, chr(169)) # Copyright symbol
tsp = tables.tree_sequence()
self.assertRaises(TypeError, list, tsp.haplotypes())

def test_recurrent_mutations_over_samples(self):
ts = msprime.simulate(10, random_seed=2)
Expand Down Expand Up @@ -591,12 +612,15 @@ def test_back_mutations(self):
ts = tsutil.insert_branch_mutations(base_ts, mutations_per_branch=j)
self.verify_tree_sequence(ts)

def test_fails_missing_data(self):
def test_missing_data(self):
tables = tskit.TableCollection(1.0)
tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0)
tables.sites.add_row(0.5, "A")
ts = tables.tree_sequence()
self.assertRaises(exceptions.LibraryError, list, ts.haplotypes())
self.assertRaises(ValueError, list, ts.haplotypes(missing_data_character="A"))
for c in ("-", ".", "a"):
h = list(ts.haplotypes(missing_data_character=c))
self.assertEqual(h, [c, c])
h = list(ts.haplotypes(impute_missing_data=True))
self.assertEqual(h, ["A", "A"])
146 changes: 105 additions & 41 deletions python/tskit/trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -2827,33 +2827,49 @@ def trees(
sample_lists=sample_lists)
return TreeIterator(tree)

def haplotypes(self, impute_missing_data=False):
"""
Returns an iterator over the haplotypes resulting from the trees
and mutations in this tree sequence as a string.
The iterator returns a total of :math:`n` strings, each of which
contains :math:`s` characters (:math:`n` is the sample size
returned by :attr:`tskit.TreeSequence.num_samples` and
:math:`s` is the number of sites returned by
:attr:`tskit.TreeSequence.num_sites`). The first
string returned is the haplotype for sample `0`, and so on.
For a given haplotype ``h``, the value of ``h[j]`` is the observed
allelic state at site ``j``.
def haplotypes(self, impute_missing_data=False, missing_data_character="-"):
"""
Returns an iterator over the strings of haplotypes that result from
the trees and mutations in this tree sequence. Each haplotype string
is guaranteed to be of the same length. A tree sequence with
:math:`n` samples and :math:`s` sites will return a total of :math:`n`
strings of :math:`s` alleles concatenated together, where an allele
can consist of zero or more ascii characters (tree sequences that
include alleles containing non-ascii characters will raise an error).
The first string returned is the haplotype for sample ``0``, and so on.

Each allele is represented by a number of characters given by the maximum
length (in bytes) of any of the alleles at that site. In the simplest
instance, the alleles at each site are represented by single byte characters
such as when all variants are single nucleotide polymorphisms (SNPs). In
this case the strings returned will all be of length :math:`s`, and for a
haplotype ``h``, the value of ``h[j]`` is the observed allelic state
at site ``j``. If instead, one site contains, say, an indel of
length 10 (e.g. if the ancestral state is the empty string and a mutation
exists at that site to a derived site of "ATATATATAT"), then the returned
strings will each be of length :math:`s` + 9.

Zero-length alleles (i.e. deletions) are represented in the string by
the ``missing_data_character`` repeated :math:`c` times, where :math:`c`
is the maximum allele length (in bytes) at that site. Any alleles of
length greater than 0 but less than :math:`c`, which could therefore
be aligned at different positions in the string, are *not emitted*, but
are instead replaced with a string consisting of the
``missing_data_character``, as if they were a deletion, and a warning is
given.

If ``impute_missing_data`` is False,
:ref:`missing data<sec_data_model_missing_data>` will be also represented
in the string by :math:`c` repeats of the ``missing_data_character``. If
instead it is set to True, missing data will be imputed such that all
isolated samples are assigned the ancestral state (unless they have
mutations directly above them, in which case they will take the most recent
derived mutational state for that node). This was the default
behaviour in versions prior to 0.2.0.

See also the :meth:`.variants` iterator for site-centric access
to sample genotypes.

This method is only supported for single-letter alleles.

As :ref:`missing data<sec_data_model_missing_data>` cannot be
represented directly in the haplotypes array, an error will be raised
if missing data is present. However, if ``impute_missing_data`` set
to True, missing data will be imputed such that all isolated samples
are assigned the ancestral state (unless they have mutations directly
above them, in which case they will take the most recent derived
mutational state for that node). This was the default behaviour in
versions prior to 0.2.0.

.. warning::
For large datasets, this method can consume a **very large** amount of
memory! To output all the sample data, it is more efficient to iterate
Expand All @@ -2867,26 +2883,74 @@ def haplotypes(self, impute_missing_data=False):
:param bool impute_missing_data: If True, the allele assigned to any
isolated samples is the ancestral state; that is, we impute
missing data as the ancestral state. Default: False.
:param str missing_data_character: A single ascii character that will
be used to represent missing data, deletions, and other alleles that
are shorter in length than the maximum allele length at each site.
If any normal allele contains this character, an error is raised.
Default: '-'.
:rtype: collections.abc.Iterable
:raises: LibraryError if called on a tree sequence containing
multiletter alleles.
:raises: LibraryError if missing data is present and impute_missing_data
is False
"""
H = np.empty((self.num_samples, self.num_sites), dtype=np.int8)
for var in self.variants(impute_missing_data=impute_missing_data):
if var.has_missing_data:
raise _tskit.LibraryError(
"Generated haplotypes contain missing data, which cannot be "
"represented. Either generate variants (which support missing "
"data) or use the impute missing data option.")
alleles = np.empty(len(var.alleles), dtype=np.int8)
:raises: TypeError if the missing_data_character is not a single ascii
character, or if any allele contains non-ascii characters
:raises: ValueError
if the ``missing_data_character`` exists in one of the alleles
"""
def sitewise_allele_max_bytes(ts):
"""
Return the maximum length in bytes of all the alleles at each site.

This is equivalent to, but much faster then, the following code
``np.array([max([len(s.encode('utf-8')) for s in v.alleles])
for v in ts.variants()])``

NB: this requires mutations to be sorted by site ID, which is a necessary
requirement of a tree sequence, hence this needs to be a ts function
not a tables function.
"""
site_mutations_max_bytes = np.zeros((ts.num_sites), dtype=np.uint32)
if ts.num_mutations > 0:
mutations_bytes = np.diff(ts.tables.mutations.derived_state_offset[:])
site_increment = np.append(1, np.diff(ts.tables.mutations.site[:]))
site_id = np.cumsum(site_increment[site_increment > 0]) - 1
site_breakpoints = np.where(site_increment > 0)[0]
site_mutations_max_bytes[site_id] = np.maximum.reduceat(
mutations_bytes, site_breakpoints)
ancestral_state_max_bytes = np.diff(ts.tables.sites.ancestral_state_offset)
return np.maximum(site_mutations_max_bytes, ancestral_state_max_bytes)

max_allele_bytes = sitewise_allele_max_bytes(self)
output_length_bytes = np.sum(max_allele_bytes)
H = np.empty((self.num_samples, output_length_bytes), dtype=np.int8)
missing_ascii = missing_data_character.encode('ascii')
missing_int8 = ord(missing_ascii)
curr_position = 0
for max_bytes, var in zip(
max_allele_bytes,
self.variants(impute_missing_data=impute_missing_data)):
# Fill by missing data character by default
alleles = np.full((len(var.alleles), max_bytes), missing_int8, dtype=np.int8)
for i, allele in enumerate(var.alleles):
if len(allele) > 1:
raise _tskit.LibraryError(
"Cannot produce haplotypes from multi-letter alleles")
alleles[i] = ord(allele.encode('ascii'))
H[:, var.site.id] = alleles[var.genotypes]
if allele is not None and len(allele) > 0:
try:
ascii_allele = allele.encode('ascii')
except UnicodeEncodeError:
raise TypeError(
"Non-ascii character in allele at site {}"
.format(var.site.id))
assert len(ascii_allele) <= max_bytes
if len(ascii_allele) < max_bytes:
warnings.warn(
"Site {} (position {}) has alleles of different lengths. "
"Treating any shorter than {} bytes as missing."
.format(var.site.id, var.site.position, max_bytes))
else:
if missing_ascii in ascii_allele:
raise ValueError(
"The missing data character '{}' clashes with an "
"existing allele at site {}"
.format(missing_data_character, var.site.id))
alleles[i, :] = np.frombuffer(ascii_allele, dtype=np.int8)
H[:, curr_position:(curr_position + max_bytes)] = alleles[var.genotypes, :]
curr_position += max_bytes
for h in H:
yield h.tostring().decode('ascii')

Expand Down