Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/stats.md
Original file line number Diff line number Diff line change
Expand Up @@ -566,12 +566,12 @@ Most statistics are not affected by invariant sites,
and hence do not depend on any part of the tree that is not ancestral to any of the sample sets.
However, some statistics are different: for instance,
given a pair of samples, {meth}`TreeSequence.genetic_relatedness`
with `centre=False` (and `polarised=True`, the default for that method)
with `centre=False` and `polarised=False`
adds up the total number of alleles (or total area of branches) that is
either ancestral to both samples *or ancestral to neither*.
So, it depends on what else is in the tree sequence.
(For this reason, we don't recommend actually *using* this combination of options for genetic
relatedness.)
relatedness; the default for that method is `polarised=True`.)

In terms of the summary function {math}`f(x)`, "not affected by invariant sites" translates to
{math}`f(0) = f(n) = 0`, where {math}`n` is the vector of sample set sizes.
Expand Down
2 changes: 2 additions & 0 deletions python/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ Unreleased
allowing greater flexibility in "disjoint union" situations.
(:user:`hyanwong`, :user:`petrelharp`, :issue:`3181`)

- Add ``TreeSequence.divergence_matrix``, which was previously undocumented.

**Bugfixes**

- In some tables with mutations out-of-order ``TableCollection.sort`` did not re-order
Expand Down
48 changes: 48 additions & 0 deletions python/tests/test_divmat.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,30 @@ def test_single_tree_sites_per_branch(self, m):
)
np.testing.assert_array_equal(D1, m * D2)

@pytest.mark.parametrize("m", [1, 3])
def test_single_tree_mutations_per_branch(self, m):
# 3.00┊ 6 ┊
# ┊ ┏━┻━┓ ┊
# 2.00┊ ┃ 5 ┊
# ┊ ┃ ┏━┻┓ ┊
# 1.00┊ ┃ ┃ 4 ┊
# ┊ ┃ ┃ ┏┻┓ ┊
# 0.00┊ 0 1 2 3 ┊
# 0 1
# state 2 3 4 4
ts = tskit.Tree.generate_comb(4).tree_sequence
ts = tsutil.insert_branch_mutations(ts, m, num_states=5)
D1 = check_divmat(ts, mode="site")
D2 = np.array(
[
[0.0, 1.0, 1.0, 1.0],
[1.0, 0.0, 1.0, 1.0],
[1.0, 1.0, 0.0, 0.0],
[1.0, 1.0, 0.0, 0.0],
]
)
np.testing.assert_array_equal(D1, D2)

@pytest.mark.parametrize("n", [2, 3, 5])
def test_single_tree_unique_sample_alleles(self, n):
tables = tskit.Tree.generate_balanced(n).tree_sequence.dump_tables()
Expand Down Expand Up @@ -1385,6 +1409,30 @@ def test_single_tree(self, mode):
ts = tsutil.insert_branch_sites(ts)
self.check(ts, mode)

@pytest.mark.parametrize("m", [1, 3])
def test_single_tree_mutations_per_branch(self, m):
# 3.00┊ 6 ┊
# ┊ ┏━┻━┓ ┊
# 2.00┊ ┃ 5 ┊
# ┊ ┃ ┏━┻┓ ┊
# 1.00┊ ┃ ┃ 4 ┊
# ┊ ┃ ┃ ┏┻┓ ┊
# 0.00┊ 0 1 2 3 ┊
# 0 1
# state 2 3 4 4
ts = tskit.Tree.generate_comb(4).tree_sequence
ts = tsutil.insert_branch_mutations(ts, m, num_states=5)
D1 = check_divmat(ts, mode="site")
D2 = np.array(
[
[0.0, 1.0, 1.0, 1.0],
[1.0, 0.0, 1.0, 1.0],
[1.0, 1.0, 0.0, 0.0],
[1.0, 1.0, 0.0, 0.0],
]
)
np.testing.assert_array_equal(D1, D2)

@pytest.mark.parametrize("mode", DIVMAT_MODES)
def test_single_tree_sample_sets(self, mode):
# 2.00┊ 6 ┊
Expand Down
4 changes: 2 additions & 2 deletions python/tests/tsutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def subsample_sites(ts, num_sites):
return t.tree_sequence()


def insert_branch_mutations(ts, mutations_per_branch=1):
def insert_branch_mutations(ts, mutations_per_branch=1, num_states=2):
"""
Returns a copy of the specified tree sequence with a mutation on every branch
in every tree.
Expand All @@ -102,7 +102,7 @@ def insert_branch_mutations(ts, mutations_per_branch=1):
state[u] = state[v]
parent = mutation[v]
for _ in range(mutations_per_branch):
state[u] = (state[u] + 1) % 2
state[u] = (state[u] + 1) % num_states
metadata = f"{len(tables.mutations)}".encode()
mutation[u] = tables.mutations.add_row(
site=site,
Expand Down
134 changes: 80 additions & 54 deletions python/tskit/trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -8698,52 +8698,6 @@ def _parse_stat_matrix_sample_sets(ids):
sizes = np.array(sizes, dtype=size_dtype)
return flat, sizes

# def divergence_matrix(self, sample_sets, windows=None, mode="site"):
# """
# Finds the mean divergence between pairs of samples from each set of
# samples and in each window. Returns a numpy array indexed by (window,
# sample_set, sample_set). Diagonal entries are corrected so that the
# value gives the mean divergence for *distinct* samples, but it is not
# checked whether the sample_sets are disjoint (so offdiagonals are not
# corrected). For this reason, if an element of `sample_sets` has only
# one element, the corresponding diagonal will be NaN.

# The mean divergence between two samples is defined to be the mean: (as
# a TreeStat) length of all edges separating them in the tree, or (as a
# SiteStat) density of segregating sites, at a uniformly chosen position
# on the genome.

# :param list sample_sets: A list of sets of IDs of samples.
# :param iterable windows: The breakpoints of the windows (including start
# and end, so has one more entry than number of windows).
# :return: A list of the upper triangle of mean TMRCA values in row-major
# order, including the diagonal.
# """
# ns = len(sample_sets)
# indexes = [(i, j) for i in range(ns) for j in range(i, ns)]
# x = self.divergence(sample_sets, indexes, windows, mode=mode)
# nw = len(windows) - 1
# A = np.ones((nw, ns, ns), dtype=float)
# for w in range(nw):
# k = 0
# for i in range(ns):
# for j in range(i, ns):
# A[w, i, j] = A[w, j, i] = x[w][k]
# k += 1
# return A
# NOTE: see older definition of divmat here, which may be useful when documenting
# this function. See https://github.com/tskit-dev/tskit/issues/2781

# NOTE for documentation of sample_sets. We *must* use samples currently because
# the normalisation for non-sample nodes is tricky. Do we normalise by the
# total span of the ts where the node is 'present' in the tree? We avoid this
# by insisting on sample nodes.

# NOTE for documentation of num_threads. Need to explain that the
# its best to think of as the number of background *worker* threads.
# default is to run without any worker threads. If you want to run
# with all the cores on the machine, use num_threads=os.cpu_count().

def divergence_matrix(
self,
sample_sets=None,
Expand All @@ -8753,6 +8707,41 @@ def divergence_matrix(
mode=None,
span_normalise=True,
):
"""
Finds the matrix of pairwise mean :meth:`.divergence` values between groups
of sample nodes. Returns a numpy array indexed by (window,
sample_set, sample_set): the [k,i,j]th value of the result gives the
mean divergence between pairs of samples from the i-th and j-th
sample sets in the k-th window. Diagonal entries are corrected so that the
value gives the mean divergence for *distinct* samples, but it is not
checked whether the sample_sets are disjoint (so offdiagonals are not
corrected). For this reason, if an element of `sample_sets` has only
one element, the corresponding diagonal will be NaN.
This is (usually) more efficient than computing many pairwise
values using the `indexes` argument to :meth:`.divergence`.

See :meth:`.divergence` for a description of what exactly is computed.

:param list sample_sets: A list of sets of IDs of samples.
:param list windows: The breakpoints of the windows (including start
and end, so has one more entry than number of windows).
:param str mode: A string giving the "type" of the statistic to be computed
(defaults to "mutation"; "site" and "node" not supported).
:return: A list of the upper triangle of mean TMRCA values in row-major
order, including the diagonal.
"""
# NOTE: see older definition of divmat here, which may be useful when documenting
# this function. See https://github.com/tskit-dev/tskit/issues/2781

# NOTE for documentation of sample_sets. We *must* use samples currently because
# the normalisation for non-sample nodes is tricky. Do we normalise by the
# total span of the ts where the node is 'present' in the tree? We avoid this
# by insisting on sample nodes.

# NOTE for documentation of num_threads. Need to explain that the
# its best to think of as the number of background *worker* threads.
# default is to run without any worker threads. If you want to run
# with all the cores on the machine, use num_threads=os.cpu_count().
windows_specified = windows is not None
windows = self.parse_windows(windows)
mode = "site" if mode is None else mode
Expand Down Expand Up @@ -8960,17 +8949,26 @@ def genetic_relatedness_matrix(
"""
Computes the full matrix of pairwise genetic relatedness values
between (and within) pairs of sets of nodes from ``sample_sets``.
*Warning:* this does not compute exactly the same thing as
Returns a numpy array indexed by (window, sample_set, sample_set):
the [k,i,j]th value of the result gives the
genetic relatedness between pairs of samples from the i-th and j-th
sample sets in the k-th window.
This is (usually) more efficient than computing many pairwise
values using the `indexes` argument to :meth:`.genetic_relatedness`.

*Warning:* in some cases, this does not compute exactly the same thing as
:meth:`.genetic_relatedness`: see below for more details.

If `mode="branch"`, then the value obtained is the same as that from
:meth:`.genetic_relatedness`, using the options `centre=True` and
`proportion=False`. The same is true if `mode="site"` and all sites have
at most one mutation.

However, if some sites have more than one mutation, the value may differ.
However, if some sites have more than one mutation, the value may differ
from that given by :meth:`.genetic_relatedness`:, although if the proportion
of such sites is small, the difference will be small.
The reason is that this function (for efficiency) computes relatedness
using :meth:`.divergence` and the following relationship.
using :meth:`.divergence_matrix` and the following relationship.
"Relatedness" measures the number of *shared* alleles (or branches),
while "divergence" measures the number of *non-shared* alleles (or branches).
Let :math:`T_i` be the total distance from sample :math:`i` up to the root;
Expand All @@ -8980,17 +8978,20 @@ def genetic_relatedness_matrix(
So, for any samples :math:`I`, :math:`J`, :math:`S`, :math:`T`
(that may now be random choices),
:math:`R_{IJ}-R_{IS}-R_{JT}+R_{ST} = (D_{IJ}-D_{IS}-D_{JT}+D_{ST})/ (-2)`.
Note, however, that this relationship only holds for `mode="site"`
if we can treat "number of differing alleles" as distances on the tree;
this is not necessarily the case in the presence of multiple mutations.
However, this relationship does not necessarily hold for `mode="site"`:
it does hold if we can treat "number of differing alleles" as distances
on the tree, but this is not necessarily the case in the presence of
multiple mutations.

Another caveat in the above relationship between :math:`R` and :math:`D`
Another note regarding the above relationship between :math:`R` and :math:`D`
is that :meth:`.divergence` of a sample set to itself does not include
the "self" comparisons (so as to provide an unbiased estimator of a
population quantity), while the usual definition of genetic relatedness
*does* include such comparisons (to provide, for instance, an appropriate
value for prospective results beginning with only a given set of
individuals).
individuals). So, diagonal entries in the relatedness matrix returned here
are obtained from :meth:`divergence_matrix` after first correcting
diagonals to include these "self" comparisons.

:param list sample_sets: A list of lists of Node IDs, specifying the
groups of nodes to compute the statistic with.
Expand All @@ -9004,6 +9005,31 @@ def genetic_relatedness_matrix(
If there is one pair of sample sets and windows=None, a numpy scalar is
returned.
"""
# Further notes on the relationship between relatedness (R)
# and divergence (D) in mode="site":
# The summary function for divergence is "p (1-q)",
# where p and q are the allele frequencies in the two sample sets;
# while for relatedness it is "pq". Summing across *all* alleles,
# we get that relatedness plus divergence is
# p1 (1-q1) + p1 q1 + ... + pk (1-qk) + pk qk = p1 + ... + pk = 1 .
# This implies that
# ts.divergence(..., span_normalise=False)
# + ts.genetic_relatedness(..., span_normalise=False, centre=False,
# proportion=False, polarised=False)
# == ts.num_sites
# This could be the basis for a similar relationship between R and D.
# However, that relationship holds only with polarised=False, which is not
# the default, or what this function does (for good reason).
# So, without setting polarised=False, we have that that for samples i and j,
# divergence plus relatedness is equal to (something like)
# the total number of sites at which both i and j are ancestral;
# this depends on the samples and so does not cancel out of the centred
# version. We could work through these relationships to figure out what exactly
# the difference between genetic_relatedness_matrix(mode="site") and
# genetic_relatedness(mode="site") is, in the general case of multiple
# mutations... but that would be confusing, probably not that useful,
# and the short version of all this is that "it's complicated".

D = self.divergence_matrix(
sample_sets,
windows=windows,
Expand Down