diff --git a/docs/stats.md b/docs/stats.md index 693c363e08..69b04c7575 100644 --- a/docs/stats.md +++ b/docs/stats.md @@ -566,12 +566,12 @@ Most statistics are not affected by invariant sites, and hence do not depend on any part of the tree that is not ancestral to any of the sample sets. However, some statistics are different: for instance, given a pair of samples, {meth}`TreeSequence.genetic_relatedness` -with `centre=False` (and `polarised=True`, the default for that method) +with `centre=False` and `polarised=False` adds up the total number of alleles (or total area of branches) that is either ancestral to both samples *or ancestral to neither*. So, it depends on what else is in the tree sequence. (For this reason, we don't recommend actually *using* this combination of options for genetic -relatedness.) +relatedness; the default for that method is `polarised=True`.) In terms of the summary function {math}`f(x)`, "not affected by invariant sites" translates to {math}`f(0) = f(n) = 0`, where {math}`n` is the vector of sample set sizes. diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index d54d487c48..ae342c29a1 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -76,6 +76,8 @@ Unreleased allowing greater flexibility in "disjoint union" situations. (:user:`hyanwong`, :user:`petrelharp`, :issue:`3181`) +- Add ``TreeSequence.divergence_matrix``, which was previously undocumented. + **Bugfixes** - In some tables with mutations out-of-order ``TableCollection.sort`` did not re-order diff --git a/python/tests/test_divmat.py b/python/tests/test_divmat.py index 892c48c347..07de0b3acd 100644 --- a/python/tests/test_divmat.py +++ b/python/tests/test_divmat.py @@ -490,6 +490,30 @@ def test_single_tree_sites_per_branch(self, m): ) np.testing.assert_array_equal(D1, m * D2) + @pytest.mark.parametrize("m", [1, 3]) + def test_single_tree_mutations_per_branch(self, m): + # 3.00┊ 6 ┊ + # ┊ ┏━┻━┓ ┊ + # 2.00┊ ┃ 5 ┊ + # ┊ ┃ ┏━┻┓ ┊ + # 1.00┊ ┃ ┃ 4 ┊ + # ┊ ┃ ┃ ┏┻┓ ┊ + # 0.00┊ 0 1 2 3 ┊ + # 0 1 + # state 2 3 4 4 + ts = tskit.Tree.generate_comb(4).tree_sequence + ts = tsutil.insert_branch_mutations(ts, m, num_states=5) + D1 = check_divmat(ts, mode="site") + D2 = np.array( + [ + [0.0, 1.0, 1.0, 1.0], + [1.0, 0.0, 1.0, 1.0], + [1.0, 1.0, 0.0, 0.0], + [1.0, 1.0, 0.0, 0.0], + ] + ) + np.testing.assert_array_equal(D1, D2) + @pytest.mark.parametrize("n", [2, 3, 5]) def test_single_tree_unique_sample_alleles(self, n): tables = tskit.Tree.generate_balanced(n).tree_sequence.dump_tables() @@ -1385,6 +1409,30 @@ def test_single_tree(self, mode): ts = tsutil.insert_branch_sites(ts) self.check(ts, mode) + @pytest.mark.parametrize("m", [1, 3]) + def test_single_tree_mutations_per_branch(self, m): + # 3.00┊ 6 ┊ + # ┊ ┏━┻━┓ ┊ + # 2.00┊ ┃ 5 ┊ + # ┊ ┃ ┏━┻┓ ┊ + # 1.00┊ ┃ ┃ 4 ┊ + # ┊ ┃ ┃ ┏┻┓ ┊ + # 0.00┊ 0 1 2 3 ┊ + # 0 1 + # state 2 3 4 4 + ts = tskit.Tree.generate_comb(4).tree_sequence + ts = tsutil.insert_branch_mutations(ts, m, num_states=5) + D1 = check_divmat(ts, mode="site") + D2 = np.array( + [ + [0.0, 1.0, 1.0, 1.0], + [1.0, 0.0, 1.0, 1.0], + [1.0, 1.0, 0.0, 0.0], + [1.0, 1.0, 0.0, 0.0], + ] + ) + np.testing.assert_array_equal(D1, D2) + @pytest.mark.parametrize("mode", DIVMAT_MODES) def test_single_tree_sample_sets(self, mode): # 2.00┊ 6 ┊ diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index dc501834f6..7572c11d84 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -79,7 +79,7 @@ def subsample_sites(ts, num_sites): return t.tree_sequence() -def insert_branch_mutations(ts, mutations_per_branch=1): +def insert_branch_mutations(ts, mutations_per_branch=1, num_states=2): """ Returns a copy of the specified tree sequence with a mutation on every branch in every tree. @@ -102,7 +102,7 @@ def insert_branch_mutations(ts, mutations_per_branch=1): state[u] = state[v] parent = mutation[v] for _ in range(mutations_per_branch): - state[u] = (state[u] + 1) % 2 + state[u] = (state[u] + 1) % num_states metadata = f"{len(tables.mutations)}".encode() mutation[u] = tables.mutations.add_row( site=site, diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 9d146a6e05..3de473e008 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -8698,52 +8698,6 @@ def _parse_stat_matrix_sample_sets(ids): sizes = np.array(sizes, dtype=size_dtype) return flat, sizes - # def divergence_matrix(self, sample_sets, windows=None, mode="site"): - # """ - # Finds the mean divergence between pairs of samples from each set of - # samples and in each window. Returns a numpy array indexed by (window, - # sample_set, sample_set). Diagonal entries are corrected so that the - # value gives the mean divergence for *distinct* samples, but it is not - # checked whether the sample_sets are disjoint (so offdiagonals are not - # corrected). For this reason, if an element of `sample_sets` has only - # one element, the corresponding diagonal will be NaN. - - # The mean divergence between two samples is defined to be the mean: (as - # a TreeStat) length of all edges separating them in the tree, or (as a - # SiteStat) density of segregating sites, at a uniformly chosen position - # on the genome. - - # :param list sample_sets: A list of sets of IDs of samples. - # :param iterable windows: The breakpoints of the windows (including start - # and end, so has one more entry than number of windows). - # :return: A list of the upper triangle of mean TMRCA values in row-major - # order, including the diagonal. - # """ - # ns = len(sample_sets) - # indexes = [(i, j) for i in range(ns) for j in range(i, ns)] - # x = self.divergence(sample_sets, indexes, windows, mode=mode) - # nw = len(windows) - 1 - # A = np.ones((nw, ns, ns), dtype=float) - # for w in range(nw): - # k = 0 - # for i in range(ns): - # for j in range(i, ns): - # A[w, i, j] = A[w, j, i] = x[w][k] - # k += 1 - # return A - # NOTE: see older definition of divmat here, which may be useful when documenting - # this function. See https://github.com/tskit-dev/tskit/issues/2781 - - # NOTE for documentation of sample_sets. We *must* use samples currently because - # the normalisation for non-sample nodes is tricky. Do we normalise by the - # total span of the ts where the node is 'present' in the tree? We avoid this - # by insisting on sample nodes. - - # NOTE for documentation of num_threads. Need to explain that the - # its best to think of as the number of background *worker* threads. - # default is to run without any worker threads. If you want to run - # with all the cores on the machine, use num_threads=os.cpu_count(). - def divergence_matrix( self, sample_sets=None, @@ -8753,6 +8707,41 @@ def divergence_matrix( mode=None, span_normalise=True, ): + """ + Finds the matrix of pairwise mean :meth:`.divergence` values between groups + of sample nodes. Returns a numpy array indexed by (window, + sample_set, sample_set): the [k,i,j]th value of the result gives the + mean divergence between pairs of samples from the i-th and j-th + sample sets in the k-th window. Diagonal entries are corrected so that the + value gives the mean divergence for *distinct* samples, but it is not + checked whether the sample_sets are disjoint (so offdiagonals are not + corrected). For this reason, if an element of `sample_sets` has only + one element, the corresponding diagonal will be NaN. + This is (usually) more efficient than computing many pairwise + values using the `indexes` argument to :meth:`.divergence`. + + See :meth:`.divergence` for a description of what exactly is computed. + + :param list sample_sets: A list of sets of IDs of samples. + :param list windows: The breakpoints of the windows (including start + and end, so has one more entry than number of windows). + :param str mode: A string giving the "type" of the statistic to be computed + (defaults to "mutation"; "site" and "node" not supported). + :return: A list of the upper triangle of mean TMRCA values in row-major + order, including the diagonal. + """ + # NOTE: see older definition of divmat here, which may be useful when documenting + # this function. See https://github.com/tskit-dev/tskit/issues/2781 + + # NOTE for documentation of sample_sets. We *must* use samples currently because + # the normalisation for non-sample nodes is tricky. Do we normalise by the + # total span of the ts where the node is 'present' in the tree? We avoid this + # by insisting on sample nodes. + + # NOTE for documentation of num_threads. Need to explain that the + # its best to think of as the number of background *worker* threads. + # default is to run without any worker threads. If you want to run + # with all the cores on the machine, use num_threads=os.cpu_count(). windows_specified = windows is not None windows = self.parse_windows(windows) mode = "site" if mode is None else mode @@ -8960,7 +8949,14 @@ def genetic_relatedness_matrix( """ Computes the full matrix of pairwise genetic relatedness values between (and within) pairs of sets of nodes from ``sample_sets``. - *Warning:* this does not compute exactly the same thing as + Returns a numpy array indexed by (window, sample_set, sample_set): + the [k,i,j]th value of the result gives the + genetic relatedness between pairs of samples from the i-th and j-th + sample sets in the k-th window. + This is (usually) more efficient than computing many pairwise + values using the `indexes` argument to :meth:`.genetic_relatedness`. + + *Warning:* in some cases, this does not compute exactly the same thing as :meth:`.genetic_relatedness`: see below for more details. If `mode="branch"`, then the value obtained is the same as that from @@ -8968,9 +8964,11 @@ def genetic_relatedness_matrix( `proportion=False`. The same is true if `mode="site"` and all sites have at most one mutation. - However, if some sites have more than one mutation, the value may differ. + However, if some sites have more than one mutation, the value may differ + from that given by :meth:`.genetic_relatedness`:, although if the proportion + of such sites is small, the difference will be small. The reason is that this function (for efficiency) computes relatedness - using :meth:`.divergence` and the following relationship. + using :meth:`.divergence_matrix` and the following relationship. "Relatedness" measures the number of *shared* alleles (or branches), while "divergence" measures the number of *non-shared* alleles (or branches). Let :math:`T_i` be the total distance from sample :math:`i` up to the root; @@ -8980,17 +8978,20 @@ def genetic_relatedness_matrix( So, for any samples :math:`I`, :math:`J`, :math:`S`, :math:`T` (that may now be random choices), :math:`R_{IJ}-R_{IS}-R_{JT}+R_{ST} = (D_{IJ}-D_{IS}-D_{JT}+D_{ST})/ (-2)`. - Note, however, that this relationship only holds for `mode="site"` - if we can treat "number of differing alleles" as distances on the tree; - this is not necessarily the case in the presence of multiple mutations. + However, this relationship does not necessarily hold for `mode="site"`: + it does hold if we can treat "number of differing alleles" as distances + on the tree, but this is not necessarily the case in the presence of + multiple mutations. - Another caveat in the above relationship between :math:`R` and :math:`D` + Another note regarding the above relationship between :math:`R` and :math:`D` is that :meth:`.divergence` of a sample set to itself does not include the "self" comparisons (so as to provide an unbiased estimator of a population quantity), while the usual definition of genetic relatedness *does* include such comparisons (to provide, for instance, an appropriate value for prospective results beginning with only a given set of - individuals). + individuals). So, diagonal entries in the relatedness matrix returned here + are obtained from :meth:`divergence_matrix` after first correcting + diagonals to include these "self" comparisons. :param list sample_sets: A list of lists of Node IDs, specifying the groups of nodes to compute the statistic with. @@ -9004,6 +9005,31 @@ def genetic_relatedness_matrix( If there is one pair of sample sets and windows=None, a numpy scalar is returned. """ + # Further notes on the relationship between relatedness (R) + # and divergence (D) in mode="site": + # The summary function for divergence is "p (1-q)", + # where p and q are the allele frequencies in the two sample sets; + # while for relatedness it is "pq". Summing across *all* alleles, + # we get that relatedness plus divergence is + # p1 (1-q1) + p1 q1 + ... + pk (1-qk) + pk qk = p1 + ... + pk = 1 . + # This implies that + # ts.divergence(..., span_normalise=False) + # + ts.genetic_relatedness(..., span_normalise=False, centre=False, + # proportion=False, polarised=False) + # == ts.num_sites + # This could be the basis for a similar relationship between R and D. + # However, that relationship holds only with polarised=False, which is not + # the default, or what this function does (for good reason). + # So, without setting polarised=False, we have that that for samples i and j, + # divergence plus relatedness is equal to (something like) + # the total number of sites at which both i and j are ancestral; + # this depends on the samples and so does not cancel out of the centred + # version. We could work through these relationships to figure out what exactly + # the difference between genetic_relatedness_matrix(mode="site") and + # genetic_relatedness(mode="site") is, in the general case of multiple + # mutations... but that would be confusing, probably not that useful, + # and the short version of all this is that "it's complicated". + D = self.divergence_matrix( sample_sets, windows=windows,