From 64d3443594e4b450dd828fa97e988a095f21eea7 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Fri, 7 Nov 2025 14:55:15 +0000 Subject: [PATCH] Allow internal nodes when isolate_as_missing --- c/tests/test_genotypes.c | 96 +++++++++- c/tskit/genotypes.c | 34 +++- python/tests/test_genotypes.py | 324 +++++++++++++++++++++++++++++++++ python/tskit/genotypes.py | 43 +++-- python/tskit/trees.py | 88 ++++++--- python/tskit/util.py | 7 +- 6 files changed, 528 insertions(+), 64 deletions(-) diff --git a/c/tests/test_genotypes.c b/c/tests/test_genotypes.c index fd597439e0..df0bfeb883 100644 --- a/c/tests/test_genotypes.c +++ b/c/tests/test_genotypes.c @@ -512,10 +512,45 @@ test_single_tree_non_samples(void) tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); - /* It's an error to hand in non-samples without imputation turned on */ ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUST_IMPUTE_NON_SAMPLES); - tsk_vargen_free(&vargen); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_vargen_print_state(&vargen, _devnull); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + CU_ASSERT_EQUAL(var->num_alleles, 2); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); + CU_ASSERT_EQUAL(var->site.id, 0); + CU_ASSERT_EQUAL(var->site.mutations_length, 1); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + CU_ASSERT_EQUAL(var->genotypes[0], 1); + CU_ASSERT_EQUAL(var->num_alleles, 2); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); + CU_ASSERT_EQUAL(var->site.id, 1); + CU_ASSERT_EQUAL(var->site.mutations_length, 2); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + CU_ASSERT_EQUAL(var->genotypes[1], 0); + CU_ASSERT_EQUAL(var->num_alleles, 2); + CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1); + CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1); + CU_ASSERT_EQUAL(var->site.id, 2); + CU_ASSERT_EQUAL(var->site.mutations_length, 4); + + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_vargen_free(&vargen); + CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, TSK_ISOLATED_NOT_MISSING); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -559,6 +594,60 @@ test_single_tree_non_samples(void) tsk_treeseq_free(&ts); } +static void +test_isolated_internal_node(void) +{ + int ret = 0; + tsk_treeseq_t ts; + tsk_vargen_t vargen; + tsk_variant_t *var; + /* Two sample nodes (0,1), plus an internal non-sample node u=2 with no edges */ + const char *nodes = "1 0 -1 -1\n" + "1 0 -1 -1\n" + "0 1 -1 -1\n"; + const char *sites = "2.0 A\n" + "9.0 T\n"; + tsk_id_t samples[] = { 2 }; + + tsk_treeseq_from_text(&ts, 10, nodes, "", NULL, sites, NULL, NULL, NULL, 0); + CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_nodes(&ts), 3); + CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_samples(&ts), 2); + CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_sites(&ts), 2); + + /* Default options (isolated_as_missing=True): internal node is isolated everywhere + */ + ret = tsk_vargen_init(&vargen, &ts, samples, 1, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_TRUE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_TRUE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_vargen_free(&vargen); + + /* Impute missing (isolated_as_missing=False): genotypes should be ancestral (0) */ + ret = tsk_vargen_init(&vargen, &ts, samples, 1, NULL, TSK_ISOLATED_NOT_MISSING); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_FALSE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 1); + CU_ASSERT_FALSE(var->has_missing_data); + CU_ASSERT_EQUAL(var->genotypes[0], 0); + ret = tsk_vargen_next(&vargen, &var); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_vargen_free(&vargen); + + tsk_treeseq_free(&ts); +} + static void test_single_tree_errors(void) { @@ -1123,6 +1212,7 @@ main(int argc, char **argv) { "test_single_tree_char_alphabet", test_single_tree_char_alphabet }, { "test_single_tree_binary_alphabet", test_single_tree_binary_alphabet }, { "test_single_tree_non_samples", test_single_tree_non_samples }, + { "test_isolated_internal_node", test_isolated_internal_node }, { "test_single_tree_errors", test_single_tree_errors }, { "test_single_tree_user_alleles_errors", test_single_tree_user_alleles_errors }, { "test_single_tree_subsample", test_single_tree_subsample }, diff --git a/c/tskit/genotypes.c b/c/tskit/genotypes.c index c2385281bd..54ad1d9c3e 100644 --- a/c/tskit/genotypes.c +++ b/c/tskit/genotypes.c @@ -90,12 +90,10 @@ tsk_variant_copy_alleles(tsk_variant_t *self, const char **alleles) static int variant_init_samples_and_index_map(tsk_variant_t *self, const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples, - size_t num_samples_alloc, tsk_flags_t options) + size_t num_samples_alloc, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; - const tsk_flags_t *flags = tree_sequence->tables->nodes.flags; tsk_size_t j, num_nodes; - bool impute_missing = !!(options & TSK_ISOLATED_NOT_MISSING); tsk_id_t u; num_nodes = tsk_treeseq_get_num_nodes(tree_sequence); @@ -120,11 +118,6 @@ variant_init_samples_and_index_map(tsk_variant_t *self, ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE); goto out; } - /* We can only detect missing data for samples */ - if (!impute_missing && !(flags[u] & TSK_NODE_IS_SAMPLE)) { - ret = tsk_trace_error(TSK_ERR_MUST_IMPUTE_NON_SAMPLES); - goto out; - } self->alt_sample_index_map[samples[j]] = (tsk_id_t) j; } out: @@ -439,6 +432,27 @@ tsk_variant_mark_missing(tsk_variant_t *self) return num_missing; } +/* Mark missing for any requested node (sample or non-sample) that is isolated + * in the current tree, i.e., has no parent and no children at this position. */ +static tsk_size_t +tsk_variant_mark_missing_any(tsk_variant_t *self) +{ + tsk_size_t num_missing = 0; + int32_t *restrict genotypes = self->genotypes; + const tsk_id_t *restrict parent = self->tree.parent; + const tsk_id_t *restrict left_child = self->tree.left_child; + tsk_size_t j; + + for (j = 0; j < self->num_samples; j++) { + tsk_id_t u = self->samples[j]; + if (parent[u] == TSK_NULL && left_child[u] == TSK_NULL) { + genotypes[j] = TSK_MISSING_DATA; + num_missing++; + } + } + return num_missing; +} + static tsk_id_t tsk_variant_get_allele_index(tsk_variant_t *self, const char *allele, tsk_size_t length) { @@ -502,6 +516,10 @@ tsk_variant_decode( update_genotypes = tsk_variant_update_genotypes_sample_list; if (by_traversal) { update_genotypes = tsk_variant_update_genotypes_traversal; + /* When decoding a user-provided list of nodes (which may include + * non-samples), mark isolated nodes as missing directly by checking + * isolation status for each requested node. */ + mark_missing = tsk_variant_mark_missing_any; } if (self->user_alleles) { diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index e8af9841a1..175b24735f 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -1286,6 +1286,52 @@ def test_non_sample_samples(self): with pytest.raises(tskit.LibraryError, match="MUST_IMPUTE_NON_SAMPLES"): list(ts.alignments(samples=[4])) + def test_variants_internal_nodes(self): + ts = self.ts() + # Root node 4: present across span, no mutations directly; always ancestral + vars4 = list(ts.variants(samples=[4])) + assert len(vars4) == 2 + assert [v.alleles[v.genotypes[0]] for v in vars4] == ["A", "T"] + assert [v.has_missing_data for v in vars4] == [False, False] + # isolated_as_missing=False should not change results here + vars4_i = list(ts.variants(samples=[4], isolated_as_missing=False)) + assert [v.alleles[v.genotypes[0]] for v in vars4_i] == ["A", "T"] + assert [v.has_missing_data for v in vars4_i] == [False, False] + # Internal node 3: mutation at site 1 gives derived state there + vars3 = list(ts.variants(samples=[3])) + assert len(vars3) == 2 + assert [v.alleles[v.genotypes[0]] for v in vars3] == ["A", "C"] + assert [v.has_missing_data for v in vars3] == [False, False] + vars3_i = list(ts.variants(samples=[3], isolated_as_missing=False)) + assert [v.alleles[v.genotypes[0]] for v in vars3_i] == ["A", "C"] + assert [v.has_missing_data for v in vars3_i] == [False, False] + + def test_genotype_matrix_internal_nodes(self): + ts = self.ts() + import numpy as np + + np.testing.assert_array_equal( + ts.genotype_matrix(samples=[4]), np.array([[0], [0]], dtype=np.int32) + ) + np.testing.assert_array_equal( + ts.genotype_matrix(samples=[4], isolated_as_missing=False), + ts.genotype_matrix(samples=[4]), + ) + np.testing.assert_array_equal( + ts.genotype_matrix(samples=[3]), np.array([[0], [1]], dtype=np.int32) + ) + np.testing.assert_array_equal( + ts.genotype_matrix(samples=[3], isolated_as_missing=False), + ts.genotype_matrix(samples=[3]), + ) + + def test_haplotypes_internal_nodes(self): + ts = self.ts() + assert list(ts.haplotypes(samples=[4])) == ["AT"] + assert list(ts.haplotypes(samples=[4], isolated_as_missing=False)) == ["AT"] + assert list(ts.haplotypes(samples=[3])) == ["AC"] + assert list(ts.haplotypes(samples=[3], isolated_as_missing=False)) == ["AC"] + def test_alignments_missing_data_char(self): A = list(self.ts().alignments(missing_data_character="x")) assert A[0] == "xxGxxxxxxT" @@ -1846,6 +1892,284 @@ def test_fasta_reference_sequence(self): assert expected == self.ts().as_fasta(reference_sequence=ref) +class TestInternalNodeMissingness: + @tests.cached_example + def ts_missing(self): + # Start from a balanced tree with span=10 + # then delete ancestry over [4,6] + # Internal node is then isolated across [4,6) + ts = tskit.Tree.generate_balanced(3, span=10).tree_sequence + tables = ts.dump_tables() + tables.delete_intervals([[4, 6]], simplify=False) + # Ensure there are sites within the deleted region + tables.sites.add_row(4, ancestral_state="A") + tables.sites.add_row(5, ancestral_state="A") + tables.sites.add_row(5.999999, ancestral_state="A") + tables.sort() + return tables.tree_sequence() + + def test_variants_internal_isolated(self): + ts = self.ts_missing() + # Choose an internal node id (3) in this balanced tree + vars3 = list(ts.variants(samples=[3], isolated_as_missing=True)) + assert len(vars3) == 3 + assert all(v.has_missing_data for v in vars3) + assert all(v.alleles[-1] is None for v in vars3) + assert all(v.genotypes[0] == tskit.MISSING_DATA for v in vars3) + # With imputation, internal isolated node maps to ancestral + vars3_i = list(ts.variants(samples=[3], isolated_as_missing=False)) + assert [v.genotypes[0] for v in vars3_i] == [0, 0, 0] + assert not any(v.has_missing_data for v in vars3_i) + + def test_genotype_matrix_internal_isolated(self): + import numpy as np + + ts = self.ts_missing() + Gm = ts.genotype_matrix(samples=[3], isolated_as_missing=True) + np.testing.assert_array_equal( + Gm.flatten(), np.array([-1, -1, -1], dtype=np.int32) + ) + Gi = ts.genotype_matrix(samples=[3], isolated_as_missing=False) + np.testing.assert_array_equal(Gi.flatten(), np.array([0, 0, 0], dtype=np.int32)) + + def test_haplotypes_internal_isolated(self): + ts = self.ts_missing() + H = list(ts.haplotypes(samples=[3], isolated_as_missing=True)) + assert H == ["NNN"] + Hq = list( + ts.haplotypes( + samples=[3], isolated_as_missing=True, missing_data_character="?" + ) + ) + assert Hq == ["???"] + + +class TestVariantTopologyCombos: + def test_all_parent_child_combinations(self): + # Build a simple tree with one site and add an isolated node. Then request + # genotypes for nodes that realise each combination of (parent, left_child) + # nullness in tsk_variant_mark_missing_any. + tables = tskit.Tree.generate_balanced(3, span=10).tree_sequence.dump_tables() + # Add an isolated node (no edges present anywhere) + iso = tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + # Add a single site with ancestral allele only + tables.sites.add_row(1.0, ancestral_state="A") + ts = tables.tree_sequence() + + tree = ts.first() + # Select a root that has children (exclude isolated root(s)) + root_candidates = [r for r in tree.roots if tree.left_child(r) != tskit.NULL] + assert len(root_candidates) >= 1 + root = root_candidates[0] # parent NULL, left_child != NULL + # Choose a leaf sample present in the tree: parent != NULL, left_child NULL + leaf = next( + u + for u in ts.samples() + if tree.parent(u) != tskit.NULL and tree.left_child(u) == tskit.NULL + ) + # Choose an internal non-root node: parent != NULL, left_child != NULL + internal = next( + u + for u in tree.nodes() + if tree.parent(u) != tskit.NULL and tree.left_child(u) != tskit.NULL + ) + + v = next( + ts.variants(samples=[iso, root, leaf, internal], isolated_as_missing=True) + ) + # Only the isolated node should be marked missing; others should be ancestral (0) + np.testing.assert_array_equal( + v.genotypes, np.array([tskit.MISSING_DATA, 0, 0, 0], dtype=np.int32) + ) + assert v.has_missing_data + + +class TestInternalNode: + @tests.cached_example + def ts_single_tree(self): + return tskit.Tree.generate_balanced(3, span=10).tree_sequence + + @tests.cached_example + def ts_isolated_internal(self): + # Add a non-sample internal node u=6 (no edges), and add sites. + ts = self.ts_single_tree() + tables = ts.dump_tables() + u = tables.nodes.add_row(flags=0, time=1) + tables.sites.add_row(2, ancestral_state="A") + tables.sites.add_row(9, ancestral_state="T") + return tables.tree_sequence(), u + + def test_isolated_internal_node_all_missing(self): + ts, u = self.ts_isolated_internal() + # Both sites missing for internal node + V = list(ts.variants(samples=[u], isolated_as_missing=True)) + assert [v.genotypes[0] for v in V] == [tskit.MISSING_DATA] * len(V) + assert all(v.has_missing_data and v.alleles[-1] is None for v in V) + np.testing.assert_array_equal( + ts.genotype_matrix(samples=[u], isolated_as_missing=True).flatten(), + np.array([-1, -1], dtype=np.int32), + ) + assert list(ts.haplotypes(samples=[u], isolated_as_missing=True)) == ["NN"] + assert [ + v.genotypes[0] for v in ts.variants(samples=[u], isolated_as_missing=False) + ] == [0, 0] + + @tests.cached_example + def ts_dead_branch(self): + # Create a dead branch: two non-sample nodes x (root) -> y (leaf) over full span. + # This topology is not reachable from sample roots, but present in the + # tree arrays for the current tree. + # + # Dead branch (x to y) unattached to sample-reachable roots + # 2.00┊ x ┊ + # ┊ ┃ ┊ + # 1.00┊ y ┊ + # ┊ ┊ + # 0.00┊ 0 1 2 ┊ (main balanced tree; x/y not reachable) + # 0 10 + # Sites: pos 3 (anc=A), pos 7 (anc=C); mutation at pos 7 on x 'T' + ts = self.ts_single_tree() + tables = ts.dump_tables() + x = tables.nodes.add_row(flags=0, time=2) + y = tables.nodes.add_row(flags=0, time=1) + tables.edges.add_row(0, tables.sequence_length, parent=x, child=y) + # Sites to probe ancestral/derived states + tables.sites.add_row(3, ancestral_state="A") + s1 = tables.sites.add_row(7, ancestral_state="C") + # Mutation on x at s1 + tables.mutations.add_row(site=s1, node=x, derived_state="T") + tables.sort() + return tables.tree_sequence(), x, y + + def test_dead_branch_internal_and_leaf(self): + ts, x, y = self.ts_dead_branch() + # y is a leaf on a dead branch (no children, parent=x), + # so not isolated (parent != NULL). + Vy = list(ts.variants(samples=[y], isolated_as_missing=True)) + # y inherits ancestral at site 3 and derived at site 7 via parent x + assert [vy.alleles[vy.genotypes[0]] for vy in Vy] == ["A", "T"] + assert not any(vy.has_missing_data for vy in Vy) + # x is internal (has child), so not isolated. + # At site 7 it is mutated; site 3 ancestral. + Vx = list(ts.variants(samples=[x], isolated_as_missing=True)) + assert [vx.alleles[vx.genotypes[0]] for vx in Vx] == ["A", "T"] + assert not any(vx.has_missing_data for vx in Vx) + + @tests.cached_example + def ts_presence_switch(self): + # Construct two trees with a breakpoint at 5. Internal node a present on [0,5), + # absent on [5,10). + # + # Two trees; breakpoint at 5 + # Tree 0: [0,5) Tree 1: [5,10) + # 1.00┊ a 1.00┊ b + # 0.00┊ 0 (1 isolated) 0.00┊ 0 (1 isolated) + # 0 5 5 10 + # Sites: pos 2 (anc=A), pos 7 (anc=C) + # Mutation: pos 2 on a 'G' + tables = tskit.TableCollection(10) + s0 = tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) + s1 = tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) # s1 (isolated) + a = tables.nodes.add_row(flags=0, time=1) + b = tables.nodes.add_row(flags=0, time=1) + # Left tree: a->s0 on [0,5); right tree: b->s0 on [5,10) + tables.edges.add_row(0, 5, parent=a, child=s0) + tables.edges.add_row(5, 10, parent=b, child=s0) + # s1 remains isolated across entire span + # Sites: one on each side of breakpoint + s_left = tables.sites.add_row(2, ancestral_state="A") + tables.sites.add_row(7, ancestral_state="C") + # Mutation on a at s_left so a has derived there + tables.mutations.add_row(site=s_left, node=a, derived_state="G") + tables.sort() + ts = tables.tree_sequence() + return ts, a, s1 + + def test_internal_presence_switch(self): + ts, a, _ = self.ts_presence_switch() + # Site at 2 (left): a present & derived; Site at 7 (right): a absent - isolated + V = list(ts.variants(samples=[a], isolated_as_missing=True)) + assert [v.genotypes[0] for v in V] == [1, tskit.MISSING_DATA] + assert [ + (v.alleles[v.genotypes[0]] if v.genotypes[0] != -1 else None) for v in V + ] == [ + "G", + None, + ] + # Imputed ancestral at right site + assert [ + vi.genotypes[0] + for vi in ts.variants(samples=[a], isolated_as_missing=False) + ] == [1, 0] + assert list(ts.haplotypes(samples=[a], isolated_as_missing=False)) == ["GC"] + assert list(ts.haplotypes(samples=[a], isolated_as_missing=True)) == ["GN"] + + def test_isolated_sample_mark_missing_any(self): + ts, _, s1 = self.ts_presence_switch() + V = list(ts.variants(samples=[s1], isolated_as_missing=True)) + assert len(V) == 2 + assert [v.genotypes[0] for v in V] == [tskit.MISSING_DATA] * len(V) + assert all(v.has_missing_data and v.alleles[-1] is None for v in V) + np.testing.assert_array_equal( + ts.genotype_matrix(samples=[s1], isolated_as_missing=True), + np.full((2, 1), tskit.MISSING_DATA, dtype=np.int32), + ) + + @tests.cached_example + def ts_mutation_overrides_missing(self): + # Similar to presence_switch, but add a mutation on a at the right-side site. + # Even though a is absent over [5,10), a mutation at the site (pos 7) makes + # the allele known (overrides missingness at the site). + ts, a, _ = self.ts_presence_switch() + tables = ts.dump_tables() + # Add a mutation on a at the right-side site (position 7) + # Find the site id for position 7 + pos_to_id = {s.position: s.id for s in ts.sites()} + s_right = pos_to_id[7.0] + tables.mutations.add_row(site=s_right, node=a, derived_state="T") + tables.sort() + return tables.tree_sequence(), a + + def test_mutation_overrides_missing(self): + ts, a = self.ts_mutation_overrides_missing() + # Now both sites have derived alleles for a, + # even though a is absent on the right interval. + V = list(ts.variants(samples=[a], isolated_as_missing=True)) + assert [v.alleles[v.genotypes[0]] for v in V] == ["G", "T"] + assert not any(v.has_missing_data for v in V) + + def test_mixed_samples_ordering(self): + # Combine internal and sample nodes; ensure order and mapping preserved + ts, a, _ = self.ts_presence_switch() + samples = [a] + list(ts.samples()) # [internal, sample0, sample1] + # Build expected by joining per-node results + rows = [] + for u in samples: + row = [ + v.genotypes[0] + for v in ts.variants(samples=[u], isolated_as_missing=True) + ] + rows.append(row) + # Single call with mixed samples should match stacked rows + # Build matrix via variants over mixed sample list + G_rows = [] + for v in ts.variants(samples=samples, isolated_as_missing=True): + G_rows.append(v.genotypes.tolist()) + # Compare + assert G_rows == [list(col) for col in zip(*rows)] + + def test_variants_copy_false_internal(self): + ts, a, _ = self.ts_presence_switch() + it = ts.variants(samples=[a], isolated_as_missing=True, copy=False) + v1 = next(it) + # Hold onto v1, decode next site updates same object + val1 = (v1.site.position, v1.genotypes.copy()) + v2 = next(it) + assert v1 is v2 + # v1 now reflects second site + assert v1.site.position != val1[0] + + class TestAlignmentsErrors: @tests.cached_example def simplest_ts(self): diff --git a/python/tskit/genotypes.py b/python/tskit/genotypes.py index 83d7b50163..1c553830d0 100644 --- a/python/tskit/genotypes.py +++ b/python/tskit/genotypes.py @@ -38,12 +38,13 @@ class Variant: """ A variant in a tree sequence, describing the observed genetic variation - among samples for a given site. A variant consists of (a) a tuple of - **alleles** listing the potential allelic states which samples at this site - can possess; (b) an array of **genotypes** mapping sample IDs to the observed - alleles (c) a reference to the :class:`Site` at which the Variant has been decoded - and (d) an array of **samples** giving the node id to which the each element of - the genotypes array corresponds. + among the specified nodes (by default, the sample nodes) for a given site. + A variant consists of (a) a tuple of **alleles** listing the potential + allelic states which the requested nodes at this site can possess; (b) an + array of **genotypes** mapping node IDs to the observed alleles; (c) a + reference to the :class:`Site` at which the Variant has been decoded; and + (d) an array of **samples** giving the node ID to which each element of the + genotypes array corresponds. After creation a Variant is not yet decoded, and has no genotypes. To decode a Variant, call the :meth:`decode` method. The Variant class will then @@ -72,12 +73,13 @@ class Variant: In this case, there is no indication of which allele is the ancestral state, as the ordering is determined by the user. - The ``genotypes`` represent the observed allelic states for each sample, - such that ``var.alleles[var.genotypes[j]]`` gives the string allele - for sample ID ``j``. Thus, the elements of the genotypes array are + The ``genotypes`` represent the observed allelic states for each requested + node, such that ``var.alleles[var.genotypes[j]]`` gives the string allele + for the node at index ``j`` (i.e., for ``variant.samples[j]``). Thus, the + elements of the genotypes array are indexes into the ``alleles`` list. The genotypes are provided in this way via a numpy numeric array to enable efficient calculations. To obtain a - (less efficient) array of allele strings for each sample, you can use e.g. + (less efficient) array of allele strings for each node, you can use e.g. ``np.asarray(variant.alleles)[variant.genotypes]``. When :ref:`missing data` is present at a given @@ -95,10 +97,11 @@ class Variant: :param TreeSequence tree_sequence: The tree sequence to which this variant belongs. :param array_like samples: An array of node IDs for which to generate - genotypes, or None for all sample nodes. Default: None. + genotypes, or ``None`` for all sample nodes. Non-sample nodes may also + be provided to generate genotypes for internal nodes. Default: ``None``. :param bool isolated_as_missing: If True, the genotype value assigned to - missing samples (i.e., isolated samples without mutations) is - :data:`.MISSING_DATA` (-1). If False, missing samples will be + isolated nodes without mutations (samples or non-samples) is + :data:`.MISSING_DATA` (-1). If False, such nodes will be assigned the allele index for the ancestral state. Default: True. :param tuple alleles: A tuple of strings defining the encoding of @@ -143,7 +146,7 @@ def site(self) -> trees.Site: @property def alleles(self) -> tuple[str | None, ...]: """ - A tuple of the allelic values which samples can possess at the current + A tuple of the allelic values which nodes can possess at the current site. Unless an encoding of alleles is specified when creating this variant instance, the first element of this tuple is always the site's ancestral state. @@ -162,7 +165,7 @@ def samples(self) -> np.ndarray: def genotypes(self) -> np.ndarray: """ An array of indexes into the list ``alleles``, giving the - state of each sample at the current site. + state of each requested node at the current site. """ self._check_decoded() return self._ll_variant.genotypes @@ -170,8 +173,8 @@ def genotypes(self) -> np.ndarray: @property def isolated_as_missing(self) -> bool: """ - True if isolated samples are decoded to missing data. If False, isolated - samples are decoded to the ancestral state. + True if isolated nodes are decoded to missing data. If False, isolated + nodes are decoded to the ancestral state. """ return self._ll_variant.isolated_as_missing @@ -179,7 +182,7 @@ def isolated_as_missing(self) -> bool: def has_missing_data(self) -> bool: """ True if there is missing data for any of the - samples at the current site. + requested nodes at the current site. """ alleles = self._ll_variant.alleles return len(alleles) > 0 and alleles[-1] is None @@ -187,7 +190,7 @@ def has_missing_data(self) -> bool: @property def num_missing(self) -> int: """ - The number of samples with missing data at this site. + The number of requested nodes with missing data at this site. """ return np.sum(self.genotypes == tskit.NULL) @@ -199,7 +202,7 @@ def num_alleles(self) -> int: array: firstly missing data is not counted as an allele, and secondly, the site may contain mutations to alternative allele states (which are counted in the number of alleles) without the mutation being inherited - by any of the samples. + by any of the requested nodes. """ return len(self.alleles) - self.has_missing_data diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 0b933e3b26..760f7055f6 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -5326,13 +5326,13 @@ def haplotypes( Returns an iterator over the strings of haplotypes that result from the trees and mutations in this tree sequence. Each haplotype string is guaranteed to be of the same length. A tree sequence with - :math:`n` samples and with :math:`s` sites lying between ``left`` and - ``right`` will return a total of :math:`n` - strings of :math:`s` alleles concatenated together, where an allele + :math:`n` requested nodes (default: the number of sample nodes) and with + :math:`s` sites lying between ``left`` and ``right`` will return a total + of :math:`n` strings of :math:`s` alleles concatenated together, where an allele consists of a single ascii character (tree sequences that include alleles which are not a single character in length, or where the character is non-ascii, will raise an error). The first string returned is the - haplotype for the first requested sample, and so on. + haplotype for the first requested node, and so on. The alleles at each site must be represented by single byte characters, (i.e., variants must be single nucleotide polymorphisms, or SNPs), hence @@ -5341,8 +5341,8 @@ def haplotypes( haplotype ``h``, the value of ``h[j]`` will therefore be the observed allelic state at site ``j``. - If ``isolated_as_missing`` is True (the default), isolated samples without - mutations directly above them will be treated as + If ``isolated_as_missing`` is True (the default), isolated nodes without + mutations directly above them (whether samples or non-samples) will be treated as :ref:`missing data` and will be represented in the string by the ``missing_data_character``. If instead it is set to False, missing data will be assigned the ancestral state @@ -5351,8 +5351,10 @@ def haplotypes( behaviour in versions prior to 0.2.0. Prior to 0.3.0 the `impute_missing_data` argument controlled this behaviour. + It is also possible to provide **non-sample** nodes via the ``samples`` + argument if you wish to output haplotypes for (e.g.) internal nodes. See also the :meth:`.variants` iterator for site-centric access - to sample genotypes. + to genotypes for the requested nodes. .. warning:: For large datasets, this method can consume a **very large** amount of @@ -5370,9 +5372,10 @@ def haplotypes( be used to represent missing data. If any normal allele contains this character, an error is raised. Default: 'N'. - :param list[int] samples: The samples for which to output haplotypes. If - ``None`` (default), return haplotypes for all the samples in the tree - sequence, in the order given by the :meth:`.samples` method. + :param list[int] samples: The node IDs for which to output haplotypes. If + ``None`` (default), return haplotypes for all the sample nodes in the tree + sequence, in the order given by the :meth:`.samples` method. Non-sample + nodes may also be provided. :param int left: Haplotype strings will start with the first site at or after this genomic position. If ``None`` (default) start at the first site. :param int right: Haplotype strings will end with the last site before this @@ -5443,9 +5446,13 @@ def variants( generated; output order of genotypes in the returned variants corresponds to the order of the samples in this list. It is also possible to provide **non-sample** nodes as an argument here, if you - wish to generate genotypes for (e.g.) internal nodes. However, - ``isolated_as_missing`` must be False in this case, as it is not - possible to detect missing data for non-sample nodes. + wish to generate genotypes for (e.g.) internal nodes. Missingness is + detected for any requested node (sample or non-sample) when + ``isolated_as_missing`` is True: if a node is isolated at a site (i.e., + has no parent and no children in the marginal tree) and has no mutation + above it at that site, its genotype will be reported as + :data:`MISSING_DATA` (-1). If ``isolated_as_missing`` is False, such + nodes are assigned the site's ancestral allele index. If isolated samples are present at a given site without mutations above them, they are interpreted by default as @@ -5535,19 +5542,23 @@ def genotype_matrix( """ Returns an :math:`m \\times n` numpy array of the genotypes in this tree sequence, where :math:`m` is the number of sites and :math:`n` - the number of samples. The genotypes are the indexes into the array - of ``alleles``, as described for the :class:`Variant` class. - - If isolated samples are present at a given site without mutations above them, - they will be interpreted as :ref:`missing data` - the genotypes array will contain a special value :data:`MISSING_DATA` - (-1) to identify these missing samples. - - Such samples are treated as missing data by default, but if - ``isolated_as_missing`` is set to to False, they will not be treated as missing, - and so assigned the ancestral state. This was the default behaviour in - versions prior to 0.2.0. Prior to 0.3.0 the `impute_missing_data` - argument controlled this behaviour. + is the number of requested nodes (default: the number of sample nodes). + The genotypes are the indexes into the array of ``alleles``, as + described for the :class:`Variant` class. + + It is possible to provide **non-sample** nodes via the ``samples`` + argument if you wish to generate genotypes for (e.g.) internal nodes. + Missingness is detected for any requested node (sample or non-sample) + when ``isolated_as_missing`` is True: if a node is isolated at a site + (i.e., has no parent and no children in the marginal tree) and has no + mutation above it at that site, its genotype will be reported as + :data:`MISSING_DATA` (-1). + + Such nodes are treated as missing data by default. If + ``isolated_as_missing`` is set to False, they will not be treated as + missing, and will instead be assigned the ancestral state. This was the + default behaviour in versions prior to 0.2.0. Prior to 0.3.0 the + ``impute_missing_data`` argument controlled this behaviour. .. warning:: This method can consume a **very large** amount of memory! If @@ -5555,10 +5566,12 @@ def genotype_matrix( access them sequentially using the :meth:`.variants` iterator. :param array_like samples: An array of node IDs for which to generate - genotypes, or None for all sample nodes. Default: None. + genotypes. If ``None`` (default), generate genotypes for all sample + nodes. Non-sample nodes may also be provided, in which case genotypes + will be generated for those nodes too. :param bool isolated_as_missing: If True, the genotype value assigned to - missing samples (i.e., isolated samples without mutations) is - :data:`.MISSING_DATA` (-1). If False, missing samples will be + isolated nodes without mutations (samples or non-samples) is + :data:`.MISSING_DATA` (-1). If False, such nodes will be assigned the allele index for the ancestral state. Default: True. :param tuple alleles: A tuple of strings describing the encoding of @@ -5689,7 +5702,8 @@ def alignments( Default: 'N'. :param list[int] samples: The samples for which to output alignments. If ``None`` (default), return alignments for all the samples in the tree - sequence, in the order given by the :meth:`.samples` method. + sequence, in the order given by the :meth:`.samples` method. Only + sample nodes are supported for alignments. :param int left: Alignments will start at this genomic position. If ``None`` (default) alignments start at 0. :param int right: Alignments will stop before this genomic position. If ``None`` @@ -5709,6 +5723,20 @@ def alignments( "N" if missing_data_character is None else missing_data_character ) + # Temporary check until alignments support missing data properly + if samples is not None: + requested = list(samples) + n = self.num_nodes + flags = self.nodes_flags + if any( + (u < 0) or (u >= n) or ((flags[u] & NODE_IS_SAMPLE) == 0) + for u in requested + ): + raise tskit.LibraryError( + "Cannot generate genotypes for non-samples when isolated nodes " + "are considered as missing. (TSK_ERR_MUST_IMPUTE_NON_SAMPLES)" + ) + L = interval.span a = np.empty(L, dtype=np.int8) if reference_sequence is None: diff --git a/python/tskit/util.py b/python/tskit/util.py index 5164800b41..3727a40191 100644 --- a/python/tskit/util.py +++ b/python/tskit/util.py @@ -771,13 +771,14 @@ def variant_html(variant): + f""" Site Id{format_number(site_id)} Site Position{format_number(site_position)} - Number of Samples{format_number(num_samples)} + Number of Nodes{format_number(num_samples)} Number of Alleles{format_number(num_alleles)} """ + "\n".join( [ - f"""Samples with Allele {'missing' if k is None - else "'" + k + "'"}""" + "Nodes with Allele " + + ("missing" if k is None else "'" + k + "'") + + "" + f"{format_number(counts[k])}" + " " + f"({format_number(freqs[k] * 100, 2)}%)"