Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 93 additions & 3 deletions c/tests/test_genotypes.c
Original file line number Diff line number Diff line change
Expand Up @@ -512,10 +512,45 @@ test_single_tree_non_samples(void)

tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL,
single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0);
/* It's an error to hand in non-samples without imputation turned on */
ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUST_IMPUTE_NON_SAMPLES);
tsk_vargen_free(&vargen);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_vargen_print_state(&vargen, _devnull);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_EQUAL(var->genotypes[0], 0);
CU_ASSERT_EQUAL(var->genotypes[1], 0);
CU_ASSERT_EQUAL(var->num_alleles, 2);
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
CU_ASSERT_EQUAL(var->site.id, 0);
CU_ASSERT_EQUAL(var->site.mutations_length, 1);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_EQUAL(var->genotypes[1], 0);
CU_ASSERT_EQUAL(var->genotypes[0], 1);
CU_ASSERT_EQUAL(var->num_alleles, 2);
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
CU_ASSERT_EQUAL(var->site.id, 1);
CU_ASSERT_EQUAL(var->site.mutations_length, 2);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_EQUAL(var->genotypes[0], 0);
CU_ASSERT_EQUAL(var->genotypes[1], 0);
CU_ASSERT_EQUAL(var->num_alleles, 2);
CU_ASSERT_NSTRING_EQUAL(var->alleles[0], "0", 1);
CU_ASSERT_NSTRING_EQUAL(var->alleles[1], "1", 1);
CU_ASSERT_EQUAL(var->site.id, 2);
CU_ASSERT_EQUAL(var->site.mutations_length, 4);

ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_vargen_free(&vargen);
CU_ASSERT_EQUAL_FATAL(ret, 0);

ret = tsk_vargen_init(&vargen, &ts, samples, 2, NULL, TSK_ISOLATED_NOT_MISSING);
CU_ASSERT_EQUAL_FATAL(ret, 0);
Expand Down Expand Up @@ -559,6 +594,60 @@ test_single_tree_non_samples(void)
tsk_treeseq_free(&ts);
}

static void
test_isolated_internal_node(void)
{
int ret = 0;
tsk_treeseq_t ts;
tsk_vargen_t vargen;
tsk_variant_t *var;
/* Two sample nodes (0,1), plus an internal non-sample node u=2 with no edges */
const char *nodes = "1 0 -1 -1\n"
"1 0 -1 -1\n"
"0 1 -1 -1\n";
const char *sites = "2.0 A\n"
"9.0 T\n";
tsk_id_t samples[] = { 2 };

tsk_treeseq_from_text(&ts, 10, nodes, "", NULL, sites, NULL, NULL, NULL, 0);
CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_nodes(&ts), 3);
CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_samples(&ts), 2);
CU_ASSERT_EQUAL_FATAL(tsk_treeseq_get_num_sites(&ts), 2);

/* Default options (isolated_as_missing=True): internal node is isolated everywhere
*/
ret = tsk_vargen_init(&vargen, &ts, samples, 1, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_TRUE(var->has_missing_data);
CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA);
ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_TRUE(var->has_missing_data);
CU_ASSERT_EQUAL(var->genotypes[0], TSK_MISSING_DATA);
ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_vargen_free(&vargen);

/* Impute missing (isolated_as_missing=False): genotypes should be ancestral (0) */
ret = tsk_vargen_init(&vargen, &ts, samples, 1, NULL, TSK_ISOLATED_NOT_MISSING);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_FALSE(var->has_missing_data);
CU_ASSERT_EQUAL(var->genotypes[0], 0);
ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 1);
CU_ASSERT_FALSE(var->has_missing_data);
CU_ASSERT_EQUAL(var->genotypes[0], 0);
ret = tsk_vargen_next(&vargen, &var);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_vargen_free(&vargen);

tsk_treeseq_free(&ts);
}

static void
test_single_tree_errors(void)
{
Expand Down Expand Up @@ -1123,6 +1212,7 @@ main(int argc, char **argv)
{ "test_single_tree_char_alphabet", test_single_tree_char_alphabet },
{ "test_single_tree_binary_alphabet", test_single_tree_binary_alphabet },
{ "test_single_tree_non_samples", test_single_tree_non_samples },
{ "test_isolated_internal_node", test_isolated_internal_node },
{ "test_single_tree_errors", test_single_tree_errors },
{ "test_single_tree_user_alleles_errors", test_single_tree_user_alleles_errors },
{ "test_single_tree_subsample", test_single_tree_subsample },
Expand Down
34 changes: 26 additions & 8 deletions c/tskit/genotypes.c
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,10 @@ tsk_variant_copy_alleles(tsk_variant_t *self, const char **alleles)
static int
variant_init_samples_and_index_map(tsk_variant_t *self,
const tsk_treeseq_t *tree_sequence, const tsk_id_t *samples, tsk_size_t num_samples,
size_t num_samples_alloc, tsk_flags_t options)
size_t num_samples_alloc, tsk_flags_t TSK_UNUSED(options))
{
int ret = 0;
const tsk_flags_t *flags = tree_sequence->tables->nodes.flags;
tsk_size_t j, num_nodes;
bool impute_missing = !!(options & TSK_ISOLATED_NOT_MISSING);
tsk_id_t u;

num_nodes = tsk_treeseq_get_num_nodes(tree_sequence);
Expand All @@ -120,11 +118,6 @@ variant_init_samples_and_index_map(tsk_variant_t *self,
ret = tsk_trace_error(TSK_ERR_DUPLICATE_SAMPLE);
goto out;
}
/* We can only detect missing data for samples */
if (!impute_missing && !(flags[u] & TSK_NODE_IS_SAMPLE)) {
ret = tsk_trace_error(TSK_ERR_MUST_IMPUTE_NON_SAMPLES);
goto out;
}
self->alt_sample_index_map[samples[j]] = (tsk_id_t) j;
}
out:
Expand Down Expand Up @@ -439,6 +432,27 @@ tsk_variant_mark_missing(tsk_variant_t *self)
return num_missing;
}

/* Mark missing for any requested node (sample or non-sample) that is isolated
* in the current tree, i.e., has no parent and no children at this position. */
static tsk_size_t
tsk_variant_mark_missing_any(tsk_variant_t *self)
{
tsk_size_t num_missing = 0;
int32_t *restrict genotypes = self->genotypes;
const tsk_id_t *restrict parent = self->tree.parent;
const tsk_id_t *restrict left_child = self->tree.left_child;
tsk_size_t j;

for (j = 0; j < self->num_samples; j++) {
tsk_id_t u = self->samples[j];
if (parent[u] == TSK_NULL && left_child[u] == TSK_NULL) {
genotypes[j] = TSK_MISSING_DATA;
num_missing++;
}
}
return num_missing;
}

static tsk_id_t
tsk_variant_get_allele_index(tsk_variant_t *self, const char *allele, tsk_size_t length)
{
Expand Down Expand Up @@ -502,6 +516,10 @@ tsk_variant_decode(
update_genotypes = tsk_variant_update_genotypes_sample_list;
if (by_traversal) {
update_genotypes = tsk_variant_update_genotypes_traversal;
/* When decoding a user-provided list of nodes (which may include
* non-samples), mark isolated nodes as missing directly by checking
* isolation status for each requested node. */
mark_missing = tsk_variant_mark_missing_any;
}

if (self->user_alleles) {
Expand Down
Loading