diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index 2a128f24c0..99c0b1e7b5 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -1,8 +1,12 @@ -------------------- -[1.2.0] - 2022-XX-XX +[1.1.1] - 2022-XX-XX -------------------- +**Bug fixes** +- Fix segfault in tsk_variant_restricted_copy in tree sequences with large + numbers of alleles or very long alleles + (:user:`jeromekelleher`, :pr:`2437`, :issue:`2429`). -------------------- [1.1.0] - 2022-07-14 diff --git a/c/tests/test_genotypes.c b/c/tests/test_genotypes.c index 4dd5a80aa4..fd597439e0 100644 --- a/c/tests/test_genotypes.c +++ b/c/tests/test_genotypes.c @@ -958,53 +958,154 @@ test_variant_decode_errors(void) tsk_treeseq_free(&ts); } +/* Checks that the data represented by the specified pair of variants exposed + * by the public API is equal. */ +static void +assert_variants_equal(const tsk_variant_t *v1, const tsk_variant_t *v2) +{ + tsk_size_t j; + + CU_ASSERT_EQUAL(v1->num_samples, v2->num_samples); + CU_ASSERT_EQUAL(v1->num_alleles, v2->num_alleles); + for (j = 0; j < v1->num_alleles; j++) { + CU_ASSERT_EQUAL(v1->allele_lengths[j], v2->allele_lengths[j]); + CU_ASSERT_EQUAL( + 0, memcmp(v1->alleles[j], v2->alleles[j], (size_t) v1->allele_lengths[j])); + } + CU_ASSERT_EQUAL(v1->has_missing_data, v2->has_missing_data); + CU_ASSERT_EQUAL(v1->num_samples, v2->num_samples); + for (j = 0; j < v1->num_samples; j++) { + CU_ASSERT_EQUAL(v1->samples[j], v2->samples[j]); + CU_ASSERT_EQUAL(v1->genotypes[j], v2->genotypes[j]); + } + CU_ASSERT_EQUAL(v1->site.id, v2->site.id); + CU_ASSERT_EQUAL(v1->site.position, v2->site.position); + CU_ASSERT_EQUAL(v1->site.ancestral_state_length, v2->site.ancestral_state_length); + CU_ASSERT_EQUAL(0, memcmp(v1->site.ancestral_state, v2->site.ancestral_state, + (size_t) v1->site.ancestral_state_length)); + CU_ASSERT_EQUAL(v1->site.mutations_length, v2->site.mutations_length); + /* We're pointing back to the same memory for embedded pointers */ + CU_ASSERT_EQUAL(v1->site.mutations, v2->site.mutations); + CU_ASSERT_EQUAL(v1->site.metadata, v2->site.metadata); +} + static void test_variant_copy(void) { int ret = 0; tsk_size_t j; tsk_treeseq_t ts; - tsk_variant_t var; - tsk_variant_t var_copy; + tsk_variant_t var, var_copy; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_variant_init(&var, &ts, NULL, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_variant_decode(&var, 0, 0); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_variant_restricted_copy(&var, &var_copy); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_variant_decode(&var_copy, 0, 0); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_VARIANT_CANT_DECODE_COPY); - CU_ASSERT_EQUAL( - 0, memcmp(var.tree_sequence, var.tree_sequence, sizeof(*var.tree_sequence))); - CU_ASSERT_EQUAL(0, memcmp(&var.tree, &var_copy.tree, sizeof(tsk_tree_t))); - CU_ASSERT_EQUAL(0, memcmp(&var.site, &var_copy.site, sizeof(tsk_site_t))); - CU_ASSERT_EQUAL(0, memcmp(var.genotypes, var_copy.genotypes, - sizeof(int32_t) * (size_t) var.num_samples)); - CU_ASSERT_EQUAL(var.num_alleles, var_copy.num_alleles); - CU_ASSERT_EQUAL( - 0, memcmp(var.allele_lengths, var_copy.allele_lengths, sizeof(tsk_size_t))); - for (j = 0; j < var.num_alleles; j++) { - CU_ASSERT_EQUAL(0, - memcmp(var.alleles[j], var_copy.alleles[j], (size_t) var.allele_lengths[j])); + for (j = 0; j < tsk_treeseq_get_num_sites(&ts); j++) { + ret = tsk_variant_decode(&var, (tsk_id_t) j, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_variant_restricted_copy(&var, &var_copy); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_variant_decode(&var_copy, 0, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_VARIANT_CANT_DECODE_COPY); + + assert_variants_equal(&var, &var_copy); + CU_ASSERT_EQUAL( + 0, memcmp(var.tree_sequence, var.tree_sequence, sizeof(*var.tree_sequence))); + CU_ASSERT_EQUAL(0, memcmp(&var.tree, &var_copy.tree, sizeof(tsk_tree_t))); + CU_ASSERT_EQUAL(0, memcmp(&var.site, &var_copy.site, sizeof(tsk_site_t))); + CU_ASSERT_EQUAL(var_copy.traversal_stack, NULL); + CU_ASSERT_EQUAL(var_copy.sample_index_map, NULL); + CU_ASSERT_EQUAL(var_copy.alt_samples, NULL); + CU_ASSERT_EQUAL(var_copy.alt_sample_index_map, NULL); + tsk_variant_free(&var_copy); } - CU_ASSERT_EQUAL(var.has_missing_data, var_copy.has_missing_data); - CU_ASSERT_EQUAL(var.num_alleles, var_copy.num_alleles); - CU_ASSERT_EQUAL(var.num_samples, var_copy.num_samples); - CU_ASSERT_EQUAL(0, memcmp(var.samples, var_copy.samples, - sizeof(*var.samples) * (size_t) var.num_samples)); - CU_ASSERT_EQUAL(var_copy.traversal_stack, NULL); - CU_ASSERT_EQUAL(var_copy.sample_index_map, NULL); - CU_ASSERT_EQUAL(var_copy.alt_samples, NULL); - CU_ASSERT_EQUAL(var_copy.alt_sample_index_map, NULL); + tsk_variant_free(&var); + tsk_treeseq_free(&ts); +} + +static void +test_variant_copy_long_alleles(void) +{ + int ret = 0; + const char *sites = "0.0 GGGG\n" + "0.125 AAAAA\n" + "0.25 CCCCCC\n" + "0.5 AAAAAAA\n"; + const char *mutations = "0 0 TTT -1\n" + "1 1 CCCCCCC -1\n" + "2 0 GGGGGGG -1\n" + "2 1 AG -1\n" + "2 2 TTTTTTT -1\n" + "3 4 TGGGGGG -1\n" + "3 0 AAA 5\n"; + tsk_treeseq_t ts; + tsk_variant_t var, copy, copy_of_copy; + tsk_size_t j; + + tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, + sites, mutations, NULL, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_variant_init(&var, &ts, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + for (j = 0; j < tsk_treeseq_get_num_sites(&ts); j++) { + ret = tsk_variant_decode(&var, (tsk_id_t) j, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_variant_restricted_copy(&var, ©); + CU_ASSERT_EQUAL_FATAL(ret, 0); + assert_variants_equal(&var, ©); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_variant_restricted_copy(©, ©_of_copy); + CU_ASSERT_EQUAL_FATAL(ret, 0); + assert_variants_equal(&var, ©_of_copy); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_variant_free(©_of_copy); + tsk_variant_free(©); + } tsk_variant_free(&var); - tsk_variant_free(&var_copy); + tsk_treeseq_free(&ts); +} + +static void +test_variant_copy_memory_management(void) +{ + int ret = 0; + tsk_size_t j; + tsk_treeseq_t ts; + tsk_variant_t *var; + tsk_variant_t copy, copy_of_copy; + + tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, + paper_ex_mutations, paper_ex_individuals, NULL, 0); + + for (j = 0; j < tsk_treeseq_get_num_sites(&ts); j++) { + var = tsk_malloc(sizeof(*var)); + CU_ASSERT_FATAL(var != NULL); + ret = tsk_variant_init(var, &ts, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_variant_decode(var, (tsk_id_t) j, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_variant_restricted_copy(var, ©); + CU_ASSERT_EQUAL_FATAL(ret, 0); + assert_variants_equal(var, ©); + /* Free var to make sure we're not pointing to any of the original memory. */ + tsk_variant_free(var); + free(var); + ret = tsk_variant_restricted_copy(©, ©_of_copy); + CU_ASSERT_EQUAL_FATAL(ret, 0); + assert_variants_equal(©, ©_of_copy); + ret = tsk_variant_decode(©, 0, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_VARIANT_CANT_DECODE_COPY); + ret = tsk_variant_decode(©_of_copy, 0, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_VARIANT_CANT_DECODE_COPY); + + tsk_variant_free(©); + tsk_variant_free(©_of_copy); + } tsk_treeseq_free(&ts); } @@ -1030,6 +1131,8 @@ main(int argc, char **argv) { "test_multiple_variant_decode", test_multiple_variant_decode }, { "test_variant_decode_errors", test_variant_decode_errors }, { "test_variant_copy", test_variant_copy }, + { "test_variant_copy_long_alleles", test_variant_copy_long_alleles }, + { "test_variant_copy_memory_management", test_variant_copy_memory_management }, { NULL, NULL }, }; diff --git a/c/tskit/genotypes.c b/c/tskit/genotypes.c index 320e3bca8e..d4d1ecb08a 100644 --- a/c/tskit/genotypes.c +++ b/c/tskit/genotypes.c @@ -463,7 +463,6 @@ tsk_variant_decode( tsk_id_t allele_index; tsk_size_t j, num_missing; int no_longer_missing; - tsk_mutation_t mutation; bool impute_missing = !!(self->options & TSK_ISOLATED_NOT_MISSING); bool by_traversal = self->alt_samples != NULL; @@ -582,11 +581,11 @@ tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other) tsk_size_t total_len, offset, j; /* Copy everything */ - tsk_memcpy(other, self, sizeof(tsk_variant_t)); + tsk_memcpy(other, self, sizeof(*other)); /* Tree sequence left as NULL and zero'd tree is a way of indicating this variant is * fixed and cannot be further decoded. */ other->tree_sequence = NULL; - tsk_memset(&other->tree, sizeof(tsk_tree_t), 0); + tsk_memset(&other->tree, sizeof(other->tree), 0); other->traversal_stack = NULL; other->samples = NULL; other->sample_index_map = NULL; @@ -598,9 +597,9 @@ tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other) for (j = 0; j < self->num_alleles; j++) { total_len += self->allele_lengths[j]; } - other->samples = tsk_malloc(other->num_samples * sizeof(tsk_id_t)); - other->genotypes = tsk_malloc(other->num_samples * sizeof(int32_t)); - other->user_alleles_mem = tsk_malloc(total_len * sizeof(char *)); + other->samples = tsk_malloc(other->num_samples * sizeof(*other->samples)); + other->genotypes = tsk_malloc(other->num_samples * sizeof(*other->genotypes)); + other->user_alleles_mem = tsk_malloc(total_len * sizeof(*other->user_alleles_mem)); other->allele_lengths = tsk_malloc(other->num_alleles * sizeof(*other->allele_lengths)); other->alleles = tsk_malloc(other->num_alleles * sizeof(*other->alleles)); @@ -612,13 +611,14 @@ tsk_variant_restricted_copy(const tsk_variant_t *self, tsk_variant_t *other) } tsk_memcpy( other->samples, self->samples, other->num_samples * sizeof(*other->samples)); - tsk_memcpy(other->genotypes, self->genotypes, other->num_samples * sizeof(int32_t)); + tsk_memcpy(other->genotypes, self->genotypes, + other->num_samples * sizeof(*other->genotypes)); tsk_memcpy(other->allele_lengths, self->allele_lengths, other->num_alleles * sizeof(*other->allele_lengths)); offset = 0; for (j = 0; j < other->num_alleles; j++) { tsk_memcpy(other->user_alleles_mem + offset, self->alleles[j], - other->allele_lengths[j] * sizeof(char *)); + other->allele_lengths[j] * sizeof(*other->user_alleles_mem)); other->alleles[j] = other->user_alleles_mem + offset; offset += other->allele_lengths[j]; } diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index ebc3e5c987..fa4853be36 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -4,6 +4,10 @@ **Fixes** +- Iterating over ``ts.variants()`` could cause a segfault in tree sequences + with large numbers of alleles or very long alleles + (:user:`jeromekelleher`, :pr:`2437`, :issue:`2429`). + - Various memory leaks fixed (:user:`jeromekelleher`, :pr:`2424`, :issue:`2423`, :issue:`2427`). @@ -24,7 +28,6 @@ - Variant objects now have a ``.num_missing`` attribute and ``.counts()`` and ``.frequencies`` methods (:user:`hyanwong`, :issue:`2390` :pr:`2393`) - - Add the `Tree.num_lineages(t)` method to return the number of lineages present at time t in the tree (:user:`jeromekelleher`, :issue:`386`, :pr:`2422`) diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index 1c76b20770..4ec3268473 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -2502,7 +2502,6 @@ def test_copy(self, isolated_as_missing, samples, alleles): variant2 = variant.restricted_copy() assert variant.site_id == variant2.site_id assert variant.alleles == variant2.alleles - print(variant.alleles) assert np.array_equal(variant.genotypes, variant2.genotypes) assert np.array_equal(variant.samples, variant2.samples) assert variant.isolated_as_missing == variant2.isolated_as_missing