From 95614151cada8168f02084c6e434540e50c8ba87 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 8 Jul 2020 14:44:43 +0100 Subject: [PATCH 1/4] Refactor table collection integrity checks and document. Closes #649 Closes #592 --- c/tests/test_tables.c | 102 ++++++++-- c/tests/test_trees.c | 19 +- c/tskit/tables.c | 451 ++++++++++++++++++++++++------------------ c/tskit/tables.h | 67 +++++-- 4 files changed, 410 insertions(+), 229 deletions(-) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 6b7cb17745..aae84e2334 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019 Tskit Developers + * Copyright (c) 2019-2020 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -3233,8 +3233,7 @@ test_dump_load_unsorted_with_options(tsk_flags_t tc_options) CU_ASSERT_EQUAL_FATAL(ret, 3); /* Verify that it's unsorted */ - ret = tsk_table_collection_check_integrity( - &t1, TSK_CHECK_OFFSETS | TSK_CHECK_EDGE_ORDERING); + ret = tsk_table_collection_check_integrity(&t1, TSK_CHECK_EDGE_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME); /* Indexing should fail */ @@ -3337,8 +3336,7 @@ test_dump_fail_no_file(void) CU_ASSERT_EQUAL_FATAL(ret, 3); /* Verify that it's unsorted */ - ret = tsk_table_collection_check_integrity( - &t1, TSK_CHECK_OFFSETS | TSK_CHECK_EDGE_ORDERING); + ret = tsk_table_collection_check_integrity(&t1, TSK_CHECK_EDGE_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME); /* Make sure the file doesn't exist beforehand. */ @@ -3690,7 +3688,7 @@ test_table_collection_check_integrity_with_options(tsk_flags_t tc_options) CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_add_row(&tables.mutations, 0, 1, 0, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); - ret = tsk_table_collection_check_integrity(&tables, 0); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_EQUAL); ret = tsk_mutation_table_clear(&tables.mutations); @@ -3704,31 +3702,31 @@ test_table_collection_check_integrity_with_options(tsk_flags_t tc_options) CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_AFTER_CHILD); - ret = tsk_table_collection_check_integrity( - &tables, TSK_CHECK_MUTATION_ORDERING | TSK_NO_CHECK_MUTATION_PARENTS); + ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_add_row( - &tables.mutations, 0, 1, TSK_NULL, 0, NULL, 0, NULL, 0); + &tables.mutations, 0, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); - ret = tsk_mutation_table_add_row(&tables.mutations, 1, 1, 0, 0, NULL, 0, NULL, 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 1, 1, 0, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE); - ret = tsk_table_collection_check_integrity(&tables, TSK_NO_CHECK_MUTATION_PARENTS); + ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_add_row( - &tables.mutations, 1, 1, TSK_NULL, 0, NULL, 0, NULL, 0); + &tables.mutations, 1, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_mutation_table_add_row( - &tables.mutations, 0, 1, TSK_NULL, 0, NULL, 0, NULL, 0); + &tables.mutations, 0, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -3742,8 +3740,28 @@ test_table_collection_check_integrity_with_options(tsk_flags_t tc_options) CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_TIME); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_ORDERING); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + /* Add a chain of unknown times in between a mutation and one with a + * younger time. */ + ret = tsk_mutation_table_clear(&tables.mutations); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 0, TSK_NULL, 1.0, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 0, 0, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 1); + ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 0, 1, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 2); + ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 2, 2.0, NULL, 0, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 3); + ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_ORDERING); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -3751,8 +3769,6 @@ test_table_collection_check_integrity_with_options(tsk_flags_t tc_options) &tables.mutations, 0, 0, TSK_NULL, NAN, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_TIME); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_NONFINITE); ret = tsk_mutation_table_clear(&tables.mutations); @@ -3761,8 +3777,6 @@ test_table_collection_check_integrity_with_options(tsk_flags_t tc_options) &tables.mutations, 0, 0, TSK_NULL, INFINITY, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_TIME); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_NONFINITE); ret = tsk_mutation_table_clear(&tables.mutations); @@ -3772,7 +3786,7 @@ test_table_collection_check_integrity_with_options(tsk_flags_t tc_options) CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_TIME); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE); ret = tsk_mutation_table_clear(&tables.mutations); @@ -3784,7 +3798,7 @@ test_table_collection_check_integrity_with_options(tsk_flags_t tc_options) CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_TIME); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION); /* Note that we do not test for TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE as @@ -3863,6 +3877,52 @@ test_table_collection_check_integrity_with_options(tsk_flags_t tc_options) tsk_table_collection_free(&tables); } + +static void +test_table_collection_check_integrity_no_populations(void) +{ + int ret; + tsk_treeseq_t ts; + tsk_table_collection_t tables; + + tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, + paper_ex_mutations, paper_ex_individuals, NULL); + ret = tsk_treeseq_copy_tables(&ts, &tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + /* Add in some bad population references and check that we can use + * TSK_NO_CHECK_POPULATION_REFS with TSK_CHECK_ALL */ + tables.nodes.population[0] = 10; + ret = tsk_table_collection_check_integrity(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_ALL); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); + ret = tsk_table_collection_check_integrity(&tables, TSK_NO_CHECK_POPULATION_REFS); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_check_integrity(&tables, + TSK_CHECK_ALL | TSK_NO_CHECK_POPULATION_REFS); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.nodes.population[0] = TSK_NULL; + + ret = tsk_table_collection_check_integrity(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_migration_table_add_row( + &tables.migrations, 0.4, 0.5, 1, 0, 1, 1.5, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_check_integrity(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_ALL); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); + ret = tsk_table_collection_check_integrity(&tables, TSK_NO_CHECK_POPULATION_REFS); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_check_integrity(&tables, + TSK_CHECK_ALL|TSK_NO_CHECK_POPULATION_REFS); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + tsk_table_collection_free(&tables); + tsk_treeseq_free(&ts); +} + static void test_table_collection_check_integrity(void) { @@ -4069,6 +4129,8 @@ main(int argc, char **argv) { "test_column_overflow", test_column_overflow }, { "test_table_collection_check_integrity", test_table_collection_check_integrity }, + { "test_table_collection_check_integrity_no_populations", + test_table_collection_check_integrity_no_populations }, { "test_table_collection_subset", test_table_collection_subset }, { "test_table_collection_subset_errors", test_table_collection_subset_errors }, { NULL, NULL }, diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 7b5f288974..579864c6a7 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -132,6 +132,7 @@ static void verify_compute_mutation_times(tsk_treeseq_t *ts) { int ret; + size_t j; size_t size = tsk_treeseq_get_num_mutations(ts) * sizeof(tsk_id_t); tsk_id_t *time = malloc(size); tsk_table_collection_t tables; @@ -140,8 +141,10 @@ verify_compute_mutation_times(tsk_treeseq_t *ts) ret = tsk_treeseq_copy_tables(ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); memcpy(time, tables.mutations.time, size); - /* Make sure the tables are actually updated */ - memset(tables.mutations.time, 0, size); + /* Time should be set to TSK_UNKNOWN_TIME before computing */ + for (j = 0; j < size; j++) { + tables.mutations.time[j] = TSK_UNKNOWN_TIME; + } ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -3758,12 +3761,12 @@ test_single_tree_compute_mutation_parents(void) const char *sites = "0 0\n" "0.1 0\n" "0.2 0\n"; - const char *mutations = "0 0 1 -1 0\n" - "1 1 1 -1 0\n" - "2 4 1 -1 1\n" - "2 1 0 2 0\n" - "2 1 1 3 0\n" - "2 2 1 -1 0\n"; + const char *mutations = "0 0 1 -1\n" + "1 1 1 -1\n" + "2 4 1 -1\n" + "2 1 0 2 \n" + "2 1 1 3 \n" + "2 2 1 -1\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 963f896bf4..a401320bf6 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -4594,7 +4594,7 @@ tsk_table_sorter_init( goto out; } if (!(options & TSK_NO_CHECK_INTEGRITY)) { - ret = tsk_table_collection_check_integrity(tables, TSK_CHECK_OFFSETS); + ret = tsk_table_collection_check_integrity(tables, 0); if (ret != 0) { goto out; } @@ -5820,9 +5820,8 @@ simplifier_init(simplifier_t *self, tsk_id_t *samples, size_t num_samples, * debateable whether we need it. If we remove, we definitely need explicit * tests to ensure we're doing sensible things with duplicate sites. * (Particularly, re TSK_REDUCE_TO_SITE_TOPOLOGY.) */ - ret = tsk_table_collection_check_integrity( - tables, TSK_CHECK_OFFSETS | TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING - | TSK_CHECK_SITE_DUPLICATES); + ret = tsk_table_collection_check_integrity(tables, + TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING | TSK_CHECK_SITE_DUPLICATES); if (ret != 0) { goto out; } @@ -6467,120 +6466,24 @@ tsk_table_collection_check_offsets(tsk_table_collection_t *self) } static int -tsk_table_collection_check_edge_ordering(tsk_table_collection_t *self) +tsk_table_collection_check_node_integrity( + tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; tsk_size_t j; - tsk_id_t parent, last_parent, child, last_child; - double left, last_left; - const double *time = self->nodes.time; - bool *parent_seen = calloc(self->nodes.num_rows, sizeof(bool)); - - if (parent_seen == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - /* Just keeping compiler happy; these values don't matter. */ - last_left = 0; - last_parent = 0; - last_child = 0; - for (j = 0; j < self->edges.num_rows; j++) { - left = self->edges.left[j]; - parent = self->edges.parent[j]; - child = self->edges.child[j]; - if (parent_seen[parent]) { - ret = TSK_ERR_EDGES_NONCONTIGUOUS_PARENTS; - goto out; - } - if (j > 0) { - /* Input data must sorted by (time[parent], parent, child, left). */ - if (time[parent] < time[last_parent]) { - ret = TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME; - goto out; - } - if (time[parent] == time[last_parent]) { - if (parent == last_parent) { - if (child < last_child) { - ret = TSK_ERR_EDGES_NOT_SORTED_CHILD; - goto out; - } - if (child == last_child) { - if (left == last_left) { - ret = TSK_ERR_DUPLICATE_EDGES; - goto out; - } else if (left < last_left) { - ret = TSK_ERR_EDGES_NOT_SORTED_LEFT; - goto out; - } - } - } else { - parent_seen[last_parent] = true; - } - } - } - last_parent = parent; - last_child = child; - last_left = left; - } -out: - tsk_safe_free(parent_seen); - return ret; -} - -/* Checks the integrity of the table collection. What gets checked depends - * on the flags values: - * 0 Check the integrity of ID & spatial references. - * TSK_CHECK_OFFSETS Check offsets for ragged columns. - * TSK_CHECK_EDGE_ORDERING Check edge ordering contraints for a tree sequence. - * TSK_CHECK_SITE_ORDERING Check that sites are in nondecreasing position order. - * TSK_CHECK_SITE_DUPLICATES Check for any duplicate site positions. - * TSK_CHECK_MUTATION_ORDERING Check mutation ordering contraints for a tree sequence. - * TSK_CHECK_MUTATION_TIME Check contraints on mutation 'time' column. - * TSK_CHECK_INDEXES Check indexes exist & reference integrity. - * TSK_CHECK_ALL All above checks. - * TSK_NO_CHECK_MUTATION_PARENTS Do not check contraints on mutation 'parent' column. - * TSK_NO_CHECK_POPULATION_REFS Do not check integrity of references to populations. - */ -int TSK_WARN_UNUSED -tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t options) -{ - int ret = TSK_ERR_GENERIC; - tsk_size_t j; - double node_time, left, right, position, mutation_time; - double L = self->sequence_length; - double *time = self->nodes.time; - tsk_id_t parent, child; - tsk_id_t parent_mut; - tsk_id_t population; - tsk_id_t individual; - tsk_id_t num_nodes = (tsk_id_t) self->nodes.num_rows; - tsk_id_t num_edges = (tsk_id_t) self->edges.num_rows; - tsk_id_t num_sites = (tsk_id_t) self->sites.num_rows; - tsk_id_t num_mutations = (tsk_id_t) self->mutations.num_rows; + double node_time; + tsk_id_t population, individual; tsk_id_t num_populations = (tsk_id_t) self->populations.num_rows; tsk_id_t num_individuals = (tsk_id_t) self->individuals.num_rows; - bool check_site_ordering = !!(options & TSK_CHECK_SITE_ORDERING); - bool check_site_duplicates = !!(options & TSK_CHECK_SITE_DUPLICATES); - bool check_mutation_ordering = !!(options & TSK_CHECK_MUTATION_ORDERING); - bool check_mutation_parents - = (check_mutation_ordering & !(options & TSK_NO_CHECK_MUTATION_PARENTS)); - bool check_mutation_time = !!(options & TSK_CHECK_MUTATION_TIME); - bool check_populations = !(options & TSK_NO_CHECK_POPULATION_REFS); - bool check_indexes = !!(options & TSK_CHECK_INDEXES); - - if (self->sequence_length <= 0) { - ret = TSK_ERR_BAD_SEQUENCE_LENGTH; - goto out; - } + const bool check_population_refs = !(options & TSK_NO_CHECK_POPULATION_REFS); - /* Nodes */ for (j = 0; j < self->nodes.num_rows; j++) { node_time = self->nodes.time[j]; if (!isfinite(node_time)) { ret = TSK_ERR_TIME_NONFINITE; goto out; } - if (check_populations) { + if (check_population_refs) { population = self->nodes.population[j]; if (population < TSK_NULL || population >= num_populations) { ret = TSK_ERR_POPULATION_OUT_OF_BOUNDS; @@ -6593,13 +6496,42 @@ tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t o goto out; } } +out: + return ret; +} - /* Edges */ - for (j = 0; j < self->edges.num_rows; j++) { - parent = self->edges.parent[j]; - child = self->edges.child[j]; - left = self->edges.left[j]; - right = self->edges.right[j]; +static int +tsk_table_collection_check_edge_integrity( + tsk_table_collection_t *self, tsk_flags_t options) +{ + int ret = 0; + tsk_size_t j; + tsk_id_t parent, last_parent, child, last_child; + double left, last_left, right; + const double *time = self->nodes.time; + const double L = self->sequence_length; + const tsk_edge_table_t edges = self->edges; + const tsk_id_t num_nodes = (tsk_id_t) self->nodes.num_rows; + const bool check_ordering = !!(options & TSK_CHECK_EDGE_ORDERING); + bool *parent_seen = NULL; + + if (check_ordering) { + parent_seen = calloc((size_t) num_nodes, sizeof(*parent_seen)); + if (parent_seen == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + } + + /* Just keeping compiler happy; these values don't matter. */ + last_left = 0; + last_parent = 0; + last_child = 0; + for (j = 0; j < edges.num_rows; j++) { + parent = edges.parent[j]; + child = edges.child[j]; + left = edges.left[j]; + right = edges.right[j]; /* Node ID integrity */ if (parent == TSK_NULL) { ret = TSK_ERR_NULL_PARENT; @@ -6639,11 +6571,62 @@ tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t o ret = TSK_ERR_BAD_NODE_TIME_ORDERING; goto out; } + + if (check_ordering) { + if (parent_seen[parent]) { + ret = TSK_ERR_EDGES_NONCONTIGUOUS_PARENTS; + goto out; + } + if (j > 0) { + /* Input data must sorted by (time[parent], parent, child, left). */ + if (time[parent] < time[last_parent]) { + ret = TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME; + goto out; + } + if (time[parent] == time[last_parent]) { + if (parent == last_parent) { + if (child < last_child) { + ret = TSK_ERR_EDGES_NOT_SORTED_CHILD; + goto out; + } + if (child == last_child) { + if (left == last_left) { + ret = TSK_ERR_DUPLICATE_EDGES; + goto out; + } else if (left < last_left) { + ret = TSK_ERR_EDGES_NOT_SORTED_LEFT; + goto out; + } + } + } else { + parent_seen[last_parent] = true; + } + } + } + last_parent = parent; + last_child = child; + last_left = left; + } } +out: + tsk_safe_free(parent_seen); + return ret; +} + +static int TSK_WARN_UNUSED +tsk_table_collection_check_site_integrity( + tsk_table_collection_t *self, tsk_flags_t options) +{ + int ret = 0; + tsk_size_t j; + double position; + const double L = self->sequence_length; + const tsk_site_table_t sites = self->sites; + const bool check_site_ordering = !!(options & TSK_CHECK_SITE_ORDERING); + const bool check_site_duplicates = !!(options & TSK_CHECK_SITE_DUPLICATES); - /* Sites */ - for (j = 0; j < self->sites.num_rows; j++) { - position = self->sites.position[j]; + for (j = 0; j < sites.num_rows; j++) { + position = sites.position[j]; /* Spatial requirements */ if (!isfinite(position)) { ret = TSK_ERR_BAD_SITE_POSITION; @@ -6654,28 +6637,48 @@ tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t o goto out; } if (j > 0) { - if (check_site_duplicates && self->sites.position[j - 1] == position) { + if (check_site_duplicates && sites.position[j - 1] == position) { ret = TSK_ERR_DUPLICATE_SITE_POSITION; goto out; } - if (check_site_ordering && self->sites.position[j - 1] > position) { + if (check_site_ordering && sites.position[j - 1] > position) { ret = TSK_ERR_UNSORTED_SITES; goto out; } } } +out: + return ret; +} - /* Mutations */ - for (j = 0; j < self->mutations.num_rows; j++) { - if (self->mutations.site[j] < 0 || self->mutations.site[j] >= num_sites) { +static int TSK_WARN_UNUSED +tsk_table_collection_check_mutation_integrity( + tsk_table_collection_t *self, tsk_flags_t options) +{ + int ret = 0; + tsk_size_t j; + tsk_id_t parent_mut; + double mutation_time, parent_mutation_time; + const tsk_mutation_table_t mutations = self->mutations; + const tsk_id_t num_nodes = (tsk_id_t) self->nodes.num_rows; + const tsk_id_t num_sites = (tsk_id_t) self->sites.num_rows; + const tsk_id_t num_mutations = (tsk_id_t) self->mutations.num_rows; + const double *node_time = self->nodes.time; + const bool check_mutation_ordering = !!(options & TSK_CHECK_MUTATION_ORDERING); + bool unknown_time; + + for (j = 0; j < mutations.num_rows; j++) { + /* Basic reference integrity */ + if (mutations.site[j] < 0 || mutations.site[j] >= num_sites) { ret = TSK_ERR_SITE_OUT_OF_BOUNDS; goto out; } - if (self->mutations.node[j] < 0 || self->mutations.node[j] >= num_nodes) { + if (mutations.node[j] < 0 || mutations.node[j] >= num_nodes) { ret = TSK_ERR_NODE_OUT_OF_BOUNDS; goto out; } - parent_mut = self->mutations.parent[j]; + /* Integrity check for mutation parent */ + parent_mut = mutations.parent[j]; if (parent_mut < TSK_NULL || parent_mut >= num_mutations) { ret = TSK_ERR_MUTATION_OUT_OF_BOUNDS; goto out; @@ -6684,69 +6687,101 @@ tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t o ret = TSK_ERR_MUTATION_PARENT_EQUAL; goto out; } - if (check_mutation_parents) { - if (parent_mut != TSK_NULL) { - /* Parents must be listed before their children */ - if (parent_mut > (tsk_id_t) j) { - ret = TSK_ERR_MUTATION_PARENT_AFTER_CHILD; - goto out; - } - if (self->mutations.site[parent_mut] != self->mutations.site[j]) { - ret = TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE; - goto out; - } + /* Check if time is nonfinite */ + mutation_time = mutations.time[j]; + unknown_time = tsk_is_unknown_time(mutation_time); + if (!unknown_time) { + if (!isfinite(mutation_time)) { + ret = TSK_ERR_TIME_NONFINITE; + goto out; } } + if (check_mutation_ordering) { if (j > 0) { - if (self->mutations.site[j - 1] > self->mutations.site[j]) { + if (mutations.site[j - 1] > mutations.site[j]) { ret = TSK_ERR_UNSORTED_MUTATIONS; goto out; } } - } - if (check_mutation_time) { - mutation_time = self->mutations.time[j]; - if (!tsk_is_unknown_time(mutation_time)) { - if (!isfinite(mutation_time)) { - ret = TSK_ERR_TIME_NONFINITE; + + /* Check the mutation parents for ordering */ + /* We can only check parent properties if it is non-null */ + if (parent_mut != TSK_NULL) { + /* Parents must be listed before their children */ + if (parent_mut > (tsk_id_t) j) { + ret = TSK_ERR_MUTATION_PARENT_AFTER_CHILD; goto out; } - if (mutation_time < self->nodes.time[self->mutations.node[j]]) { - ret = TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE; + if (mutations.site[parent_mut] != mutations.site[j]) { + ret = TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE; goto out; } - if (parent_mut != TSK_NULL - && self->mutations.time[parent_mut] < mutation_time) { - ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION; + } + + /* Check time value ordering */ + if (!unknown_time) { + if (mutation_time < node_time[mutations.node[j]]) { + ret = TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE; goto out; } + /* We may be on chain of mutations in which the + * intermediate parents have unknown times, but the + * mutations at either end have known times. So, we + * must skip back until either we find a NULL parent + * or a known time. */ + while (parent_mut != TSK_NULL) { + parent_mutation_time = mutations.time[parent_mut]; + if (!tsk_is_unknown_time(parent_mutation_time)) { + if (mutation_time > parent_mutation_time) { + ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION; + goto out; + } + break; + } + parent_mut = mutations.parent[parent_mut]; + } } } } +out: + return ret; +} - /* Migrations */ - for (j = 0; j < self->migrations.num_rows; j++) { - if (self->migrations.node[j] < 0 || self->migrations.node[j] >= num_nodes) { +static int TSK_WARN_UNUSED +tsk_table_collection_check_migration_integrity( + tsk_table_collection_t *self, tsk_flags_t options) +{ + int ret = 0; + tsk_size_t j; + double left, right; + const double L = self->sequence_length; + const tsk_migration_table_t migrations = self->migrations; + const tsk_id_t num_nodes = (tsk_id_t) self->nodes.num_rows; + const tsk_id_t num_populations = (tsk_id_t) self->populations.num_rows; + const bool check_population_refs = !(options & TSK_NO_CHECK_POPULATION_REFS); + + for (j = 0; j < migrations.num_rows; j++) { + if (migrations.node[j] < 0 || migrations.node[j] >= num_nodes) { ret = TSK_ERR_NODE_OUT_OF_BOUNDS; goto out; } - if (self->migrations.source[j] < 0 - || self->migrations.source[j] >= num_populations) { - ret = TSK_ERR_POPULATION_OUT_OF_BOUNDS; - goto out; - } - if (self->migrations.dest[j] < 0 - || self->migrations.dest[j] >= num_populations) { - ret = TSK_ERR_POPULATION_OUT_OF_BOUNDS; - goto out; + if (check_population_refs) { + if (migrations.source[j] < 0 || migrations.source[j] >= num_populations) { + ret = TSK_ERR_POPULATION_OUT_OF_BOUNDS; + goto out; + } + if (migrations.dest[j] < 0 || migrations.dest[j] >= num_populations) { + ret = TSK_ERR_POPULATION_OUT_OF_BOUNDS; + goto out; + } } - if (!isfinite(self->migrations.time[j])) { + if (!isfinite(migrations.time[j])) { ret = TSK_ERR_TIME_NONFINITE; goto out; } - left = self->migrations.left[j]; - right = self->migrations.right[j]; + left = migrations.left[j]; + right = migrations.right[j]; /* Spatial requirements */ /* TODO it's a bit misleading to use the edge-specific errors here. */ if (!(isfinite(left) && isfinite(right))) { @@ -6766,40 +6801,79 @@ tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t o goto out; } } +out: + return ret; +} + +static int TSK_WARN_UNUSED +tsk_table_collection_check_index_integrity( + tsk_table_collection_t *self, tsk_flags_t options) +{ + int ret = 0; + tsk_id_t j; + const tsk_id_t num_edges = (tsk_id_t) self->edges.num_rows; + const tsk_id_t *edge_insertion_order = self->indexes.edge_insertion_order; + const tsk_id_t *edge_removal_order = self->indexes.edge_removal_order; + const bool check_indexes = !!(options & TSK_CHECK_INDEXES); if (check_indexes) { if (!tsk_table_collection_has_index(self, 0)) { ret = TSK_ERR_TABLES_NOT_INDEXED; goto out; } - for (j = 0; j < self->edges.num_rows; j++) { - if (self->indexes.edge_insertion_order[j] < 0 - || self->indexes.edge_insertion_order[j] >= num_edges) { + for (j = 0; j < num_edges; j++) { + if (edge_insertion_order[j] < 0 || edge_insertion_order[j] >= num_edges) { ret = TSK_ERR_EDGE_OUT_OF_BOUNDS; goto out; } - if (self->indexes.edge_removal_order[j] < 0 - || self->indexes.edge_removal_order[j] >= num_edges) { + if (edge_removal_order[j] < 0 || edge_removal_order[j] >= num_edges) { ret = TSK_ERR_EDGE_OUT_OF_BOUNDS; goto out; } } } +out: + return ret; +} - ret = 0; - if (!!(options & TSK_CHECK_OFFSETS)) { - ret = tsk_table_collection_check_offsets(self); - if (ret != 0) { - goto out; - } - } - if (!!(options & TSK_CHECK_EDGE_ORDERING)) { - ret = tsk_table_collection_check_edge_ordering(self); - if (ret != 0) { - goto out; - } +int TSK_WARN_UNUSED +tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t options) +{ + int ret = 0; + + if (self->sequence_length <= 0) { + ret = TSK_ERR_BAD_SEQUENCE_LENGTH; + goto out; } + ret = tsk_table_collection_check_offsets(self); + if (ret != 0) { + goto out; + } + ret = tsk_table_collection_check_node_integrity(self, options); + if (ret != 0) { + goto out; + } + ret = tsk_table_collection_check_edge_integrity(self, options); + if (ret != 0) { + goto out; + } + ret = tsk_table_collection_check_site_integrity(self, options); + if (ret != 0) { + goto out; + } + ret = tsk_table_collection_check_mutation_integrity(self, options); + if (ret != 0) { + goto out; + } + ret = tsk_table_collection_check_migration_integrity(self, options); + if (ret != 0) { + goto out; + } + ret = tsk_table_collection_check_index_integrity(self, options); + if (ret != 0) { + goto out; + } out: return ret; } @@ -7681,10 +7755,10 @@ tsk_table_collection_deduplicate_sites( goto out; } /* Check everything except site duplicates (which we expect) and - * edge indexes and mutation times (which we don't use) */ + * edge indexes */ ret = tsk_table_collection_check_integrity( - self, TSK_CHECK_ALL & ~TSK_CHECK_SITE_DUPLICATES & ~TSK_CHECK_INDEXES - & ~TSK_CHECK_MUTATION_TIME); + self, TSK_CHECK_ALL & ~TSK_CHECK_SITE_DUPLICATES & ~TSK_CHECK_INDEXES); + if (ret != 0) { goto out; } @@ -7751,12 +7825,10 @@ tsk_table_collection_compute_mutation_parents( /* Using unsigned values here avoids potentially undefined behaviour */ tsk_size_t j, mutation, first_mutation; - /* Note that because we check everything here, any non-null mutation parents - * will also be checked, even though they are about to be overwritten. To - * ensure that this function always succeeds we must ensure that the - * parent field is set to -1 first. */ - ret = tsk_table_collection_check_integrity( - self, TSK_CHECK_ALL & ~TSK_CHECK_MUTATION_TIME); + /* Set the mutation parent to TSK_NULL so that we don't check the + * parent values we are about to write over. */ + memset(mutations.parent, 0xff, mutations.num_rows * sizeof(*mutations.parent)); + ret = tsk_table_collection_check_integrity(self, TSK_CHECK_ALL); if (ret != 0) { goto out; } @@ -7875,8 +7947,11 @@ tsk_table_collection_compute_mutation_times( goto out; } - ret = tsk_table_collection_check_integrity( - self, TSK_CHECK_ALL & ~TSK_CHECK_MUTATION_TIME); + /* First set the times to TSK_UNKNOWN_TIME so that check will succeed */ + for (j = 0; j < mutations.num_rows; j++) { + mutations.time[j] = TSK_UNKNOWN_TIME; + } + ret = tsk_table_collection_check_integrity(self, TSK_CHECK_ALL); if (ret != 0) { goto out; } diff --git a/c/tskit/tables.h b/c/tskit/tables.h index c04379458b..8d6b294546 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -660,19 +660,16 @@ typedef struct _tsk_table_sorter_t { #define TSK_KEEP_UNARY (1 << 4) /* Flags for check_integrity */ -#define TSK_CHECK_OFFSETS (1 << 0) -#define TSK_CHECK_EDGE_ORDERING (1 << 1) -#define TSK_CHECK_SITE_ORDERING (1 << 2) -#define TSK_CHECK_SITE_DUPLICATES (1 << 3) -#define TSK_CHECK_MUTATION_ORDERING (1 << 4) -#define TSK_CHECK_INDEXES (1 << 5) +#define TSK_CHECK_EDGE_ORDERING (1 << 0) +#define TSK_CHECK_SITE_ORDERING (1 << 1) +#define TSK_CHECK_SITE_DUPLICATES (1 << 2) +#define TSK_CHECK_MUTATION_ORDERING (1 << 3) +#define TSK_CHECK_INDEXES (1 << 4) #define TSK_CHECK_ALL \ - (TSK_CHECK_OFFSETS | TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING \ - | TSK_CHECK_SITE_DUPLICATES | TSK_CHECK_MUTATION_ORDERING | TSK_CHECK_INDEXES \ - | TSK_CHECK_MUTATION_TIME) -#define TSK_NO_CHECK_MUTATION_PARENTS (1 << 6) -#define TSK_NO_CHECK_POPULATION_REFS (1 << 7) -#define TSK_CHECK_MUTATION_TIME (1 << 8) + (TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING | TSK_CHECK_SITE_DUPLICATES \ + | TSK_CHECK_MUTATION_ORDERING | TSK_CHECK_INDEXES) +/* Leave room for more positive check flags */ +#define TSK_NO_CHECK_POPULATION_REFS (1 << 10) /* Flags for dump tables */ #define TSK_NO_BUILD_INDEXES (1 << 0) @@ -2588,7 +2585,51 @@ int tsk_table_collection_build_index(tsk_table_collection_t *self, tsk_flags_t o @brief Runs integrity checks on this table collection. @rst -TODO: document me. https://github.com/tskit-dev/tskit/issues/592 + +Checks the integrity of this table collection. The default checks (i.e., with +options = 0) guarantee the integrity of memory and references within the +table collection. All spatial values (along the genome) are checked +to see if they are finite values and within the required bounds. Time values +are checked to see if they are finite or marked as unknown. + +To check if a set of tables fulfills the requirements needed +for a valid tree sequence, use the TSK_CHECK_ALL option. Note that +if is possible for :c:func:`tsk_treeseq_init` to fail even if +TSK_CHECK_ALL integrity checks pass. This is because some properties +can only be verified when we build the trees along the genome, +and which we do not check here. + +More fine-grained checks can be achieved using bitwise combinations of the +other options. + +**Options**: + +Options can be specified by providing one or more of the following bitwise +flags: + +TSK_CHECK_EDGE_ORDERING + Check edge ordering constraints for a tree sequence. +TSK_CHECK_SITE_ORDERING + Check that sites are in nondecreasing position order. +TSK_CHECK_SITE_DUPLICATES + Check for any duplicate site positions. +TSK_CHECK_MUTATION_ORDERING + Check contraints on the ordering of mutations. Any non-null + mutation parents and known times are checked for ordering + constraints. +TSK_CHECK_INDEXES + Check that the table indexes exist, and contain valid edge + references. +TSK_CHECK_ALL + All checks needed to define a valid tree sequence that + can be determined without building the individual trees. + +It is sometimes useful to disregard some parts of the data model +when performing checks: + +TSK_NO_CHECK_POPULATION_REFS + Do not check integrity of references to populations. This + can be safely combined with TSK_CHECK_ALL. @endrst @param self A pointer to a tsk_table_collection_t object. From 47571934344114335c754f8a1f1292db43aa29ec Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 10 Jul 2020 09:36:51 +0100 Subject: [PATCH 2/4] Detect bad tree topologies at load time. --- c/CHANGELOG.rst | 5 ++++ c/tests/test_tables.c | 9 +++--- c/tests/test_trees.c | 54 +---------------------------------- c/tskit/trees.c | 33 +++++++-------------- python/CHANGELOG.rst | 7 ++++- python/tests/test_topology.py | 12 ++++---- 6 files changed, 32 insertions(+), 88 deletions(-) diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index ed1644bd5f..39310f5c1e 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -37,6 +37,11 @@ In development. - The text dump of tables with metadata now includes the metadata schema as a header. (:user:`benjeffery`, :pr:`493`) +- Bad tree topologies are detected earlier, so that it is no longer possible + to create a tsk_treeseq_t object which contains a parent with contradictory + children on an interval. Previously an error occured when some operation + building the trees was attempted (:user:`jeromekelleher`, :pr:`709`). + **New features** - Mutations now have an optional double-precision floating-point ``time`` column. diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index aae84e2334..ba8b0770ce 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -3877,7 +3877,6 @@ test_table_collection_check_integrity_with_options(tsk_flags_t tc_options) tsk_table_collection_free(&tables); } - static void test_table_collection_check_integrity_no_populations(void) { @@ -3899,8 +3898,8 @@ test_table_collection_check_integrity_no_populations(void) CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret = tsk_table_collection_check_integrity(&tables, TSK_NO_CHECK_POPULATION_REFS); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_check_integrity(&tables, - TSK_CHECK_ALL | TSK_NO_CHECK_POPULATION_REFS); + ret = tsk_table_collection_check_integrity( + &tables, TSK_CHECK_ALL | TSK_NO_CHECK_POPULATION_REFS); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.nodes.population[0] = TSK_NULL; @@ -3915,8 +3914,8 @@ test_table_collection_check_integrity_no_populations(void) CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret = tsk_table_collection_check_integrity(&tables, TSK_NO_CHECK_POPULATION_REFS); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_check_integrity(&tables, - TSK_CHECK_ALL|TSK_NO_CHECK_POPULATION_REFS); + ret = tsk_table_collection_check_integrity( + &tables, TSK_CHECK_ALL | TSK_NO_CHECK_POPULATION_REFS); CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_table_collection_free(&tables); diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 579864c6a7..8665d81b88 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -2364,7 +2364,6 @@ test_simplest_contradictory_children(void) "0 1 2 0\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; - tsk_tree_t tree; int ret; tsk_flags_t load_flags = TSK_BUILD_INDEXES; @@ -2378,13 +2377,8 @@ test_simplest_contradictory_children(void) tables.sequence_length = 1.0; ret = tsk_treeseq_init(&ts, &tables, load_flags); - CU_ASSERT_EQUAL(ret, 0); - ret = tsk_tree_init(&tree, &ts, 0); - CU_ASSERT_EQUAL(ret, 0); - ret = tsk_tree_first(&tree); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN); - tsk_tree_free(&tree); tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } @@ -5131,51 +5125,6 @@ test_offset_trees_with_errors_kc(void) tsk_treeseq_free(&other); } -static void -test_invalid_first_tree_errors_kc(void) -{ - const char *nodes = "1 0 -1\n" - "1 0 -1\n" - "0 1 -1\n"; - const char *edges = "0 1 2 0\n" - "0 1 2 1\n"; - - const char *bad_nodes = "1 0 -1\n" - "1 1 -1\n" - "0 1 -1\n"; - const char *bad_edges = "0 1 1 0\n" - "0 1 2 0\n"; - tsk_treeseq_t ts, other; - tsk_table_collection_t tables; - double result; - int ret; - tsk_flags_t load_flags = TSK_BUILD_INDEXES; - - tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); - - ret = tsk_table_collection_init(&tables, 0); - CU_ASSERT_EQUAL_FATAL(ret, 0); - - parse_nodes(bad_nodes, &tables.nodes); - CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 3); - parse_edges(bad_edges, &tables.edges); - CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 2); - tables.sequence_length = 1.0; - - ret = tsk_treeseq_init(&other, &tables, load_flags); - CU_ASSERT_EQUAL(ret, 0); - - ret = tsk_treeseq_kc_distance(&ts, &other, 0, &result); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN); - - ret = tsk_treeseq_kc_distance(&other, &ts, 0, &result); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN); - - tsk_treeseq_free(&ts); - tsk_treeseq_free(&other); - tsk_table_collection_free(&tables); -} - /*======================================================= * Miscellaneous tests. *======================================================*/ @@ -5941,7 +5890,6 @@ main(int argc, char **argv) { "test_unequal_sequence_lengths_kc", test_unequal_sequence_lengths_kc }, { "test_different_number_trees_kc", test_different_number_trees_kc }, { "test_offset_trees_with_errors_kc", test_offset_trees_with_errors_kc }, - { "test_invalid_first_tree_errors_kc", test_invalid_first_tree_errors_kc }, /* Misc */ { "test_tree_errors", test_tree_errors }, diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 1299cfaba4..bd2de4f5eb 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -228,7 +228,7 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) { int ret = TSK_ERR_GENERIC; size_t j, k, tree_index; - tsk_id_t site, mutation; + tsk_id_t u, site, mutation; double tree_left, tree_right; const double sequence_length = self->tables->sequence_length; const tsk_id_t num_sites = (tsk_id_t) self->tables->sites.num_rows; @@ -269,7 +269,12 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) k++; } while (j < num_edges && edge_left[I[j]] == tree_left) { - parent[edge_child[I[j]]] = edge_parent[I[j]]; + u = edge_child[I[j]]; + if (parent[u] != TSK_NULL) { + ret = TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN; + goto out; + } + parent[u] = edge_parent[I[j]]; j++; } tree_right = sequence_length; @@ -3873,7 +3878,7 @@ tsk_tree_update_sample_lists(tsk_tree_t *self, const tsk_id_t *restrict parent, } } -static int +static void tsk_tree_remove_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) { tsk_id_t *restrict parent = self->parent; @@ -3954,14 +3959,11 @@ tsk_tree_remove_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) if (self->options & TSK_SAMPLE_LISTS) { tsk_tree_update_sample_lists(self, parent, left_child, right_sib, p); } - - return 0; } -static int +static void tsk_tree_insert_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) { - int ret = 0; tsk_id_t *restrict parent = self->parent; tsk_id_t *restrict left_child = self->left_child; tsk_id_t *restrict right_child = self->right_child; @@ -3977,10 +3979,6 @@ tsk_tree_insert_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) #define IS_ROOT(U) (num_samples[U] >= root_threshold) - if (parent[c] != TSK_NULL) { - ret = TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN; - goto out; - } parent[c] = p; u = right_child[p]; lsib = left_sib[c]; @@ -4055,8 +4053,6 @@ tsk_tree_insert_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) if (self->options & TSK_SAMPLE_LISTS) { tsk_tree_update_sample_lists(self, parent, left_child, right_sib, p); } -out: - return ret; } static int @@ -4086,19 +4082,13 @@ tsk_tree_advance(tsk_tree_t *self, int direction, const double *restrict out_bre assert(out < num_edges); k = out_order[out]; out += direction; - ret = tsk_tree_remove_edge(self, edge_parent[k], edge_child[k]); - if (ret != 0) { - goto out; - } + tsk_tree_remove_edge(self, edge_parent[k], edge_child[k]); } while (in >= 0 && in < num_edges && in_breakpoints[in_order[in]] == x) { k = in_order[in]; in += direction; - ret = tsk_tree_insert_edge(self, edge_parent[k], edge_child[k]); - if (ret != 0) { - goto out; - } + tsk_tree_insert_edge(self, edge_parent[k], edge_child[k]); } if (self->left_root != TSK_NULL) { @@ -4137,7 +4127,6 @@ tsk_tree_advance(tsk_tree_t *self, int direction, const double *restrict out_bre self->sites_length = self->tree_sequence->tree_sites_length[self->index]; } ret = 1; -out: return ret; } diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index e39946e9c6..2ba14642f3 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -1,5 +1,5 @@ -------------------- -[0.2.4] - 20XX-XX-XX +[0.3.0] - 20XX-XX-XX -------------------- In development @@ -14,6 +14,11 @@ In development instead of tskit.FileFormatError. Loading from an empty file now raises and EOFError. +- Bad tree topologies are detected earlier, so that it is no longer possible + to create a TreeSequence object which contains a parent with contradictory + children on an interval. Previously an error was thrown when some operation + building the trees was attempted. (:user:`jeromekelleher`, :pr:`709`). + **New features** - Mutations now have an optional double-precision floating-point ``time`` column. diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index e0d0b44bb8..6424fc6a94 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -4477,8 +4477,8 @@ def test_simplest_contradictory_children(self): 0.0 1.0 3 0 """ ) - ts = tskit.load_text(nodes=nodes, edges=edges, strict=False) - self.assertRaises(_tskit.LibraryError, list, ts.trees()) + with self.assertRaises(_tskit.LibraryError): + tskit.load_text(nodes=nodes, edges=edges, strict=False) def test_partial_overlap_contradictory_children(self): nodes = io.StringIO( @@ -4497,8 +4497,8 @@ def test_partial_overlap_contradictory_children(self): 0.5 1.0 3 0 """ ) - ts = tskit.load_text(nodes=nodes, edges=edges, strict=False) - self.assertRaises(_tskit.LibraryError, list, ts.trees()) + with self.assertRaises(_tskit.LibraryError): + tskit.load_text(nodes=nodes, edges=edges, strict=False) class TestSimplify(unittest.TestCase): @@ -6410,10 +6410,8 @@ def test_squash_overlapping_intervals(self): 2 0.60 1.00 1 0 """ ) - ts = tskit.load_text(nodes=nodes, edges=edges, strict=False) - with self.assertRaises(tskit.LibraryError): - self.do_squash(ts) + tskit.load_text(nodes=nodes, edges=edges, strict=False) def verify_slice_and_squash(self, ts): """ From 15305e954602d5d314dbb225db2f8868c06c7186 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Fri, 10 Jul 2020 14:22:33 +0100 Subject: [PATCH 3/4] Move tree checks into table collection. This removes the need for an extra iteration over the trees during tree sequence init. --- c/tests/test_tables.c | 20 +++--- c/tests/test_trees.c | 9 +-- c/tskit/core.c | 3 + c/tskit/core.h | 4 +- c/tskit/tables.c | 161 ++++++++++++++++++++++++++++++++++-------- c/tskit/tables.h | 34 ++++----- c/tskit/trees.c | 87 ++++------------------- 7 files changed, 185 insertions(+), 133 deletions(-) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index ba8b0770ce..034e190562 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -3801,9 +3801,6 @@ test_table_collection_check_integrity_with_options(tsk_flags_t tc_options) ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION); - /* Note that we do not test for TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE as - that is only checked for tree sequences not tables */ - /* migrations */ ret = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret >= 0); @@ -3885,22 +3882,23 @@ test_table_collection_check_integrity_no_populations(void) tsk_table_collection_t tables; tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, - paper_ex_mutations, paper_ex_individuals, NULL); + paper_ex_mutations, paper_ex_individuals, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); /* Add in some bad population references and check that we can use - * TSK_NO_CHECK_POPULATION_REFS with TSK_CHECK_ALL */ + * TSK_NO_CHECK_POPULATION_REFS with TSK_CHECK_TREES */ tables.nodes.population[0] = 10; ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); - ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_ALL); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret = tsk_table_collection_check_integrity(&tables, TSK_NO_CHECK_POPULATION_REFS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_check_integrity( - &tables, TSK_CHECK_ALL | TSK_NO_CHECK_POPULATION_REFS); - CU_ASSERT_EQUAL_FATAL(ret, 0); + &tables, TSK_CHECK_TREES | TSK_NO_CHECK_POPULATION_REFS); + /* CHECK_TREES returns the number of trees */ + CU_ASSERT_EQUAL_FATAL(ret, 3); tables.nodes.population[0] = TSK_NULL; ret = tsk_table_collection_check_integrity(&tables, 0); @@ -3910,13 +3908,13 @@ test_table_collection_check_integrity_no_populations(void) CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); - ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_ALL); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); ret = tsk_table_collection_check_integrity(&tables, TSK_NO_CHECK_POPULATION_REFS); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_check_integrity( - &tables, TSK_CHECK_ALL | TSK_NO_CHECK_POPULATION_REFS); - CU_ASSERT_EQUAL_FATAL(ret, 0); + &tables, TSK_CHECK_TREES | TSK_NO_CHECK_POPULATION_REFS); + CU_ASSERT_EQUAL_FATAL(ret, 3); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 8665d81b88..ac34d93f5a 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -2139,17 +2139,18 @@ test_simplest_bad_indexes(void) CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLES_NOT_INDEXED); ret = tsk_table_collection_build_index(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_ALL); - CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); + /* TSK_CHECK_TREES returns the number of trees */ + CU_ASSERT_EQUAL_FATAL(ret, 1); for (j = 0; j < sizeof(bad_indexes) / sizeof(*bad_indexes); j++) { tables.indexes.edge_insertion_order[0] = bad_indexes[j]; - ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_ALL); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); tables.indexes.edge_insertion_order[0] = 0; tables.indexes.edge_removal_order[0] = bad_indexes[j]; - ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_ALL); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_TREES); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EDGE_OUT_OF_BOUNDS); tables.indexes.edge_removal_order[0] = 0; } diff --git a/c/tskit/core.c b/c/tskit/core.c index 910e41f66c..3e58f7bc8a 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -325,6 +325,9 @@ tsk_strerror_internal(int err) case TSK_ERR_COLUMN_OVERFLOW: ret = "Table column too large; cannot be more than 2**32 bytes."; break; + case TSK_ERR_TREE_OVERFLOW: + ret = "Too many trees; cannot be more than 2**31."; + break; case TSK_ERR_METADATA_DISABLED: ret = "Metadata is disabled for this table, so cannot be set"; break; diff --git a/c/tskit/core.h b/c/tskit/core.h index 7c4756afa7..0e963e1225 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -257,7 +257,9 @@ not found in the file. #define TSK_ERR_TABLES_NOT_INDEXED -702 #define TSK_ERR_TABLE_OVERFLOW -703 #define TSK_ERR_COLUMN_OVERFLOW -704 -#define TSK_ERR_METADATA_DISABLED -705 +#define TSK_ERR_TREE_OVERFLOW -705 +#define TSK_ERR_METADATA_DISABLED -706 + /* Limitations */ #define TSK_ERR_ONLY_INFINITE_SITES -800 #define TSK_ERR_SIMPLIFY_MIGRATIONS_NOT_SUPPORTED -801 diff --git a/c/tskit/tables.c b/c/tskit/tables.c index a401320bf6..ff16c400fb 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -6805,47 +6805,143 @@ tsk_table_collection_check_migration_integrity( return ret; } +static tsk_id_t TSK_WARN_UNUSED +tsk_table_collection_check_tree_integrity(tsk_table_collection_t *self) +{ + tsk_id_t ret = 0; + size_t j, k; + tsk_id_t u, site, mutation; + double tree_left, tree_right; + const double sequence_length = self->sequence_length; + const tsk_id_t num_sites = (tsk_id_t) self->sites.num_rows; + const tsk_id_t num_mutations = (tsk_id_t) self->mutations.num_rows; + const size_t num_edges = self->edges.num_rows; + const double *restrict site_position = self->sites.position; + const tsk_id_t *restrict mutation_site = self->mutations.site; + const tsk_id_t *restrict mutation_node = self->mutations.node; + const double *restrict mutation_time = self->mutations.time; + const double *restrict node_time = self->nodes.time; + const tsk_id_t *restrict I = self->indexes.edge_insertion_order; + const tsk_id_t *restrict O = self->indexes.edge_removal_order; + const double *restrict edge_right = self->edges.right; + const double *restrict edge_left = self->edges.left; + const tsk_id_t *restrict edge_child = self->edges.child; + const tsk_id_t *restrict edge_parent = self->edges.parent; + tsk_id_t *restrict parent = NULL; + tsk_id_t num_trees = 0; + + parent = malloc(self->nodes.num_rows * sizeof(*parent)); + if (parent == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + memset(parent, 0xff, self->nodes.num_rows * sizeof(*parent)); + + tree_left = 0; + tree_right = sequence_length; + num_trees = 0; + j = 0; + k = 0; + site = 0; + mutation = 0; + assert(I != NULL && O != NULL); + + while (j < num_edges || tree_left < sequence_length) { + while (k < num_edges && edge_right[O[k]] == tree_left) { + parent[edge_child[O[k]]] = TSK_NULL; + k++; + } + while (j < num_edges && edge_left[I[j]] == tree_left) { + u = edge_child[I[j]]; + if (parent[u] != TSK_NULL) { + ret = TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN; + goto out; + } + parent[u] = edge_parent[I[j]]; + j++; + } + tree_right = sequence_length; + if (j < num_edges) { + tree_right = TSK_MIN(tree_right, edge_left[I[j]]); + } + if (k < num_edges) { + tree_right = TSK_MIN(tree_right, edge_right[O[k]]); + } + while (site < num_sites && site_position[site] < tree_right) { + while (mutation < num_mutations && mutation_site[mutation] == site) { + if (!tsk_is_unknown_time(mutation_time[mutation]) + && parent[mutation_node[mutation]] != TSK_NULL + && node_time[parent[mutation_node[mutation]]] + <= mutation_time[mutation]) { + ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE; + goto out; + } + mutation++; + } + site++; + } + tree_left = tree_right; + /* This is technically possible; if we have 2**31 edges each defining + * a single tree, and there's a gap between each of these edges we + * would overflow this counter. */ + if (num_trees == INT32_MAX) { + ret = TSK_ERR_TREE_OVERFLOW; + goto out; + } + num_trees++; + } + ret = num_trees; +out: + /* Can't use tsk_safe_free because of restrict*/ + if (parent != NULL) { + free(parent); + } + return ret; +} + static int TSK_WARN_UNUSED -tsk_table_collection_check_index_integrity( - tsk_table_collection_t *self, tsk_flags_t options) +tsk_table_collection_check_index_integrity(tsk_table_collection_t *self) { int ret = 0; tsk_id_t j; const tsk_id_t num_edges = (tsk_id_t) self->edges.num_rows; const tsk_id_t *edge_insertion_order = self->indexes.edge_insertion_order; const tsk_id_t *edge_removal_order = self->indexes.edge_removal_order; - const bool check_indexes = !!(options & TSK_CHECK_INDEXES); - if (check_indexes) { - if (!tsk_table_collection_has_index(self, 0)) { - ret = TSK_ERR_TABLES_NOT_INDEXED; + if (!tsk_table_collection_has_index(self, 0)) { + ret = TSK_ERR_TABLES_NOT_INDEXED; + goto out; + } + for (j = 0; j < num_edges; j++) { + if (edge_insertion_order[j] < 0 || edge_insertion_order[j] >= num_edges) { + ret = TSK_ERR_EDGE_OUT_OF_BOUNDS; goto out; } - for (j = 0; j < num_edges; j++) { - if (edge_insertion_order[j] < 0 || edge_insertion_order[j] >= num_edges) { - ret = TSK_ERR_EDGE_OUT_OF_BOUNDS; - goto out; - } - if (edge_removal_order[j] < 0 || edge_removal_order[j] >= num_edges) { - ret = TSK_ERR_EDGE_OUT_OF_BOUNDS; - goto out; - } + if (edge_removal_order[j] < 0 || edge_removal_order[j] >= num_edges) { + ret = TSK_ERR_EDGE_OUT_OF_BOUNDS; + goto out; } } out: return ret; } -int TSK_WARN_UNUSED +tsk_id_t TSK_WARN_UNUSED tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; + if (options & TSK_CHECK_TREES) { + /* Checking the trees implies all the other checks */ + options |= TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING + | TSK_CHECK_SITE_DUPLICATES | TSK_CHECK_MUTATION_ORDERING + | TSK_CHECK_INDEXES; + } + if (self->sequence_length <= 0) { ret = TSK_ERR_BAD_SEQUENCE_LENGTH; goto out; } - ret = tsk_table_collection_check_offsets(self); if (ret != 0) { goto out; @@ -6870,9 +6966,17 @@ tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t o if (ret != 0) { goto out; } - ret = tsk_table_collection_check_index_integrity(self, options); - if (ret != 0) { - goto out; + if (options & TSK_CHECK_INDEXES) { + ret = tsk_table_collection_check_index_integrity(self); + if (ret != 0) { + goto out; + } + } + if (options & TSK_CHECK_TREES) { + ret = tsk_table_collection_check_tree_integrity(self); + if (ret < 0) { + goto out; + } } out: return ret; @@ -7754,10 +7858,7 @@ tsk_table_collection_deduplicate_sites( if (ret != 0) { goto out; } - /* Check everything except site duplicates (which we expect) and - * edge indexes */ - ret = tsk_table_collection_check_integrity( - self, TSK_CHECK_ALL & ~TSK_CHECK_SITE_DUPLICATES & ~TSK_CHECK_INDEXES); + ret = tsk_table_collection_check_integrity(self, TSK_CHECK_SITE_ORDERING); if (ret != 0) { goto out; @@ -7810,6 +7911,7 @@ tsk_table_collection_compute_mutation_parents( tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; + tsk_id_t num_trees; const tsk_id_t *I, *O; const tsk_edge_table_t edges = self->edges; const tsk_node_table_t nodes = self->nodes; @@ -7828,8 +7930,9 @@ tsk_table_collection_compute_mutation_parents( /* Set the mutation parent to TSK_NULL so that we don't check the * parent values we are about to write over. */ memset(mutations.parent, 0xff, mutations.num_rows * sizeof(*mutations.parent)); - ret = tsk_table_collection_check_integrity(self, TSK_CHECK_ALL); - if (ret != 0) { + num_trees = tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); + if (num_trees < 0) { + ret = num_trees; goto out; } parent = malloc(nodes.num_rows * sizeof(*parent)); @@ -7924,6 +8027,7 @@ tsk_table_collection_compute_mutation_times( tsk_table_collection_t *self, double *random, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; + tsk_id_t num_trees; const tsk_id_t *restrict I = self->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->indexes.edge_removal_order; const tsk_edge_table_t edges = self->edges; @@ -7951,8 +8055,9 @@ tsk_table_collection_compute_mutation_times( for (j = 0; j < mutations.num_rows; j++) { mutations.time[j] = TSK_UNKNOWN_TIME; } - ret = tsk_table_collection_check_integrity(self, TSK_CHECK_ALL); - if (ret != 0) { + num_trees = tsk_table_collection_check_integrity(self, TSK_CHECK_TREES); + if (num_trees < 0) { + ret = num_trees; goto out; } parent = malloc(nodes.num_rows * sizeof(*parent)); diff --git a/c/tskit/tables.h b/c/tskit/tables.h index 8d6b294546..560af4ad3f 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -665,9 +665,8 @@ typedef struct _tsk_table_sorter_t { #define TSK_CHECK_SITE_DUPLICATES (1 << 2) #define TSK_CHECK_MUTATION_ORDERING (1 << 3) #define TSK_CHECK_INDEXES (1 << 4) -#define TSK_CHECK_ALL \ - (TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING | TSK_CHECK_SITE_DUPLICATES \ - | TSK_CHECK_MUTATION_ORDERING | TSK_CHECK_INDEXES) +#define TSK_CHECK_TREES (1 << 5) + /* Leave room for more positive check flags */ #define TSK_NO_CHECK_POPULATION_REFS (1 << 10) @@ -2587,17 +2586,17 @@ int tsk_table_collection_build_index(tsk_table_collection_t *self, tsk_flags_t o @rst Checks the integrity of this table collection. The default checks (i.e., with -options = 0) guarantee the integrity of memory and references within the +options = 0) guarantee the integrity of memory and entity references within the table collection. All spatial values (along the genome) are checked to see if they are finite values and within the required bounds. Time values are checked to see if they are finite or marked as unknown. -To check if a set of tables fulfills the requirements needed -for a valid tree sequence, use the TSK_CHECK_ALL option. Note that -if is possible for :c:func:`tsk_treeseq_init` to fail even if -TSK_CHECK_ALL integrity checks pass. This is because some properties -can only be verified when we build the trees along the genome, -and which we do not check here. +To check if a set of tables fulfills the :ref:`requirements +` needed for a valid tree sequence, use +the TSK_CHECK_TREES option. When this method is called with TSK_CHECK_TREES, +the number of trees in the tree sequence is returned. Thus, to check for errors +client code should verify that the return value is less than zero. All other +options will return zero on success and a negative value on failure. More fine-grained checks can be achieved using bitwise combinations of the other options. @@ -2620,23 +2619,26 @@ TSK_CHECK_MUTATION_ORDERING TSK_CHECK_INDEXES Check that the table indexes exist, and contain valid edge references. -TSK_CHECK_ALL - All checks needed to define a valid tree sequence that - can be determined without building the individual trees. +TSK_CHECK_TREES + All checks needed to define a valid tree sequence. Note that + this implies all of the above checks. It is sometimes useful to disregard some parts of the data model when performing checks: TSK_NO_CHECK_POPULATION_REFS Do not check integrity of references to populations. This - can be safely combined with TSK_CHECK_ALL. + can be safely combined with the other checks. @endrst @param self A pointer to a tsk_table_collection_t object. @param options Bitwise options. -@return Return 0 on success or a negative value on failure. +@return Return a negative error value on if any problems are detected + in the tree sequence. If the TSK_CHECK_TREES option is provided, + the number of trees in the tree sequence will be returned, on + success. */ -int tsk_table_collection_check_integrity( +tsk_id_t tsk_table_collection_check_integrity( tsk_table_collection_t *self, tsk_flags_t options); /** @} */ diff --git a/c/tskit/trees.c b/c/tskit/trees.c index bd2de4f5eb..a91a4718d8 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -228,90 +228,30 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) { int ret = TSK_ERR_GENERIC; size_t j, k, tree_index; - tsk_id_t u, site, mutation; + tsk_id_t site; double tree_left, tree_right; const double sequence_length = self->tables->sequence_length; const tsk_id_t num_sites = (tsk_id_t) self->tables->sites.num_rows; - const tsk_id_t num_mutations = (tsk_id_t) self->tables->mutations.num_rows; const size_t num_edges = self->tables->edges.num_rows; const double *restrict site_position = self->tables->sites.position; - const tsk_id_t *restrict mutation_site = self->tables->mutations.site; - const tsk_id_t *restrict mutation_node = self->tables->mutations.node; - const double *restrict mutation_time = self->tables->mutations.time; - const double *restrict node_time = self->tables->nodes.time; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; const double *restrict edge_right = self->tables->edges.right; const double *restrict edge_left = self->tables->edges.left; - const tsk_id_t *restrict edge_child = self->tables->edges.child; - const tsk_id_t *restrict edge_parent = self->tables->edges.parent; - tsk_id_t *parent = NULL; - - parent = malloc(self->tables->nodes.num_rows * sizeof(*parent)); - if (parent == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - memset(parent, 0xff, self->tables->nodes.num_rows * sizeof(*parent)); + /* Avoid malloc(0) */ + size_t num_trees_alloc = self->num_trees + 1; - tree_left = 0; - tree_right = sequence_length; - self->num_trees = 0; - j = 0; - k = 0; - site = 0; - mutation = 0; - assert(I != NULL && O != NULL); - /* Iterate through the trees in order to count them and check mutation times */ - while (j < num_edges || tree_left < sequence_length) { - while (k < num_edges && edge_right[O[k]] == tree_left) { - parent[edge_child[O[k]]] = TSK_NULL; - k++; - } - while (j < num_edges && edge_left[I[j]] == tree_left) { - u = edge_child[I[j]]; - if (parent[u] != TSK_NULL) { - ret = TSK_ERR_BAD_EDGES_CONTRADICTORY_CHILDREN; - goto out; - } - parent[u] = edge_parent[I[j]]; - j++; - } - tree_right = sequence_length; - if (j < num_edges) { - tree_right = TSK_MIN(tree_right, edge_left[I[j]]); - } - if (k < num_edges) { - tree_right = TSK_MIN(tree_right, edge_right[O[k]]); - } - while (site < num_sites && site_position[site] < tree_right) { - while (mutation < num_mutations && mutation_site[mutation] == site) { - if (!isnan(mutation_time[mutation]) - && parent[mutation_node[mutation]] != TSK_NULL - && node_time[parent[mutation_node[mutation]]] - <= mutation_time[mutation]) { - ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE; - goto out; - } - mutation++; - } - site++; - } - tree_left = tree_right; - self->num_trees++; - } - assert(self->num_trees > 0); - - self->tree_sites_length = malloc(self->num_trees * sizeof(tsk_size_t)); - self->tree_sites = malloc(self->num_trees * sizeof(tsk_site_t *)); - self->breakpoints = malloc((self->num_trees + 1) * sizeof(double)); + self->tree_sites_length = malloc(num_trees_alloc * sizeof(*self->tree_sites_length)); + self->tree_sites = malloc(num_trees_alloc * sizeof(*self->tree_sites)); + self->breakpoints = malloc(num_trees_alloc * sizeof(*self->breakpoints)); if (self->tree_sites == NULL || self->tree_sites_length == NULL || self->breakpoints == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; } - memset(self->tree_sites_length, 0, self->num_trees * sizeof(tsk_size_t)); - memset(self->tree_sites, 0, self->num_trees * sizeof(tsk_site_t *)); + memset( + self->tree_sites_length, 0, self->num_trees * sizeof(*self->tree_sites_length)); + memset(self->tree_sites, 0, self->num_trees * sizeof(*self->tree_sites)); tree_left = 0; tree_right = sequence_length; @@ -347,7 +287,6 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) self->breakpoints[tree_index] = tree_right; ret = 0; out: - tsk_safe_free(parent); return ret; } @@ -401,6 +340,7 @@ tsk_treeseq_init( tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options) { int ret = 0; + tsk_id_t num_trees; memset(self, 0, sizeof(*self)); if (tables == NULL) { @@ -429,11 +369,12 @@ tsk_treeseq_init( goto out; } } - ret = tsk_table_collection_check_integrity(self->tables, TSK_CHECK_ALL); - if (ret != 0) { + num_trees = tsk_table_collection_check_integrity(self->tables, TSK_CHECK_TREES); + if (num_trees < 0) { + ret = (int) num_trees; goto out; } - assert(tsk_table_collection_has_index(self->tables, 0)); + self->num_trees = (tsk_size_t) num_trees; /* This is a hack to workaround the fact we're copying the tables here. * In general, we don't want the file_uuid to be copied, as this should From 2a990902f97be7286fe3dd3249edbdbd211cba3e Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 15 Jul 2020 18:19:16 +0100 Subject: [PATCH 4/4] Add version to circleCI cache key. --- .circleci/config.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a8a9ea7ae8..faacc0a819 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,7 +8,10 @@ jobs: - checkout - run: sudo chown -R circleci:circleci * - restore_cache: - key: tskit-{{ .Branch }} + # It's sometimes necessary to nuke the cache, and the simplest + # way to do it is to change the key. We can increment this + # version number when we want to do this. + key: tskit-{{ .Branch }}-v1 - run: name: Checkout submodules command: | @@ -26,7 +29,7 @@ jobs: pip uninstall tskit -y echo 'export PATH=/home/circleci/.local/bin:$PATH' >> $BASH_ENV - save_cache: - key: tskit-{{ .Branch }} + key: tskit-{{ .Branch }}-v1 paths: - "/home/circleci/.local"