From 8eba3defac5099eff5cb7c69013375e73c02ce04 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 9 Jul 2020 14:42:32 +0100 Subject: [PATCH] Close indexing loophole and document life cycle. Closes #179 --- c/tests/test_tables.c | 38 ++++++++++++++++++++++++++++++++++++++ c/tests/testlib.c | 2 +- c/tskit/tables.c | 10 +++++++--- c/tskit/tables.h | 22 +++++++++++++++++++--- docs/c-api.rst | 38 ++++++++++++++++++++++++++++++++++++++ docs/data-model.rst | 8 +++++--- 6 files changed, 108 insertions(+), 10 deletions(-) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 034e190562..55129e0bc4 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -2860,6 +2860,43 @@ test_simplify_metadata(void) tsk_table_collection_free(&tables); } +static void +test_edge_update_invalidates_index(void) +{ + int ret; + tsk_treeseq_t ts; + tsk_table_collection_t tables; + + tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, + NULL, NULL, NULL, 0); + + /* Any operations on the edge table should now invalidate the index */ + ret = tsk_treeseq_copy_tables(&ts, &tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_has_index(&tables, 0)) + ret = tsk_edge_table_clear(&tables.edges); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FALSE(tsk_table_collection_has_index(&tables, 0)); + /* Even though the actual indexes still exist */ + CU_ASSERT_FALSE(tables.indexes.edge_insertion_order == NULL); + CU_ASSERT_FALSE(tables.indexes.edge_removal_order == NULL); + CU_ASSERT_EQUAL_FATAL(tables.indexes.num_edges, tsk_treeseq_get_num_edges(&ts)); + + ret = tsk_treeseq_copy_tables(&ts, &tables, TSK_NO_INIT); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_has_index(&tables, 0)) + ret = tsk_edge_table_add_row(&tables.edges, 0, 1, 0, 1, NULL, 0); + CU_ASSERT_TRUE(ret > 0); + CU_ASSERT_FALSE(tsk_table_collection_has_index(&tables, 0)); + /* Even though the actual indexes still exist */ + CU_ASSERT_FALSE(tables.indexes.edge_insertion_order == NULL); + CU_ASSERT_FALSE(tables.indexes.edge_removal_order == NULL); + CU_ASSERT_EQUAL_FATAL(tables.indexes.num_edges, tsk_treeseq_get_num_edges(&ts)); + + tsk_table_collection_free(&tables); + tsk_treeseq_free(&ts); +} + static void test_copy_table_collection(void) { @@ -4113,6 +4150,7 @@ main(int argc, char **argv) { "test_sort_tables_drops_indexes", test_sort_tables_drops_indexes }, { "test_sort_tables_edge_metadata", test_sort_tables_edge_metadata }, { "test_sort_tables_no_edge_metadata", test_sort_tables_no_edge_metadata }, + { "test_edge_update_invalidates_index", test_edge_update_invalidates_index }, { "test_copy_table_collection", test_copy_table_collection }, { "test_sort_tables_errors", test_sort_tables_errors }, { "test_sorter_interface", test_sorter_interface }, diff --git a/c/tests/testlib.c b/c/tests/testlib.c index 3d870b18cd..aa1d330ce8 100644 --- a/c/tests/testlib.c +++ b/c/tests/testlib.c @@ -520,7 +520,7 @@ tsk_treeseq_from_text(tsk_treeseq_t *ts, double sequence_length, const char *nod ret = tsk_treeseq_init(ts, &tables, TSK_BUILD_INDEXES); /* tsk_treeseq_print_state(ts, stdout); */ - // printf("ret = %s\n", tsk_strerror(ret)); + /* printf("ret = %s\n", tsk_strerror(ret)); */ CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_table_collection_free(&tables); } diff --git a/c/tskit/tables.c b/c/tskit/tables.c index ff16c400fb..8e52e5ca70 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -7128,6 +7128,7 @@ tsk_table_collection_set_index(tsk_table_collection_t *self, } memcpy(self->indexes.edge_insertion_order, edge_insertion_order, index_size); memcpy(self->indexes.edge_removal_order, edge_removal_order, index_size); + self->indexes.num_edges = self->edges.num_rows; out: return ret; } @@ -7137,7 +7138,8 @@ tsk_table_collection_has_index( tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) { return self->indexes.edge_insertion_order != NULL - && self->indexes.edge_removal_order != NULL; + && self->indexes.edge_removal_order != NULL + && self->indexes.num_edges == self->edges.num_rows; } int @@ -7148,6 +7150,7 @@ tsk_table_collection_drop_index( tsk_safe_free(self->indexes.edge_removal_order); self->indexes.edge_insertion_order = NULL; self->indexes.edge_removal_order = NULL; + self->indexes.num_edges = 0; return 0; } @@ -7206,6 +7209,7 @@ tsk_table_collection_build_index( for (j = 0; j < self->edges.num_rows; j++) { self->indexes.edge_removal_order[j] = sort_buff[j].index; } + self->indexes.num_edges = self->edges.num_rows; ret = 0; out: tsk_safe_free(sort_buff); @@ -7410,8 +7414,8 @@ tsk_table_collection_dump_indexes(tsk_table_collection_t *self, kastore_t *store { int ret = 0; write_table_col_t write_cols[] = { - { "indexes/edge_insertion_order", NULL, self->edges.num_rows, KAS_INT32 }, - { "indexes/edge_removal_order", NULL, self->edges.num_rows, KAS_INT32 }, + { "indexes/edge_insertion_order", NULL, self->indexes.num_edges, KAS_INT32 }, + { "indexes/edge_removal_order", NULL, self->indexes.num_edges, KAS_INT32 }, }; if (tsk_table_collection_has_index(self, 0)) { diff --git a/c/tskit/tables.h b/c/tskit/tables.h index 560af4ad3f..83cdb26d68 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -589,6 +589,7 @@ typedef struct { struct { tsk_id_t *edge_insertion_order; tsk_id_t *edge_removal_order; + tsk_size_t num_edges; } indexes; } tsk_table_collection_t; @@ -2247,7 +2248,7 @@ Writes the data from this table collection to the specified file. Usually we expect that data written to a file will be in a form that can be read directly and used to create a tree sequence; that is, we assume that by default the tables are :ref:`sorted ` -and should be :ref:`indexed `. Following these +and :ref:`indexed `. Following these assumptions, if the tables are not already indexed, we index the tables before writing to file to save the cost of building these indexes at load time. This behaviour requires that the tables are sorted. @@ -2543,6 +2544,17 @@ int tsk_table_collection_set_metadata_schema(tsk_table_collection_t *self, /** @brief Returns true if this table collection is indexed. +@rst +This method returns true if the table collection has an index +for the edge table. It guarantees that the index exists, and that +it is for the same number of edges that are in the edge table. It +does *not* guarantee that the index is valid (i.e., if the rows +in the edge have been permuted in some way since the index was built). + +See the :ref:`sec_c_api_table_indexes` section for details on the index +life-cycle. +@endrst + @param self A pointer to a tsk_table_collection_t object. @param options Bitwise options. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @@ -2556,6 +2568,8 @@ bool tsk_table_collection_has_index(tsk_table_collection_t *self, tsk_flags_t op @rst Unconditionally drop the indexes that may be present for this table collection. It is not an error to call this method on an unindexed table collection. +See the :ref:`sec_c_api_table_indexes` section for details on the index +life-cycle. @endrst @param self A pointer to a tsk_table_collection_t object. @@ -2569,8 +2583,10 @@ int tsk_table_collection_drop_index(tsk_table_collection_t *self, tsk_flags_t op @brief Builds indexes for this table collection. @rst -Builds the tree traversal :ref:`indexes ` for this table collection. -Any existing index is first dropped using :c:func:`tsk_table_collection_drop_index`. +Builds the tree traversal :ref:`indexes ` for this table +collection. Any existing index is first dropped using +:c:func:`tsk_table_collection_drop_index`. See the +:ref:`sec_c_api_table_indexes` section for details on the index life-cycle. @endrst @param self A pointer to a tsk_table_collection_t object. diff --git a/docs/c-api.rst b/docs/c-api.rst index 68383f54bf..b6d15b8518 100644 --- a/docs/c-api.rst +++ b/docs/c-api.rst @@ -315,6 +315,44 @@ Provenances :content-only: +.. _sec_c_api_table_indexes: + +************* +Table indexes +************* + +Along with the tree sequence :ref:`ordering requirements +`, the :ref:`sec_table_indexes` +allow us to take a table collection and efficiently operate +on the trees defined within it. This section defines the rules +for safely operating on table indexes and their life-cycle. + +The edge index used for tree generation consists of two arrays, +each holding ``N`` edge IDs (where ``N`` is the size of the edge +table). When the index is computed using +:c:func:`tsk_table_collection_build_index`, we store the current size +of the edge table along with the two arrays of edge IDs. The +function :c:func:`tsk_table_collection_has_index` then returns true +iff (a) both of these arrays are not NULL and (b) the stored +number of edges is the same as the current size of the edge table. + +Updating the edge table does not automatically invalidate the indexes. +Thus, if we call :c:func:`tsk_edge_table_clear` on an edge table +which has an index, this index will still exist. However, it will +not be considered a valid index by +:c:func:`tsk_table_collection_has_index` because of the size mismatch. +Similarly for functions that increase the size of the table. +Note that it is possible then to have +:c:func:`tsk_table_collection_has_index` return true, but the index +is not actually valid, if, for example, the user has manipulated the +node and edge tables to describe a different topology, which happens +to have the same number of edges. The behaviour of methods that +use the indexes will be undefined in this case. + +Thus, if you are manipulating an existing table collection that may +be indexed, it is always recommended to call +:c:func:`tsk_table_collection_drop_index` first. + .. _sec_c_api_tree_sequences: ************** diff --git a/docs/data-model.rst b/docs/data-model.rst index 04a83927a3..dbb27e8887 100644 --- a/docs/data-model.rst +++ b/docs/data-model.rst @@ -790,16 +790,18 @@ appearance), then this property is preserved, even if the `parent` properties are not specified. -.. _sec_table_indexing: +.. _sec_table_indexes: -Indexing --------- +Table indexes +------------- To efficiently iterate over the trees in a tree sequence, ``tskit`` uses indexes built on the edges. To create a tree sequence from a table collection the tables must be indexed; the :meth:`TableCollection.build_index` method can be used to create an index on a table collection if necessary. +.. todo:: Add more details on what the indexes actually are. + Removing duplicate sites ------------------------