From 09d41ef025340fed119142a20d4807cd0e3381b3 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 19 May 2020 20:32:54 +0100 Subject: [PATCH] Expose sorting API and test C++ implementation. Closes #616 --- c/CHANGELOG.rst | 3 + c/tests/test_minimal_cpp.cpp | 181 +++++++++++++++++++++++++++++++++- c/tests/test_tables.c | 96 +++++++++++++++++- c/tskit/tables.c | 185 +++++++++++++++++------------------ c/tskit/tables.h | 127 +++++++++++++++++++++++- docs/c-api.rst | 28 ++++++ 6 files changed, 521 insertions(+), 99 deletions(-) diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index 7d8bfbe3e1..e673f5924d 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -71,6 +71,9 @@ In development. variants for tree sequence and table collection load/dump (:user:`jeromekelleher`, :user:`grahamgower`, :issue:`565`, :pr:`599`). +- Add low-level sorting API and ``TSK_NO_CHECK_INTEGRITY`` flag + (:user:`jeromekelleher`, :pr:`627`, :issue:`626`). + **Deprecated** - The ``TSK_SAMPLE_COUNTS`` options is now ignored and will print out a warning diff --git a/c/tests/test_minimal_cpp.cpp b/c/tests/test_minimal_cpp.cpp index a6d8d9bbf7..fc5196cdec 100644 --- a/c/tests/test_minimal_cpp.cpp +++ b/c/tests/test_minimal_cpp.cpp @@ -1,7 +1,6 @@ -/* - * MIT License +/* * MIT License * - * Copyright (c) 2019 Tskit Developers + * Copyright (c) 2019-2020 Tskit Developers * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,6 +27,9 @@ #include #include #include +#include +#include +#include #include @@ -78,6 +80,177 @@ test_table_basics() tsk_table_collection_free(&tables); } +/* A definition of sort_edges that uses C++ std::sort and inlining of the + * comparison function to achieve significantly better performance than + * the builtin method in tskit. + */ +int +cpp_sort_edges(tsk_table_sorter_t *sorter, tsk_size_t start) +{ + struct _edge { + double left, right; + tsk_id_t parent, child; + + _edge(double l, double r, tsk_id_t p, tsk_id_t c) + : left{ l }, right{ r }, parent{ p }, child{ c } + { + } + }; + tsk_edge_table_t *edges = &sorter->tables->edges; + const double *node_time = sorter->tables->nodes.time; + std::vector<_edge> sorted_edges; + size_t num_edges = edges->num_rows; + size_t j; + + /* This is the comparison function. We cannot define an + * operator < for _edge because we need to bind the node times + * so we have to use a functional method. This is a copy of the cmp + * from fwdpp. Only difference is the final time comparison + * (fwdpp table times go forwards). */ + const auto cmp = [&node_time](const _edge &lhs, const _edge &rhs) { + auto tl = node_time[lhs.parent]; + auto tr = node_time[rhs.parent]; + if (tl == tr) { + if (lhs.parent == rhs.parent) { + if (lhs.child == rhs.child) { + return lhs.left < rhs.left; + } + return lhs.child < rhs.child; + } + return lhs.parent < rhs.parent; + } + return tl < tr; + }; + + assert(start == 0); + /* Let's not bother with metadata */ + assert(edges->metadata_length == 0); + + sorted_edges.reserve(num_edges); + for (j = 0; j < num_edges; j++) { + sorted_edges.emplace_back( + edges->left[j], edges->right[j], edges->parent[j], edges->child[j]); + } + + std::sort(begin(sorted_edges), end(sorted_edges), cmp); + + for (j = 0; j < num_edges; j++) { + edges->left[j] = sorted_edges[j].left; + edges->right[j] = sorted_edges[j].right; + edges->parent[j] = sorted_edges[j].parent; + edges->child[j] = sorted_edges[j].child; + } + return 0; +} + +void +test_edge_sorting() +{ + std::cout << "test_edge_sorting" << endl; + tsk_table_collection_t tables; + tsk_id_t n = 10; + tsk_id_t j; + tsk_id_t ret = tsk_table_collection_init(&tables, 0); + assert(ret == 0); + + tables.sequence_length = 1.0; + /* Make a stick tree */ + /* Add nodes and edges */ + for (j = 0; j < n; j++) { + ret = tsk_node_table_add_row( + &tables.nodes, TSK_NODE_IS_SAMPLE, j + 1, TSK_NULL, TSK_NULL, NULL, 0); + assert(ret == j); + } + for (j = n - 1; j > 0; j--) { + tsk_edge_table_add_row(&tables.edges, 0, 1, j, j - 1, NULL, 0); + } + assert(tables.nodes.num_rows == (tsk_size_t) n); + assert(tables.edges.num_rows == (tsk_size_t) n - 1); + + /* Make sure the edges are unsorted */ + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_EDGE_ORDERING); + assert(ret == TSK_ERR_EDGES_NOT_SORTED_PARENT_TIME); + + /* Sort the tables */ + tsk_table_sorter_t sorter; + ret = tsk_table_sorter_init(&sorter, &tables, 0); + assert(ret == 0); + /* Set the sort_edges to our local C++ version. We could also set some + * persistent state in sorter.params if we wanted to. */ + sorter.sort_edges = cpp_sort_edges; + ret = tsk_table_sorter_run(&sorter, NULL); + assert(ret == 0); + tsk_table_sorter_free(&sorter); + + /* Make sure the edges are now sorted */ + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_EDGE_ORDERING); + assert(ret == 0); + + tsk_table_collection_free(&tables); +} + +int +sort_edges_raises_exception(tsk_table_sorter_t *sorter, tsk_size_t start) +{ + throw std::exception(); + return 0; +} + +int +sort_edges_raises_non_exception(tsk_table_sorter_t *sorter, tsk_size_t start) +{ + throw 42; + return 0; +} + +int +safe_sort_edges(tsk_table_sorter_t *sorter, tsk_size_t start) +{ + int ret = 0; + if (sorter->user_data == NULL) { + try { + ret = sort_edges_raises_exception(sorter, start); + } catch (...) { + ret = -12345; + } + } else { + try { + ret = sort_edges_raises_non_exception(sorter, start); + } catch (...) { + ret = -123456; + } + } + return ret; +} + +void +test_edge_sorting_errors() +{ + std::cout << "test_edge_sorting_errors" << endl; + tsk_table_collection_t tables; + tsk_table_sorter_t sorter; + tsk_id_t ret = tsk_table_collection_init(&tables, 0); + + assert(ret == 0); + tables.sequence_length = 1.0; + + ret = tsk_table_sorter_init(&sorter, &tables, 0); + assert(ret == 0); + sorter.sort_edges = safe_sort_edges; + ret = tsk_table_sorter_run(&sorter, NULL); + assert(ret == -12345); + + /* Use the user_data as a way to communicate with the sorter + * function. Here, we want to try out two different types + * of exception that get thrown. */ + sorter.user_data = &tables; + ret = tsk_table_sorter_run(&sorter, NULL); + assert(ret == -123456); + + tsk_table_sorter_free(&sorter); + tsk_table_collection_free(&tables); +} + int main() { @@ -85,5 +258,7 @@ main() test_strerror(); test_load_error(); test_table_basics(); + test_edge_sorting(); + test_edge_sorting_errors(); return 0; } diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 9e87dafd4a..cb743cedeb 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -2427,7 +2427,7 @@ test_simplify_empty_tables(void) CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1; - // ret = tsk_table_collection_simplify(&tables, NULL, 0, 0, NULL); + ret = tsk_table_collection_simplify(&tables, NULL, 0, 0, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 0); @@ -2569,6 +2569,99 @@ test_sort_tables_errors(void) tsk_treeseq_free(&ts); } +static void +reverse_edges(tsk_table_collection_t *tables) +{ + int ret; + tsk_edge_table_t edges; + tsk_edge_t edge; + tsk_id_t j; + + ret = tsk_edge_table_init(&edges, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + for (j = (tsk_id_t) tables->edges.num_rows - 1; j >= 0; j--) { + ret = tsk_edge_table_get_row(&tables->edges, j, &edge); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_edge_table_add_row(&edges, edge.left, edge.right, edge.parent, + edge.child, edge.metadata, edge.metadata_length); + CU_ASSERT_FATAL(ret >= 0); + } + + ret = tsk_edge_table_copy(&edges, &tables->edges, TSK_NO_INIT); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + tsk_edge_table_free(&edges); +} + +static void +test_sorter_interface(void) +{ + int ret; + tsk_treeseq_t ts; + tsk_table_collection_t tables; + tsk_table_sorter_t sorter; + + tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, + NULL, NULL, NULL); + ret = tsk_treeseq_copy_tables(&ts, &tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, &tables)); + + /* Nominal case */ + reverse_edges(&tables); + CU_ASSERT_FALSE(tsk_table_collection_equals(ts.tables, &tables)); + ret = tsk_table_sorter_init(&sorter, &tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_sorter_run(&sorter, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, &tables)); + CU_ASSERT_EQUAL(sorter.user_data, NULL); + tsk_table_sorter_free(&sorter); + + /* If we set the sort_edges function to NULL then we should leave the + * node table as is. */ + reverse_edges(&tables); + CU_ASSERT_FALSE(tsk_edge_table_equals(&ts.tables->edges, &tables.edges)); + ret = tsk_table_sorter_init(&sorter, &tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + sorter.sort_edges = NULL; + ret = tsk_table_sorter_run(&sorter, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FALSE(tsk_edge_table_equals(&ts.tables->edges, &tables.edges)); + tsk_table_sorter_free(&sorter); + + /* Reversing again should make them equal */ + reverse_edges(&tables); + CU_ASSERT_TRUE(tsk_edge_table_equals(&ts.tables->edges, &tables.edges)); + + /* Do not check integrity before sorting */ + reverse_edges(&tables); + CU_ASSERT_FALSE(tsk_table_collection_equals(ts.tables, &tables)); + ret = tsk_table_sorter_init(&sorter, &tables, TSK_NO_CHECK_INTEGRITY); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_sorter_run(&sorter, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, &tables)); + tsk_table_sorter_free(&sorter); + + /* The user_data shouldn't be touched */ + reverse_edges(&tables); + CU_ASSERT_FALSE(tsk_table_collection_equals(ts.tables, &tables)); + ret = tsk_table_sorter_init(&sorter, &tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + sorter.user_data = (void *) &ts; + ret = tsk_table_sorter_run(&sorter, NULL); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, &tables)); + CU_ASSERT_EQUAL_FATAL(sorter.user_data, &ts); + tsk_table_sorter_free(&sorter); + + tsk_table_collection_free(&tables); + tsk_treeseq_free(&ts); +} + static void test_dump_unindexed(void) { @@ -3453,6 +3546,7 @@ main(int argc, char **argv) { "test_sort_tables_drops_indexes", test_sort_tables_drops_indexes }, { "test_copy_table_collection", test_copy_table_collection }, { "test_sort_tables_errors", test_sort_tables_errors }, + { "test_sorter_interface", test_sorter_interface }, { "test_dump_unindexed", test_dump_unindexed }, { "test_dump_load_empty", test_dump_load_empty }, { "test_dump_load_unsorted", test_dump_load_unsorted }, diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 826c279018..f77827642c 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -4277,17 +4277,6 @@ typedef struct { double time; } edge_sort_t; -typedef struct { - /* Input tables. */ - tsk_node_table_t *nodes; - tsk_edge_table_t *edges; - tsk_site_table_t *sites; - tsk_mutation_table_t *mutations; - tsk_migration_table_t *migrations; - /* Mapping from input site IDs to output site IDs */ - tsk_id_t *site_id_map; -} table_sorter_t; - static int cmp_site(const void *a, const void *b) { @@ -4344,42 +4333,14 @@ cmp_edge(const void *a, const void *b) } static int -table_sorter_init(table_sorter_t *self, tsk_table_collection_t *tables, - tsk_flags_t TSK_UNUSED(options)) -{ - int ret = 0; - - memset(self, 0, sizeof(table_sorter_t)); - if (tables->migrations.num_rows != 0) { - ret = TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED; - goto out; - } - ret = tsk_table_collection_check_integrity(tables, TSK_CHECK_OFFSETS); - if (ret != 0) { - goto out; - } - self->nodes = &tables->nodes; - self->edges = &tables->edges; - self->mutations = &tables->mutations; - self->sites = &tables->sites; - self->migrations = &tables->migrations; - - self->site_id_map = malloc(self->sites->num_rows * sizeof(tsk_id_t)); - if (self->site_id_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } -out: - return ret; -} - -static int -table_sorter_sort_edges(table_sorter_t *self, tsk_size_t start) +tsk_table_sorter_sort_edges(tsk_table_sorter_t *self, tsk_size_t start) { int ret = 0; + const tsk_edge_table_t *edges = &self->tables->edges; + const double *restrict node_time = self->tables->nodes.time; edge_sort_t *e; tsk_size_t j, k; - tsk_size_t n = self->edges->num_rows - start; + tsk_size_t n = self->tables->edges.num_rows - start; edge_sort_t *sorted_edges = malloc(n * sizeof(*sorted_edges)); if (sorted_edges == NULL) { @@ -4389,21 +4350,21 @@ table_sorter_sort_edges(table_sorter_t *self, tsk_size_t start) for (j = 0; j < n; j++) { e = sorted_edges + j; k = start + j; - e->left = self->edges->left[k]; - e->right = self->edges->right[k]; - e->parent = self->edges->parent[k]; - e->child = self->edges->child[k]; - e->time = self->nodes->time[e->parent]; + e->left = edges->left[k]; + e->right = edges->right[k]; + e->parent = edges->parent[k]; + e->child = edges->child[k]; + e->time = node_time[e->parent]; } qsort(sorted_edges, n, sizeof(edge_sort_t), cmp_edge); /* Copy the edges back into the table. */ for (j = 0; j < n; j++) { e = sorted_edges + j; k = start + j; - self->edges->left[k] = e->left; - self->edges->right[k] = e->right; - self->edges->parent[k] = e->parent; - self->edges->child[k] = e->child; + edges->left[k] = e->left; + edges->right[k] = e->right; + edges->parent[k] = e->parent; + edges->child[k] = e->child; } out: tsk_safe_free(sorted_edges); @@ -4411,15 +4372,16 @@ table_sorter_sort_edges(table_sorter_t *self, tsk_size_t start) } static int -table_sorter_sort_sites(table_sorter_t *self) +tsk_table_sorter_sort_sites(tsk_table_sorter_t *self) { int ret = 0; + tsk_site_table_t *sites = &self->tables->sites; tsk_site_table_t copy; tsk_size_t j; - tsk_size_t num_sites = self->sites->num_rows; + tsk_size_t num_sites = sites->num_rows; tsk_site_t *sorted_sites = malloc(num_sites * sizeof(*sorted_sites)); - ret = tsk_site_table_copy(self->sites, ©, 0); + ret = tsk_site_table_copy(sites, ©, 0); if (ret != 0) { goto out; } @@ -4435,14 +4397,14 @@ table_sorter_sort_sites(table_sorter_t *self) } /* Sort the sites by position */ - qsort(sorted_sites, self->sites->num_rows, sizeof(*sorted_sites), cmp_site); + qsort(sorted_sites, num_sites, sizeof(*sorted_sites), cmp_site); /* Build the mapping from old site IDs to new site IDs and copy back into the table */ - tsk_site_table_clear(self->sites); + tsk_site_table_clear(sites); for (j = 0; j < num_sites; j++) { self->site_id_map[sorted_sites[j].id] = (tsk_id_t) j; - ret = tsk_site_table_add_row(self->sites, sorted_sites[j].position, + ret = tsk_site_table_add_row(sites, sorted_sites[j].position, sorted_sites[j].ancestral_state, sorted_sites[j].ancestral_state_length, sorted_sites[j].metadata, sorted_sites[j].metadata_length); if (ret < 0) { @@ -4457,17 +4419,18 @@ table_sorter_sort_sites(table_sorter_t *self) } static int -table_sorter_sort_mutations(table_sorter_t *self) +tsk_table_sorter_sort_mutations(tsk_table_sorter_t *self) { int ret = 0; tsk_size_t j; tsk_id_t parent, mapped_parent; - tsk_size_t num_mutations = self->mutations->num_rows; + tsk_mutation_table_t *mutations = &self->tables->mutations; + tsk_size_t num_mutations = mutations->num_rows; tsk_mutation_table_t copy; tsk_mutation_t *sorted_mutations = malloc(num_mutations * sizeof(*sorted_mutations)); tsk_id_t *mutation_id_map = malloc(num_mutations * sizeof(*mutation_id_map)); - ret = tsk_mutation_table_copy(self->mutations, ©, 0); + ret = tsk_mutation_table_copy(mutations, ©, 0); if (ret != 0) { goto out; } @@ -4483,7 +4446,7 @@ table_sorter_sort_mutations(table_sorter_t *self) } sorted_mutations[j].site = self->site_id_map[sorted_mutations[j].site]; } - ret = tsk_mutation_table_clear(self->mutations); + ret = tsk_mutation_table_clear(mutations); if (ret != 0) { goto out; } @@ -4501,7 +4464,7 @@ table_sorter_sort_mutations(table_sorter_t *self) if (parent != TSK_NULL) { mapped_parent = mutation_id_map[parent]; } - ret = tsk_mutation_table_add_row(self->mutations, sorted_mutations[j].site, + ret = tsk_mutation_table_add_row(mutations, sorted_mutations[j].site, sorted_mutations[j].node, mapped_parent, sorted_mutations[j].time, sorted_mutations[j].derived_state, sorted_mutations[j].derived_state_length, sorted_mutations[j].metadata, sorted_mutations[j].metadata_length); @@ -4518,20 +4481,43 @@ table_sorter_sort_mutations(table_sorter_t *self) return ret; } -static int -table_sorter_run(table_sorter_t *self, tsk_size_t edge_start) +int +tsk_table_sorter_run(tsk_table_sorter_t *self, tsk_bookmark_t *start) { int ret = 0; + tsk_size_t edge_start = 0; - ret = table_sorter_sort_edges(self, edge_start); + if (start != NULL) { + if (start->edges > self->tables->edges.num_rows) { + ret = TSK_ERR_EDGE_OUT_OF_BOUNDS; + goto out; + } + edge_start = start->edges; + + /* For now, if migrations, sites or mutations are non-zero we get an error. + * This should be fixed: https://github.com/tskit-dev/tskit/issues/101 + */ + if (start->migrations != 0 || start->sites != 0 || start->mutations != 0) { + ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; + goto out; + } + } + /* The indexes will be invalidated, so drop them */ + ret = tsk_table_collection_drop_index(self->tables, 0); if (ret != 0) { goto out; } - ret = table_sorter_sort_sites(self); + if (self->sort_edges != NULL) { + ret = self->sort_edges(self, edge_start); + if (ret != 0) { + goto out; + } + } + ret = tsk_table_sorter_sort_sites(self); if (ret != 0) { goto out; } - ret = table_sorter_sort_mutations(self); + ret = tsk_table_sorter_sort_mutations(self); if (ret != 0) { goto out; } @@ -4539,10 +4525,42 @@ table_sorter_run(table_sorter_t *self, tsk_size_t edge_start) return ret; } -static void -table_sorter_free(table_sorter_t *self) +int +tsk_table_sorter_init( + tsk_table_sorter_t *self, tsk_table_collection_t *tables, tsk_flags_t options) +{ + int ret = 0; + + memset(self, 0, sizeof(tsk_table_sorter_t)); + if (tables->migrations.num_rows != 0) { + ret = TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED; + goto out; + } + if (!(options & TSK_NO_CHECK_INTEGRITY)) { + ret = tsk_table_collection_check_integrity(tables, TSK_CHECK_OFFSETS); + if (ret != 0) { + goto out; + } + } + self->tables = tables; + + self->site_id_map = malloc(self->tables->sites.num_rows * sizeof(tsk_id_t)); + if (self->site_id_map == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + /* Set the sort_edges method to the default. */ + self->sort_edges = tsk_table_sorter_sort_edges; +out: + return ret; +} + +int +tsk_table_sorter_free(tsk_table_sorter_t *self) { tsk_safe_free(self->site_id_map); + return 0; } /************************* @@ -7543,37 +7561,18 @@ tsk_table_collection_sort( tsk_table_collection_t *self, tsk_bookmark_t *start, tsk_flags_t options) { int ret = 0; - table_sorter_t sorter; - tsk_size_t edge_start = 0; + tsk_table_sorter_t sorter; - /* Must init the sorter before we check errors */ - ret = table_sorter_init(&sorter, self, options); + ret = tsk_table_sorter_init(&sorter, self, options); if (ret != 0) { goto out; } - if (start != NULL) { - if (start->edges > self->edges.num_rows) { - ret = TSK_ERR_EDGE_OUT_OF_BOUNDS; - goto out; - } - edge_start = start->edges; - - /* For now, if migrations, sites or mutations are non-zero we get an error. - * This should be fixed: https://github.com/tskit-dev/tskit/issues/101 - */ - if (start->migrations != 0 || start->sites != 0 || start->mutations != 0) { - ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; - goto out; - } - } - ret = table_sorter_run(&sorter, edge_start); + ret = tsk_table_sorter_run(&sorter, start); if (ret != 0) { goto out; } - /* The indexes are invalidated now so drop them */ - ret = tsk_table_collection_drop_index(self, 0); out: - table_sorter_free(&sorter); + tsk_table_sorter_free(&sorter); return ret; } diff --git a/c/tskit/tables.h b/c/tskit/tables.h index 82d6bc0324..dda316238f 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -612,6 +612,20 @@ typedef struct { tsk_size_t provenances; } tsk_bookmark_t; +/** +@brief Low-level table sorting method. +*/ +typedef struct _tsk_table_sorter_t { + /** @brief The input tables that are being sorted. */ + tsk_table_collection_t *tables; + /** @brief The edge sorting function. If set to NULL, edges are not sorted. */ + int (*sort_edges)(struct _tsk_table_sorter_t *self, tsk_size_t start); + /** @brief An opaque pointer for use by client code */ + void *user_data; + /** @brief Mapping from input site IDs to output site IDs */ + tsk_id_t *site_id_map; +} tsk_table_sorter_t; + /****************************************************************************/ /* Common function options */ /****************************************************************************/ @@ -631,6 +645,9 @@ typedef struct { /** @brief Do not initialise the parameter object. */ #define TSK_NO_INIT (1u << 30) +/** @brief Do not run integrity checks before performing an operation. */ +#define TSK_NO_CHECK_INTEGRITY (1u << 30) + /**@} */ /* Flags for simplify() */ @@ -2340,6 +2357,23 @@ and ``provenance``) are ignored and can be set to arbitrary values. The table collection will always be unindexed after sort successfully completes. See the :ref:`table sorting ` section for more details. +For more control over the sorting process, see the +:ref:`sec_c_api_low_level_sorting` section. + +**Options** + +Options can be specified by providing one or more of the following bitwise +flags: + +TSK_NO_CHECK_INTEGRITY + Do not run integrity checks using + :c:func:`tsk_table_collection_check_integrity` before sorting, + potentially leading to a small reduction in execution time. This + performance optimisation should not be used unless the calling code can + guarantee reference integrity within the table collection. References + to rows not in the table or bad offsets will result in undefined + behaviour. + @endrst @param self A pointer to a tsk_individual_table_t object. @@ -2519,6 +2553,21 @@ Any existing index is first dropped using :c:func:`tsk_table_collection_drop_ind @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_build_index(tsk_table_collection_t *self, tsk_flags_t options); + +/** +@brief Runs integrity checks on this table collection. + +@rst +TODO: document me. https://github.com/tskit-dev/tskit/issues/592 +@endrst + +@param self A pointer to a tsk_table_collection_t object. +@param options Bitwise options. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_table_collection_check_integrity( + tsk_table_collection_t *self, tsk_flags_t options); + /** @} */ /* Undocumented methods */ @@ -2533,8 +2582,82 @@ int tsk_table_collection_compute_mutation_parents( tsk_table_collection_t *self, tsk_flags_t options); int tsk_table_collection_compute_mutation_times( tsk_table_collection_t *self, double *random, tsk_flags_t TSK_UNUSED(options)); -int tsk_table_collection_check_integrity( - tsk_table_collection_t *self, tsk_flags_t options); + +/** +@defgroup TABLE_SORTER_API_GROUP Low-level table sorter API. +@{ +*/ + +/* NOTE: We use the "struct _tsk_table_sorter_t" form here + * rather then the usual tsk_table_sorter_t alias because + * of problems with Doxygen. This was the only way I could + * get it to work - ideally, we'd use the usual typedefs + * to avoid confusing people. + */ + +/** +@brief Initialises the memory for the sorter object. + +@rst +This must be called before any operations are performed on the +table sorter and initialises all fields. The ``edge_sort`` function +is set to the default method using qsort. The ``user_data`` +field is set to NULL. +This method supports the same options as +:c:func:`tsk_table_collection_sort`. + +@endrst + +@param self A pointer to an uninitialised tsk_table_sorter_t object. +@param tables The table collection to sort. +@param options Sorting options. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_table_sorter_init(struct _tsk_table_sorter_t *self, + tsk_table_collection_t *tables, tsk_flags_t options); + +/** +@brief Runs the sort using the configured functions. + +@rst +Runs the sorting process: + +1. Drop the table indexes. +2. If the ``sort_edges`` function pointer is not NULL, run it. The + first parameter to the called function will be a pointer to this + table_sorter_t object. The second parameter will be the value + ``start.edges``. This specifies the offset at which sorting should + start in the edge table. This offset is guaranteed to be within the + bounds of the edge table. +3. Sort the site table, building the mapping between site IDs in the + current and sorted tables. +4. Sort the mutation table. + +If an error occurs during the execution of a user-supplied +sorting function a non-zero value must be returned. This value +will then be returned by ``tsk_table_sorter_run``. The error +return value should be chosen to avoid conflicts with tskit error +codes. + +See :c:func:`tsk_table_collection_sort` for details on the ``start`` parameter. + +@endrst + +@param self A pointer to a tsk_table_sorter_t object. +@param start The position in the tables at which sorting starts. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_table_sorter_run(struct _tsk_table_sorter_t *self, tsk_bookmark_t *start); + +/** +@brief Free the internal memory for the specified table sorter. + +@param self A pointer to an initialised tsk_table_sorter_t object. +@return Always returns 0. +*/ +int tsk_table_sorter_free(struct _tsk_table_sorter_t *self); + +/* @} */ int tsk_squash_edges( tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output_edges); diff --git a/docs/c-api.rst b/docs/c-api.rst index 2ba7c9ffd9..68383f54bf 100644 --- a/docs/c-api.rst +++ b/docs/c-api.rst @@ -341,6 +341,34 @@ Trees .. doxygengroup:: TREE_API_GROUP :content-only: + +.. _sec_c_api_low_level_sorting: + +***************** +Low-level sorting +***************** + +In some highly performance sensitive cases it can be useful to +have more control over the process of sorting tables. This low-level +API allows a user to provide their own edge sorting function. +This can be useful, for example, to use parallel sorting algorithms, +or to take advantage of the more efficient sorting procedures +available in C++. It is the user's responsibility to ensure that the +edge sorting requirements are fulfilled by this function. + +.. todo:: + Create an idiomatic C++11 example where we load a table collection + file from argv[1], and sort the edges using std::sort, based + on the example in tests/test_minimal_cpp.cpp. We can include + this in the examples below, and link to it here. + +.. doxygenstruct:: _tsk_table_sorter_t + :members: + +.. doxygengroup:: TABLE_SORTER_API_GROUP + :content-only: + + *********************** Miscellaneous functions ***********************