diff --git a/src/trees.rs b/src/trees.rs index ffc1c8dd5..e9eba789e 100644 --- a/src/trees.rs +++ b/src/trees.rs @@ -337,7 +337,12 @@ impl Tree { /// Return the `[left, right)` coordinates of the tree. pub fn interval(&self) -> (f64, f64) { - unsafe { ((*self.as_ptr()).left, (*self.as_ptr()).right) } + unsafe { + ( + (*self.as_ptr()).interval.left, + (*self.as_ptr()).interval.right, + ) + } } /// Return the length of the genome for which this @@ -681,7 +686,7 @@ impl<'a> RootIterator<'a> { fn new(tree: &'a Tree) -> Self { RootIterator { current_root: None, - next_root: tree.inner.left_root.into(), + next_root: unsafe { ll_bindings::tsk_tree_get_left_root(tree.as_ptr()).into() }, tree, } } @@ -905,9 +910,10 @@ impl TreeSequence { /// tskit::TreeSequenceFlags::default()).unwrap(); /// ``` pub fn new(tables: TableCollection, flags: TreeSequenceFlags) -> Result { + let mut t = tables; let mut treeseq = Self::wrap(); let rv = unsafe { - ll_bindings::tsk_treeseq_init(treeseq.as_mut_ptr(), tables.as_ptr(), flags.bits()) + ll_bindings::tsk_treeseq_init(treeseq.as_mut_ptr(), t.as_mut_ptr(), flags.bits()) }; handle_tsk_return_value!(rv, treeseq) } diff --git a/subprojects/kastore/kastore.c b/subprojects/kastore/kastore.c index 60e444dfc..ab61378ec 100644 --- a/subprojects/kastore/kastore.c +++ b/subprojects/kastore/kastore.c @@ -9,7 +9,10 @@ /* Private flag used to indicate when we have opened the file ourselves * and need to free it. */ -#define OWN_FILE (1 << 31) +/* Note: we use 1<<14 to keep this flag at the end of the flag space, + * and this is the highest bit that can be guaranteed to fit into + * an int. */ +#define OWN_FILE (1 << 14) const char * kas_strerror(int err) @@ -261,7 +264,7 @@ kastore_read_descriptors(kastore_t *self) if (size + KAS_HEADER_SIZE > self->file_size) { goto out; } - read_buffer = malloc(size); + read_buffer = (char *) malloc(size); if (read_buffer == NULL) { ret = KAS_ERR_NO_MEMORY; goto out; @@ -386,7 +389,7 @@ kastore_read_file(kastore_t *self) assert(size > offset); size -= offset; - self->read_buffer = malloc(size); + self->read_buffer = (char *) malloc(size); if (self->read_buffer == NULL) { ret = KAS_ERR_NO_MEMORY; goto out; @@ -480,7 +483,7 @@ kastore_read(kastore_t *self) goto out; } if (self->num_items > 0) { - self->items = calloc(self->num_items, sizeof(*self->items)); + self->items = (kaitem_t *) calloc(self->num_items, sizeof(*self->items)); if (self->items == NULL) { ret = KAS_ERR_NO_MEMORY; goto out; @@ -559,6 +562,7 @@ kastore_open(kastore_t *self, const char *filename, const char *mode, int flags) tmp.file = NULL; if (err != 0) { ret = KAS_ERR_IO; + goto out; } } file = fopen(filename, file_mode); @@ -690,7 +694,7 @@ kastore_get(kastore_t *self, const char *key, size_t key_len, void **array, int ret = KAS_ERR_KEY_NOT_FOUND; kaitem_t search; kaitem_t *item; - search.key = malloc(key_len); + search.key = (char *) malloc(key_len); search.key_len = key_len; if (self->mode != KAS_READ) { @@ -733,7 +737,7 @@ static int KAS_WARN_UNUSED kastore_gets_type( kastore_t *self, const char *key, void **array, size_t *array_len, int type) { - int loaded_type; + int loaded_type = -1; int ret; ret = kastore_get(self, key, strlen(key), array, array_len, &loaded_type); @@ -847,7 +851,7 @@ kastore_oput(kastore_t *self, const char *key, size_t key_len, void *array, { int ret = 0; kaitem_t *new_item; - void *p; + kaitem_t *p; size_t j; if (self->mode != KAS_WRITE) { @@ -864,7 +868,7 @@ kastore_oput(kastore_t *self, const char *key, size_t key_len, void *array, } /* This isn't terribly efficient, but we're not expecting large * numbers of items. */ - p = realloc(self->items, (self->num_items + 1) * sizeof(*self->items)); + p = (kaitem_t *) realloc(self->items, (self->num_items + 1) * sizeof(*self->items)); if (p == NULL) { ret = KAS_ERR_NO_MEMORY; goto out; @@ -877,7 +881,7 @@ kastore_oput(kastore_t *self, const char *key, size_t key_len, void *array, new_item->key_len = key_len; new_item->array_len = array_len; new_item->array = array; - new_item->key = malloc(key_len); + new_item->key = (char *) malloc(key_len); if (new_item->key == NULL) { kas_safe_free(new_item->key); ret = KAS_ERR_NO_MEMORY; diff --git a/subprojects/kastore/kastore.h b/subprojects/kastore/kastore.h index cfaa9003f..e8aad7580 100644 --- a/subprojects/kastore/kastore.h +++ b/subprojects/kastore/kastore.h @@ -24,8 +24,6 @@ extern "C" { #include #include -/** @} */ - /** @defgroup ERROR_GROUP Error return values. @{ @@ -155,7 +153,7 @@ to the API or ABI are introduced, i.e., the addition of a new function. The library patch version. Incremented when any changes not relevant to the to the API or ABI are introduced, i.e., internal refactors of bugfixes. */ -#define KAS_VERSION_PATCH 0 +#define KAS_VERSION_PATCH 2 /** @} */ #define KAS_HEADER_SIZE 64 diff --git a/subprojects/tskit/tskit/convert.c b/subprojects/tskit/tskit/convert.c index ed25c1ab8..c84ddebc3 100644 --- a/subprojects/tskit/tskit/convert.c +++ b/subprojects/tskit/tskit/convert.c @@ -57,12 +57,15 @@ tsk_newick_converter_run( const tsk_tree_t *tree = self->tree; tsk_id_t *stack = self->traversal_stack; const double *time = self->tree->tree_sequence->tables->nodes.time; + const tsk_flags_t *flags = self->tree->tree_sequence->tables->nodes.flags; int stack_top = 0; int label; size_t s = 0; int r; tsk_id_t u, v, w, root_parent; double branch_length; + bool ms_labels = self->options & TSK_NEWICK_LEGACY_MS_LABELS; + const char *label_format = ms_labels ? "%d" : "n%d"; if (root < 0 || root >= (tsk_id_t) self->tree->num_nodes) { ret = TSK_ERR_NODE_OUT_OF_BOUNDS; @@ -91,15 +94,20 @@ tsk_newick_converter_run( } else { u = tree->parent[v]; stack_top--; - if (tree->left_child[v] == TSK_NULL) { + label = -1; + if (ms_labels) { + if (tree->left_child[v] == TSK_NULL) { + label = (int) v + 1; + } + } else if (flags[v] & TSK_NODE_IS_SAMPLE) { + label = (int) v; + } + if (label != -1) { if (s >= buffer_size) { ret = TSK_ERR_BUFFER_OVERFLOW; goto out; } - /* We do this for ms-compatability. This should be a configurable option - * via the flags attribute */ - label = (int) v + 1; - r = snprintf(buffer + s, buffer_size - s, "%d", label); + r = snprintf(buffer + s, buffer_size - s, label_format, label); if (r < 0) { ret = TSK_ERR_IO; goto out; @@ -154,8 +162,7 @@ tsk_newick_converter_init(tsk_newick_converter_t *self, const tsk_tree_t *tree, self->options = options; self->tree = tree; self->traversal_stack - = tsk_malloc(tsk_treeseq_get_num_nodes(self->tree->tree_sequence) - * sizeof(*self->traversal_stack)); + = tsk_malloc(tsk_tree_get_size_bound(tree) * sizeof(*self->traversal_stack)); if (self->traversal_stack == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; diff --git a/subprojects/tskit/tskit/convert.h b/subprojects/tskit/tskit/convert.h index 4f74b1168..a49593a9d 100644 --- a/subprojects/tskit/tskit/convert.h +++ b/subprojects/tskit/tskit/convert.h @@ -32,6 +32,8 @@ extern "C" { #include +#define TSK_NEWICK_LEGACY_MS_LABELS (1 << 0) + int tsk_convert_newick(const tsk_tree_t *tree, tsk_id_t root, unsigned int precision, tsk_flags_t options, size_t buffer_size, char *buffer); diff --git a/subprojects/tskit/tskit/core.c b/subprojects/tskit/tskit/core.c index c71952cc2..48d003867 100644 --- a/subprojects/tskit/tskit/core.c +++ b/subprojects/tskit/tskit/core.c @@ -44,7 +44,7 @@ get_random_bytes(uint8_t *buf) { /* Based on CPython's code in bootstrap_hash.c */ int ret = TSK_ERR_GENERATE_UUID; - HCRYPTPROV hCryptProv = NULL; + HCRYPTPROV hCryptProv = (HCRYPTPROV) NULL; if (!CryptAcquireContext( &hCryptProv, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT)) { @@ -54,13 +54,13 @@ get_random_bytes(uint8_t *buf) goto out; } if (!CryptReleaseContext(hCryptProv, 0)) { - hCryptProv = NULL; + hCryptProv = (HCRYPTPROV) NULL; goto out; } - hCryptProv = NULL; + hCryptProv = (HCRYPTPROV) NULL; ret = 0; out: - if (hCryptProv != NULL) { + if (hCryptProv != (HCRYPTPROV) NULL) { CryptReleaseContext(hCryptProv, 0); } return ret; @@ -216,6 +216,9 @@ tsk_strerror_internal(int err) case TSK_ERR_GENOME_COORDS_NONFINITE: ret = "Genome coordinates must be finite numbers"; break; + case TSK_ERR_SEEK_OUT_OF_BOUNDS: + ret = "Tree seek position out of bounds"; + break; /* Edge errors */ case TSK_ERR_NULL_PARENT: @@ -308,6 +311,11 @@ tsk_strerror_internal(int err) "values for any single site."; break; + /* Migration errors */ + case TSK_ERR_UNSORTED_MIGRATIONS: + ret = "Migrations must be sorted by time."; + break; + /* Sample errors */ case TSK_ERR_DUPLICATE_SAMPLE: ret = "Duplicate sample value"; @@ -363,6 +371,9 @@ tsk_strerror_internal(int err) case TSK_ERR_CANNOT_EXTEND_FROM_SELF: ret = "Tables can only be extended using rows from a different table"; break; + case TSK_ERR_SILENT_MUTATIONS_NOT_SUPPORTED: + ret = "Silent mutations not supported by this operation"; + break; /* Stats errors */ case TSK_ERR_BAD_NUM_WINDOWS: @@ -395,6 +406,10 @@ tsk_strerror_internal(int err) case TSK_ERR_UNSUPPORTED_STAT_MODE: ret = "Requested statistics mode not supported for this method."; break; + case TSK_ERR_TIME_UNCALIBRATED: + ret = "Statistics using branch lengths cannot be calculated when time_units " + "is 'uncalibrated'"; + break; /* Mutation mapping errors */ case TSK_ERR_GENOTYPES_ALL_MISSING: @@ -468,12 +483,19 @@ tsk_strerror_internal(int err) break; /* IBD errors */ - case TSK_ERR_NO_SAMPLE_PAIRS: - ret = "There are no possible sample pairs."; + case TSK_ERR_SAME_NODES_IN_PAIR: + ret = "Both nodes in the sample pair are the same"; + break; + + case TSK_ERR_IBD_PAIRS_NOT_STORED: + ret = "The sample pairs are not stored by default in ibd_segments. Please " + "add the TSK_IBD_STORE_PAIRS option flag if per-pair statistics are " + "required."; break; - case TSK_ERR_DUPLICATE_SAMPLE_PAIRS: - ret = "There are duplicate sample pairs."; + case TSK_ERR_IBD_SEGMENTS_NOT_STORED: + ret = "All segments are not stored by default in ibd_segments. Please " + "add the TSK_IBD_STORE_SEGMENTS option flag if they are required."; break; /* Simplify errors */ @@ -517,12 +539,17 @@ tsk_is_kas_error(int err) return !(err & (1 << TSK_KAS_ERR_BIT)); } +int +tsk_get_kas_error(int err) +{ + return err ^ (1 << TSK_KAS_ERR_BIT); +} + const char * tsk_strerror(int err) { if (err != 0 && tsk_is_kas_error(err)) { - err ^= (1 << TSK_KAS_ERR_BIT); - return kas_strerror(err); + return kas_strerror(tsk_get_kas_error(err)); } else { return tsk_strerror_internal(err); } @@ -710,6 +737,38 @@ tsk_is_unknown_time(double val) return nan_union.i == TSK_UNKNOWN_TIME_HEX; } +/* Work around a bug which seems to show up on various mixtures of + * compiler and libc versions, where isfinite and isnan result in + * spurious warnings about casting down to float. The original issue + * is here: + * https://github.com/tskit-dev/tskit/issues/721 + * + * The simplest approach seems to be to use the builtins where they + * are available (clang and gcc), and to use the library macro + * otherwise. There would be no disadvantage to using the builtin + * version, so there's no real harm in this approach. + */ + +bool +tsk_isnan(double val) +{ +#if defined(__GNUC__) + return __builtin_isnan(val); +#else + return isnan(val); +#endif +} + +bool +tsk_isfinite(double val) +{ +#if defined(__GNUC__) + return __builtin_isfinite(val); +#else + return isfinite(val); +#endif +} + void * tsk_malloc(tsk_size_t size) { @@ -776,3 +835,246 @@ tsk_memcmp(const void *s1, const void *s2, tsk_size_t size) { return memcmp(s1, s2, (size_t) size); } + +/* We can't initialise the stream to its real default value because + * of limitations on static initialisers. To work around this, we initialise + * it to NULL and then set the value to the required standard stream + * when called. */ + +FILE *_tsk_debug_stream = NULL; + +void +tsk_set_debug_stream(FILE *f) +{ + _tsk_debug_stream = f; +} + +FILE * +tsk_get_debug_stream(void) +{ + if (_tsk_debug_stream == NULL) { + _tsk_debug_stream = stdout; + } + return _tsk_debug_stream; +} + +/* AVL Tree implementation. This is based directly on Knuth's implementation + * in TAOCP. See the python/tests/test_avl_tree.py for more information, + * and equivalent code annotated with the original algorithm listing. + */ + +static void +tsk_avl_tree_int_print_node(tsk_avl_node_int_t *node, int depth, FILE *out) +{ + int d; + + if (node == NULL) { + return; + } + for (d = 0; d < depth; d++) { + fprintf(out, " "); + } + fprintf(out, "key=%d balance=%d\n", (int) node->key, node->balance); + tsk_avl_tree_int_print_node(node->llink, depth + 1, out); + tsk_avl_tree_int_print_node(node->rlink, depth + 1, out); +} +void +tsk_avl_tree_int_print_state(tsk_avl_tree_int_t *self, FILE *out) +{ + fprintf(out, "AVL tree: size=%d height=%d\n", (int) self->size, (int) self->height); + tsk_avl_tree_int_print_node(self->head.rlink, 0, out); +} + +int +tsk_avl_tree_int_init(tsk_avl_tree_int_t *self) +{ + memset(self, 0, sizeof(*self)); + return 0; +} + +int +tsk_avl_tree_int_free(tsk_avl_tree_int_t *TSK_UNUSED(self)) +{ + return 0; +} + +tsk_avl_node_int_t * +tsk_avl_tree_int_get_root(const tsk_avl_tree_int_t *self) +{ + return self->head.rlink; +} + +tsk_avl_node_int_t * +tsk_avl_tree_int_search(const tsk_avl_tree_int_t *self, int64_t key) +{ + tsk_avl_node_int_t *P = self->head.rlink; + + while (P != NULL) { + if (key == P->key) { + break; + } else if (key < P->key) { + P = P->llink; + } else { + P = P->rlink; + } + } + return P; +} + +static int +tsk_avl_tree_int_insert_empty(tsk_avl_tree_int_t *self, tsk_avl_node_int_t *node) +{ + self->head.rlink = node; + self->size = 1; + self->height = 1; + node->llink = NULL; + node->rlink = NULL; + node->balance = 0; + return 0; +} + +#define get_link(a, P) ((a) == -1 ? (P)->llink : (P)->rlink) +#define set_link(a, P, val) \ + do { \ + if ((a) == -1) { \ + (P)->llink = val; \ + } else { \ + (P)->rlink = val; \ + } \ + } while (0); + +static int +tsk_avl_tree_int_insert_non_empty(tsk_avl_tree_int_t *self, tsk_avl_node_int_t *node) +{ + const int64_t K = node->key; + tsk_avl_node_int_t *T = &self->head; + tsk_avl_node_int_t *S = T->rlink; + tsk_avl_node_int_t *P = T->rlink; + tsk_avl_node_int_t *Q, *R; + int a; + + while (true) { + if (K == P->key) { + /* TODO figure out what the most useful semantics are here. Just + * returning 1 as a non-zero value for now. */ + return 1; + } else if (K < P->key) { + Q = P->llink; + if (Q == NULL) { + Q = node; + P->llink = Q; + break; + } + } else { + Q = P->rlink; + if (Q == NULL) { + Q = node; + P->rlink = Q; + break; + } + } + if (Q->balance != 0) { + T = P; + S = Q; + } + P = Q; + } + + self->size++; + Q->llink = NULL; + Q->rlink = NULL; + Q->balance = 0; + + if (K < S->key) { + a = -1; + } else { + a = 1; + } + P = get_link(a, S); + R = P; + while (P != Q) { + if (K < P->key) { + P->balance = -1; + P = P->llink; + } else if (K > P->key) { + P->balance = 1; + P = P->rlink; + } + } + + if (S->balance == 0) { + S->balance = a; + self->height++; + } else if (S->balance == -a) { + S->balance = 0; + } else { + if (R->balance == a) { + P = R; + set_link(a, S, get_link(-a, R)); + set_link(-a, R, S); + S->balance = 0; + R->balance = 0; + } else if (R->balance == -a) { + P = get_link(-a, R); + set_link(-a, R, get_link(a, P)); + set_link(a, P, R); + set_link(a, S, get_link(-a, P)); + set_link(-a, P, S); + if (P->balance == a) { + S->balance = -a; + R->balance = 0; + } else if (P->balance == 0) { + S->balance = 0; + R->balance = 0; + } else { + S->balance = 0; + R->balance = a; + } + P->balance = 0; + } + if (S == T->rlink) { + T->rlink = P; + } else { + T->llink = P; + } + } + return 0; +} + +int +tsk_avl_tree_int_insert(tsk_avl_tree_int_t *self, tsk_avl_node_int_t *node) +{ + int ret = 0; + + if (self->size == 0) { + ret = tsk_avl_tree_int_insert_empty(self, node); + } else { + ret = tsk_avl_tree_int_insert_non_empty(self, node); + } + return ret; +} + +/* An inorder traversal of the nodes in an AVL tree (or any binary search tree) + * yields the keys in sorted order. The recursive implementation is safe here + * because this is an AVL tree and it is strictly balanced, the depth is very + * limited. Using GCC's __builtin_frame_address it looks like the size of a stack + * frame for this function is 48 bytes. Assuming a stack size of 1MiB, this + * would give us a maximum tree depth of 21845 - so, we're pretty safe. + */ +static int +ordered_nodes_traverse(tsk_avl_node_int_t *node, int index, tsk_avl_node_int_t **out) +{ + if (node == NULL) { + return index; + } + index = ordered_nodes_traverse(node->llink, index, out); + out[index] = node; + return ordered_nodes_traverse(node->rlink, index + 1, out); +} + +int +tsk_avl_tree_int_ordered_nodes(const tsk_avl_tree_int_t *self, tsk_avl_node_int_t **out) +{ + ordered_nodes_traverse(self->head.rlink, 0, out); + return 0; +} diff --git a/subprojects/tskit/tskit/core.h b/subprojects/tskit/tskit/core.h index 4fd226a65..a0f4927a3 100644 --- a/subprojects/tskit/tskit/core.h +++ b/subprojects/tskit/tskit/core.h @@ -40,28 +40,6 @@ extern "C" { #include #include -#if defined(_TSK_WORKAROUND_FALSE_CLANG_WARNING) && defined(__clang__) -/* Work around bug in clang >= 6.0, https://github.com/tskit-dev/tskit/issues/721 - * (note: fixed in clang January 2019) - * This workaround does some nasty fiddling with builtins and is only intended to - * be used within the library. To turn it on, make sure - * _TSK_WORKAROUND_FALSE_CLANG_WARNING is defined before including any tskit - * headers. - */ -#if __has_builtin(__builtin_isnan) -#undef isnan -#define isnan __builtin_isnan -#else -abort(); -#endif -#if __has_builtin(__builtin_isfinite) -#undef isfinite -#define isfinite __builtin_isfinite -#else -abort(); -#endif -#endif - #ifdef __GNUC__ #define TSK_WARN_UNUSED __attribute__((warn_unused_result)) #define TSK_UNUSED(x) TSK_UNUSED_##x __attribute__((__unused__)) @@ -109,6 +87,9 @@ __tsk_nan_f(void) } #define TSK_UNKNOWN_TIME __tsk_nan_f() +#define TSK_TIME_UNITS_UNKNOWN "unknown" +#define TSK_TIME_UNITS_UNCALIBRATED "uncalibrated" + /** @brief Tskit Object IDs. @@ -132,11 +113,11 @@ missing data. * on the thread above. */ typedef int64_t tsk_id_t; -#define TSK_MAX_ID INT64_MAX +#define TSK_MAX_ID INT64_MAX - 1 #define TSK_ID_STORAGE_TYPE KAS_INT64 #else typedef int32_t tsk_id_t; -#define TSK_MAX_ID INT32_MAX +#define TSK_MAX_ID INT32_MAX - 1 #define TSK_ID_STORAGE_TYPE KAS_INT32 #endif @@ -182,7 +163,7 @@ to the API or ABI are introduced, i.e., the addition of a new function. The library patch version. Incremented when any changes not relevant to the to the API or ABI are introduced, i.e., internal refactors of bugfixes. */ -#define TSK_VERSION_PATCH 14 +#define TSK_VERSION_PATCH 15 /** @} */ /* Node flags */ @@ -197,7 +178,7 @@ to the API or ABI are introduced, i.e., internal refactors of bugfixes. #define TSK_FILE_FORMAT_NAME "tskit.trees" #define TSK_FILE_FORMAT_NAME_LENGTH 11 #define TSK_FILE_FORMAT_VERSION_MAJOR 12 -#define TSK_FILE_FORMAT_VERSION_MINOR 5 +#define TSK_FILE_FORMAT_VERSION_MINOR 7 /** @defgroup GENERAL_ERROR_GROUP General errors. @@ -280,6 +261,7 @@ An unsupported type was provided for a column in the file. #define TSK_ERR_PROVENANCE_OUT_OF_BOUNDS -209 #define TSK_ERR_TIME_NONFINITE -210 #define TSK_ERR_GENOME_COORDS_NONFINITE -211 +#define TSK_ERR_SEEK_OUT_OF_BOUNDS -212 /* Edge errors */ #define TSK_ERR_NULL_PARENT -300 @@ -313,6 +295,9 @@ An unsupported type was provided for a column in the file. #define TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE -508 #define TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN -509 +/* Migration errors */ +#define TSK_ERR_UNSORTED_MIGRATIONS -550 + /* Sample errors */ #define TSK_ERR_DUPLICATE_SAMPLE -600 #define TSK_ERR_BAD_SAMPLES -601 @@ -334,6 +319,7 @@ An unsupported type was provided for a column in the file. #define TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED -804 #define TSK_ERR_MIGRATIONS_NOT_SUPPORTED -805 #define TSK_ERR_CANNOT_EXTEND_FROM_SELF -806 +#define TSK_ERR_SILENT_MUTATIONS_NOT_SUPPORTED -807 /* Stats errors */ #define TSK_ERR_BAD_NUM_WINDOWS -900 @@ -346,6 +332,8 @@ An unsupported type was provided for a column in the file. #define TSK_ERR_BAD_SAMPLE_SET_INDEX -907 #define TSK_ERR_EMPTY_SAMPLE_SET -908 #define TSK_ERR_UNSUPPORTED_STAT_MODE -909 +#define TSK_ERR_TIME_UNCALIBRATED -910 + /* Mutation mapping errors */ #define TSK_ERR_GENOTYPES_ALL_MISSING -1000 @@ -377,8 +365,9 @@ An unsupported type was provided for a column in the file. #define TSK_ERR_UNION_DIFF_HISTORIES -1401 /* IBD errors */ -#define TSK_ERR_NO_SAMPLE_PAIRS -1500 -#define TSK_ERR_DUPLICATE_SAMPLE_PAIRS -1501 +#define TSK_ERR_SAME_NODES_IN_PAIR -1500 +#define TSK_ERR_IBD_PAIRS_NOT_STORED -1501 +#define TSK_ERR_IBD_SEGMENTS_NOT_STORED -1502 /* Simplify errors */ #define TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE -1600 @@ -395,6 +384,7 @@ An unsupported type was provided for a column in the file. int tsk_set_kas_error(int err); bool tsk_is_kas_error(int err); +int tsk_get_kas_error(int err); /** @brief Return a description of the specified error. @@ -462,11 +452,57 @@ extern int tsk_blkalloc_init(tsk_blkalloc_t *self, size_t chunk_size); extern void *tsk_blkalloc_get(tsk_blkalloc_t *self, size_t size); extern void tsk_blkalloc_free(tsk_blkalloc_t *self); +typedef struct _tsk_avl_node_int_t { + int64_t key; + void *value; + struct _tsk_avl_node_int_t *llink; + struct _tsk_avl_node_int_t *rlink; + /* This can only contain -1, 0, 1. We could set it to a smaller type, + * but there's no point because of struct padding and alignment so + * it's simplest to keep it as a plain int. */ + int balance; +} tsk_avl_node_int_t; + +typedef struct { + tsk_avl_node_int_t head; + tsk_size_t size; + tsk_size_t height; +} tsk_avl_tree_int_t; + +int tsk_avl_tree_int_init(tsk_avl_tree_int_t *self); +int tsk_avl_tree_int_free(tsk_avl_tree_int_t *self); +void tsk_avl_tree_int_print_state(tsk_avl_tree_int_t *self, FILE *out); +int tsk_avl_tree_int_insert(tsk_avl_tree_int_t *self, tsk_avl_node_int_t *node); +tsk_avl_node_int_t *tsk_avl_tree_int_search(const tsk_avl_tree_int_t *self, int64_t key); +int tsk_avl_tree_int_ordered_nodes( + const tsk_avl_tree_int_t *self, tsk_avl_node_int_t **out); +tsk_avl_node_int_t *tsk_avl_tree_int_get_root(const tsk_avl_tree_int_t *self); + tsk_size_t tsk_search_sorted(const double *array, tsk_size_t size, double value); + double tsk_round(double x, unsigned int ndigits); +/** +@brief Check if a number is TSK_UNKNOWN_TIME + +@rst +Unknown time values in tskit are represented by a particular NaN value. Since NaN values +are not equal to each other by definition, a simple comparison like +``mutation.time == TSK_UNKNOWN_TIME`` will fail even if the mutation's time is +TSK_UNKNOWN_TIME. This function compares the underlying bit representation of a double +value and returns true iff it is equal to the specific NaN value TSK_UNKNOWN_TIME. +@endrst + +@param val The number to check +@return true if the number is TSK_UNKNOWN_TIME else false +*/ bool tsk_is_unknown_time(double val); +/* We define local versions of isnan and isfinite to workaround some portability + * issues. */ +bool tsk_isnan(double val); +bool tsk_isfinite(double val); + #define TSK_UUID_SIZE 36 int tsk_generate_uuid(char *dest, int flags); @@ -480,6 +516,10 @@ void *tsk_memcpy(void *dest, const void *src, tsk_size_t size); void *tsk_memmove(void *dest, const void *src, tsk_size_t size); int tsk_memcmp(const void *s1, const void *s2, tsk_size_t size); +/* Developer debug utilities. These are **not** threadsafe */ +void tsk_set_debug_stream(FILE *f); +FILE *tsk_get_debug_stream(void); + #ifdef __cplusplus } #endif diff --git a/subprojects/tskit/tskit/genotypes.c b/subprojects/tskit/tskit/genotypes.c index 509ce38b7..ecf295487 100644 --- a/subprojects/tskit/tskit/genotypes.c +++ b/subprojects/tskit/tskit/genotypes.c @@ -462,10 +462,11 @@ tsk_vargen_mark_missing_i16(tsk_vargen_t *self) const tsk_id_t *restrict left_child = self->tree.left_child; const tsk_id_t *restrict right_sib = self->tree.right_sib; const tsk_id_t *restrict sample_index_map = self->sample_index_map; + const tsk_id_t N = self->tree.virtual_root; int16_t *restrict genotypes = self->variant.genotypes.i16; tsk_id_t root, sample_index; - for (root = self->tree.left_root; root != TSK_NULL; root = right_sib[root]) { + for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { if (left_child[root] == TSK_NULL) { sample_index = sample_index_map[root]; if (sample_index != TSK_NULL) { @@ -484,10 +485,11 @@ tsk_vargen_mark_missing_i8(tsk_vargen_t *self) const tsk_id_t *restrict left_child = self->tree.left_child; const tsk_id_t *restrict right_sib = self->tree.right_sib; const tsk_id_t *restrict sample_index_map = self->sample_index_map; + const tsk_id_t N = self->tree.virtual_root; int8_t *restrict genotypes = self->variant.genotypes.i8; tsk_id_t root, sample_index; - for (root = self->tree.left_root; root != TSK_NULL; root = right_sib[root]) { + for (root = left_child[N]; root != TSK_NULL; root = right_sib[root]) { if (left_child[root] == TSK_NULL) { sample_index = sample_index_map[root]; if (sample_index != TSK_NULL) { diff --git a/subprojects/tskit/tskit/haplotype_matching.c b/subprojects/tskit/tskit/haplotype_matching.c index 29b6ae9d1..48f56e200 100644 --- a/subprojects/tskit/tskit/haplotype_matching.c +++ b/subprojects/tskit/tskit/haplotype_matching.c @@ -31,6 +31,8 @@ #include +#define MAX_PARSIMONY_WORDS 256 + const char *_zero_one_alleles[] = { "0", "1", NULL }; const char *_acgt_alleles[] = { "A", "C", "G", "T", NULL }; @@ -187,6 +189,10 @@ tsk_ls_hmm_init(tsk_ls_hmm_t *self, tsk_treeseq_t *tree_sequence, } self->num_values = 0; self->max_values = 0; + /* Keep this as a struct variable so that we can test overflow, but this + * should never be set to more than MAX_PARSIMONY_WORDS as we're doing + * a bunch of stack allocations based on this. */ + self->max_parsimony_words = MAX_PARSIMONY_WORDS; ret = 0; out: return ret; @@ -268,7 +274,7 @@ tsk_ls_hmm_remove_dead_roots(tsk_ls_hmm_t *self) tsk_id_t *restrict T_index = self->transition_index; tsk_value_transition_t *restrict T = self->transitions; const tsk_id_t *restrict right_sib = self->tree.right_sib; - const tsk_id_t left_root = self->tree.left_root; + const tsk_id_t left_root = tsk_tree_get_left_root(&self->tree); const tsk_id_t *restrict parent = self->parent; tsk_id_t root, u; tsk_size_t j; @@ -398,6 +404,7 @@ tsk_ls_hmm_update_probabilities( tsk_id_t *restrict T_index = self->transition_index; tsk_value_transition_t *restrict T = self->transitions; int8_t *restrict allelic_state = self->allelic_state; + const tsk_id_t left_root = tsk_tree_get_left_root(tree); tsk_mutation_t mut; tsk_id_t j, u, v; double x; @@ -409,7 +416,7 @@ tsk_ls_hmm_update_probabilities( if (ret < 0) { goto out; } - for (root = tree->left_root; root != TSK_NULL; root = tree->right_sib[root]) { + for (root = left_root; root != TSK_NULL; root = tree->right_sib[root]) { allelic_state[root] = (int8_t) ret; } @@ -454,7 +461,7 @@ tsk_ls_hmm_update_probabilities( } /* Unset the allelic states */ - for (root = tree->left_root; root != TSK_NULL; root = tree->right_sib[root]) { + for (root = left_root; root != TSK_NULL; root = tree->right_sib[root]) { allelic_state[root] = TSK_MISSING_DATA; } for (j = 0; j < (tsk_id_t) site->mutations_length; j++) { @@ -555,7 +562,6 @@ get_smallest_element(const uint64_t *restrict A, tsk_size_t u, tsk_size_t num_wo return j * 64 + get_smallest_set_bit(a[j]); } -#define MAX_PARSIMONY_WORDS 256 /* static variables are zero-initialised by default. */ static const uint64_t zero_block[MAX_PARSIMONY_WORDS]; @@ -714,7 +720,7 @@ tsk_ls_hmm_setup_optimal_value_sets(tsk_ls_hmm_t *self) * could in principle release back the memory as well, but it doesn't seem * worth the bother. */ self->num_optimal_value_set_words = (self->num_values / 64) + 1; - if (self->num_optimal_value_set_words > MAX_PARSIMONY_WORDS) { + if (self->num_optimal_value_set_words > self->max_parsimony_words) { ret = TSK_ERR_TOO_MANY_VALUES; goto out; } @@ -814,7 +820,11 @@ tsk_ls_hmm_redistribute_transitions(tsk_ls_hmm_t *self) old_num_transitions = self->num_transitions; self->num_transitions = 0; - for (root = self->tree.left_root; root != TSK_NULL; root = right_sib[root]) { + /* TODO refactor this to push the virtual root onto the stack rather then + * iterating over the roots. See the existing parsimony implementations + * for an example. */ + for (root = tsk_tree_get_left_root(&self->tree); root != TSK_NULL; + root = right_sib[root]) { stack[0].tree_node = root; stack[0].old_state = T_old[T_index[root]].value_index; stack[0].new_state @@ -1004,10 +1014,10 @@ tsk_ls_hmm_run(tsk_ls_hmm_t *self, int8_t *haplotype, static double tsk_ls_hmm_compute_normalisation_factor_forward(tsk_ls_hmm_t *self) { - tsk_id_t *restrict N = self->num_transition_samples; + tsk_size_t *restrict N = self->num_transition_samples; tsk_value_transition_t *restrict T = self->transitions; const tsk_id_t *restrict T_parent = self->transition_parent; - const tsk_id_t *restrict num_samples = self->tree.num_samples; + const tsk_size_t *restrict num_samples = self->tree.num_samples; const tsk_id_t num_transitions = (tsk_id_t) self->num_transitions; double normalisation_factor; tsk_id_t j; @@ -1560,14 +1570,14 @@ tsk_viterbi_matrix_traceback( if (ret != 0) { goto out; } - while (tree.left > site.position) { + while (tree.interval.left > site.position) { ret = tsk_tree_prev(&tree); if (ret < 0) { goto out; } } - tsk_bug_assert(tree.left <= site.position); - tsk_bug_assert(site.position < tree.right); + tsk_bug_assert(tree.interval.left <= site.position); + tsk_bug_assert(site.position < tree.interval.right); /* Fill in the recombination tree */ rr_record_tmp = rr_record; diff --git a/subprojects/tskit/tskit/haplotype_matching.h b/subprojects/tskit/tskit/haplotype_matching.h index a7ea5ac29..4c392a1b9 100644 --- a/subprojects/tskit/tskit/haplotype_matching.h +++ b/subprojects/tskit/tskit/haplotype_matching.h @@ -115,13 +115,14 @@ typedef struct _tsk_ls_hmm_t { double *values; tsk_size_t num_values; tsk_size_t max_values; + tsk_size_t max_parsimony_words; /* Number of machine words per node optimal value set. */ tsk_size_t num_optimal_value_set_words; uint64_t *optimal_value_sets; /* The parent transition; used during compression */ tsk_id_t *transition_parent; /* The number of samples directly subtended by a transition */ - tsk_id_t *num_transition_samples; + tsk_size_t *num_transition_samples; int8_t *allelic_state; /* Algorithms set these values before they are run */ int (*next_probability)( diff --git a/subprojects/tskit/tskit/stats.c b/subprojects/tskit/tskit/stats.c index 031519137..c4f4e572c 100644 --- a/subprojects/tskit/tskit/stats.c +++ b/subprojects/tskit/tskit/stats.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019 Tskit Developers + * Copyright (c) 2019-2021 Tskit Developers * Copyright (c) 2016-2017 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -30,67 +30,31 @@ #include -static void -tsk_ld_calc_check_state(const tsk_ld_calc_t *self) -{ - tsk_size_t u; - tsk_size_t num_nodes = tsk_treeseq_get_num_nodes(self->tree_sequence); - tsk_tree_t *tA = self->outer_tree; - tsk_tree_t *tB = self->inner_tree; - - tsk_bug_assert(tA->index == tB->index); - - /* The inner tree's mark values should all be zero. */ - for (u = 0; u < num_nodes; u++) { - tsk_bug_assert(tA->marked[u] == 0); - tsk_bug_assert(tB->marked[u] == 0); - } -} - void tsk_ld_calc_print_state(const tsk_ld_calc_t *self, FILE *out) { - fprintf(out, "tree_sequence = %p\n", (const void *) self->tree_sequence); - fprintf(out, "outer tree index = %lld\n", (long long) self->outer_tree->index); - fprintf(out, "outer tree interval = (%f, %f)\n", self->outer_tree->left, - self->outer_tree->right); - fprintf(out, "inner tree index = %lld\n", (long long) self->inner_tree->index); - fprintf(out, "inner tree interval = (%f, %f)\n", self->inner_tree->left, - self->inner_tree->right); - tsk_ld_calc_check_state(self); + fprintf(out, "tree = %p\n", (const void *) &self->tree); + fprintf(out, "max_sites = %d\n", (int) self->max_sites); + fprintf(out, "max_distance = %f\n", self->max_distance); } int TSK_WARN_UNUSED tsk_ld_calc_init(tsk_ld_calc_t *self, const tsk_treeseq_t *tree_sequence) { - int ret = TSK_ERR_GENERIC; + int ret = 0; + tsk_memset(self, 0, sizeof(*self)); - tsk_memset(self, 0, sizeof(tsk_ld_calc_t)); - self->tree_sequence = tree_sequence; - self->num_sites = tsk_treeseq_get_num_sites(tree_sequence); - self->outer_tree = tsk_malloc(sizeof(tsk_tree_t)); - self->inner_tree = tsk_malloc(sizeof(tsk_tree_t)); - if (self->outer_tree == NULL || self->inner_tree == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - ret = tsk_tree_init(self->outer_tree, self->tree_sequence, TSK_SAMPLE_LISTS); + ret = tsk_tree_init(&self->tree, tree_sequence, 0); if (ret != 0) { goto out; } - ret = tsk_tree_init(self->inner_tree, self->tree_sequence, 0); - if (ret != 0) { - goto out; - } - ret = tsk_tree_first(self->outer_tree); - if (ret < 0) { - goto out; - } - ret = tsk_tree_first(self->inner_tree); - if (ret < 0) { + self->tree_sequence = tree_sequence; + self->total_samples = tsk_treeseq_get_num_samples(self->tree_sequence); + + self->sample_buffer = tsk_malloc(self->total_samples * sizeof(*self->sample_buffer)); + if (self->sample_buffer == NULL) { goto out; } - ret = 0; out: return ret; } @@ -98,378 +62,244 @@ tsk_ld_calc_init(tsk_ld_calc_t *self, const tsk_treeseq_t *tree_sequence) int tsk_ld_calc_free(tsk_ld_calc_t *self) { - if (self->inner_tree != NULL) { - tsk_tree_free(self->inner_tree); - free(self->inner_tree); - } - if (self->outer_tree != NULL) { - tsk_tree_free(self->outer_tree); - free(self->outer_tree); - } + tsk_tree_free(&self->tree); + tsk_safe_free(self->sample_buffer); return 0; } -/* Position the two trees so that the specified site is within their - * interval. - */ -static int TSK_WARN_UNUSED -tsk_ld_calc_position_trees(tsk_ld_calc_t *self, tsk_id_t site_index) +static int +tsk_ld_calc_check_site(tsk_ld_calc_t *TSK_UNUSED(self), const tsk_site_t *site) { - int ret = TSK_ERR_GENERIC; - tsk_site_t mut; - double x; - tsk_tree_t *tA = self->outer_tree; - tsk_tree_t *tB = self->inner_tree; + int ret = 0; - ret = tsk_treeseq_get_site(self->tree_sequence, site_index, &mut); - if (ret != 0) { + /* These are both limitations in the current implementation, there's no + * fundamental reason why we can't support them */ + if (site->mutations_length != 1) { + ret = TSK_ERR_ONLY_INFINITE_SITES; goto out; } - x = mut.position; - tsk_bug_assert(tA->index == tB->index); - while (x >= tA->right) { - ret = tsk_tree_next(tA); - if (ret < 0) { - goto out; - } - tsk_bug_assert(ret == 1); - ret = tsk_tree_next(tB); - if (ret < 0) { - goto out; - } - tsk_bug_assert(ret == 1); - } - while (x < tA->left) { - ret = tsk_tree_prev(tA); - if (ret < 0) { - goto out; - } - tsk_bug_assert(ret == 1); - ret = tsk_tree_prev(tB); - if (ret < 0) { - goto out; - } - tsk_bug_assert(ret == 1); + if (site->ancestral_state_length == site->mutations[0].derived_state_length + && tsk_memcmp(site->ancestral_state, site->mutations[0].derived_state, + site->ancestral_state_length) + == 0) { + ret = TSK_ERR_SILENT_MUTATIONS_NOT_SUPPORTED; + goto out; } - ret = 0; - tsk_bug_assert(x >= tA->left && x < tB->right); - tsk_bug_assert(tA->index == tB->index); out: return ret; } -static double -tsk_ld_calc_overlap_within_tree(tsk_ld_calc_t *self, tsk_site_t sA, tsk_site_t sB) -{ - const tsk_tree_t *t = self->inner_tree; - const tsk_node_table_t *nodes = &self->tree_sequence->tables->nodes; - tsk_id_t u, v, nAB; - - tsk_bug_assert(sA.mutations_length == 1); - tsk_bug_assert(sB.mutations_length == 1); - u = sA.mutations[0].node; - v = sB.mutations[0].node; - if (nodes->time[u] > nodes->time[v]) { - v = sA.mutations[0].node; - u = sB.mutations[0].node; - } - while (u != v && u != TSK_NULL) { - u = t->parent[u]; - } - nAB = 0; - if (u == v) { - nAB = TSK_MIN( - t->num_samples[sA.mutations[0].node], t->num_samples[sB.mutations[0].node]); - } - return (double) nAB; -} - -static inline int TSK_WARN_UNUSED -tsk_ld_calc_set_tracked_samples(tsk_ld_calc_t *self, tsk_site_t sA) +static int +tsk_ld_calc_set_focal_samples(tsk_ld_calc_t *self) { int ret = 0; + tsk_id_t focal_node = self->focal_site.mutations[0].node; - tsk_bug_assert(sA.mutations_length == 1); - ret = tsk_tree_set_tracked_samples_from_sample_list( - self->inner_tree, self->outer_tree, sA.mutations[0].node); + ret = tsk_tree_track_descendant_samples(&self->tree, focal_node); + if (ret != 0) { + goto out; + } + self->focal_samples = self->tree.num_tracked_samples[focal_node]; +out: return ret; } -static int TSK_WARN_UNUSED -tsk_ld_calc_get_r2_array_forward(tsk_ld_calc_t *self, tsk_id_t source_index, - tsk_size_t max_sites, double max_distance, double *r2, tsk_size_t *num_r2_values) +static int +tsk_ld_calc_initialise(tsk_ld_calc_t *self, tsk_id_t a) { - int ret = TSK_ERR_GENERIC; - tsk_site_t sA, sB; - double fA, fB, fAB, D; - int tracked_samples_set = 0; - tsk_tree_t *tA, *tB; - double n = (double) tsk_treeseq_get_num_samples(self->tree_sequence); - tsk_id_t j; - double nAB; + int ret = 0; - tA = self->outer_tree; - tB = self->inner_tree; - ret = tsk_treeseq_get_site(self->tree_sequence, source_index, &sA); + ret = tsk_treeseq_get_site(self->tree_sequence, a, &self->focal_site); if (ret != 0) { goto out; } - if (sA.mutations_length > 1) { - ret = TSK_ERR_ONLY_INFINITE_SITES; + ret = tsk_ld_calc_check_site(self, &self->focal_site); + if (ret != 0) { goto out; } - fA = ((double) tA->num_samples[sA.mutations[0].node]) / n; - tsk_bug_assert(fA > 0); - tB->mark = 1; - for (j = 0; j < (tsk_id_t) max_sites; j++) { - if (source_index + j + 1 >= (tsk_id_t) self->num_sites) { - break; - } - ret = tsk_treeseq_get_site(self->tree_sequence, (source_index + j + 1), &sB); - if (ret != 0) { - goto out; - } - if (sB.mutations_length > 1) { - ret = TSK_ERR_ONLY_INFINITE_SITES; - goto out; - } - if (sB.position - sA.position > max_distance) { - break; - } - while (sB.position >= tB->right) { - ret = tsk_tree_next(tB); - if (ret < 0) { - goto out; - } - tsk_bug_assert(ret == 1); - } - fB = ((double) tB->num_samples[sB.mutations[0].node]) / n; - tsk_bug_assert(fB > 0); - if (sB.position < tA->right) { - nAB = tsk_ld_calc_overlap_within_tree(self, sA, sB); - } else { - if (!tracked_samples_set && tB->marked[sA.mutations[0].node] == 1) { - tracked_samples_set = 1; - ret = tsk_ld_calc_set_tracked_samples(self, sA); - if (ret != 0) { - goto out; - } - } - if (tracked_samples_set) { - nAB = (double) tB->num_tracked_samples[sB.mutations[0].node]; - } else { - nAB = tsk_ld_calc_overlap_within_tree(self, sA, sB); - } - } - fAB = nAB / n; - D = fAB - fA * fB; - r2[j] = D * D / (fA * fB * (1 - fA) * (1 - fB)); + ret = tsk_tree_seek(&self->tree, self->focal_site.position, 0); + if (ret != 0) { + goto out; } - - /* Now rewind back the inner iterator and unmark all nodes that - * were set to 1 as we moved forward. */ - tB->mark = 0; - while (tB->index > tA->index) { - ret = tsk_tree_prev(tB); - if (ret < 0) { - goto out; - } - tsk_bug_assert(ret == 1); + ret = tsk_ld_calc_set_focal_samples(self); + if (ret != 0) { + goto out; } - *num_r2_values = (tsk_size_t) j; - ret = 0; out: return ret; } -static int TSK_WARN_UNUSED -tsk_ld_calc_get_r2_array_reverse(tsk_ld_calc_t *self, tsk_id_t source_index, - tsk_size_t max_sites, double max_distance, double *r2, tsk_size_t *num_r2_values) +static int +tsk_ld_calc_compute_r2(tsk_ld_calc_t *self, const tsk_site_t *target_site, double *r2) { - int ret = TSK_ERR_GENERIC; - tsk_site_t sA, sB; - double fA, fB, fAB, D; - int tracked_samples_set = 0; - tsk_tree_t *tA, *tB; - double n = (double) tsk_treeseq_get_num_samples(self->tree_sequence); - double nAB; - tsk_id_t j, site_index; + const double n = (double) self->total_samples; + double f_a, f_b, f_ab, D, denom; + tsk_id_t node; + int ret = tsk_ld_calc_check_site(self, target_site); - tA = self->outer_tree; - tB = self->inner_tree; - ret = tsk_treeseq_get_site(self->tree_sequence, source_index, &sA); if (ret != 0) { goto out; } - if (sA.mutations_length > 1) { - ret = TSK_ERR_ONLY_INFINITE_SITES; - goto out; - } - fA = ((double) tA->num_samples[sA.mutations[0].node]) / n; - tsk_bug_assert(fA > 0); - tB->mark = 1; - for (j = 0; j < (tsk_id_t) max_sites; j++) { - site_index = source_index - j - 1; - if (site_index < 0) { - break; - } - ret = tsk_treeseq_get_site(self->tree_sequence, site_index, &sB); + node = target_site->mutations[0].node; + f_a = ((double) self->focal_samples) / n; + f_b = ((double) self->tree.num_samples[node]) / n; + f_ab = ((double) self->tree.num_tracked_samples[node]) / n; + D = f_ab - f_a * f_b; + denom = f_a * f_b * (1 - f_a) * (1 - f_b); + *r2 = (D * D) / denom; +out: + return ret; +} + +static int +tsk_ld_calc_compute_and_append( + tsk_ld_calc_t *self, const tsk_site_t *target_site, bool *ret_done) +{ + int ret = 0; + double r2; + double distance = fabs(self->focal_site.position - target_site->position); + bool done = true; + + if (distance <= self->max_distance && self->result_length < self->max_sites) { + ret = tsk_ld_calc_compute_r2(self, target_site, &r2); if (ret != 0) { goto out; } - if (sB.mutations_length > 1) { - ret = TSK_ERR_ONLY_INFINITE_SITES; - goto out; - } - if (sA.position - sB.position > max_distance) { - break; - } - while (sB.position < tB->left) { - ret = tsk_tree_prev(tB); - if (ret < 0) { + self->result[self->result_length] = r2; + self->result_length++; + done = false; + } + *ret_done = done; +out: + return ret; +} + +static int +tsk_ld_calc_run_forward(tsk_ld_calc_t *self) +{ + int ret = 0; + tsk_size_t j; + bool done = false; + + for (j = 0; j < self->tree.sites_length; j++) { + if (self->tree.sites[j].id > self->focal_site.id) { + ret = tsk_ld_calc_compute_and_append(self, &self->tree.sites[j], &done); + if (ret != 0) { goto out; } - tsk_bug_assert(ret == 1); + if (done) { + break; + } } - fB = ((double) tB->num_samples[sB.mutations[0].node]) / n; - tsk_bug_assert(fB > 0); - if (sB.position >= tA->left) { - nAB = tsk_ld_calc_overlap_within_tree(self, sA, sB); - } else { - if (!tracked_samples_set && tB->marked[sA.mutations[0].node] == 1) { - tracked_samples_set = 1; - ret = tsk_ld_calc_set_tracked_samples(self, sA); - if (ret != 0) { - goto out; - } + } + while (((ret = tsk_tree_next(&self->tree)) == 1) && !done) { + for (j = 0; j < self->tree.sites_length; j++) { + ret = tsk_ld_calc_compute_and_append(self, &self->tree.sites[j], &done); + if (ret != 0) { + goto out; } - if (tracked_samples_set) { - nAB = (double) tB->num_tracked_samples[sB.mutations[0].node]; - } else { - nAB = tsk_ld_calc_overlap_within_tree(self, sA, sB); + if (done) { + break; } } - fAB = nAB / n; - D = fAB - fA * fB; - r2[j] = D * D / (fA * fB * (1 - fA) * (1 - fB)); } - - /* Now fast forward the inner iterator and unmark all nodes that - * were set to 1 as we moved back. */ - tB->mark = 0; - while (tB->index < tA->index) { - ret = tsk_tree_next(tB); - if (ret < 0) { - goto out; - } - tsk_bug_assert(ret == 1); + if (ret < 0) { + goto out; } - *num_r2_values = (tsk_size_t) j; ret = 0; out: return ret; } -int TSK_WARN_UNUSED -tsk_ld_calc_get_r2_array(tsk_ld_calc_t *self, tsk_id_t a, int direction, - tsk_size_t max_sites, double max_distance, double *r2, tsk_size_t *num_r2_values) +static int +tsk_ld_calc_run_reverse(tsk_ld_calc_t *self) { - int ret = TSK_ERR_GENERIC; + int ret = 0; + tsk_id_t j; + bool done = false; - if (a < 0 || a >= (tsk_id_t) self->num_sites) { - ret = TSK_ERR_OUT_OF_BOUNDS; - goto out; + for (j = (tsk_id_t) self->tree.sites_length - 1; j >= 0; j--) { + if (self->tree.sites[j].id < self->focal_site.id) { + ret = tsk_ld_calc_compute_and_append(self, &self->tree.sites[j], &done); + if (ret != 0) { + goto out; + } + if (done) { + break; + } + } } - ret = tsk_ld_calc_position_trees(self, a); - if (ret != 0) { - goto out; + while (((ret = tsk_tree_prev(&self->tree)) == 1) && !done) { + for (j = (tsk_id_t) self->tree.sites_length - 1; j >= 0; j--) { + ret = tsk_ld_calc_compute_and_append(self, &self->tree.sites[j], &done); + if (ret != 0) { + goto out; + } + if (done) { + break; + } + } } - if (direction == TSK_DIR_FORWARD) { - ret = tsk_ld_calc_get_r2_array_forward( - self, a, max_sites, max_distance, r2, num_r2_values); - } else if (direction == TSK_DIR_REVERSE) { - ret = tsk_ld_calc_get_r2_array_reverse( - self, a, max_sites, max_distance, r2, num_r2_values); - } else { - ret = TSK_ERR_BAD_PARAM_VALUE; + if (ret < 0) { + goto out; } + ret = 0; out: return ret; } -int TSK_WARN_UNUSED +int tsk_ld_calc_get_r2(tsk_ld_calc_t *self, tsk_id_t a, tsk_id_t b, double *r2) { - int ret = TSK_ERR_GENERIC; - tsk_site_t sA, sB; - double fA, fB, fAB, D; - tsk_tree_t *tA, *tB; - double n = (double) tsk_treeseq_get_num_samples(self->tree_sequence); - double nAB; - tsk_id_t tmp; + int ret = 0; + tsk_site_t target_site; - if (a < 0 || b < 0 || a >= (tsk_id_t) self->num_sites - || b >= (tsk_id_t) self->num_sites) { - ret = TSK_ERR_OUT_OF_BOUNDS; - goto out; - } - if (a > b) { - tmp = a; - a = b; - b = tmp; - } - ret = tsk_ld_calc_position_trees(self, a); + ret = tsk_ld_calc_initialise(self, a); if (ret != 0) { goto out; } - /* We can probably do a lot better than this implementation... */ - tA = self->outer_tree; - tB = self->inner_tree; - ret = tsk_treeseq_get_site(self->tree_sequence, a, &sA); + ret = tsk_treeseq_get_site(self->tree_sequence, b, &target_site); if (ret != 0) { goto out; } - ret = tsk_treeseq_get_site(self->tree_sequence, b, &sB); + ret = tsk_tree_seek(&self->tree, target_site.position, 0); if (ret != 0) { goto out; } - if (sA.mutations_length > 1 || sB.mutations_length > 1) { - ret = TSK_ERR_ONLY_INFINITE_SITES; + ret = tsk_ld_calc_compute_r2(self, &target_site, r2); + if (ret != 0) { goto out; } - tsk_bug_assert(sA.mutations_length == 1); - /* tsk_bug_assert(tA->parent[sA.mutations[0].node] != TSK_NULL); */ - fA = ((double) tA->num_samples[sA.mutations[0].node]) / n; - tsk_bug_assert(fA > 0); - ret = tsk_ld_calc_set_tracked_samples(self, sA); +out: + return ret; +} + +int +tsk_ld_calc_get_r2_array(tsk_ld_calc_t *self, tsk_id_t a, int direction, + tsk_size_t max_sites, double max_distance, double *r2, tsk_size_t *num_r2_values) +{ + int ret = tsk_ld_calc_initialise(self, a); + if (ret != 0) { goto out; } - while (sB.position >= tB->right) { - ret = tsk_tree_next(tB); - if (ret < 0) { - goto out; - } - tsk_bug_assert(ret == 1); - } - /* tsk_bug_assert(tB->parent[sB.mutations[0].node] != TSK_NULL); */ - fB = ((double) tB->num_samples[sB.mutations[0].node]) / n; - tsk_bug_assert(fB > 0); - nAB = (double) tB->num_tracked_samples[sB.mutations[0].node]; - fAB = nAB / n; - D = fAB - fA * fB; - *r2 = D * D / (fA * fB * (1 - fA) * (1 - fB)); + self->max_sites = max_sites; + self->max_distance = max_distance; + self->result_length = 0; + self->result = r2; - /* Now rewind the inner iterator back. */ - while (tB->index > tA->index) { - ret = tsk_tree_prev(tB); - if (ret < 0) { - goto out; - } - tsk_bug_assert(ret == 1); + if (direction == TSK_DIR_FORWARD) { + ret = tsk_ld_calc_run_forward(self); + } else if (direction == TSK_DIR_REVERSE) { + ret = tsk_ld_calc_run_reverse(self); + } else { + ret = TSK_ERR_BAD_PARAM_VALUE; } - ret = 0; + if (ret != 0) { + goto out; + } + *num_r2_values = self->result_length; out: return ret; } diff --git a/subprojects/tskit/tskit/stats.h b/subprojects/tskit/tskit/stats.h index 66e2c2757..f679959cf 100644 --- a/subprojects/tskit/tskit/stats.h +++ b/subprojects/tskit/tskit/stats.h @@ -33,11 +33,16 @@ extern "C" { #include typedef struct { - tsk_tree_t *outer_tree; - tsk_tree_t *inner_tree; - tsk_size_t num_sites; - int tree_changed; const tsk_treeseq_t *tree_sequence; + tsk_site_t focal_site; + tsk_size_t total_samples; + tsk_size_t focal_samples; + double max_distance; + tsk_size_t max_sites; + tsk_tree_t tree; + tsk_id_t *sample_buffer; + double *result; + tsk_size_t result_length; } tsk_ld_calc_t; int tsk_ld_calc_init(tsk_ld_calc_t *self, const tsk_treeseq_t *tree_sequence); diff --git a/subprojects/tskit/tskit/tables.c b/subprojects/tskit/tskit/tables.c index 1d1893d7e..31b127db4 100644 --- a/subprojects/tskit/tskit/tables.c +++ b/subprojects/tskit/tskit/tables.c @@ -32,10 +32,8 @@ #include #include -#define _TSK_WORKAROUND_FALSE_CLANG_WARNING #include -#define DEFAULT_SIZE_INCREMENT 1024 #define TABLE_SEP "-----------------------------------------\n" #define TSK_COL_OPTIONAL (1 << 0) @@ -171,7 +169,8 @@ read_table_ragged_cols(kastore_t *store, tsk_size_t *num_rows, read_table_ragged_col_t *cols, tsk_flags_t TSK_UNUSED(flags)) { int ret = 0; - size_t data_len, offset_len; + size_t data_len = 0; // initial value unused, just to keep the compiler happy. + size_t offset_len; int type; read_table_ragged_col_t *col; char offset_col_name[TSK_MAX_COL_NAME_LEN]; @@ -180,8 +179,6 @@ read_table_ragged_cols(kastore_t *store, tsk_size_t *num_rows, tsk_size_t *offset_array; for (col = cols; col->name != NULL; col++) { - data_col_present = false; - offset_col_present = false; ret = kastore_containss(store, col->name); if (ret < 0) { ret = tsk_set_kas_error(ret); @@ -282,6 +279,7 @@ read_table_properties( ret = kastore_gets(store, property->name, property->array_dest, &len, &type); if (ret != 0) { ret = tsk_set_kas_error(ret); + assert(ret != 0); /* Tell static analysers that we're handling errors */ goto out; } if (type != property->type) { @@ -468,6 +466,95 @@ check_offsets( return ret; } +static int +calculate_max_rows(tsk_size_t num_rows, tsk_size_t max_rows, + tsk_size_t max_rows_increment, tsk_size_t additional_rows, + tsk_size_t *ret_new_max_rows) +{ + tsk_size_t new_max_rows; + int ret = 0; + + if (check_table_overflow(num_rows, additional_rows)) { + ret = TSK_ERR_TABLE_OVERFLOW; + goto out; + } + + if (num_rows + additional_rows <= max_rows) { + new_max_rows = max_rows; + } else { + if (max_rows_increment == 0) { + /* Doubling by default */ + new_max_rows = TSK_MIN(max_rows * 2, TSK_MAX_ID + (tsk_size_t) 1); + /* Add some constraints to prevent very small allocations */ + if (new_max_rows < 1024) { + new_max_rows = 1024; + } + /* Prevent allocating more than ~2 million additional rows unless needed*/ + if (new_max_rows - max_rows > 2097152) { + new_max_rows = max_rows + 2097152; + } + } else { + /* Use user increment value */ + if (check_table_overflow(max_rows, max_rows_increment)) { + ret = TSK_ERR_TABLE_OVERFLOW; + goto out; + } + new_max_rows = max_rows + max_rows_increment; + } + new_max_rows = TSK_MAX(new_max_rows, num_rows + additional_rows); + } + *ret_new_max_rows = new_max_rows; +out: + return ret; +} + +static int +calculate_max_length(tsk_size_t current_length, tsk_size_t max_length, + tsk_size_t max_length_increment, tsk_size_t additional_length, + tsk_size_t *ret_new_max_length) +{ + tsk_size_t new_max_length; + int ret = 0; + + if (check_offset_overflow(current_length, additional_length)) { + ret = TSK_ERR_COLUMN_OVERFLOW; + goto out; + } + + if (current_length + additional_length <= max_length) { + new_max_length = max_length; + } else { + if (max_length_increment == 0) { + /* Doubling by default */ + new_max_length = TSK_MIN(max_length * 2, TSK_MAX_SIZE); + /* Add some constraints to prevent very small allocations */ + if (new_max_length < 65536) { + new_max_length = 65536; + } + /* Prevent allocating more than 100MB additional unless needed*/ + if (new_max_length - max_length > 104857600) { + new_max_length = max_length + 104857600; + } + new_max_length = TSK_MAX(new_max_length, current_length + additional_length); + } else { + /* Use user increment value */ + if (check_offset_overflow(max_length, max_length_increment)) { + /* Here we could allocate to the maximum size. + * Instead we are erroring out as this is much easier to test. + * The cost is that (at most) the last "max_length_increment"-1 + * bytes of the possible array space can't be used. */ + ret = TSK_ERR_COLUMN_OVERFLOW; + goto out; + } + new_max_length = max_length + max_length_increment; + } + new_max_length = TSK_MAX(new_max_length, current_length + additional_length); + } + *ret_new_max_length = new_max_length; +out: + return ret; +} + static int expand_column(void **column, tsk_size_t new_max_rows, size_t element_size) { @@ -490,32 +577,26 @@ expand_ragged_column(tsk_size_t current_length, tsk_size_t additional_length, size_t element_size) { int ret = 0; - tsk_size_t increment = TSK_MAX(additional_length, max_length_increment); + tsk_size_t new_max_length; - if (check_offset_overflow(current_length, additional_length)) { - ret = TSK_ERR_COLUMN_OVERFLOW; + ret = calculate_max_length(current_length, *max_length, max_length_increment, + additional_length, &new_max_length); + if (ret != 0) { goto out; } - if ((current_length + additional_length) > *max_length) { - if (check_offset_overflow(*max_length, increment)) { - /* Here we could allocate to the maximum size. - * Instead we are erroring out as this is much easier to test. - * The cost is that (at most) the last "max_length_increment"-1 - * bytes of the possible array space can't be used. */ - ret = TSK_ERR_COLUMN_OVERFLOW; - goto out; - } - ret = expand_column(column, *max_length + increment, element_size); + if (new_max_length > *max_length) { + ret = expand_column(column, new_max_length, element_size); if (ret != 0) { goto out; } - *max_length += increment; + *max_length = new_max_length; } out: return ret; } +/* TODO rename to copy_string or replace_and_copy_string */ static int replace_string( char **str, tsk_size_t *len, const char *new_str, const tsk_size_t new_len) @@ -536,16 +617,177 @@ replace_string( return ret; } +static int +takeset_string(char **str, tsk_size_t *len, char *new_str, const tsk_size_t new_len) +{ + tsk_safe_free(*str); + *str = new_str; + *len = new_len; + return 0; +} + static int write_metadata_schema_header( FILE *out, const char *metadata_schema, tsk_size_t metadata_schema_length) { const char *fmt = "#metadata_schema#\n" "%.*s\n" - "#end#metadata_schema\n"; + "#end#metadata_schema\n" TABLE_SEP; return fprintf(out, fmt, (int) metadata_schema_length, metadata_schema); } +/************************* + * reference sequence + *************************/ + +int +tsk_reference_sequence_init( + tsk_reference_sequence_t *self, tsk_flags_t TSK_UNUSED(options)) +{ + tsk_memset(self, 0, sizeof(*self)); + return 0; +} + +int +tsk_reference_sequence_free(tsk_reference_sequence_t *self) +{ + tsk_safe_free(self->data); + tsk_safe_free(self->url); + tsk_safe_free(self->metadata); + tsk_safe_free(self->metadata_schema); + return 0; +} + +bool +tsk_reference_sequence_is_null(const tsk_reference_sequence_t *self) +{ + return self->data_length == 0 && self->url_length == 0 && self->metadata_length == 0 + && self->metadata_schema_length == 0; +} + +bool +tsk_reference_sequence_equals(const tsk_reference_sequence_t *self, + const tsk_reference_sequence_t *other, tsk_flags_t options) +{ + int ret + = self->data_length == other->data_length + && self->url_length == other->url_length + && tsk_memcmp(self->data, other->data, self->data_length * sizeof(char)) == 0 + && tsk_memcmp(self->url, other->url, self->url_length * sizeof(char)) == 0; + + if (!(options & TSK_CMP_IGNORE_METADATA)) { + ret = ret && self->metadata_length == other->metadata_length + && self->metadata_schema_length == other->metadata_schema_length + && tsk_memcmp(self->metadata, other->metadata, + self->metadata_length * sizeof(char)) + == 0 + && tsk_memcmp(self->metadata_schema, other->metadata_schema, + self->metadata_schema_length * sizeof(char)) + == 0; + } + return ret; +} + +int +tsk_reference_sequence_copy(const tsk_reference_sequence_t *self, + tsk_reference_sequence_t *dest, tsk_flags_t options) +{ + int ret = 0; + + if (!(options & TSK_NO_INIT)) { + ret = tsk_reference_sequence_init(dest, 0); + if (ret != 0) { + goto out; + } + } + + if (tsk_reference_sequence_is_null(self)) { + /* This is a simple way to get any input into the NULL state */ + tsk_reference_sequence_free(dest); + } else { + ret = tsk_reference_sequence_set_data(dest, self->data, self->data_length); + if (ret != 0) { + goto out; + } + ret = tsk_reference_sequence_set_url(dest, self->url, self->url_length); + if (ret != 0) { + goto out; + } + ret = tsk_reference_sequence_set_metadata( + dest, self->metadata, self->metadata_length); + if (ret != 0) { + goto out; + } + ret = tsk_reference_sequence_set_metadata_schema( + dest, self->metadata_schema, self->metadata_schema_length); + if (ret != 0) { + goto out; + } + } +out: + return ret; +} + +int +tsk_reference_sequence_set_data( + tsk_reference_sequence_t *self, const char *data, tsk_size_t data_length) +{ + return replace_string(&self->data, &self->data_length, data, data_length); +} + +int +tsk_reference_sequence_set_url( + tsk_reference_sequence_t *self, const char *url, tsk_size_t url_length) +{ + return replace_string(&self->url, &self->url_length, url, url_length); +} + +int +tsk_reference_sequence_set_metadata( + tsk_reference_sequence_t *self, const char *metadata, tsk_size_t metadata_length) +{ + return replace_string( + &self->metadata, &self->metadata_length, metadata, metadata_length); +} + +int +tsk_reference_sequence_set_metadata_schema(tsk_reference_sequence_t *self, + const char *metadata_schema, tsk_size_t metadata_schema_length) +{ + return replace_string(&self->metadata_schema, &self->metadata_schema_length, + metadata_schema, metadata_schema_length); +} + +int +tsk_reference_sequence_takeset_data( + tsk_reference_sequence_t *self, char *data, tsk_size_t data_length) +{ + return takeset_string(&self->data, &self->data_length, data, data_length); +} + +int +tsk_reference_sequence_takeset_url( + tsk_reference_sequence_t *self, char *url, tsk_size_t url_length) +{ + return takeset_string(&self->url, &self->url_length, url, url_length); +} + +int +tsk_reference_sequence_takeset_metadata( + tsk_reference_sequence_t *self, char *metadata, tsk_size_t metadata_length) +{ + return takeset_string( + &self->metadata, &self->metadata_length, metadata, metadata_length); +} + +int +tsk_reference_sequence_takeset_metadata_schema(tsk_reference_sequence_t *self, + char *metadata_schema, tsk_size_t metadata_schema_length) +{ + return takeset_string(&self->metadata_schema, &self->metadata_schema_length, + metadata_schema, metadata_schema_length); +} + /************************* * individual table *************************/ @@ -555,34 +797,34 @@ tsk_individual_table_expand_main_columns( tsk_individual_table_t *self, tsk_size_t additional_rows) { int ret = 0; - tsk_size_t increment = TSK_MAX(additional_rows, self->max_rows_increment); - tsk_size_t new_size = self->max_rows + increment; + tsk_size_t new_max_rows; - if (check_table_overflow(self->max_rows, increment)) { - ret = TSK_ERR_TABLE_OVERFLOW; + ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, + additional_rows, &new_max_rows); + if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column((void **) &self->flags, new_size, sizeof(tsk_flags_t)); + ret = expand_column((void **) &self->flags, new_max_rows, sizeof(tsk_flags_t)); if (ret != 0) { goto out; } ret = expand_column( - (void **) &self->location_offset, new_size + 1, sizeof(tsk_size_t)); + (void **) &self->location_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } ret = expand_column( - (void **) &self->parents_offset, new_size + 1, sizeof(tsk_size_t)); + (void **) &self->parents_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } ret = expand_column( - (void **) &self->metadata_offset, new_size + 1, sizeof(tsk_size_t)); + (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } - self->max_rows = new_size; + self->max_rows = new_max_rows; } out: return ret; @@ -619,10 +861,7 @@ int tsk_individual_table_set_max_rows_increment( tsk_individual_table_t *self, tsk_size_t max_rows_increment) { - if (max_rows_increment == 0) { - max_rows_increment = DEFAULT_SIZE_INCREMENT; - } - self->max_rows_increment = (tsk_size_t) max_rows_increment; + self->max_rows_increment = max_rows_increment; return 0; } @@ -630,9 +869,6 @@ int tsk_individual_table_set_max_metadata_length_increment( tsk_individual_table_t *self, tsk_size_t max_metadata_length_increment) { - if (max_metadata_length_increment == 0) { - max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_metadata_length_increment = (tsk_size_t) max_metadata_length_increment; return 0; } @@ -641,9 +877,6 @@ int tsk_individual_table_set_max_location_length_increment( tsk_individual_table_t *self, tsk_size_t max_location_length_increment) { - if (max_location_length_increment == 0) { - max_location_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_location_length_increment = (tsk_size_t) max_location_length_increment; return 0; } @@ -652,9 +885,6 @@ int tsk_individual_table_set_max_parents_length_increment( tsk_individual_table_t *self, tsk_size_t max_parents_length_increment) { - if (max_parents_length_increment == 0) { - max_parents_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_parents_length_increment = (tsk_size_t) max_parents_length_increment; return 0; } @@ -690,10 +920,10 @@ tsk_individual_table_init(tsk_individual_table_t *self, tsk_flags_t TSK_UNUSED(o goto out; } self->metadata_offset[0] = 0; - self->max_rows_increment = DEFAULT_SIZE_INCREMENT; - self->max_location_length_increment = DEFAULT_SIZE_INCREMENT; - self->max_parents_length_increment = DEFAULT_SIZE_INCREMENT; - self->max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; + self->max_rows_increment = 0; + self->max_location_length_increment = 0; + self->max_parents_length_increment = 0; + self->max_metadata_length_increment = 0; tsk_individual_table_set_metadata_schema(self, NULL, 0); out: return ret; @@ -1065,7 +1295,7 @@ tsk_individual_table_print_state(const tsk_individual_table_t *self, FILE *out) { tsk_size_t j, k; - fprintf(out, TABLE_SEP); + fprintf(out, "\n" TABLE_SEP); fprintf(out, "tsk_individual_tbl: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, @@ -1332,36 +1562,37 @@ static int tsk_node_table_expand_main_columns(tsk_node_table_t *self, tsk_size_t additional_rows) { int ret = 0; - tsk_size_t increment = TSK_MAX(additional_rows, self->max_rows_increment); - tsk_size_t new_size = self->max_rows + increment; + tsk_size_t new_max_rows; - if (check_table_overflow(self->max_rows, increment)) { - ret = TSK_ERR_TABLE_OVERFLOW; + ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, + additional_rows, &new_max_rows); + if (ret != 0) { goto out; } - if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column((void **) &self->flags, new_size, sizeof(tsk_flags_t)); + + if (new_max_rows > self->max_rows) { + ret = expand_column((void **) &self->flags, new_max_rows, sizeof(tsk_flags_t)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->time, new_size, sizeof(double)); + ret = expand_column((void **) &self->time, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->population, new_size, sizeof(tsk_id_t)); + ret = expand_column((void **) &self->population, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->individual, new_size, sizeof(tsk_id_t)); + ret = expand_column((void **) &self->individual, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } ret = expand_column( - (void **) &self->metadata_offset, new_size + 1, sizeof(tsk_size_t)); + (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } - self->max_rows = new_size; + self->max_rows = new_max_rows; } out: return ret; @@ -1379,9 +1610,6 @@ int tsk_node_table_set_max_rows_increment( tsk_node_table_t *self, tsk_size_t max_rows_increment) { - if (max_rows_increment == 0) { - max_rows_increment = DEFAULT_SIZE_INCREMENT; - } self->max_rows_increment = max_rows_increment; return 0; } @@ -1390,9 +1618,6 @@ int tsk_node_table_set_max_metadata_length_increment( tsk_node_table_t *self, tsk_size_t max_metadata_length_increment) { - if (max_metadata_length_increment == 0) { - max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_metadata_length_increment = max_metadata_length_increment; return 0; } @@ -1416,8 +1641,8 @@ tsk_node_table_init(tsk_node_table_t *self, tsk_flags_t TSK_UNUSED(options)) goto out; } self->metadata_offset[0] = 0; - self->max_rows_increment = DEFAULT_SIZE_INCREMENT; - self->max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; + self->max_rows_increment = 0; + self->max_metadata_length_increment = 0; tsk_node_table_set_metadata_schema(self, NULL, 0); out: return ret; @@ -1718,7 +1943,7 @@ tsk_node_table_print_state(const tsk_node_table_t *self, FILE *out) { tsk_size_t j, k; - fprintf(out, TABLE_SEP); + fprintf(out, "\n" TABLE_SEP); fprintf(out, "tsk_node_tbl: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, @@ -1932,39 +2157,38 @@ static int tsk_edge_table_expand_main_columns(tsk_edge_table_t *self, tsk_size_t additional_rows) { int ret = 0; - tsk_size_t increment - = TSK_MAX((tsk_size_t) additional_rows, self->max_rows_increment); - tsk_size_t new_size = self->max_rows + increment; + tsk_size_t new_max_rows; - if (check_table_overflow(self->max_rows, increment)) { - ret = TSK_ERR_TABLE_OVERFLOW; + ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, + additional_rows, &new_max_rows); + if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column((void **) &self->left, new_size, sizeof(double)); + ret = expand_column((void **) &self->left, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->right, new_size, sizeof(double)); + ret = expand_column((void **) &self->right, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->parent, new_size, sizeof(tsk_id_t)); + ret = expand_column((void **) &self->parent, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->child, new_size, sizeof(tsk_id_t)); + ret = expand_column((void **) &self->child, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } if (tsk_edge_table_has_metadata(self)) { ret = expand_column( - (void **) &self->metadata_offset, new_size + 1, sizeof(tsk_size_t)); + (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } } - self->max_rows = new_size; + self->max_rows = new_max_rows; } out: return ret; @@ -1982,9 +2206,6 @@ int tsk_edge_table_set_max_rows_increment( tsk_edge_table_t *self, tsk_size_t max_rows_increment) { - if (max_rows_increment == 0) { - max_rows_increment = DEFAULT_SIZE_INCREMENT; - } self->max_rows_increment = max_rows_increment; return 0; } @@ -1993,9 +2214,6 @@ int tsk_edge_table_set_max_metadata_length_increment( tsk_edge_table_t *self, tsk_size_t max_metadata_length_increment) { - if (max_metadata_length_increment == 0) { - max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_metadata_length_increment = max_metadata_length_increment; return 0; } @@ -2023,8 +2241,8 @@ tsk_edge_table_init(tsk_edge_table_t *self, tsk_flags_t options) } self->metadata_offset[0] = 0; } - self->max_rows_increment = DEFAULT_SIZE_INCREMENT; - self->max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; + self->max_rows_increment = 0; + self->max_metadata_length_increment = 0; tsk_edge_table_set_metadata_schema(self, NULL, 0); out: return ret; @@ -2374,7 +2592,7 @@ tsk_edge_table_print_state(const tsk_edge_table_t *self, FILE *out) { int ret; - fprintf(out, TABLE_SEP); + fprintf(out, "\n" TABLE_SEP); fprintf(out, "edge_table: %p:\n", (const void *) self); fprintf(out, "options = 0x%X\n", self->options); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", @@ -2611,29 +2829,29 @@ static int tsk_site_table_expand_main_columns(tsk_site_table_t *self, tsk_size_t additional_rows) { int ret = 0; - tsk_size_t increment = TSK_MAX(additional_rows, self->max_rows_increment); - tsk_size_t new_size = self->max_rows + increment; + tsk_size_t new_max_rows; - if (check_table_overflow(self->max_rows, increment)) { - ret = TSK_ERR_TABLE_OVERFLOW; + ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, + additional_rows, &new_max_rows); + if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column((void **) &self->position, new_size, sizeof(double)); + ret = expand_column((void **) &self->position, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } - ret = expand_column( - (void **) &self->ancestral_state_offset, new_size + 1, sizeof(tsk_size_t)); + ret = expand_column((void **) &self->ancestral_state_offset, new_max_rows + 1, + sizeof(tsk_size_t)); if (ret != 0) { goto out; } ret = expand_column( - (void **) &self->metadata_offset, new_size + 1, sizeof(tsk_size_t)); + (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } - self->max_rows = new_size; + self->max_rows = new_max_rows; } out: return ret; @@ -2660,9 +2878,6 @@ int tsk_site_table_set_max_rows_increment( tsk_site_table_t *self, tsk_size_t max_rows_increment) { - if (max_rows_increment == 0) { - max_rows_increment = DEFAULT_SIZE_INCREMENT; - } self->max_rows_increment = max_rows_increment; return 0; } @@ -2671,9 +2886,6 @@ int tsk_site_table_set_max_metadata_length_increment( tsk_site_table_t *self, tsk_size_t max_metadata_length_increment) { - if (max_metadata_length_increment == 0) { - max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_metadata_length_increment = max_metadata_length_increment; return 0; } @@ -2682,9 +2894,6 @@ int tsk_site_table_set_max_ancestral_state_length_increment( tsk_site_table_t *self, tsk_size_t max_ancestral_state_length_increment) { - if (max_ancestral_state_length_increment == 0) { - max_ancestral_state_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_ancestral_state_length_increment = max_ancestral_state_length_increment; return 0; } @@ -2715,9 +2924,9 @@ tsk_site_table_init(tsk_site_table_t *self, tsk_flags_t TSK_UNUSED(options)) } self->ancestral_state_offset[0] = 0; self->metadata_offset[0] = 0; - self->max_rows_increment = DEFAULT_SIZE_INCREMENT; - self->max_ancestral_state_length_increment = DEFAULT_SIZE_INCREMENT; - self->max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; + self->max_rows_increment = 0; + self->max_ancestral_state_length_increment = 0; + self->max_metadata_length_increment = 0; tsk_site_table_set_metadata_schema(self, NULL, 0); out: return ret; @@ -3064,7 +3273,7 @@ tsk_site_table_print_state(const tsk_site_table_t *self, FILE *out) { int ret; - fprintf(out, TABLE_SEP); + fprintf(out, "\n" TABLE_SEP); fprintf(out, "site_table: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\t(max= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, @@ -3241,42 +3450,41 @@ tsk_mutation_table_expand_main_columns( tsk_mutation_table_t *self, tsk_size_t additional_rows) { int ret = 0; - tsk_size_t increment - = (tsk_size_t) TSK_MAX(additional_rows, self->max_rows_increment); - tsk_size_t new_size = self->max_rows + increment; + tsk_size_t new_max_rows; - if (check_table_overflow(self->max_rows, increment)) { - ret = TSK_ERR_TABLE_OVERFLOW; + ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, + additional_rows, &new_max_rows); + if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column((void **) &self->site, new_size, sizeof(tsk_id_t)); + ret = expand_column((void **) &self->site, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->node, new_size, sizeof(tsk_id_t)); + ret = expand_column((void **) &self->node, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->parent, new_size, sizeof(tsk_id_t)); + ret = expand_column((void **) &self->parent, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->time, new_size, sizeof(double)); + ret = expand_column((void **) &self->time, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } ret = expand_column( - (void **) &self->derived_state_offset, new_size + 1, sizeof(tsk_size_t)); + (void **) &self->derived_state_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } ret = expand_column( - (void **) &self->metadata_offset, new_size + 1, sizeof(tsk_size_t)); + (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } - self->max_rows = new_size; + self->max_rows = new_max_rows; } out: return ret; @@ -3304,9 +3512,6 @@ int tsk_mutation_table_set_max_rows_increment( tsk_mutation_table_t *self, tsk_size_t max_rows_increment) { - if (max_rows_increment == 0) { - max_rows_increment = DEFAULT_SIZE_INCREMENT; - } self->max_rows_increment = max_rows_increment; return 0; } @@ -3315,9 +3520,6 @@ int tsk_mutation_table_set_max_metadata_length_increment( tsk_mutation_table_t *self, tsk_size_t max_metadata_length_increment) { - if (max_metadata_length_increment == 0) { - max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_metadata_length_increment = max_metadata_length_increment; return 0; } @@ -3326,9 +3528,6 @@ int tsk_mutation_table_set_max_derived_state_length_increment( tsk_mutation_table_t *self, tsk_size_t max_derived_state_length_increment) { - if (max_derived_state_length_increment == 0) { - max_derived_state_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_derived_state_length_increment = max_derived_state_length_increment; return 0; } @@ -3359,9 +3558,9 @@ tsk_mutation_table_init(tsk_mutation_table_t *self, tsk_flags_t TSK_UNUSED(optio } self->derived_state_offset[0] = 0; self->metadata_offset[0] = 0; - self->max_rows_increment = DEFAULT_SIZE_INCREMENT; - self->max_derived_state_length_increment = DEFAULT_SIZE_INCREMENT; - self->max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; + self->max_rows_increment = 0; + self->max_derived_state_length_increment = 0; + self->max_metadata_length_increment = 0; tsk_mutation_table_set_metadata_schema(self, NULL, 0); out: return ret; @@ -3742,7 +3941,7 @@ tsk_mutation_table_print_state(const tsk_mutation_table_t *self, FILE *out) { int ret; - fprintf(out, TABLE_SEP); + fprintf(out, "\n" TABLE_SEP); fprintf(out, "mutation_table: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, @@ -3929,46 +4128,45 @@ tsk_migration_table_expand_main_columns( tsk_migration_table_t *self, tsk_size_t additional_rows) { int ret = 0; - tsk_size_t increment - = TSK_MAX((tsk_size_t) additional_rows, self->max_rows_increment); - tsk_size_t new_size = self->max_rows + increment; + tsk_size_t new_max_rows; - if (check_table_overflow(self->max_rows, increment)) { - ret = TSK_ERR_TABLE_OVERFLOW; + ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, + additional_rows, &new_max_rows); + if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { - ret = expand_column((void **) &self->left, new_size, sizeof(double)); + ret = expand_column((void **) &self->left, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->right, new_size, sizeof(double)); + ret = expand_column((void **) &self->right, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->node, new_size, sizeof(tsk_id_t)); + ret = expand_column((void **) &self->node, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->source, new_size, sizeof(tsk_id_t)); + ret = expand_column((void **) &self->source, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->dest, new_size, sizeof(tsk_id_t)); + ret = expand_column((void **) &self->dest, new_max_rows, sizeof(tsk_id_t)); if (ret != 0) { goto out; } - ret = expand_column((void **) &self->time, new_size, sizeof(double)); + ret = expand_column((void **) &self->time, new_max_rows, sizeof(double)); if (ret != 0) { goto out; } ret = expand_column( - (void **) &self->metadata_offset, new_size + 1, sizeof(tsk_size_t)); + (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } - self->max_rows = new_size; + self->max_rows = new_max_rows; } out: return ret; @@ -3987,9 +4185,6 @@ int tsk_migration_table_set_max_rows_increment( tsk_migration_table_t *self, tsk_size_t max_rows_increment) { - if (max_rows_increment == 0) { - max_rows_increment = DEFAULT_SIZE_INCREMENT; - } self->max_rows_increment = max_rows_increment; return 0; } @@ -3998,9 +4193,6 @@ int tsk_migration_table_set_max_metadata_length_increment( tsk_migration_table_t *self, tsk_size_t max_metadata_length_increment) { - if (max_metadata_length_increment == 0) { - max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_metadata_length_increment = max_metadata_length_increment; return 0; } @@ -4025,8 +4217,8 @@ tsk_migration_table_init(tsk_migration_table_t *self, tsk_flags_t TSK_UNUSED(opt goto out; } self->metadata_offset[0] = 0; - self->max_rows_increment = DEFAULT_SIZE_INCREMENT; - self->max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; + self->max_rows_increment = 0; + self->max_metadata_length_increment = 0; tsk_migration_table_set_metadata_schema(self, NULL, 0); out: return ret; @@ -4324,7 +4516,7 @@ tsk_migration_table_print_state(const tsk_migration_table_t *self, FILE *out) { int ret; - fprintf(out, TABLE_SEP); + fprintf(out, "\n" TABLE_SEP); fprintf(out, "migration_table: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, @@ -4527,20 +4719,20 @@ tsk_population_table_expand_main_columns( tsk_population_table_t *self, tsk_size_t additional_rows) { int ret = 0; - tsk_size_t increment = TSK_MAX(additional_rows, self->max_rows_increment); - tsk_size_t new_size = self->max_rows + increment; + tsk_size_t new_max_rows; - if (check_table_overflow(self->max_rows, increment)) { - ret = TSK_ERR_TABLE_OVERFLOW; + ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, + additional_rows, &new_max_rows); + if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { ret = expand_column( - (void **) &self->metadata_offset, new_size + 1, sizeof(tsk_size_t)); + (void **) &self->metadata_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } - self->max_rows = new_size; + self->max_rows = new_max_rows; } out: return ret; @@ -4559,9 +4751,6 @@ int tsk_population_table_set_max_rows_increment( tsk_population_table_t *self, tsk_size_t max_rows_increment) { - if (max_rows_increment == 0) { - max_rows_increment = DEFAULT_SIZE_INCREMENT; - } self->max_rows_increment = max_rows_increment; return 0; } @@ -4570,9 +4759,6 @@ int tsk_population_table_set_max_metadata_length_increment( tsk_population_table_t *self, tsk_size_t max_metadata_length_increment) { - if (max_metadata_length_increment == 0) { - max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_metadata_length_increment = max_metadata_length_increment; return 0; } @@ -4596,8 +4782,8 @@ tsk_population_table_init(tsk_population_table_t *self, tsk_flags_t TSK_UNUSED(o goto out; } self->metadata_offset[0] = 0; - self->max_rows_increment = DEFAULT_SIZE_INCREMENT; - self->max_metadata_length_increment = DEFAULT_SIZE_INCREMENT; + self->max_rows_increment = 0; + self->max_metadata_length_increment = 0; tsk_population_table_set_metadata_schema(self, NULL, 0); out: return ret; @@ -4857,7 +5043,7 @@ tsk_population_table_print_state(const tsk_population_table_t *self, FILE *out) { tsk_size_t j, k; - fprintf(out, TABLE_SEP); + fprintf(out, "\n" TABLE_SEP); fprintf(out, "population_table: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, @@ -5035,25 +5221,25 @@ tsk_provenance_table_expand_main_columns( tsk_provenance_table_t *self, tsk_size_t additional_rows) { int ret = 0; - tsk_size_t increment = TSK_MAX(additional_rows, self->max_rows_increment); - tsk_size_t new_size = self->max_rows + increment; + tsk_size_t new_max_rows; - if (check_table_overflow(self->max_rows, increment)) { - ret = TSK_ERR_TABLE_OVERFLOW; + ret = calculate_max_rows(self->num_rows, self->max_rows, self->max_rows_increment, + additional_rows, &new_max_rows); + if (ret != 0) { goto out; } if ((self->num_rows + additional_rows) > self->max_rows) { ret = expand_column( - (void **) &self->timestamp_offset, new_size + 1, sizeof(tsk_size_t)); + (void **) &self->timestamp_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } ret = expand_column( - (void **) &self->record_offset, new_size + 1, sizeof(tsk_size_t)); + (void **) &self->record_offset, new_max_rows + 1, sizeof(tsk_size_t)); if (ret != 0) { goto out; } - self->max_rows = new_size; + self->max_rows = new_max_rows; } out: return ret; @@ -5081,9 +5267,6 @@ int tsk_provenance_table_set_max_rows_increment( tsk_provenance_table_t *self, tsk_size_t max_rows_increment) { - if (max_rows_increment == 0) { - max_rows_increment = DEFAULT_SIZE_INCREMENT; - } self->max_rows_increment = max_rows_increment; return 0; } @@ -5092,9 +5275,6 @@ int tsk_provenance_table_set_max_timestamp_length_increment( tsk_provenance_table_t *self, tsk_size_t max_timestamp_length_increment) { - if (max_timestamp_length_increment == 0) { - max_timestamp_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_timestamp_length_increment = max_timestamp_length_increment; return 0; } @@ -5103,9 +5283,6 @@ int tsk_provenance_table_set_max_record_length_increment( tsk_provenance_table_t *self, tsk_size_t max_record_length_increment) { - if (max_record_length_increment == 0) { - max_record_length_increment = DEFAULT_SIZE_INCREMENT; - } self->max_record_length_increment = max_record_length_increment; return 0; } @@ -5135,9 +5312,9 @@ tsk_provenance_table_init(tsk_provenance_table_t *self, tsk_flags_t TSK_UNUSED(o goto out; } self->record_offset[0] = 0; - self->max_rows_increment = DEFAULT_SIZE_INCREMENT; - self->max_timestamp_length_increment = DEFAULT_SIZE_INCREMENT; - self->max_record_length_increment = DEFAULT_SIZE_INCREMENT; + self->max_rows_increment = 0; + self->max_timestamp_length_increment = 0; + self->max_record_length_increment = 0; out: return ret; } @@ -5431,7 +5608,7 @@ tsk_provenance_table_print_state(const tsk_provenance_table_t *self, FILE *out) { tsk_size_t j, k; - fprintf(out, TABLE_SEP); + fprintf(out, "\n" TABLE_SEP); fprintf(out, "provenance_table: %p:\n", (const void *) self); fprintf(out, "num_rows = %lld\tmax= %lld\tincrement = %lld)\n", (long long) self->num_rows, (long long) self->max_rows, @@ -6127,15 +6304,16 @@ tsk_individual_table_topological_sort( return ret; } -static int -tsk_table_sorter_sort_individuals(tsk_table_sorter_t *self) +int +tsk_table_collection_individual_topological_sort( + tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t i, ret_id; tsk_individual_table_t copy; tsk_individual_t individual; - tsk_individual_table_t *individuals = &self->tables->individuals; - tsk_node_table_t *nodes = &self->tables->nodes; + tsk_individual_table_t *individuals = &self->individuals; + tsk_node_table_t *nodes = &self->nodes; tsk_size_t num_individuals = individuals->num_rows; tsk_id_t *traversal_order = tsk_malloc(num_individuals * sizeof(*traversal_order)); tsk_id_t *new_id_map = tsk_malloc(num_individuals * sizeof(*new_id_map)); @@ -6151,6 +6329,12 @@ tsk_table_sorter_sort_individuals(tsk_table_sorter_t *self) goto out; } + ret_id = tsk_table_collection_check_integrity(self, 0); + if (ret_id != 0) { + ret = (int) ret_id; + goto out; + } + ret = tsk_individual_table_clear(individuals); if (ret != 0) { goto out; @@ -6325,14 +6509,6 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; goto out; } - - /* Individuals also must be all sorted or skipped entirely */ - if (start->individuals == self->tables->individuals.num_rows) { - skip_individuals = true; - } else if (start->individuals != 0) { - ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; - goto out; - } } /* The indexes will be invalidated, so drop them */ ret = tsk_table_collection_drop_index(self->tables, 0); @@ -6363,7 +6539,7 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) goto out; } } - if (!skip_individuals) { + if (!skip_individuals && self->sort_individuals != NULL) { ret = self->sort_individuals(self); if (ret != 0) { goto out; @@ -6399,7 +6575,8 @@ tsk_table_sorter_init( /* Set the sort_edges and sort_mutations methods to the default. */ self->sort_edges = tsk_table_sorter_sort_edges; self->sort_mutations = tsk_table_sorter_sort_mutations; - self->sort_individuals = tsk_table_sorter_sort_individuals; + /* Default sort doesn't touch individuals */ + self->sort_individuals = NULL; out: return ret; } @@ -6426,6 +6603,13 @@ typedef struct _mutation_id_list_t { struct _mutation_id_list_t *next; } mutation_id_list_t; +typedef struct _tsk_segment_t { + double left; + double right; + struct _tsk_segment_t *next; + tsk_id_t node; +} tsk_segment_t; + /* segment overlap finding algorithm */ typedef struct { /* The input segments. This buffer is sorted by the algorithm and we also @@ -7115,58 +7299,375 @@ ancestor_mapper_run(ancestor_mapper_t *self) } /************************* - * IBD finder + * IBD Segments *************************/ -static tsk_segment_t *TSK_WARN_UNUSED -tsk_ibd_finder_alloc_segment( - tsk_ibd_finder_t *self, double left, double right, tsk_id_t node) +/* This maps two positive integers 0 <= a < b < N into the set + * {0, ..., N^2}. For us to overflow an int64, N would need to + * be > sqrt(2^63), ~3 * 10^9. The maximum value for a 32bit int + * is ~2 * 10^9, so this can't happen here, however it is + * theoretically possible with 64 bit IDs. It would require + * a *very* large node table --- assuming 24 bytes per row + * it would be at least 67GiB. To make sure this eventuality + * doesn't happen, we have a tsk_bug_assert in the + * tsk_identity_segments_init. + */ +static inline int64_t +pair_to_integer(tsk_id_t a, tsk_id_t b, tsk_size_t N) { - tsk_segment_t *seg = NULL; + tsk_id_t tmp; + if (a > b) { + tmp = a; + a = b; + b = tmp; + } + return ((int64_t) a) * (int64_t) N + (int64_t) b; +} - seg = tsk_blkalloc_get(&self->segment_heap, sizeof(*seg)); +static inline void +integer_to_pair(int64_t index, tsk_size_t N, tsk_id_t *a, tsk_id_t *b) +{ + *a = (tsk_id_t)(index / (int64_t) N); + *b = (tsk_id_t)(index % (int64_t) N); +} + +static int64_t +tsk_identity_segments_get_key( + const tsk_identity_segments_t *self, tsk_id_t a, tsk_id_t b) +{ + int64_t ret; + tsk_id_t N = (tsk_id_t) self->num_nodes; + + if (a < 0 || b < 0 || a >= N || b >= N) { + ret = TSK_ERR_NODE_OUT_OF_BOUNDS; + goto out; + } + if (a == b) { + ret = TSK_ERR_SAME_NODES_IN_PAIR; + goto out; + } + ret = pair_to_integer(a, b, self->num_nodes); +out: + return ret; +} + +static tsk_identity_segment_t *TSK_WARN_UNUSED +tsk_identity_segments_alloc_segment( + tsk_identity_segments_t *self, double left, double right, tsk_id_t node) +{ + tsk_identity_segment_t *seg = tsk_blkalloc_get(&self->heap, sizeof(*seg)); if (seg == NULL) { goto out; } + tsk_bug_assert(left < right); + tsk_bug_assert(node >= 0 && node < (tsk_id_t) self->num_nodes); + seg->next = NULL; seg->left = left; seg->right = right; seg->node = node; - out: return seg; } -static int TSK_WARN_UNUSED -tsk_ibd_finder_add_output( - tsk_ibd_finder_t *self, double left, double right, tsk_id_t node_id, int pair_num) +static tsk_avl_node_int_t * +tsk_identity_segments_alloc_new_pair(tsk_identity_segments_t *self, int64_t key) { - int ret = 0; - tsk_segment_t *tail = self->ibd_segments_tail[pair_num]; - tsk_segment_t *x; + tsk_avl_node_int_t *avl_node = tsk_blkalloc_get(&self->heap, sizeof(*avl_node)); + tsk_identity_segment_list_t *list = tsk_blkalloc_get(&self->heap, sizeof(*list)); - tsk_bug_assert(left < right); - if (tail == NULL) { - x = tsk_ibd_finder_alloc_segment(self, left, right, node_id); - if (x == NULL) { + if (avl_node == NULL || list == NULL) { + return NULL; + } + avl_node->key = key; + avl_node->value = list; + memset(list, 0, sizeof(*list)); + return avl_node; +} + +/* Deliberately not making this a part of the public interface for now, + * so we don't have to worry about the signature */ +static int +tsk_identity_segments_init( + tsk_identity_segments_t *self, tsk_size_t num_nodes, tsk_flags_t options) +{ + int ret = 0; + /* Make sure we don't overflow in the ID mapping. See the comments in pair_to_integer + * for details. */ + double max_num_nodes = sqrt(1ULL << 63); + tsk_bug_assert((double) num_nodes < max_num_nodes); + + memset(self, 0, sizeof(*self)); + self->num_nodes = num_nodes; + /* Storing segments implies storing pairs */ + if (options & TSK_IBD_STORE_SEGMENTS) { + self->store_pairs = true; + self->store_segments = true; + } else if (options & TSK_IBD_STORE_PAIRS) { + self->store_pairs = true; + } + ret = tsk_avl_tree_int_init(&self->pair_map); + if (ret != 0) { + goto out; + } + /* Allocate heap memory in 1MiB blocks */ + ret = tsk_blkalloc_init(&self->heap, 1024 * 1024); + if (ret != 0) { + goto out; + } +out: + return ret; +} + +void +tsk_identity_segments_print_state(tsk_identity_segments_t *self, FILE *out) +{ + tsk_avl_node_int_t **nodes = tsk_malloc(self->pair_map.size * sizeof(*nodes)); + int64_t key; + tsk_identity_segment_list_t *value; + tsk_identity_segment_t *seg; + tsk_size_t j; + tsk_id_t a, b; + + tsk_bug_assert(nodes != NULL); + + fprintf(out, "===\nIBD Result\n===\n"); + fprintf(out, "total_span = %f\n", self->total_span); + fprintf(out, "num_segments = %lld\n", (unsigned long long) self->num_segments); + fprintf(out, "store_pairs = %d\n", self->store_pairs); + fprintf(out, "store_segments = %d\n", self->store_segments); + if (self->store_pairs) { + fprintf(out, "num_keys = %d\n", (int) self->pair_map.size); + tsk_avl_tree_int_ordered_nodes(&self->pair_map, nodes); + for (j = 0; j < self->pair_map.size; j++) { + key = nodes[j]->key; + value = (tsk_identity_segment_list_t *) nodes[j]->value; + integer_to_pair(key, self->num_nodes, &a, &b); + fprintf(out, "%lld\t(%d,%d) n=%d total_span=%f\t", (long long) key, (int) a, + (int) b, (int) value->num_segments, value->total_span); + if (self->store_segments) { + for (seg = value->head; seg != NULL; seg = seg->next) { + fprintf( + out, "(%f, %f)->%d, ", seg->left, seg->right, (int) seg->node); + } + } + fprintf(out, "\n"); + } + } + fprintf(out, "Segment memory\n"); + tsk_blkalloc_print_state(&self->heap, out); + tsk_safe_free(nodes); +} + +tsk_size_t +tsk_identity_segments_get_num_segments(const tsk_identity_segments_t *self) +{ + return self->num_segments; +} + +double +tsk_identity_segments_get_total_span(const tsk_identity_segments_t *self) +{ + return self->total_span; +} + +tsk_size_t +tsk_identity_segments_get_num_pairs(const tsk_identity_segments_t *self) +{ + return self->pair_map.size; +} + +/* Use an inorder traversal on the AVL tree to get the pairs in order. + * Recursion is safe here because it's a balanced tree (see the AVL tree + * code for notes on this). + */ +static int +get_keys_traverse(tsk_avl_node_int_t *node, int index, tsk_size_t N, tsk_id_t *pairs) +{ + tsk_id_t a, b; + + if (node == NULL) { + return index; + } + index = get_keys_traverse(node->llink, index, N, pairs); + integer_to_pair(node->key, N, &a, &b); + pairs[2 * index] = a; + pairs[2 * index + 1] = b; + return get_keys_traverse(node->rlink, index + 1, N, pairs); +} + +int +tsk_identity_segments_get_keys(const tsk_identity_segments_t *self, tsk_id_t *pairs) +{ + if (!self->store_pairs) { + return TSK_ERR_IBD_PAIRS_NOT_STORED; + } + get_keys_traverse( + tsk_avl_tree_int_get_root(&self->pair_map), 0, self->num_nodes, pairs); + return 0; +} + +static int +get_items_traverse(tsk_avl_node_int_t *node, int index, tsk_size_t N, tsk_id_t *pairs, + tsk_identity_segment_list_t **lists) +{ + tsk_id_t a, b; + + if (node == NULL) { + return index; + } + index = get_items_traverse(node->llink, index, N, pairs, lists); + integer_to_pair(node->key, N, &a, &b); + pairs[2 * index] = a; + pairs[2 * index + 1] = b; + lists[index] = node->value; + return get_items_traverse(node->rlink, index + 1, N, pairs, lists); +} + +int +tsk_identity_segments_get_items(const tsk_identity_segments_t *self, tsk_id_t *pairs, + tsk_identity_segment_list_t **lists) +{ + if (!self->store_pairs) { + return TSK_ERR_IBD_PAIRS_NOT_STORED; + } + get_items_traverse( + tsk_avl_tree_int_get_root(&self->pair_map), 0, self->num_nodes, pairs, lists); + return 0; +} + +int +tsk_identity_segments_free(tsk_identity_segments_t *self) +{ + tsk_blkalloc_free(&self->heap); + tsk_avl_tree_int_free(&self->pair_map); + return 0; +} + +static int TSK_WARN_UNUSED +tsk_identity_segments_update_pair(tsk_identity_segments_t *self, tsk_id_t a, tsk_id_t b, + double left, double right, tsk_id_t node) +{ + int ret = 0; + tsk_identity_segment_t *x; + tsk_identity_segment_list_t *list; + /* skip the error checking here since this an internal API */ + int64_t key = pair_to_integer(a, b, self->num_nodes); + tsk_avl_node_int_t *avl_node = tsk_avl_tree_int_search(&self->pair_map, key); + + if (avl_node == NULL) { + /* We haven't seen this pair before */ + avl_node = tsk_identity_segments_alloc_new_pair(self, key); + if (avl_node == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; } - self->ibd_segments_head[pair_num] = x; - self->ibd_segments_tail[pair_num] = x; - } else { - x = tsk_ibd_finder_alloc_segment(self, left, right, node_id); + ret = tsk_avl_tree_int_insert(&self->pair_map, avl_node); + tsk_bug_assert(ret == 0); + } + list = (tsk_identity_segment_list_t *) avl_node->value; + list->num_segments++; + list->total_span += right - left; + if (self->store_segments) { + x = tsk_identity_segments_alloc_segment(self, left, right, node); if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; goto out; } - tail->next = x; - self->ibd_segments_tail[pair_num] = x; + if (list->tail == NULL) { + list->head = x; + list->tail = x; + } else { + list->tail->next = x; + list->tail = x; + } } out: return ret; } +static int TSK_WARN_UNUSED +tsk_identity_segments_add_segment(tsk_identity_segments_t *self, tsk_id_t a, tsk_id_t b, + double left, double right, tsk_id_t node) +{ + int ret = 0; + + if (self->store_pairs) { + ret = tsk_identity_segments_update_pair(self, a, b, left, right, node); + if (ret != 0) { + goto out; + } + } + self->total_span += right - left; + self->num_segments++; +out: + return ret; +} + +int TSK_WARN_UNUSED +tsk_identity_segments_get(const tsk_identity_segments_t *self, tsk_id_t sample_a, + tsk_id_t sample_b, tsk_identity_segment_list_t **ret_list) +{ + int ret = 0; + int64_t key = tsk_identity_segments_get_key(self, sample_a, sample_b); + tsk_avl_node_int_t *avl_node; + + if (key < 0) { + ret = (int) key; + goto out; + } + if (!self->store_pairs) { + ret = TSK_ERR_IBD_PAIRS_NOT_STORED; + goto out; + } + avl_node = tsk_avl_tree_int_search(&self->pair_map, key); + *ret_list = NULL; + if (avl_node != NULL) { + *ret_list = (tsk_identity_segment_list_t *) avl_node->value; + } +out: + return ret; +} + +/************************* + * IBD finder + *************************/ + +typedef struct { + tsk_identity_segments_t *result; + double min_length; + double max_time; + const tsk_table_collection_t *tables; + /* Maps nodes to their sample set IDs. Input samples map to set 0 + * in the "within" case. */ + tsk_id_t *sample_set_id; + /* True if we're finding IBD between sample sets, false otherwise. */ + bool finding_between; + tsk_segment_t **ancestor_map_head; + tsk_segment_t **ancestor_map_tail; + tsk_segment_t *segment_queue; + tsk_size_t segment_queue_size; + tsk_size_t max_segment_queue_size; + tsk_blkalloc_t segment_heap; +} tsk_ibd_finder_t; + +static tsk_segment_t *TSK_WARN_UNUSED +tsk_ibd_finder_alloc_segment( + tsk_ibd_finder_t *self, double left, double right, tsk_id_t node) +{ + tsk_segment_t *seg = NULL; + + seg = tsk_blkalloc_get(&self->segment_heap, sizeof(*seg)); + if (seg == NULL) { + goto out; + } + seg->next = NULL; + seg->left = left; + seg->right = right; + seg->node = node; + +out: + return seg; +} static int TSK_WARN_UNUSED tsk_ibd_finder_add_ancestry(tsk_ibd_finder_t *self, tsk_id_t input_id, double left, double right, tsk_id_t output_id) @@ -7176,20 +7677,15 @@ tsk_ibd_finder_add_ancestry(tsk_ibd_finder_t *self, tsk_id_t input_id, double le tsk_segment_t *x = NULL; tsk_bug_assert(left < right); + x = tsk_ibd_finder_alloc_segment(self, left, right, output_id); + if (x == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } if (tail == NULL) { - x = tsk_ibd_finder_alloc_segment(self, left, right, output_id); - if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } self->ancestor_map_head[input_id] = x; self->ancestor_map_tail[input_id] = x; } else { - x = tsk_ibd_finder_alloc_segment(self, left, right, output_id); - if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } tail->next = x; self->ancestor_map_tail[input_id] = x; } @@ -7198,211 +7694,111 @@ tsk_ibd_finder_add_ancestry(tsk_ibd_finder_t *self, tsk_id_t input_id, double le } static int -tsk_ibd_finder_init_samples(tsk_ibd_finder_t *self) +tsk_ibd_finder_init_samples_from_set( + tsk_ibd_finder_t *self, const tsk_id_t *samples, tsk_size_t num_samples) { int ret = 0; tsk_size_t j; tsk_id_t u; - /* Go through the sample pairs to define samples. */ - for (j = 0; j < 2 * self->num_pairs; j++) { - u = self->pairs[j]; + for (j = 0; j < num_samples; j++) { + u = samples[j]; if (u < 0 || u > (tsk_id_t) self->tables->nodes.num_rows) { ret = TSK_ERR_NODE_OUT_OF_BOUNDS; goto out; } - - if (!self->is_sample[u]) { - self->is_sample[u] = true; - ret = tsk_ibd_finder_add_ancestry( - self, u, 0, self->tables->sequence_length, u); - if (ret != 0) { - goto out; - } + if (self->sample_set_id[u] != TSK_NULL) { + ret = TSK_ERR_DUPLICATE_SAMPLE; + goto out; } + self->sample_set_id[u] = 0; } - out: return ret; } -static int -tsk_ibd_finder_index_samples(tsk_ibd_finder_t *self) +static void +tsk_ibd_finder_init_samples_from_nodes(tsk_ibd_finder_t *self) { - int ret = 0; - tsk_size_t i; - tsk_id_t idx; - - self->paired_nodes_index - = tsk_calloc(self->num_nodes, sizeof(*self->paired_nodes_index)); - - if (self->paired_nodes_index == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - for (i = 0; i < self->num_nodes; ++i) { - self->paired_nodes_index[i] = -1; - } - - for (i = 0; i < 2 * self->num_pairs; ++i) { - self->paired_nodes_index[self->pairs[i]] = 1; - } + tsk_id_t u; + const tsk_id_t num_nodes = (tsk_id_t) self->tables->nodes.num_rows; + const tsk_flags_t *restrict flags = self->tables->nodes.flags; - self->num_unique_nodes_in_pair = 0; - idx = 0; - for (i = 0; i < self->num_nodes; ++i) { - if (self->paired_nodes_index[i] == 1) { - ++self->num_unique_nodes_in_pair; - self->paired_nodes_index[i] = idx; - ++idx; + for (u = 0; u < num_nodes; u++) { + if (flags[u] & TSK_NODE_IS_SAMPLE) { + self->sample_set_id[u] = 0; } } - -out: - return ret; } -// NOTE: future travellers may want to refactor -// this to only allocate the upper triangle of the matrix. -// Doing so would save a good bit of memory, but would -// imply trickier indexing here and in -// tsk_ibd_finder_find_sample_pair_index2. static int -tsk_ibd_finder_build_pair_map(tsk_ibd_finder_t *self) +tsk_ibd_finder_add_sample_ancestry(tsk_ibd_finder_t *self) { - int ret = 0; - tsk_size_t i, index; - tsk_id_t sample0, sample1; - tsk_id_t row, col; - tsk_size_t matrix_size - = self->num_unique_nodes_in_pair * self->num_unique_nodes_in_pair; - - self->pair_map = tsk_calloc(matrix_size, sizeof(*self->pair_map)); - if (self->pair_map == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - for (i = 0; i < matrix_size; ++i) { - self->pair_map[i] = -1; - } - - for (i = 0; i < self->num_pairs; ++i) { - sample0 = self->pairs[2 * i]; - sample1 = self->pairs[2 * i + 1]; - - row = TSK_MIN( - self->paired_nodes_index[sample0], self->paired_nodes_index[sample1]); - col = TSK_MAX( - self->paired_nodes_index[sample0], self->paired_nodes_index[sample1]); - - tsk_bug_assert(row >= 0); - tsk_bug_assert(col >= 0); + int ret = 0; + tsk_id_t u; + const tsk_id_t num_nodes = (tsk_id_t) self->tables->nodes.num_rows; + const double L = self->tables->sequence_length; - index = ((tsk_size_t) row) * self->num_unique_nodes_in_pair + (tsk_size_t) col; - if (self->pair_map[index] != -1) { - ret = TSK_ERR_DUPLICATE_SAMPLE_PAIRS; - goto out; + for (u = 0; u < num_nodes; u++) { + if (self->sample_set_id[u] != TSK_NULL) { + ret = tsk_ibd_finder_add_ancestry(self, u, 0, L, u); + if (ret != 0) { + goto out; + } } - self->pair_map[index] = (int64_t) i; } - out: return ret; } -int TSK_WARN_UNUSED -tsk_ibd_finder_init(tsk_ibd_finder_t *self, tsk_table_collection_t *tables, - tsk_id_t *pairs, tsk_size_t num_pairs) +static int TSK_WARN_UNUSED +tsk_ibd_finder_init(tsk_ibd_finder_t *self, const tsk_table_collection_t *tables, + tsk_identity_segments_t *result, double min_length, double max_time) { int ret = 0; tsk_size_t num_nodes; tsk_memset(self, 0, sizeof(tsk_ibd_finder_t)); - self->pairs = pairs; - self->num_pairs = num_pairs; - self->sequence_length = tables->sequence_length; - self->num_nodes = tables->nodes.num_rows; - self->tables = tables; - self->max_time = DBL_MAX; - self->min_length = 0; - if (pairs == NULL || num_pairs < 1) { - ret = TSK_ERR_NO_SAMPLE_PAIRS; + if (min_length < 0) { + ret = TSK_ERR_BAD_PARAM_VALUE; + goto out; + } + if (max_time < 0) { + ret = TSK_ERR_BAD_PARAM_VALUE; goto out; } - // Allocate the heaps used for small objects. + self->tables = tables; + self->result = result; + self->max_time = max_time; + self->min_length = min_length; + ret = tsk_blkalloc_init(&self->segment_heap, 8192); if (ret != 0) { goto out; } - // Mallocing and callocing. num_nodes = tables->nodes.num_rows; self->ancestor_map_head = tsk_calloc(num_nodes, sizeof(*self->ancestor_map_head)); self->ancestor_map_tail = tsk_calloc(num_nodes, sizeof(*self->ancestor_map_tail)); - self->ibd_segments_head - = tsk_calloc(self->num_pairs, sizeof(*self->ibd_segments_head)); - self->ibd_segments_tail - = tsk_calloc(self->num_pairs, sizeof(*self->ibd_segments_tail)); - self->is_sample = tsk_calloc(num_nodes, sizeof(*self->is_sample)); + self->sample_set_id = tsk_malloc(num_nodes * sizeof(*self->sample_set_id)); self->segment_queue_size = 0; self->max_segment_queue_size = 64; self->segment_queue = tsk_malloc(self->max_segment_queue_size * sizeof(*self->segment_queue)); if (self->ancestor_map_head == NULL || self->ancestor_map_tail == NULL - || self->ibd_segments_head == NULL || self->ibd_segments_tail == NULL - || self->is_sample == NULL || self->segment_queue == NULL) { + || self->sample_set_id == NULL || self->segment_queue == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; } - - ret = tsk_ibd_finder_init_samples(self); - if (ret != 0) { - goto out; - } - - ret = tsk_ibd_finder_index_samples(self); - if (ret != 0) { - goto out; - } - - ret = tsk_ibd_finder_build_pair_map(self); - if (ret != 0) { - goto out; - } - + tsk_memset(self->sample_set_id, TSK_NULL, num_nodes * sizeof(*self->sample_set_id)); out: return ret; } -int TSK_WARN_UNUSED -tsk_ibd_finder_set_min_length(tsk_ibd_finder_t *self, double min_length) -{ - int ret = 0; - - if (min_length < 0) { - ret = TSK_ERR_BAD_PARAM_VALUE; - } - self->min_length = min_length; - return ret; -} - -int TSK_WARN_UNUSED -tsk_ibd_finder_set_max_time(tsk_ibd_finder_t *self, double max_time) -{ - int ret = 0; - - if (max_time < 0) { - ret = TSK_ERR_BAD_PARAM_VALUE; - } - self->max_time = max_time; - return ret; -} - static int TSK_WARN_UNUSED tsk_ibd_finder_enqueue_segment( tsk_ibd_finder_t *self, double left, double right, tsk_id_t node) @@ -7411,8 +7807,7 @@ tsk_ibd_finder_enqueue_segment( tsk_segment_t *seg; void *p; - tsk_bug_assert(left < right); - if (right - left > self->min_length) { + if ((right - left) > self->min_length) { /* Make sure we always have room for one more segment in the queue so we * can put a tail sentinel on it */ if (self->segment_queue_size == self->max_segment_queue_size - 1) { @@ -7425,153 +7820,102 @@ tsk_ibd_finder_enqueue_segment( } self->segment_queue = p; } + seg = self->segment_queue + self->segment_queue_size; + seg->left = left; + seg->right = right; + seg->node = node; + self->segment_queue_size++; } - seg = self->segment_queue + self->segment_queue_size; - seg->left = left; - seg->right = right; - seg->node = node; - self->segment_queue_size++; out: return ret; } -static int -tsk_ibd_finder_find_sample_pair_index2( - tsk_ibd_finder_t *self, tsk_id_t sample0, tsk_id_t sample1) +static bool +tsk_ibd_finder_passes_filters( + const tsk_ibd_finder_t *self, tsk_id_t a, tsk_id_t b, double left, double right) { - int ret = -1; - tsk_id_t s0, s1; - tsk_size_t row, col; - - s0 = self->paired_nodes_index[sample0]; - s1 = self->paired_nodes_index[sample1]; - - if (s0 < 0 || s1 < 0) { - goto out; + if (a == b) { + return false; + } + if ((right - left) <= self->min_length) { + return false; + } + if (self->finding_between) { + return self->sample_set_id[a] != self->sample_set_id[b]; + } else { + return true; } - - row = TSK_MIN((tsk_size_t) s0, (tsk_size_t) s1); - col = TSK_MAX((tsk_size_t) s0, (tsk_size_t) s1); - - ret = (int) self->pair_map[row * self->num_unique_nodes_in_pair + col]; -out: - return ret; } static int TSK_WARN_UNUSED -tsk_ibd_finder_calculate_ibd(tsk_ibd_finder_t *self, tsk_id_t current_parent) +tsk_ibd_finder_record_ibd(tsk_ibd_finder_t *self, tsk_id_t parent) { int ret = 0; - int j, pair_index; - tsk_segment_t *seg, *seg0, *seg1; + tsk_size_t j; + tsk_segment_t *seg0, *seg1; double left, right; - if (self->ancestor_map_head[current_parent] == NULL) { - for (j = 0; j != (int) self->segment_queue_size; j++) { - seg = &self->segment_queue[j]; - ret = tsk_ibd_finder_add_ancestry( - self, current_parent, seg->left, seg->right, seg->node); - if (ret != 0) { - goto out; - } - } - } else { - for (seg0 = self->ancestor_map_head[current_parent]; seg0 != NULL; - seg0 = seg0->next) { - for (j = 0; j != (int) self->segment_queue_size; j++) { - seg1 = &self->segment_queue[j]; - if (seg0->node == seg1->node) { - continue; - } - ret = tsk_ibd_finder_find_sample_pair_index2( - self, seg0->node, seg1->node); - if (ret < 0) { - continue; - } - pair_index = ret; - - if (seg0->left > seg1->left) { - left = seg0->left; - } else { - left = seg1->left; - } - if (seg0->right < seg1->right) { - right = seg0->right; - } else { - right = seg1->right; - } - - if (left < right) { - if (right - left > self->min_length) { - ret = tsk_ibd_finder_add_output( - self, left, right, current_parent, pair_index); - if (ret != 0) { - goto out; - } - } + for (seg0 = self->ancestor_map_head[parent]; seg0 != NULL; seg0 = seg0->next) { + for (j = 0; j < self->segment_queue_size; j++) { + seg1 = &self->segment_queue[j]; + left = TSK_MAX(seg0->left, seg1->left); + right = TSK_MIN(seg0->right, seg1->right); + if (tsk_ibd_finder_passes_filters( + self, seg0->node, seg1->node, left, right)) { + ret = tsk_identity_segments_add_segment( + self->result, seg0->node, seg1->node, left, right, parent); + if (ret != 0) { + goto out; } } } - for (j = 0; j != (int) self->segment_queue_size; j++) { - seg = &self->segment_queue[j]; - ret = tsk_ibd_finder_add_ancestry( - self, current_parent, seg->left, seg->right, seg->node); - if (ret != 0) { - goto out; - } - } } - self->segment_queue_size = 0; - out: return ret; } -int TSK_WARN_UNUSED -tsk_ibd_finder_get_ibd_segments( - tsk_ibd_finder_t *self, tsk_id_t pair_index, tsk_segment_t **ret_ibd_segments_head) +static int TSK_WARN_UNUSED +tsk_ibd_finder_add_queued_ancestry(tsk_ibd_finder_t *self, tsk_id_t parent) { int ret = 0; + tsk_size_t j; + tsk_segment_t seg; - if (((pair_index < 0) || (pair_index >= (tsk_id_t) self->num_pairs))) { - ret = TSK_ERR_NO_SAMPLE_PAIRS; - goto out; - } - if (self->ibd_segments_head[pair_index] != NULL) { - *ret_ibd_segments_head = self->ibd_segments_head[pair_index]; - } else { - ret = -1; + for (j = 0; j < self->segment_queue_size; j++) { + seg = self->segment_queue[j]; + ret = tsk_ibd_finder_add_ancestry(self, parent, seg.left, seg.right, seg.node); + if (ret != 0) { + goto out; + } } + self->segment_queue_size = 0; out: return ret; } -void +static void tsk_ibd_finder_print_state(tsk_ibd_finder_t *self, FILE *out) { tsk_size_t j; tsk_segment_t *u = NULL; fprintf(out, "--ibd-finder stats--\n"); - fprintf(out, "===\nEdge table\n==\n"); + fprintf(out, "max_time = %f\n", self->max_time); + fprintf(out, "min_length = %f\n", self->min_length); + fprintf(out, "finding_between = %d\n", self->finding_between); + fprintf(out, "===\nEdges\n===\n"); for (j = 0; j < self->tables->edges.num_rows; j++) { fprintf(out, "L:%f, R:%f, P:%lld, C:%lld\n", self->tables->edges.left[j], self->tables->edges.right[j], (long long) self->tables->edges.parent[j], (long long) self->tables->edges.child[j]); } - fprintf(out, "===\nNode table\n==\n"); + fprintf(out, "===\nNodes\n===\n"); for (j = 0; j < self->tables->nodes.num_rows; j++) { - fprintf(out, "ID:%f, Time:%f, Flag:%lld\n", (double) j, - self->tables->nodes.time[j], (long long) self->tables->nodes.flags[j]); + fprintf(out, "ID:%d, Time:%f, Flag:%lld Sample set:%d\n", (int) j, + self->tables->nodes.time[j], (long long) self->tables->nodes.flags[j], + (int) self->sample_set_id[j]); } - fprintf(out, "==\nSample pairs\n==\n"); - for (j = 0; j < 2 * self->num_pairs; j++) { - fprintf(out, "%i ", (int) self->pairs[j]); - if (j % 2 != 0) { - fprintf(out, "\n"); - } - } - fprintf(out, "===\nAncestral map\n==\n"); + fprintf(out, "===\nAncestral map\n===\n"); for (j = 0; j < self->tables->nodes.num_rows; j++) { fprintf(out, "Node %lld: ", (long long) j); for (u = self->ancestor_map_head[j]; u != NULL; u = u->next) { @@ -7579,107 +7923,109 @@ tsk_ibd_finder_print_state(tsk_ibd_finder_t *self, FILE *out) } fprintf(out, "\n"); } - fprintf(out, "===\nIBD segments\n===\n"); - for (j = 0; j < self->num_pairs; j++) { - fprintf(out, "Pair (%i, %i)\n", (int) self->pairs[2 * j], - (int) self->pairs[2 * j + 1]); - for (u = self->ibd_segments_head[j]; u != NULL; u = u->next) { - fprintf(out, "(%f,%f->%lld)", u->left, u->right, (long long) u->node); + tsk_identity_segments_print_state(self->result, out); +} + +static int TSK_WARN_UNUSED +tsk_ibd_finder_init_within( + tsk_ibd_finder_t *self, const tsk_id_t *samples, tsk_size_t num_samples) +{ + int ret; + + if (samples == NULL) { + tsk_ibd_finder_init_samples_from_nodes(self); + } else { + ret = tsk_ibd_finder_init_samples_from_set(self, samples, num_samples); + if (ret != 0) { + goto out; } - fprintf(out, "\n"); } + self->finding_between = false; + ret = tsk_ibd_finder_add_sample_ancestry(self); +out: + return ret; } -int TSK_WARN_UNUSED +static int TSK_WARN_UNUSED +tsk_ibd_finder_init_between(tsk_ibd_finder_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets) +{ + int ret = 0; + tsk_size_t j, k, index; + tsk_id_t u; + + index = 0; + for (j = 0; j < num_sample_sets; j++) { + for (k = 0; k < sample_set_sizes[j]; k++) { + u = sample_sets[index]; + if (u < 0 || u > (tsk_id_t) self->tables->nodes.num_rows) { + ret = TSK_ERR_NODE_OUT_OF_BOUNDS; + goto out; + } + if (self->sample_set_id[u] != TSK_NULL) { + ret = TSK_ERR_DUPLICATE_SAMPLE; + goto out; + } + self->sample_set_id[u] = (tsk_id_t) j; + index++; + } + } + self->finding_between = true; + ret = tsk_ibd_finder_add_sample_ancestry(self); +out: + return ret; +} + +static int TSK_WARN_UNUSED tsk_ibd_finder_run(tsk_ibd_finder_t *self) { const tsk_edge_table_t *input_edges = &self->tables->edges; + const tsk_size_t num_edges = input_edges->num_rows; int ret = 0; tsk_size_t j; - tsk_id_t u; - tsk_id_t current_parent = -1; - tsk_size_t num_edges = input_edges->num_rows; - tsk_segment_t *seg; tsk_segment_t *s; - double intvl_l, intvl_r, current_time; - bool parent_should_be_added = true; + tsk_id_t parent, child; + double left, right, intvl_l, intvl_r, time; for (j = 0; j < num_edges; j++) { - - if (current_parent >= 0 && current_parent != input_edges->parent[j]) { - parent_should_be_added = true; - } - - // Stop if the processed node's time exceeds the max time. - current_parent = input_edges->parent[j]; - current_time = self->tables->nodes.time[current_parent]; - if (current_time > self->max_time) { - goto out; + parent = input_edges->parent[j]; + left = input_edges->left[j]; + right = input_edges->right[j]; + child = input_edges->child[j]; + time = self->tables->nodes.time[parent]; + if (time > self->max_time) { + break; } - // Extract segment. - seg = tsk_ibd_finder_alloc_segment( - self, input_edges->left[j], input_edges->right[j], input_edges->child[j]); - // Create a SegmentList holding all of the sample segments descending from - // seg. - u = seg->node; - if (self->is_sample[u]) { - ret = tsk_ibd_finder_enqueue_segment(self, seg->left, seg->right, seg->node); + for (s = self->ancestor_map_head[child]; s != NULL; s = s->next) { + intvl_l = TSK_MAX(left, s->left); + intvl_r = TSK_MIN(right, s->right); + ret = tsk_ibd_finder_enqueue_segment(self, intvl_l, intvl_r, s->node); if (ret != 0) { goto out; } - } else { - for (s = self->ancestor_map_head[u]; s != NULL; s = s->next) { - if (seg->left > s->left) { - intvl_l = seg->left; - } else { - intvl_l = s->left; - } - if (seg->right < s->right) { - intvl_r = seg->right; - } else { - intvl_r = s->right; - } - // Add to the segment queue. - if (intvl_r - intvl_l > 0) { - ret = tsk_ibd_finder_enqueue_segment( - self, intvl_l, intvl_r, s->node); - if (ret != 0) { - goto out; - } - } - } - } - - // Calculate new ibd segments descending from the current parent. - if (self->segment_queue_size > 0) { - ret = tsk_ibd_finder_calculate_ibd(self, current_parent); } + ret = tsk_ibd_finder_record_ibd(self, parent); if (ret != 0) { goto out; } - - // For samples that appear in the parent column of the edge table - if (self->is_sample[current_parent] && parent_should_be_added) { - parent_should_be_added = false; + ret = tsk_ibd_finder_add_queued_ancestry(self, parent); + if (ret != 0) { + goto out; } } out: return ret; } -int TSK_WARN_UNUSED +static int tsk_ibd_finder_free(tsk_ibd_finder_t *self) { - tsk_safe_free(self->ibd_segments_head); - tsk_safe_free(self->ibd_segments_tail); tsk_blkalloc_free(&self->segment_heap); - tsk_safe_free(self->is_sample); - tsk_safe_free(self->paired_nodes_index); + tsk_safe_free(self->sample_set_id); tsk_safe_free(self->ancestor_map_head); tsk_safe_free(self->ancestor_map_tail); tsk_safe_free(self->segment_queue); - tsk_safe_free(self->pair_map); return 0; } @@ -8554,7 +8900,7 @@ simplifier_output_sites(simplifier_t *self) tsk_id_t input_mutation, mapped_parent, site_start, site_end; tsk_id_t num_input_sites = (tsk_id_t) self->input_tables.sites.num_rows; tsk_id_t num_input_mutations = (tsk_id_t) self->input_tables.mutations.num_rows; - tsk_id_t input_parent, num_output_mutations, num_output_site_mutations; + tsk_id_t num_output_mutations, num_output_site_mutations; tsk_id_t mapped_node; bool keep_site; bool filter_sites = !!(self->options & TSK_FILTER_SITES); @@ -8572,11 +8918,6 @@ simplifier_output_sites(simplifier_t *self) && self->input_tables.mutations.site[input_mutation] == site.id) { mapped_node = self->mutation_node_map[input_mutation]; if (mapped_node != TSK_NULL) { - input_parent = self->input_tables.mutations.parent[input_mutation]; - mapped_parent = TSK_NULL; - if (input_parent != TSK_NULL) { - mapped_parent = self->mutation_id_map[input_parent]; - } self->mutation_id_map[input_mutation] = num_output_mutations; num_output_mutations++; num_output_site_mutations++; @@ -8988,7 +9329,7 @@ tsk_table_collection_check_node_integrity( for (j = 0; j < self->nodes.num_rows; j++) { node_time = self->nodes.time[j]; - if (!isfinite(node_time)) { + if (!tsk_isfinite(node_time)) { ret = TSK_ERR_TIME_NONFINITE; goto out; } @@ -9059,7 +9400,7 @@ tsk_table_collection_check_edge_integrity( goto out; } /* Spatial requirements for edges */ - if (!(isfinite(left) && isfinite(right))) { + if (!(tsk_isfinite(left) && tsk_isfinite(right))) { ret = TSK_ERR_GENOME_COORDS_NONFINITE; goto out; } @@ -9137,7 +9478,7 @@ tsk_table_collection_check_site_integrity( for (j = 0; j < sites.num_rows; j++) { position = sites.position[j]; /* Spatial requirements */ - if (!isfinite(position)) { + if (!tsk_isfinite(position)) { ret = TSK_ERR_BAD_SITE_POSITION; goto out; } @@ -9176,7 +9517,8 @@ tsk_table_collection_check_mutation_integrity( const double *node_time = self->nodes.time; const bool check_mutation_ordering = !!(options & TSK_CHECK_MUTATION_ORDERING); bool unknown_time; - bool unknown_times_seen = false; + int num_known_times = 0; + int num_unknown_times = 0; for (j = 0; j < mutations.num_rows; j++) { /* Basic reference integrity */ @@ -9198,68 +9540,70 @@ tsk_table_collection_check_mutation_integrity( ret = TSK_ERR_MUTATION_PARENT_EQUAL; goto out; } - /* Check if time is nonfinite */ + /* Check that time is finite and not more recent than node time */ mutation_time = mutations.time[j]; unknown_time = tsk_is_unknown_time(mutation_time); if (!unknown_time) { - if (!isfinite(mutation_time)) { + if (!tsk_isfinite(mutation_time)) { ret = TSK_ERR_TIME_NONFINITE; goto out; } + if (mutation_time < node_time[mutations.node[j]]) { + ret = TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE; + goto out; + } } - if (check_mutation_ordering) { - /* Check site ordering and reset time check if needed*/ - if (j > 0) { - if (mutations.site[j - 1] > mutations.site[j]) { - ret = TSK_ERR_UNSORTED_MUTATIONS; - goto out; - } - if (mutations.site[j - 1] != mutations.site[j]) { - last_known_time = INFINITY; - unknown_times_seen = false; - } + /* reset checks when reaching a new site */ + if (j > 0 && mutations.site[j - 1] != mutations.site[j]) { + last_known_time = INFINITY; + num_known_times = 0; + num_unknown_times = 0; + } + + /* Check known/unknown times are not both present on a site */ + if (unknown_time) { + num_unknown_times++; + } else { + num_known_times++; + } + if ((num_unknown_times > 0) && (num_known_times > 0)) { + ret = TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN; + goto out; + } + + /* check parent site agrees */ + if (parent_mut != TSK_NULL) { + if (mutations.site[parent_mut] != mutations.site[j]) { + ret = TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE; + goto out; + } + /* If this mutation time is known, then the parent time + * must also be, or else the + * TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN check + * above will fail. */ + if (!unknown_time && mutation_time > mutations.time[parent_mut]) { + ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION; + goto out; } + } - /* Check known/unknown times are not both present on a site*/ - if (unknown_time) { - unknown_times_seen = true; - } else if (unknown_times_seen) { - ret = TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN; + if (check_mutation_ordering) { + /* Check site ordering */ + if (j > 0 && mutations.site[j - 1] > mutations.site[j]) { + ret = TSK_ERR_UNSORTED_MUTATIONS; goto out; } - /* Check the mutation parents for ordering */ - /* We can only check parent properties if it is non-null */ - if (parent_mut != TSK_NULL) { - /* Parents must be listed before their children */ - if (parent_mut > (tsk_id_t) j) { - ret = TSK_ERR_MUTATION_PARENT_AFTER_CHILD; - goto out; - } - if (mutations.site[parent_mut] != mutations.site[j]) { - ret = TSK_ERR_MUTATION_PARENT_DIFFERENT_SITE; - goto out; - } + /* Check if parents are listed before their children */ + if (parent_mut != TSK_NULL && parent_mut > (tsk_id_t) j) { + ret = TSK_ERR_MUTATION_PARENT_AFTER_CHILD; + goto out; } - /* Check time value ordering */ + /* Check time ordering. We do this after the other checks above, + * so that more specific errors trigger first */ if (!unknown_time) { - if (mutation_time < node_time[mutations.node[j]]) { - ret = TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE; - goto out; - } - /* If this mutation time is known, then the parent time - * must also be as the - * TSK_ERR_MUTATION_TIME_HAS_BOTH_KNOWN_AND_UNKNOWN check - * above would otherwise fail. */ - if (parent_mut != TSK_NULL - && mutation_time > mutations.time[parent_mut]) { - ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION; - goto out; - } - /* Check time ordering, we do this after the time checks above, so - that more specific errors trigger first */ if (mutation_time > last_known_time) { ret = TSK_ERR_UNSORTED_MUTATIONS; goto out; @@ -9278,12 +9622,13 @@ tsk_table_collection_check_migration_integrity( { int ret = 0; tsk_size_t j; - double left, right; + double left, right, time; const double L = self->sequence_length; const tsk_migration_table_t migrations = self->migrations; const tsk_id_t num_nodes = (tsk_id_t) self->nodes.num_rows; const tsk_id_t num_populations = (tsk_id_t) self->populations.num_rows; const bool check_population_refs = !(options & TSK_NO_CHECK_POPULATION_REFS); + const bool check_migration_ordering = !!(options & TSK_CHECK_MIGRATION_ORDERING); for (j = 0; j < migrations.num_rows; j++) { if (migrations.node[j] < 0 || migrations.node[j] >= num_nodes) { @@ -9300,15 +9645,22 @@ tsk_table_collection_check_migration_integrity( goto out; } } - if (!isfinite(migrations.time[j])) { + time = migrations.time[j]; + if (!tsk_isfinite(time)) { ret = TSK_ERR_TIME_NONFINITE; goto out; } + if (j > 0) { + if (check_migration_ordering && migrations.time[j - 1] > time) { + ret = TSK_ERR_UNSORTED_MIGRATIONS; + goto out; + } + } left = migrations.left[j]; right = migrations.right[j]; /* Spatial requirements */ /* TODO it's a bit misleading to use the edge-specific errors here. */ - if (!(isfinite(left) && isfinite(right))) { + if (!(tsk_isfinite(left) && tsk_isfinite(right))) { ret = TSK_ERR_GENOME_COORDS_NONFINITE; goto out; } @@ -9399,7 +9751,6 @@ tsk_table_collection_check_tree_integrity(const tsk_table_collection_t *self) tsk_memset(parent, 0xff, self->nodes.num_rows * sizeof(*parent)); tree_left = 0; - tree_right = sequence_length; num_trees = 0; j = 0; k = 0; @@ -9494,10 +9845,10 @@ tsk_table_collection_check_integrity( tsk_id_t ret = 0; if (options & TSK_CHECK_TREES) { - /* Checking the trees implies all the other checks */ + /* Checking the trees implies these checks */ options |= TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING | TSK_CHECK_SITE_DUPLICATES | TSK_CHECK_MUTATION_ORDERING - | TSK_CHECK_INDEXES | TSK_CHECK_INDIVIDUAL_ORDERING; + | TSK_CHECK_MIGRATION_ORDERING | TSK_CHECK_INDEXES; } if (self->sequence_length <= 0) { @@ -9560,6 +9911,9 @@ tsk_table_collection_print_state(const tsk_table_collection_t *self, FILE *out) fprintf(out, "#metadata#\n"); fprintf(out, "%.*s\n", (int) self->metadata_length, self->metadata); fprintf(out, "#end#metadata\n"); + fprintf(out, "#time_units#\n"); + fprintf(out, "%.*s\n", (int) self->time_units_length, self->time_units); + fprintf(out, "#end#time_units\n"); tsk_individual_table_print_state(&self->individuals, out); tsk_node_table_print_state(&self->nodes, out); tsk_edge_table_print_state(&self->edges, out); @@ -9580,6 +9934,14 @@ tsk_table_collection_init(tsk_table_collection_t *self, tsk_flags_t options) if (options & TSK_NO_EDGE_METADATA) { edge_options |= TSK_NO_METADATA; } + + /* Set default time_units value */ + ret = tsk_table_collection_set_time_units( + self, TSK_TIME_UNITS_UNKNOWN, strlen(TSK_TIME_UNITS_UNKNOWN)); + if (ret != 0) { + goto out; + } + ret = tsk_node_table_init(&self->nodes, 0); if (ret != 0) { goto out; @@ -9612,6 +9974,10 @@ tsk_table_collection_init(tsk_table_collection_t *self, tsk_flags_t options) if (ret != 0) { goto out; } + ret = tsk_reference_sequence_init(&self->reference_sequence, 0); + if (ret != 0) { + goto out; + } out: return ret; } @@ -9627,9 +9993,11 @@ tsk_table_collection_free(tsk_table_collection_t *self) tsk_mutation_table_free(&self->mutations); tsk_population_table_free(&self->populations); tsk_provenance_table_free(&self->provenances); + tsk_reference_sequence_free(&self->reference_sequence); tsk_safe_free(self->indexes.edge_insertion_order); tsk_safe_free(self->indexes.edge_removal_order); tsk_safe_free(self->file_uuid); + tsk_safe_free(self->time_units); tsk_safe_free(self->metadata); tsk_safe_free(self->metadata_schema); return 0; @@ -9639,18 +10007,30 @@ bool tsk_table_collection_equals(const tsk_table_collection_t *self, const tsk_table_collection_t *other, tsk_flags_t options) { - bool ret - = self->sequence_length == other->sequence_length - && tsk_individual_table_equals( - &self->individuals, &other->individuals, options) - && tsk_node_table_equals(&self->nodes, &other->nodes, options) - && tsk_edge_table_equals(&self->edges, &other->edges, options) - && tsk_migration_table_equals(&self->migrations, &other->migrations, options) - && tsk_site_table_equals(&self->sites, &other->sites, options) - && tsk_mutation_table_equals(&self->mutations, &other->mutations, options) - && tsk_population_table_equals( - &self->populations, &other->populations, options); - + bool ret = self->sequence_length == other->sequence_length + && self->time_units_length == other->time_units_length + && tsk_memcmp(self->time_units, other->time_units, + self->time_units_length * sizeof(char)) + == 0; + if (!(options & TSK_CMP_IGNORE_TABLES)) { + ret = ret + && tsk_individual_table_equals( + &self->individuals, &other->individuals, options) + && tsk_node_table_equals(&self->nodes, &other->nodes, options) + && tsk_edge_table_equals(&self->edges, &other->edges, options) + && tsk_migration_table_equals( + &self->migrations, &other->migrations, options) + && tsk_site_table_equals(&self->sites, &other->sites, options) + && tsk_mutation_table_equals(&self->mutations, &other->mutations, options) + && tsk_population_table_equals( + &self->populations, &other->populations, options); + /* TSK_CMP_IGNORE_TABLES implies TSK_CMP_IGNORE_PROVENANCE */ + if (!(options & TSK_CMP_IGNORE_PROVENANCE)) { + ret = ret + && tsk_provenance_table_equals( + &self->provenances, &other->provenances, options); + } + } /* TSK_CMP_IGNORE_TS_METADATA is implied by TSK_CMP_IGNORE_METADATA */ if (options & TSK_CMP_IGNORE_METADATA) { options |= TSK_CMP_IGNORE_TS_METADATA; @@ -9666,14 +10046,23 @@ tsk_table_collection_equals(const tsk_table_collection_t *self, self->metadata_schema_length * sizeof(char)) == 0); } - if (!(options & TSK_CMP_IGNORE_PROVENANCE)) { + + if (!(options & TSK_CMP_IGNORE_REFERENCE_SEQUENCE)) { ret = ret - && tsk_provenance_table_equals( - &self->provenances, &other->provenances, options); + && tsk_reference_sequence_equals( + &self->reference_sequence, &other->reference_sequence, options); } return ret; } +int +tsk_table_collection_set_time_units( + tsk_table_collection_t *self, const char *time_units, tsk_size_t time_units_length) +{ + return replace_string( + &self->time_units, &self->time_units_length, time_units, time_units_length); +} + int tsk_table_collection_set_metadata( tsk_table_collection_t *self, const char *metadata, tsk_size_t metadata_length) @@ -9721,6 +10110,12 @@ tsk_table_collection_has_index( && self->indexes.num_edges == self->edges.num_rows; } +bool +tsk_table_collection_has_reference_sequence(const tsk_table_collection_t *self) +{ + return !tsk_reference_sequence_is_null(&self->reference_sequence); +} + int tsk_table_collection_drop_index( tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) @@ -9853,6 +10248,11 @@ tsk_table_collection_copy(const tsk_table_collection_t *self, goto out; } } + ret = tsk_table_collection_set_time_units( + dest, self->time_units, self->time_units_length); + if (ret != 0) { + goto out; + } ret = tsk_table_collection_set_metadata(dest, self->metadata, self->metadata_length); if (ret != 0) { goto out; @@ -9862,6 +10262,11 @@ tsk_table_collection_copy(const tsk_table_collection_t *self, if (ret != 0) { goto out; } + ret = tsk_reference_sequence_copy( + &self->reference_sequence, &dest->reference_sequence, options); + if (ret != 0) { + goto out; + } out: return ret; @@ -9875,10 +10280,13 @@ tsk_table_collection_read_format_data(tsk_table_collection_t *self, kastore_t *s uint32_t *version; int8_t *format_name, *uuid; double *L; - + char *time_units = NULL; char *metadata = NULL; char *metadata_schema = NULL; - size_t metadata_length, metadata_schema_length; + size_t time_units_length, metadata_length, metadata_schema_length; + /* TODO we could simplify this function quite a bit if we use the + * read_table_properties infrastructure. We would need to add the + * ability to have non-optional columns to that though. */ ret = kastore_gets_int8(store, "format/name", &format_name, &len); if (ret != 0) { @@ -9950,6 +10358,25 @@ tsk_table_collection_read_format_data(tsk_table_collection_t *self, kastore_t *s tsk_memcpy(self->file_uuid, uuid, TSK_UUID_SIZE); self->file_uuid[TSK_UUID_SIZE] = '\0'; + ret = kastore_containss(store, "time_units"); + if (ret < 0) { + ret = tsk_set_kas_error(ret); + goto out; + } + if (ret == 1) { + ret = kastore_gets_int8( + store, "time_units", (int8_t **) &time_units, &time_units_length); + if (ret != 0) { + ret = tsk_set_kas_error(ret); + goto out; + } + ret = tsk_table_collection_set_time_units( + self, time_units, (tsk_size_t) time_units_length); + if (ret != 0) { + goto out; + } + } + ret = kastore_containss(store, "metadata"); if (ret < 0) { ret = tsk_set_kas_error(ret); @@ -10057,13 +10484,80 @@ tsk_table_collection_load_indexes(tsk_table_collection_t *self, kastore_t *store return ret; } +static int +tsk_table_collection_load_reference_sequence( + tsk_table_collection_t *self, kastore_t *store) +{ + int ret = 0; + char *data = NULL; + char *url = NULL; + char *metadata = NULL; + char *metadata_schema = NULL; + tsk_size_t data_length = 0, url_length, metadata_length, metadata_schema_length; + + read_table_property_t properties[] = { + { "reference_sequence/data", (void **) &data, &data_length, KAS_UINT8, + TSK_COL_OPTIONAL }, + { "reference_sequence/url", (void **) &url, &url_length, KAS_UINT8, + TSK_COL_OPTIONAL }, + { "reference_sequence/metadata", (void **) &metadata, &metadata_length, + KAS_UINT8, TSK_COL_OPTIONAL }, + { "reference_sequence/metadata_schema", (void **) &metadata_schema, + &metadata_schema_length, KAS_UINT8, TSK_COL_OPTIONAL }, + { .name = NULL }, + }; + + ret = read_table_properties(store, properties, 0); + if (ret != 0) { + goto out; + } + if (data != NULL) { + ret = tsk_reference_sequence_set_data( + &self->reference_sequence, data, (tsk_size_t) data_length); + if (ret != 0) { + goto out; + } + } + if (metadata != NULL) { + ret = tsk_reference_sequence_set_metadata( + &self->reference_sequence, metadata, (tsk_size_t) metadata_length); + if (ret != 0) { + goto out; + } + } + if (metadata_schema != NULL) { + ret = tsk_reference_sequence_set_metadata_schema(&self->reference_sequence, + metadata_schema, (tsk_size_t) metadata_schema_length); + if (ret != 0) { + goto out; + } + } + if (url != NULL) { + ret = tsk_reference_sequence_set_url( + &self->reference_sequence, url, (tsk_size_t) url_length); + if (ret != 0) { + goto out; + } + } + +out: + return ret; +} + static int TSK_WARN_UNUSED -tsk_table_collection_loadf_inited(tsk_table_collection_t *self, FILE *file) +tsk_table_collection_loadf_inited( + tsk_table_collection_t *self, FILE *file, tsk_flags_t options) { int ret = 0; kastore_t store; - ret = kastore_openf(&store, file, "r", KAS_READ_ALL); + int kas_flags = KAS_READ_ALL; + if ((options & TSK_LOAD_SKIP_TABLES) + || (options & TSK_LOAD_SKIP_REFERENCE_SEQUENCE)) { + kas_flags = 0; + } + ret = kastore_openf(&store, file, "r", kas_flags); + if (ret != 0) { if (ret == KAS_ERR_EOF) { /* KAS_ERR_EOF means that we tried to read a store from the stream @@ -10080,41 +10574,54 @@ tsk_table_collection_loadf_inited(tsk_table_collection_t *self, FILE *file) if (ret != 0) { goto out; } - ret = tsk_node_table_load(&self->nodes, &store); - if (ret != 0) { - goto out; - } - ret = tsk_edge_table_load(&self->edges, &store); - if (ret != 0) { - goto out; - } - ret = tsk_site_table_load(&self->sites, &store); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_load(&self->mutations, &store); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_load(&self->migrations, &store); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_load(&self->individuals, &store); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_load(&self->populations, &store); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_load(&self->provenances, &store); - if (ret != 0) { - goto out; + if (!(options & TSK_LOAD_SKIP_TABLES)) { + ret = tsk_node_table_load(&self->nodes, &store); + if (ret != 0) { + goto out; + } + ret = tsk_edge_table_load(&self->edges, &store); + if (ret != 0) { + goto out; + } + ret = tsk_site_table_load(&self->sites, &store); + if (ret != 0) { + goto out; + } + ret = tsk_mutation_table_load(&self->mutations, &store); + if (ret != 0) { + goto out; + } + ret = tsk_migration_table_load(&self->migrations, &store); + if (ret != 0) { + goto out; + } + ret = tsk_individual_table_load(&self->individuals, &store); + if (ret != 0) { + goto out; + } + ret = tsk_population_table_load(&self->populations, &store); + if (ret != 0) { + goto out; + } + ret = tsk_provenance_table_load(&self->provenances, &store); + if (ret != 0) { + goto out; + } + ret = tsk_table_collection_load_indexes(self, &store); + if (ret != 0) { + goto out; + } + } else { + ret = tsk_table_collection_build_index(self, 0); + if (ret != 0) { + goto out; + } } - ret = tsk_table_collection_load_indexes(self, &store); - if (ret != 0) { - goto out; + if (!(options & TSK_LOAD_SKIP_REFERENCE_SEQUENCE)) { + ret = tsk_table_collection_load_reference_sequence(self, &store); + if (ret != 0) { + goto out; + } } ret = kastore_close(&store); if (ret != 0) { @@ -10139,7 +10646,7 @@ tsk_table_collection_loadf(tsk_table_collection_t *self, FILE *file, tsk_flags_t goto out; } } - ret = tsk_table_collection_loadf_inited(self, file); + ret = tsk_table_collection_loadf_inited(self, file, options); if (ret != 0) { goto out; } @@ -10165,7 +10672,7 @@ tsk_table_collection_load( ret = TSK_ERR_IO; goto out; } - ret = tsk_table_collection_loadf_inited(self, file); + ret = tsk_table_collection_loadf_inited(self, file, options); if (ret != 0) { goto out; } @@ -10197,6 +10704,7 @@ tsk_table_collection_write_format_data(const tsk_table_collection_t *self, { "format/version", (void *) version, 2, KAS_UINT32 }, { "sequence_length", (const void *) &self->sequence_length, 1, KAS_FLOAT64 }, { "uuid", (void *) uuid, TSK_UUID_SIZE, KAS_INT8 }, + { "time_units", (void *) self->time_units, self->time_units_length, KAS_INT8 }, { "metadata", (void *) self->metadata, self->metadata_length, KAS_INT8 }, { "metadata_schema", (void *) self->metadata_schema, self->metadata_schema_length, KAS_INT8 }, @@ -10215,6 +10723,27 @@ tsk_table_collection_write_format_data(const tsk_table_collection_t *self, return ret; } +static int TSK_WARN_UNUSED +tsk_table_collection_dump_reference_sequence(const tsk_table_collection_t *self, + kastore_t *store, tsk_flags_t TSK_UNUSED(options)) +{ + int ret = 0; + const tsk_reference_sequence_t *ref = &self->reference_sequence; + write_table_col_t write_cols[] = { + { "reference_sequence/data", (void *) ref->data, ref->data_length, KAS_UINT8 }, + { "reference_sequence/url", (void *) ref->url, ref->url_length, KAS_UINT8 }, + { "reference_sequence/metadata", (void *) ref->metadata, ref->metadata_length, + KAS_UINT8 }, + { "reference_sequence/metadata_schema", (void *) ref->metadata_schema, + ref->metadata_schema_length, KAS_UINT8 }, + { .name = NULL }, + }; + if (tsk_table_collection_has_reference_sequence(self)) { + ret = write_table_cols(store, write_cols, 0); + } + return ret; +} + int TSK_WARN_UNUSED tsk_table_collection_dump( const tsk_table_collection_t *self, const char *filename, tsk_flags_t options) @@ -10303,6 +10832,10 @@ tsk_table_collection_dumpf( if (ret != 0) { goto out; } + ret = tsk_table_collection_dump_reference_sequence(self, &store, options); + if (ret != 0) { + goto out; + } ret = kastore_close(&store); if (ret != 0) { @@ -10366,7 +10899,7 @@ tsk_table_collection_simplify(tsk_table_collection_t *self, const tsk_id_t *samp goto out; } if (!!(options & TSK_DEBUG)) { - simplifier_print_state(&simplifier, stdout); + simplifier_print_state(&simplifier, tsk_get_debug_stream()); } /* The indexes are invalidated now so drop them */ ret = tsk_table_collection_drop_index(self, 0); @@ -10405,6 +10938,72 @@ tsk_table_collection_link_ancestors(tsk_table_collection_t *self, tsk_id_t *samp return ret; } +int TSK_WARN_UNUSED +tsk_table_collection_ibd_within(const tsk_table_collection_t *self, + tsk_identity_segments_t *result, const tsk_id_t *samples, tsk_size_t num_samples, + double min_length, double max_time, tsk_flags_t options) +{ + int ret = 0; + tsk_ibd_finder_t ibd_finder; + + ret = tsk_identity_segments_init(result, self->nodes.num_rows, options); + if (ret != 0) { + goto out; + } + ret = tsk_ibd_finder_init(&ibd_finder, self, result, min_length, max_time); + if (ret != 0) { + goto out; + } + ret = tsk_ibd_finder_init_within(&ibd_finder, samples, num_samples); + if (ret != 0) { + goto out; + } + ret = tsk_ibd_finder_run(&ibd_finder); + if (ret != 0) { + goto out; + } + if (!!(options & TSK_DEBUG)) { + tsk_ibd_finder_print_state(&ibd_finder, tsk_get_debug_stream()); + } +out: + tsk_ibd_finder_free(&ibd_finder); + return ret; +} + +int TSK_WARN_UNUSED +tsk_table_collection_ibd_between(const tsk_table_collection_t *self, + tsk_identity_segments_t *result, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, double min_length, + double max_time, tsk_flags_t options) +{ + int ret = 0; + tsk_ibd_finder_t ibd_finder; + + ret = tsk_identity_segments_init(result, self->nodes.num_rows, options); + if (ret != 0) { + goto out; + } + ret = tsk_ibd_finder_init(&ibd_finder, self, result, min_length, max_time); + if (ret != 0) { + goto out; + } + ret = tsk_ibd_finder_init_between( + &ibd_finder, num_sample_sets, sample_set_sizes, sample_sets); + if (ret != 0) { + goto out; + } + ret = tsk_ibd_finder_run(&ibd_finder); + if (ret != 0) { + goto out; + } + if (!!(options & TSK_DEBUG)) { + tsk_ibd_finder_print_state(&ibd_finder, tsk_get_debug_stream()); + } +out: + tsk_ibd_finder_free(&ibd_finder); + return ret; +} + int TSK_WARN_UNUSED tsk_table_collection_sort( tsk_table_collection_t *self, const tsk_bookmark_t *start, tsk_flags_t options) @@ -10898,6 +11497,7 @@ tsk_table_collection_clear(tsk_table_collection_t *self, tsk_flags_t options) goto out; } } + out: return ret; } diff --git a/subprojects/tskit/tskit/tables.h b/subprojects/tskit/tskit/tables.h index dac430614..9a3f08b0f 100644 --- a/subprojects/tskit/tskit/tables.h +++ b/subprojects/tskit/tskit/tables.h @@ -538,6 +538,17 @@ typedef struct { tsk_size_t *record_offset; } tsk_provenance_table_t; +typedef struct { + char *data; + tsk_size_t data_length; + char *url; + tsk_size_t url_length; + char *metadata; + tsk_size_t metadata_length; + char *metadata_schema; + tsk_size_t metadata_schema_length; +} tsk_reference_sequence_t; + /** @brief A collection of tables defining the data for a tree sequence. */ @@ -545,12 +556,16 @@ typedef struct { /** @brief The sequence length defining the tree sequence's coordinate space */ double sequence_length; char *file_uuid; + /** @brief The units of the time dimension */ + char *time_units; + tsk_size_t time_units_length; /** @brief The tree-sequence metadata */ char *metadata; tsk_size_t metadata_length; /** @brief The metadata schema */ char *metadata_schema; tsk_size_t metadata_schema_length; + tsk_reference_sequence_t reference_sequence; /** @brief The individual table */ tsk_individual_table_t individuals; /** @brief The node table */ @@ -618,34 +633,35 @@ typedef struct _tsk_table_sorter_t { * TODO: document properly * */ -typedef struct _tsk_segment_t { +/* Note for tskit developers: it's perhaps a bit confusing/pointless to + * have the tsk_identity_segment_t struct as well as the internal tsk_segment_t + * struct (which is identical). However, we may want to implement either + * segment type differently in future, and since the tsk_identity_segment_t + * is part of the public API we want to allow the freedom for the different + * structures to evolve over time */ +typedef struct _tsk_identity_segment_t { double left; double right; - struct _tsk_segment_t *next; + struct _tsk_identity_segment_t *next; tsk_id_t node; -} tsk_segment_t; +} tsk_identity_segment_t; + +typedef struct { + tsk_size_t num_segments; + double total_span; + tsk_identity_segment_t *head; + tsk_identity_segment_t *tail; +} tsk_identity_segment_list_t; typedef struct { - tsk_id_t *pairs; - tsk_size_t num_pairs; tsk_size_t num_nodes; - tsk_size_t num_unique_nodes_in_pair; - int64_t *pair_map; - double sequence_length; - tsk_table_collection_t *tables; - tsk_segment_t **ibd_segments_head; - tsk_segment_t **ibd_segments_tail; - tsk_blkalloc_t segment_heap; - bool *is_sample; - tsk_id_t *paired_nodes_index; - double min_length; - double max_time; - tsk_segment_t **ancestor_map_head; - tsk_segment_t **ancestor_map_tail; - tsk_segment_t *segment_queue; - tsk_size_t segment_queue_size; - tsk_size_t max_segment_queue_size; -} tsk_ibd_finder_t; + tsk_avl_tree_int_t pair_map; + tsk_size_t num_segments; + double total_span; + tsk_blkalloc_t heap; + bool store_segments; + bool store_pairs; +} tsk_identity_segments_t; /****************************************************************************/ /* Common function options */ @@ -689,12 +705,13 @@ typedef struct { #define TSK_CHECK_SITE_ORDERING (1 << 1) #define TSK_CHECK_SITE_DUPLICATES (1 << 2) #define TSK_CHECK_MUTATION_ORDERING (1 << 3) -#define TSK_CHECK_INDEXES (1 << 4) -#define TSK_CHECK_TREES (1 << 5) -#define TSK_CHECK_INDIVIDUAL_ORDERING (1 << 6) +#define TSK_CHECK_INDIVIDUAL_ORDERING (1 << 4) +#define TSK_CHECK_MIGRATION_ORDERING (1 << 5) +#define TSK_CHECK_INDEXES (1 << 6) +#define TSK_CHECK_TREES (1 << 7) /* Leave room for more positive check flags */ -#define TSK_NO_CHECK_POPULATION_REFS (1 << 10) +#define TSK_NO_CHECK_POPULATION_REFS (1 << 12) /* Flags for load tables */ #define TSK_BUILD_INDEXES (1 << 0) @@ -707,6 +724,12 @@ typedef struct { /* Flags for table collection init */ #define TSK_NO_EDGE_METADATA (1 << 0) +/* Flags for table collection load */ +/* This shares an interface with table collection init. + TODO: review as part of #1720 */ +#define TSK_LOAD_SKIP_TABLES (1 << 1) +#define TSK_LOAD_SKIP_REFERENCE_SEQUENCE (1 << 2) + /* Flags for table init. */ #define TSK_NO_METADATA (1 << 0) @@ -719,6 +742,8 @@ typedef struct { #define TSK_CMP_IGNORE_PROVENANCE (1 << 1) #define TSK_CMP_IGNORE_METADATA (1 << 2) #define TSK_CMP_IGNORE_TIMESTAMPS (1 << 3) +#define TSK_CMP_IGNORE_TABLES (1 << 4) +#define TSK_CMP_IGNORE_REFERENCE_SEQUENCE (1 << 5) /* Flags for table collection clear */ #define TSK_CLEAR_METADATA_SCHEMAS (1 << 0) @@ -867,7 +892,7 @@ and is not checked for compatibility with any existing schema on this table. @param other A pointer to a tsk_individual_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the -first ``num_rows`` of ``other`` are used. + first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. @@ -969,29 +994,130 @@ on and may change arbitrarily between versions. */ void tsk_individual_table_print_state(const tsk_individual_table_t *self, FILE *out); -/** @} */ +/** +@brief Replace this table's data by copying from a set of column arrays -/* Undocumented methods */ +@rst +Clears the data columns of this table and then copies column data from the specified +set of arrays. The supplied arrays should all contain data on the same number of rows. +The metadata schema is not affected. +@endrst +@param self A pointer to a tsk_individual_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param flags The array of tsk_flag_t flag values to be copied. +@param location The array of double location values to be copied. +@param location_offset The array of tsk_size_t location offset values to be copied. +@param parents The array of tsk_id_t parent values to be copied. +@param parents_offset The array of tsk_size_t parent offset values to be copied. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_individual_table_set_columns(tsk_individual_table_t *self, tsk_size_t num_rows, const tsk_flags_t *flags, const double *location, const tsk_size_t *location_offset, const tsk_id_t *parents, const tsk_size_t *parents_offset, const char *metadata, const tsk_size_t *metadata_offset); + +/** +@brief Extends this table by copying from a set of column arrays + +@rst +Copies column data from the specified set of arrays to create new rows at the end of the +table. The supplied arrays should all contain data on the same number of rows. The +metadata schema is not affected. +@endrst + +@param self A pointer to a tsk_individual_table_t object. +@param num_rows The number of rows to copy from the specifed arrays +@param flags The array of tsk_flag_t flag values to be copied. +@param location The array of double location values to be copied. +@param location_offset The array of tsk_size_t location offset values to be copied. +@param parents The array of tsk_id_t parent values to be copied. +@param parents_offset The array of tsk_size_t parent offset values to be copied. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_individual_table_append_columns(tsk_individual_table_t *self, tsk_size_t num_rows, const tsk_flags_t *flags, const double *location, const tsk_size_t *location_offset, const tsk_id_t *parents, const tsk_size_t *parents_offset, const char *metadata, const tsk_size_t *metadata_offset); -int tsk_individual_table_dump_text(const tsk_individual_table_t *self, FILE *out); + +/** +@brief Controls the pre-allocation strategy for this table + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_individual_table_t object. +@param max_rows_increment The number of rows to pre-allocate, or zero for the default + doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ int tsk_individual_table_set_max_rows_increment( tsk_individual_table_t *self, tsk_size_t max_rows_increment); + +/** +@brief Controls the pre-allocation strategy for the metadata column + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_individual_table_t object. +@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for +the default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ int tsk_individual_table_set_max_metadata_length_increment( tsk_individual_table_t *self, tsk_size_t max_metadata_length_increment); + +/** +@brief Controls the pre-allocation strategy for the location column + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_individual_table_t object. +@param max_location_length_increment The number of bytes to pre-allocate, or zero for +the default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ int tsk_individual_table_set_max_location_length_increment( tsk_individual_table_t *self, tsk_size_t max_location_length_increment); + +/** +@brief Controls the pre-allocation strategy for the parents column + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_individual_table_t object. +@param max_parents_length_increment The number of bytes to pre-allocate, or zero for +the default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ int tsk_individual_table_set_max_parents_length_increment( tsk_individual_table_t *self, tsk_size_t max_parents_length_increment); +/** @} */ + +/* Undocumented methods */ + +int tsk_individual_table_dump_text(const tsk_individual_table_t *self, FILE *out); /** @defgroup NODE_TABLE_API_GROUP Node table API. @{ @@ -1115,7 +1241,7 @@ and is not checked for compatibility with any existing schema on this table. @param other A pointer to a tsk_node_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the -first ``num_rows`` of ``other`` are used. + first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. @@ -1212,20 +1338,91 @@ on and may change arbitrarily between versions. */ void tsk_node_table_print_state(const tsk_node_table_t *self, FILE *out); -/** @} */ +/** +@brief Replace this table's data by copying from a set of column arrays -/* Undocumented methods */ +@rst +Clears the data columns of this table and then copies column data from the specified +set of arrays. The supplied arrays should all contain data on the same number of rows. +The metadata schema is not affected. +@endrst -int tsk_node_table_set_max_rows_increment( - tsk_node_table_t *self, tsk_size_t max_rows_increment); -int tsk_node_table_set_max_metadata_length_increment( - tsk_node_table_t *self, tsk_size_t max_metadata_length_increment); +@param self A pointer to a tsk_node_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param flags The array of tsk_flag_t values to be copied. +@param time The array of double time values to be copied. +@param population The array of tsk_id_t population values to be copied. +@param individual The array of tsk_id_t individual values to be copied. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_node_table_set_columns(tsk_node_table_t *self, tsk_size_t num_rows, const tsk_flags_t *flags, const double *time, const tsk_id_t *population, - const tsk_id_t *individual, const char *metadata, const tsk_size_t *metadata_length); + const tsk_id_t *individual, const char *metadata, const tsk_size_t *metadata_offset); + +/** +@brief Extends this table by copying from a set of column arrays + +@rst +Copies column data from the specified set of arrays to create new rows at the end of the +table. The supplied arrays should all contain data on the same number of rows. The +metadata schema is not affected. +@endrst + +@param self A pointer to a tsk_node_table_t object. +@param num_rows The number of rows to copy from the specifed arrays +@param flags The array of tsk_flag_t values to be copied. +@param time The array of double time values to be copied. +@param population The array of tsk_id_t population values to be copied. +@param individual The array of tsk_id_t individual values to be copied. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_node_table_append_columns(tsk_node_table_t *self, tsk_size_t num_rows, const tsk_flags_t *flags, const double *time, const tsk_id_t *population, - const tsk_id_t *individual, const char *metadata, const tsk_size_t *metadata_length); + const tsk_id_t *individual, const char *metadata, const tsk_size_t *metadata_offset); + +/** +@brief Controls the pre-allocation strategy for this table + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_node_table_t object. +@param max_rows_increment The number of rows to pre-allocate, or zero for the default + doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ + +int tsk_node_table_set_max_rows_increment( + tsk_node_table_t *self, tsk_size_t max_rows_increment); + +/** +@brief Controls the pre-allocation strategy for the metadata column + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_node_table_t object. +@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for +the default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_node_table_set_max_metadata_length_increment( + tsk_node_table_t *self, tsk_size_t max_metadata_length_increment); + +/** @} */ + +/* Undocumented methods */ + int tsk_node_table_dump_text(const tsk_node_table_t *self, FILE *out); /** @@ -1360,7 +1557,7 @@ and is not checked for compatibility with any existing schema on this table. @param other A pointer to a tsk_edge_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the -first ``num_rows`` of ``other`` are used. + first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. @@ -1457,20 +1654,89 @@ on and may change arbitrarily between versions. */ void tsk_edge_table_print_state(const tsk_edge_table_t *self, FILE *out); -/** @} */ +/** +@brief Replace this table's data by copying from a set of column arrays -/* Undocumented methods */ +@rst +Clears the data columns of this table and then copies column data from the specified +set of arrays. The supplied arrays should all contain data on the same number of rows. +The metadata schema is not affected. +@endrst -int tsk_edge_table_set_max_rows_increment( - tsk_edge_table_t *self, tsk_size_t max_rows_increment); -int tsk_edge_table_set_max_metadata_length_increment( - tsk_edge_table_t *self, tsk_size_t max_metadata_length_increment); +@param self A pointer to a tsk_edge_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param left The array of double left values to be copied. +@param right The array of double right values to be copied. +@param parent The array of tsk_id_t parent values to be copied. +@param child The array of tsk_id_t child values to be copied. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_edge_table_set_columns(tsk_edge_table_t *self, tsk_size_t num_rows, const double *left, const double *right, const tsk_id_t *parent, - const tsk_id_t *child, const char *metadata, const tsk_size_t *metadata_length); + const tsk_id_t *child, const char *metadata, const tsk_size_t *metadata_offset); + +/** +@brief Extends this table by copying from a set of column arrays + +@rst +Copies column data from the specified set of arrays to create new rows at the end of the +table. The supplied arrays should all contain data on the same number of rows. The +metadata schema is not affected. +@endrst + +@param self A pointer to a tsk_edge_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param left The array of double left values to be copied. +@param right The array of double right values to be copied. +@param parent The array of tsk_id_t parent values to be copied. +@param child The array of tsk_id_t child values to be copied. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +*/ int tsk_edge_table_append_columns(tsk_edge_table_t *self, tsk_size_t num_rows, const double *left, const double *right, const tsk_id_t *parent, - const tsk_id_t *child, const char *metadata, const tsk_size_t *metadata_length); + const tsk_id_t *child, const char *metadata, const tsk_size_t *metadata_offset); + +/** +@brief Controls the pre-allocation strategy for this table + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_edge_table_t object. +@param max_rows_increment The number of rows to pre-allocate, or zero for the default + doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_edge_table_set_max_rows_increment( + tsk_edge_table_t *self, tsk_size_t max_rows_increment); + +/** +@brief Controls the pre-allocation strategy for the metadata column + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_edge_table_t object. +@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for +the default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_edge_table_set_max_metadata_length_increment( + tsk_edge_table_t *self, tsk_size_t max_metadata_length_increment); + +/** @} */ + +/* Undocumented methods */ + int tsk_edge_table_dump_text(const tsk_edge_table_t *self, FILE *out); int tsk_edge_table_squash(tsk_edge_table_t *self); @@ -1603,7 +1869,7 @@ and is not checked for compatibility with any existing schema on this table. @param other A pointer to a tsk_migration_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the -first ``num_rows`` of ``other`` are used. + first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. @@ -1702,22 +1968,96 @@ on and may change arbitrarily between versions. */ void tsk_migration_table_print_state(const tsk_migration_table_t *self, FILE *out); -/** @} */ +/** +@brief Replace this table's data by copying from a set of column arrays -/* Undocumented methods */ +@rst +Clears the data columns of this table and then copies column data from the specified +set of arrays. The supplied arrays should all contain data on the same number of rows. +The metadata schema is not affected. +@endrst -int tsk_migration_table_set_max_rows_increment( - tsk_migration_table_t *self, tsk_size_t max_rows_increment); -int tsk_migration_table_set_max_metadata_length_increment( - tsk_migration_table_t *self, tsk_size_t max_metadata_length_increment); +@param self A pointer to a tsk_migration_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param left The array of double left values to be copied. +@param right The array of double right values to be copied. +@param node The array of tsk_id_t node values to be copied. +@param source The array of tsk_id_t source values to be copied. +@param dest The array of tsk_id_t dest values to be copied. +@param time The array of double time values to be copied. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_migration_table_set_columns(tsk_migration_table_t *self, tsk_size_t num_rows, const double *left, const double *right, const tsk_id_t *node, const tsk_id_t *source, const tsk_id_t *dest, const double *time, - const char *metadata, const tsk_size_t *metadata_length); + const char *metadata, const tsk_size_t *metadata_offset); + +/** +@brief Extends this table by copying from a set of column arrays + +@rst +Copies column data from the specified set of arrays to create new rows at the end of the +table. The supplied arrays should all contain data on the same number of rows. The +metadata schema is not affected. +@endrst + +@param self A pointer to a tsk_migration_table_t object. +@param num_rows The number of rows to copy from the specifed arrays +@param left The array of double left values to be copied. +@param right The array of double right values to be copied. +@param node The array of tsk_id_t node values to be copied. +@param source The array of tsk_id_t source values to be copied. +@param dest The array of tsk_id_t dest values to be copied. +@param time The array of double time values to be copied. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_migration_table_append_columns(tsk_migration_table_t *self, tsk_size_t num_rows, const double *left, const double *right, const tsk_id_t *node, const tsk_id_t *source, const tsk_id_t *dest, const double *time, - const char *metadata, const tsk_size_t *metadata_length); + const char *metadata, const tsk_size_t *metadata_offset); + +/** +@brief Controls the pre-allocation strategy for this table + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_migration_table_t object. +@param max_rows_increment The number of rows to pre-allocate, or zero for the default + doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_migration_table_set_max_rows_increment( + tsk_migration_table_t *self, tsk_size_t max_rows_increment); + +/** +@brief Controls the pre-allocation strategy for the metadata column + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_migration_table_t object. +@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for +the default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_migration_table_set_max_metadata_length_increment( + tsk_migration_table_t *self, tsk_size_t max_metadata_length_increment); + +/** @} */ + +/* Undocumented methods */ + int tsk_migration_table_dump_text(const tsk_migration_table_t *self, FILE *out); /** @@ -1841,7 +2181,7 @@ and is not checked for compatibility with any existing schema on this table. @param other A pointer to a tsk_site_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the -first ``num_rows`` of ``other`` are used. + first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. @@ -1938,24 +2278,110 @@ on and may change arbitrarily between versions. */ void tsk_site_table_print_state(const tsk_site_table_t *self, FILE *out); -/** @} */ +/** +@brief Replace this table's data by copying from a set of column arrays -/* Undocumented methods */ +@rst +Clears the data columns of this table and then copies column data from the specified +set of arrays. The supplied arrays should all contain data on the same number of rows. +The metadata schema is not affected. +@endrst + +@param self A pointer to a tsk_site_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param position The array of double position values to be copied. +@param ancestral_state The array of char ancestral state values to be copied. +@param ancestral_state_offset The array of tsk_size_t ancestral state offset values to be + copied. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_site_table_set_columns(tsk_site_table_t *self, tsk_size_t num_rows, + const double *position, const char *ancestral_state, + const tsk_size_t *ancestral_state_offset, const char *metadata, + const tsk_size_t *metadata_offset); + +/** +@brief Extends this table by copying from a set of column arrays + +@rst +Copies column data from the specified set of arrays to create new rows at the end of the +table. The supplied arrays should all contain data on the same number of rows. The +metadata schema is not affected. +@endrst + +@param self A pointer to a tsk_site_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param position The array of double position values to be copied. +@param ancestral_state The array of char ancestral state values to be copied. +@param ancestral_state_offset The array of tsk_size_t ancestral state offset values to be + copied. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_site_table_append_columns(tsk_site_table_t *self, tsk_size_t num_rows, + const double *position, const char *ancestral_state, + const tsk_size_t *ancestral_state_offset, const char *metadata, + const tsk_size_t *metadata_offset); + +/** +@brief Controls the pre-allocation strategy for this table + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst +@param self A pointer to a tsk_site_table_t object. +@param max_rows_increment The number of rows to pre-allocate, or zero for the default + doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ int tsk_site_table_set_max_rows_increment( tsk_site_table_t *self, tsk_size_t max_rows_increment); + +/** +@brief Controls the pre-allocation strategy for the metadata column + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_site_table_t object. +@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for +the default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ + int tsk_site_table_set_max_metadata_length_increment( tsk_site_table_t *self, tsk_size_t max_metadata_length_increment); + +/** +@brief Controls the pre-allocation strategy for the ancestral_state column + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_site_table_t object. +@param max_ancestral_state_length_increment The number of bytes to pre-allocate, or zero +for the default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ int tsk_site_table_set_max_ancestral_state_length_increment( tsk_site_table_t *self, tsk_size_t max_ancestral_state_length_increment); -int tsk_site_table_set_columns(tsk_site_table_t *self, tsk_size_t num_rows, - const double *position, const char *ancestral_state, - const tsk_size_t *ancestral_state_length, const char *metadata, - const tsk_size_t *metadata_length); -int tsk_site_table_append_columns(tsk_site_table_t *self, tsk_size_t num_rows, - const double *position, const char *ancestral_state, - const tsk_size_t *ancestral_state_length, const char *metadata, - const tsk_size_t *metadata_length); + +/** @} */ + +/* Undocumented methods */ + int tsk_site_table_dump_text(const tsk_site_table_t *self, FILE *out); /** @@ -2086,7 +2512,7 @@ and is not checked for compatibility with any existing schema on this table. @param other A pointer to a tsk_mutation_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the -first ``num_rows`` of ``other`` are used. + first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. @@ -2184,26 +2610,117 @@ on and may change arbitrarily between versions. */ void tsk_mutation_table_print_state(const tsk_mutation_table_t *self, FILE *out); -/** @} */ +/** +@brief Replace this table's data by copying from a set of column arrays -/* Undocumented methods */ +@rst +Clears the data columns of this table and then copies column data from the specified +set of arrays. The supplied arrays should all contain data on the same number of rows. +The metadata schema is not affected. +@endrst -int tsk_mutation_table_set_max_rows_increment( - tsk_mutation_table_t *self, tsk_size_t max_rows_increment); -int tsk_mutation_table_set_max_metadata_length_increment( - tsk_mutation_table_t *self, tsk_size_t max_metadata_length_increment); -int tsk_mutation_table_set_max_derived_state_length_increment( - tsk_mutation_table_t *self, tsk_size_t max_derived_state_length_increment); +@param self A pointer to a tsk_mutation_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param site The array of tsk_id_t site values to be copied. +@param node The array of tsk_id_t node values to be copied. +@param parent The array of tsk_id_t parent values to be copied. +@param time The array of double time values to be copied. +@param derived_state The array of char derived_state values to be copied. +@param derived_state_offset The array of tsk_size_t derived state offset values to be +copied. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_mutation_table_set_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, const tsk_id_t *site, const tsk_id_t *node, const tsk_id_t *parent, const double *time, const char *derived_state, - const tsk_size_t *derived_state_length, const char *metadata, - const tsk_size_t *metadata_length); + const tsk_size_t *derived_state_offset, const char *metadata, + const tsk_size_t *metadata_offset); + +/** +@brief Extends this table by copying from a set of column arrays + +@rst +Copies column data from the specified set of arrays to create new rows at the end of the +table. The supplied arrays should all contain data on the same number of rows. The +metadata schema is not affected. +@endrst + +@param self A pointer to a tsk_mutation_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param site The array of tsk_id_t site values to be copied. +@param node The array of tsk_id_t node values to be copied. +@param parent The array of tsk_id_t parent values to be copied. +@param time The array of double time values to be copied. +@param derived_state The array of char derived_state values to be copied. +@param derived_state_offset The array of tsk_size_t derived state offset values to be + copied. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_mutation_table_append_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, const tsk_id_t *site, const tsk_id_t *node, const tsk_id_t *parent, const double *time, const char *derived_state, - const tsk_size_t *derived_state_length, const char *metadata, - const tsk_size_t *metadata_length); + const tsk_size_t *derived_state_offset, const char *metadata, + const tsk_size_t *metadata_offset); + +/** +@brief Controls the pre-allocation strategy for this table + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_mutation_table_t object. +@param max_rows_increment The number of rows to pre-allocate, or zero for the default + doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_mutation_table_set_max_rows_increment( + tsk_mutation_table_t *self, tsk_size_t max_rows_increment); + +/** +@brief Controls the pre-allocation strategy for the metadata column + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_mutation_table_t object. +@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for +the default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_mutation_table_set_max_metadata_length_increment( + tsk_mutation_table_t *self, tsk_size_t max_metadata_length_increment); + +/** +@brief Controls the pre-allocation strategy for the derived_state column + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_mutation_table_t object. +@param max_derived_state_length_increment The number of bytes to pre-allocate, or zero +for the default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_mutation_table_set_max_derived_state_length_increment( + tsk_mutation_table_t *self, tsk_size_t max_derived_state_length_increment); + +/** @} */ + +/* Undocumented methods */ + int tsk_mutation_table_dump_text(const tsk_mutation_table_t *self, FILE *out); /** @@ -2318,7 +2835,7 @@ and is not checked for compatibility with any existing schema on this table. @param other A pointer to a tsk_population_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the -first ``num_rows`` of ``other`` are used. + first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. @@ -2418,18 +2935,80 @@ on and may change arbitrarily between versions. */ void tsk_population_table_print_state(const tsk_population_table_t *self, FILE *out); -/** @} */ +/** +@brief Replace this table's data by copying from a set of column arrays -/* Undocumented methods */ +@rst +Clears the data columns of this table and then copies column data from the specified +set of arrays. The supplied arrays should all contain data on the same number of rows. +The metadata schema is not affected. +@endrst -int tsk_population_table_set_max_rows_increment( - tsk_population_table_t *self, tsk_size_t max_rows_increment); -int tsk_population_table_set_max_metadata_length_increment( - tsk_population_table_t *self, tsk_size_t max_metadata_length_increment); +@param self A pointer to a tsk_population_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_population_table_set_columns(tsk_population_table_t *self, tsk_size_t num_rows, const char *metadata, const tsk_size_t *metadata_offset); + +/** +@brief Extends this table by copying from a set of column arrays + +@rst +Copies column data from the specified set of arrays to create new rows at the end of the +table. The supplied arrays should all contain data on the same number of rows. The +metadata schema is not affected. +@endrst + +@param self A pointer to a tsk_population_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param metadata The array of char metadata values to be copied. +@param metadata_offset The array of tsk_size_t metadata offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_population_table_append_columns(tsk_population_table_t *self, tsk_size_t num_rows, const char *metadata, const tsk_size_t *metadata_offset); + +/** +@brief Controls the pre-allocation strategy for this table + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_population_table_t object. +@param max_rows_increment The number of rows to pre-allocate, or zero for the default + doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_population_table_set_max_rows_increment( + tsk_population_table_t *self, tsk_size_t max_rows_increment); + +/** +@brief Controls the pre-allocation strategy for the metadata column + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_population_table_t object. +@param max_metadata_length_increment The number of bytes to pre-allocate, or zero for +the default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_population_table_set_max_metadata_length_increment( + tsk_population_table_t *self, tsk_size_t max_metadata_length_increment); + +/** @} */ + +/* Undocumented methods */ + int tsk_population_table_dump_text(const tsk_population_table_t *self, FILE *out); /** @@ -2551,7 +3130,7 @@ the first ``num_rows`` from ``other`` to this table. @param other A pointer to a tsk_provenance_table_t object where rows are copied from. @param num_rows The number of rows from ``other`` to append to this table. @param row_indexes Array of row indexes in ``other``. If ``NULL`` is passed then the -first ``num_rows`` of ``other`` are used. + first ``num_rows`` of ``other`` are used. @param options Bitwise option flags. Currently unused; should be set to zero to ensure compatibility with later versions of tskit. @return Return 0 on success or a negative value on failure. @@ -2636,22 +3215,102 @@ on and may change arbitrarily between versions. */ void tsk_provenance_table_print_state(const tsk_provenance_table_t *self, FILE *out); -/** @} */ +/** +@brief Replace this table's data by copying from a set of column arrays -/* Undocumented methods */ +@rst +Clears the data columns of this table and then copies column data from the specified +set of arrays. The supplied arrays should all contain data on the same number of rows. +The metadata schema is not affected. +@endrst -int tsk_provenance_table_set_max_rows_increment( - tsk_provenance_table_t *self, tsk_size_t max_rows_increment); -int tsk_provenance_table_set_max_timestamp_length_increment( - tsk_provenance_table_t *self, tsk_size_t max_timestamp_length_increment); -int tsk_provenance_table_set_max_record_length_increment( - tsk_provenance_table_t *self, tsk_size_t max_record_length_increment); +@param self A pointer to a tsk_provenance_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param timestamp The array of char timestamp values to be copied. +@param timestamp_offset The array of tsk_size_t timestamp offset values to be copied. +@param record The array of char record values to be copied. +@param record_offset The array of tsk_size_t record offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_provenance_table_set_columns(tsk_provenance_table_t *self, tsk_size_t num_rows, const char *timestamp, const tsk_size_t *timestamp_offset, const char *record, const tsk_size_t *record_offset); + +/** +@brief Extends this table by copying from a set of column arrays + +@rst +Copies column data from the specified set of arrays to create new rows at the end of the +table. The supplied arrays should all contain data on the same number of rows. The +metadata schema is not affected. +@endrst + +@param self A pointer to a tsk_provenance_table_t object. +@param num_rows The number of rows to copy from the specifed arrays. +@param timestamp The array of char timestamp values to be copied. +@param timestamp_offset The array of tsk_size_t timestamp offset values to be copied. +@param record The array of char record values to be copied. +@param record_offset The array of tsk_size_t record offset values to be copied. +@return Return 0 on success or a negative value on failure. +*/ int tsk_provenance_table_append_columns(tsk_provenance_table_t *self, tsk_size_t num_rows, const char *timestamp, const tsk_size_t *timestamp_offset, const char *record, const tsk_size_t *record_offset); + +/** +@brief Controls the pre-allocation strategy for this table + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_provenance_table_t object. +@param max_rows_increment The number of rows to pre-allocate, or zero for the default + doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_provenance_table_set_max_rows_increment( + tsk_provenance_table_t *self, tsk_size_t max_rows_increment); + +/** +@brief Controls the pre-allocation strategy for the timestamp column + +@rst +Set a fixed pre-allocation size, or use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_provenance_table_t object. +@param max_timestamp_length_increment The number of bytes to pre-allocate, or zero for +the default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_provenance_table_set_max_timestamp_length_increment( + tsk_provenance_table_t *self, tsk_size_t max_timestamp_length_increment); + +/** +@brief Controls the pre-allocation strategy for the record column + +@rst +Set a fixed pre-allocation size, use the default doubling strategy. +See :ref:`sec_c_api_memory_allocation_strategy` for details on the default +pre-allocation strategy, +@endrst + +@param self A pointer to a tsk_provenance_table_t object. +@param max_record_length_increment The number of bytes to pre-allocate, or zero for the +default doubling strategy. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_provenance_table_set_max_record_length_increment( + tsk_provenance_table_t *self, tsk_size_t max_record_length_increment); + +/** @} */ + +/* Undocumented methods */ int tsk_provenance_table_dump_text(const tsk_provenance_table_t *self, FILE *out); /****************************************************************************/ @@ -2761,6 +3420,11 @@ TSK_CMP_IGNORE_TS_METADATA TSK_CMP_IGNORE_TIMESTAMPS Do not include the timestamp information when comparing the provenance tables. This has no effect if TSK_CMP_IGNORE_PROVENANCE is specified. +TSK_CMP_IGNORE_TABLES + Do not include any tables in the comparison, thus comparing only the + top-level information of the table collections being compared. +TSK_CMP_IGNORE_REFERENCE_SEQUENCE + Do not include the reference sequence in the comparison. @endrst @param self A pointer to a tsk_table_collection_t object. @@ -2819,6 +3483,12 @@ If the file contains multiple table collections, this function will load the first. Please see the :c:func:`tsk_table_collection_loadf` for details on how to sequentially load table collections from a stream. +If the TSK_LOAD_SKIP_TABLES option is set, only the non-table information from +the table collection will be read, leaving all tables with zero rows and no +metadata or schema. +If the TSK_LOAD_SKIP_REFERENCE_SEQUENCE option is set, the table collection is +read without loading the reference sequence. + **Options** Options can be specified by providing one or more of the following bitwise @@ -2826,6 +3496,10 @@ Options can be specified by providing one or more of the following bitwise TSK_NO_INIT Do not initialise this :c:type:`tsk_table_collection_t` before loading. +TSK_LOAD_SKIP_TABLES + Skip reading tables, and only load top-level information. +TSK_LOAD_SKIP_REFERENCE_SEQUENCE + Do not load reference sequence. **Examples** @@ -2873,6 +3547,18 @@ different error conditions. Please see the :ref:`sec_c_api_examples_file_streaming` section for an example of how to sequentially load tree sequences from a stream. +Please note that this streaming behaviour is not supported if the +TSK_LOAD_SKIP_TABLES or TSK_LOAD_SKIP_REFERENCE_SEQUENCE option is set. +If the TSK_LOAD_SKIP_TABLES option is set, only the non-table information from +the table collection will be read, leaving all tables with zero rows and no +metadata or schema. +If the TSK_LOAD_SKIP_REFERENCE_SEQUENCE option is set, the table collection is +read without loading the reference sequence. +When attempting to read from a stream with multiple table collection definitions +and either of these two options set, the requested information from the first +table collection will be read on the first call to +:c:func:`tsk_table_collection_loadf`, with subsequent calls leading to errors. + **Options** Options can be specified by providing one or more of the following bitwise @@ -2880,6 +3566,10 @@ Options can be specified by providing one or more of the following bitwise TSK_NO_INIT Do not initialise this :c:type:`tsk_table_collection_t` before loading. +TSK_LOAD_SKIP_TABLES + Skip reading tables, and only load top-level information. +TSK_LOAD_SKIP_REFERENCE_SEQUENCE + Do not load reference sequence. @endrst @@ -3013,9 +3703,8 @@ and ``provenance``) are ignored and can be set to arbitrary values. The table collection will always be unindexed after sort successfully completes. -See the :ref:`table sorting ` section for more details. -For more control over the sorting process, see the -:ref:`sec_c_api_low_level_sorting` section. +For more control over the sorting process, see the :ref:`sec_c_api_low_level_sorting` +section. **Options** @@ -3033,17 +3722,33 @@ TSK_NO_CHECK_INTEGRITY @endrst -@param self A pointer to a tsk_individual_table_t object. +@param self A pointer to a tsk_table_collection_t object. @param start The position to begin sorting in each table; all rows less than this position must fulfill the tree sequence sortedness requirements. If this is NULL, sort all rows. -@param options Sort options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. +@param options Sort options. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_sort( tsk_table_collection_t *self, const tsk_bookmark_t *start, tsk_flags_t options); +/** +@brief Sorts the individual table in this collection. + +@rst +Sorts the individual table in place, so that parents come before children, +and the parent column is remapped as required. Node references to individuals +are also updated. +@endrst + +@param self A pointer to a tsk_table_collection_t object. +@param options Sort options. Currently unused; should be + set to zero to ensure compatibility with later versions of tskit. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_table_collection_individual_topological_sort( + tsk_table_collection_t *self, tsk_flags_t options); + /** @brief Puts the tables into canonical form. @@ -3075,7 +3780,7 @@ int tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t @rst Simplification transforms the tables to remove redundancy and canonicalise -tree sequence data. See the :ref:`simplification ` section for +tree sequence data. See the :ref:`simplification ` tutorial for more details. A mapping from the node IDs in the table before simplification to their equivalent @@ -3252,6 +3957,19 @@ int tsk_table_collection_union(tsk_table_collection_t *self, const tsk_table_collection_t *other, const tsk_id_t *other_node_mapping, tsk_flags_t options); +/** +@brief Set the time_units +@rst +Copies the time_units string to this table collection, replacing any existing. +@endrst +@param self A pointer to a tsk_table_collection_t object. +@param time_units A pointer to a char array +@param time_units_length The size of the time units string in bytes. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_table_collection_set_time_units( + tsk_table_collection_t *self, const char *time_units, tsk_size_t time_units_length); + /** @brief Set the metadata @rst @@ -3300,6 +4018,8 @@ life-cycle. bool tsk_table_collection_has_index( const tsk_table_collection_t *self, tsk_flags_t options); +bool tsk_table_collection_has_reference_sequence(const tsk_table_collection_t *self); + /** @brief Deletes the indexes for this table collection. @@ -3344,9 +4064,12 @@ int tsk_table_collection_set_indexes(tsk_table_collection_t *self, Checks the integrity of this table collection. The default checks (i.e., with options = 0) guarantee the integrity of memory and entity references within the -table collection. All spatial values (along the genome) are checked +table collection. All positions along the genome are checked to see if they are finite values and within the required bounds. Time values are checked to see if they are finite or marked as unknown. +Consistency of the direction of inheritance is also checked: whether +parents are more recent than children, mutations are not more recent +than their nodes or their mutation parents, etcetera. To check if a set of tables fulfills the :ref:`requirements ` needed for a valid tree sequence, use @@ -3375,6 +4098,8 @@ TSK_CHECK_MUTATION_ORDERING constraints. TSK_CHECK_INDIVIDUAL_ORDERING Check individual parents are before children, where specified. +TSK_CHECK_MIGRATION_ORDERING + Check migrations are ordered by time. TSK_CHECK_INDEXES Check that the table indexes exist, and contain valid edge references. @@ -3404,6 +4129,23 @@ tsk_id_t tsk_table_collection_check_integrity( /* Undocumented methods */ +/* Flags for ibd_segments */ +#define TSK_IBD_STORE_PAIRS (1 << 0) +#define TSK_IBD_STORE_SEGMENTS (1 << 1) + +/* TODO be systematic about where "result" should be in the params + * list, different here and in link_ancestors. */ +/* FIXME the order of num_samples and samples needs to be reversed in within. + * This should be done as part of documenting, I guess. */ +int tsk_table_collection_ibd_within(const tsk_table_collection_t *self, + tsk_identity_segments_t *result, const tsk_id_t *samples, tsk_size_t num_samples, + double min_length, double max_time, tsk_flags_t options); + +int tsk_table_collection_ibd_between(const tsk_table_collection_t *self, + tsk_identity_segments_t *result, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, double min_length, + double max_time, tsk_flags_t options); + int tsk_table_collection_link_ancestors(tsk_table_collection_t *self, tsk_id_t *samples, tsk_size_t num_samples, tsk_id_t *ancestors, tsk_size_t num_ancestors, tsk_flags_t options, tsk_edge_table_t *result); @@ -3414,6 +4156,30 @@ int tsk_table_collection_compute_mutation_parents( int tsk_table_collection_compute_mutation_times( tsk_table_collection_t *self, double *random, tsk_flags_t TSK_UNUSED(options)); +int tsk_reference_sequence_init(tsk_reference_sequence_t *self, tsk_flags_t options); +int tsk_reference_sequence_free(tsk_reference_sequence_t *self); +bool tsk_reference_sequence_is_null(const tsk_reference_sequence_t *self); +bool tsk_reference_sequence_equals(const tsk_reference_sequence_t *self, + const tsk_reference_sequence_t *other, tsk_flags_t options); +int tsk_reference_sequence_copy(const tsk_reference_sequence_t *self, + tsk_reference_sequence_t *dest, tsk_flags_t options); +int tsk_reference_sequence_set_data( + tsk_reference_sequence_t *self, const char *data, tsk_size_t data_length); +int tsk_reference_sequence_set_url( + tsk_reference_sequence_t *self, const char *url, tsk_size_t url_length); +int tsk_reference_sequence_set_metadata( + tsk_reference_sequence_t *self, const char *metadata, tsk_size_t metadata_length); +int tsk_reference_sequence_set_metadata_schema(tsk_reference_sequence_t *self, + const char *metadata_schema, tsk_size_t metadata_schema_length); +int tsk_reference_sequence_takeset_data( + tsk_reference_sequence_t *self, char *data, tsk_size_t data_length); +int tsk_reference_sequence_takeset_url( + tsk_reference_sequence_t *self, char *url, tsk_size_t url_length); +int tsk_reference_sequence_takeset_metadata( + tsk_reference_sequence_t *self, char *metadata, tsk_size_t metadata_length); +int tsk_reference_sequence_takeset_metadata_schema(tsk_reference_sequence_t *self, + char *metadata_schema, tsk_size_t metadata_schema_length); + /** @defgroup TABLE_SORTER_API_GROUP Low-level table sorter API. @{ @@ -3493,16 +4259,19 @@ int tsk_table_sorter_free(struct _tsk_table_sorter_t *self); int tsk_squash_edges( tsk_edge_t *edges, tsk_size_t num_edges, tsk_size_t *num_output_edges); -/* IBD finder API. This is experimental and the interface may change. */ -int tsk_ibd_finder_init(tsk_ibd_finder_t *ibd_finder, tsk_table_collection_t *tables, - tsk_id_t *pairs, tsk_size_t num_pairs); -int tsk_ibd_finder_set_min_length(tsk_ibd_finder_t *self, double min_length); -int tsk_ibd_finder_set_max_time(tsk_ibd_finder_t *self, double max_time); -int tsk_ibd_finder_free(tsk_ibd_finder_t *self); -int tsk_ibd_finder_run(tsk_ibd_finder_t *ibd_finder); -int tsk_ibd_finder_get_ibd_segments(tsk_ibd_finder_t *ibd_finder, tsk_id_t pair_index, - tsk_segment_t **ret_ibd_segments_head); -void tsk_ibd_finder_print_state(tsk_ibd_finder_t *self, FILE *out); +/* IBD segments API. This is experimental and the interface may change. */ + +tsk_size_t tsk_identity_segments_get_num_segments(const tsk_identity_segments_t *self); +double tsk_identity_segments_get_total_span(const tsk_identity_segments_t *self); +tsk_size_t tsk_identity_segments_get_num_pairs(const tsk_identity_segments_t *self); +int tsk_identity_segments_get_keys( + const tsk_identity_segments_t *result, tsk_id_t *pairs); +int tsk_identity_segments_get_items(const tsk_identity_segments_t *self, tsk_id_t *pairs, + tsk_identity_segment_list_t **lists); +int tsk_identity_segments_get(const tsk_identity_segments_t *self, tsk_id_t a, + tsk_id_t b, tsk_identity_segment_list_t **ret_list); +void tsk_identity_segments_print_state(tsk_identity_segments_t *self, FILE *out); +int tsk_identity_segments_free(tsk_identity_segments_t *self); #ifdef __cplusplus } diff --git a/subprojects/tskit/tskit/trees.c b/subprojects/tskit/tskit/trees.c index 621af66bb..90623320d 100644 --- a/subprojects/tskit/tskit/trees.c +++ b/subprojects/tskit/tskit/trees.c @@ -32,6 +32,12 @@ #include +static inline bool +is_discrete(double x) +{ + return trunc(x) == x; +} + /* ======================================================== * * tree sequence * ======================================================== */ @@ -127,6 +133,8 @@ tsk_treeseq_init_sites(tsk_treeseq_t *self) const tsk_size_t num_mutations = self->tables->mutations.num_rows; const tsk_size_t num_sites = self->tables->sites.num_rows; const tsk_id_t *restrict mutation_site = self->tables->mutations.site; + const double *restrict site_position = self->tables->sites.position; + bool discrete_sites = true; self->site_mutations_mem = tsk_malloc(num_mutations * sizeof(*self->site_mutations_mem)); @@ -148,6 +156,7 @@ tsk_treeseq_init_sites(tsk_treeseq_t *self) } k = 0; for (j = 0; j < (tsk_id_t) num_sites; j++) { + discrete_sites = discrete_sites && is_discrete(site_position[j]); self->site_mutations[j] = self->site_mutations_mem + offset; self->site_mutations_length[j] = 0; /* Go through all mutations for this site */ @@ -161,6 +170,7 @@ tsk_treeseq_init_sites(tsk_treeseq_t *self) goto out; } } + self->discrete_genome = self->discrete_genome && discrete_sites; out: return ret; } @@ -243,6 +253,7 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) const double *restrict edge_right = self->tables->edges.right; const double *restrict edge_left = self->tables->edges.left; tsk_size_t num_trees_alloc = self->num_trees + 1; + bool discrete_breakpoints = true; self->tree_sites_length = tsk_malloc(num_trees_alloc * sizeof(*self->tree_sites_length)); @@ -264,6 +275,7 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) j = 0; k = 0; while (j < num_edges || tree_left < sequence_length) { + discrete_breakpoints = discrete_breakpoints && is_discrete(tree_left); self->breakpoints[tree_index] = tree_left; while (k < num_edges && edge_right[O[k]] == tree_left) { k++; @@ -289,18 +301,58 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) tsk_bug_assert(site == num_sites); tsk_bug_assert(tree_index == self->num_trees); self->breakpoints[tree_index] = tree_right; + discrete_breakpoints = discrete_breakpoints && is_discrete(tree_right); + self->discrete_genome = self->discrete_genome && discrete_breakpoints; ret = 0; out: return ret; } +static void +tsk_treeseq_init_migrations(tsk_treeseq_t *self) +{ + tsk_size_t j; + tsk_size_t num_migrations = self->tables->migrations.num_rows; + const double *restrict left = self->tables->migrations.left; + const double *restrict right = self->tables->migrations.right; + const double *restrict time = self->tables->migrations.time; + bool discrete_breakpoints = true; + bool discrete_times = true; + + for (j = 0; j < num_migrations; j++) { + discrete_breakpoints + = discrete_breakpoints && is_discrete(left[j]) && is_discrete(right[j]); + discrete_times + = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); + } + self->discrete_genome = self->discrete_genome && discrete_breakpoints; + self->discrete_time = self->discrete_time && discrete_times; +} + +static void +tsk_treeseq_init_mutations(tsk_treeseq_t *self) +{ + tsk_size_t j; + tsk_size_t num_mutations = self->tables->mutations.num_rows; + const double *restrict time = self->tables->mutations.time; + bool discrete_times = true; + + for (j = 0; j < num_mutations; j++) { + discrete_times + = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); + } + self->discrete_time = self->discrete_time && discrete_times; +} + static int tsk_treeseq_init_nodes(tsk_treeseq_t *self) { tsk_size_t j, k; tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_flags_t *restrict node_flags = self->tables->nodes.flags; + const double *restrict time = self->tables->nodes.time; int ret = 0; + bool discrete_times = true; /* Determine the sample size */ self->num_samples = 0; @@ -326,6 +378,12 @@ tsk_treeseq_init_nodes(tsk_treeseq_t *self) } } tsk_bug_assert(k == self->num_samples); + + for (j = 0; j < num_nodes; j++) { + discrete_times + = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); + } + self->discrete_time = self->discrete_time && discrete_times; out: return ret; } @@ -341,7 +399,7 @@ tsk_treeseq_init_nodes(tsk_treeseq_t *self) */ int TSK_WARN_UNUSED tsk_treeseq_init( - tsk_treeseq_t *self, const tsk_table_collection_t *tables, tsk_flags_t options) + tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options) { int ret = 0; tsk_id_t num_trees; @@ -396,6 +454,8 @@ tsk_treeseq_init( tsk_memcpy(self->tables->file_uuid, tables->file_uuid, TSK_UUID_SIZE + 1); } + self->discrete_genome = true; + self->discrete_time = true; ret = tsk_treeseq_init_nodes(self); if (ret != 0) { goto out; @@ -412,6 +472,14 @@ tsk_treeseq_init( if (ret != 0) { goto out; } + tsk_treeseq_init_migrations(self); + tsk_treeseq_init_mutations(self); + + if (tsk_treeseq_get_time_units_length(self) == strlen(TSK_TIME_UNITS_UNCALIBRATED) + && !strncmp(tsk_treeseq_get_time_units(self), TSK_TIME_UNITS_UNCALIBRATED, + strlen(TSK_TIME_UNITS_UNCALIBRATED))) { + self->time_uncalibrated = true; + } out: return ret; } @@ -424,15 +492,14 @@ tsk_treeseq_copy_tables( } int TSK_WARN_UNUSED -tsk_treeseq_load( - tsk_treeseq_t *self, const char *filename, tsk_flags_t TSK_UNUSED(options)) +tsk_treeseq_load(tsk_treeseq_t *self, const char *filename, tsk_flags_t options) { int ret = 0; tsk_table_collection_t tables; /* Need to make sure that we're zero'd out in case of error */ tsk_memset(self, 0, sizeof(*self)); - ret = tsk_table_collection_load(&tables, filename, 0); + ret = tsk_table_collection_load(&tables, filename, options); if (ret != 0) { goto out; } @@ -450,14 +517,14 @@ tsk_treeseq_load( } int TSK_WARN_UNUSED -tsk_treeseq_loadf(tsk_treeseq_t *self, FILE *file, tsk_flags_t TSK_UNUSED(options)) +tsk_treeseq_loadf(tsk_treeseq_t *self, FILE *file, tsk_flags_t options) { int ret = 0; tsk_table_collection_t tables; /* Need to make sure that we're zero'd out in case of error */ tsk_memset(self, 0, sizeof(*self)); - ret = tsk_table_collection_loadf(&tables, file, 0); + ret = tsk_table_collection_loadf(&tables, file, options); if (ret != 0) { goto out; } @@ -512,6 +579,18 @@ tsk_treeseq_get_metadata_schema_length(const tsk_treeseq_t *self) return self->tables->metadata_schema_length; } +const char * +tsk_treeseq_get_time_units(const tsk_treeseq_t *self) +{ + return self->tables->time_units; +} + +tsk_size_t +tsk_treeseq_get_time_units_length(const tsk_treeseq_t *self) +{ + return self->tables->time_units_length; +} + double tsk_treeseq_get_sequence_length(const tsk_treeseq_t *self) { @@ -613,6 +692,24 @@ tsk_treeseq_is_sample(const tsk_treeseq_t *self, tsk_id_t u) return ret; } +bool +tsk_treeseq_get_discrete_genome(const tsk_treeseq_t *self) +{ + return self->discrete_genome; +} + +bool +tsk_treeseq_get_discrete_time(const tsk_treeseq_t *self) +{ + return self->discrete_time; +} + +bool +tsk_treeseq_has_reference_sequence(const tsk_treeseq_t *self) +{ + return tsk_table_collection_has_reference_sequence(self->tables); +} + /* Stats functions */ #define GET_2D_ROW(array, row_len, row) (array + (((size_t)(row_len)) * (size_t) row)) @@ -1067,7 +1164,7 @@ static int tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, const double *sample_weights, tsk_size_t result_dim, general_stat_func_t *f, void *f_params, tsk_size_t num_windows, const double *windows, double *result, - tsk_flags_t TSK_UNUSED(options)) + tsk_flags_t options) { int ret = 0; tsk_id_t u, v; @@ -1092,6 +1189,11 @@ tsk_treeseq_branch_general_stat(const tsk_treeseq_t *self, tsk_size_t state_dim, double *summary = tsk_calloc(num_nodes * result_dim, sizeof(*summary)); double *running_sum = tsk_calloc(result_dim, sizeof(*running_sum)); + if (self->time_uncalibrated && !(options & TSK_STAT_ALLOW_TIME_UNCALIBRATED)) { + ret = TSK_ERR_TIME_UNCALIBRATED; + goto out; + } + if (parent == NULL || branch_length == NULL || state == NULL || running_sum == NULL || summary == NULL) { ret = TSK_ERR_NO_MEMORY; @@ -2161,6 +2263,11 @@ tsk_treeseq_branch_allele_frequency_spectrum(const tsk_treeseq_t *self, double t_left, t_right, w_right; const tsk_size_t K = num_sample_sets + 1; + if (self->time_uncalibrated && !(options & TSK_STAT_ALLOW_TIME_UNCALIBRATED)) { + ret = TSK_ERR_TIME_UNCALIBRATED; + goto out; + } + if (parent == NULL || last_update == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; @@ -3127,120 +3234,47 @@ tsk_treeseq_simplify(const tsk_treeseq_t *self, const tsk_id_t *samples, * Tree * ======================================================== */ -int TSK_WARN_UNUSED -tsk_tree_clear(tsk_tree_t *self) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t u; - const tsk_size_t N = self->num_nodes; - const tsk_size_t num_samples = self->tree_sequence->num_samples; - const bool sample_counts = !(self->options & TSK_NO_SAMPLE_COUNTS); - const bool sample_lists = !!(self->options & TSK_SAMPLE_LISTS); - - self->left = 0; - self->right = 0; - self->index = -1; - /* TODO we should profile this method to see if just doing a single loop over - * the nodes would be more efficient than multiple memsets. - */ - tsk_memset(self->parent, 0xff, N * sizeof(tsk_id_t)); - tsk_memset(self->left_child, 0xff, N * sizeof(tsk_id_t)); - tsk_memset(self->right_child, 0xff, N * sizeof(tsk_id_t)); - tsk_memset(self->left_sib, 0xff, N * sizeof(tsk_id_t)); - tsk_memset(self->right_sib, 0xff, N * sizeof(tsk_id_t)); - - if (sample_counts) { - tsk_memset(self->num_samples, 0, N * sizeof(tsk_id_t)); - tsk_memset(self->marked, 0, N * sizeof(uint8_t)); - /* We can't reset the tracked samples via tsk_memset because we don't - * know where the tracked samples are. - */ - for (j = 0; j < self->num_nodes; j++) { - if (!tsk_treeseq_is_sample(self->tree_sequence, (tsk_id_t) j)) { - self->num_tracked_samples[j] = 0; - } - } - } - if (sample_lists) { - tsk_memset(self->left_sample, 0xff, N * sizeof(tsk_id_t)); - tsk_memset(self->right_sample, 0xff, N * sizeof(tsk_id_t)); - tsk_memset(self->next_sample, 0xff, num_samples * sizeof(tsk_id_t)); - } - /* Set the sample attributes */ - for (j = 0; j < num_samples; j++) { - u = self->samples[j]; - if (sample_counts) { - self->num_samples[u] = 1; - } - if (sample_lists) { - /* We are mapping to *indexes* into the list of samples here */ - self->left_sample[u] = (tsk_id_t) j; - self->right_sample[u] = (tsk_id_t) j; - } - } - self->left_root = TSK_NULL; - if (sample_counts && self->root_threshold == 1 && num_samples > 0) { - self->left_root = self->samples[0]; - for (j = 0; j < num_samples; j++) { - /* Set initial roots */ - u = self->samples[j]; - if (j < num_samples - 1) { - self->right_sib[u] = self->samples[j + 1]; - } - if (j > 0) { - self->left_sib[u] = self->samples[j - 1]; - } - } - } - return ret; -} - int TSK_WARN_UNUSED tsk_tree_init(tsk_tree_t *self, const tsk_treeseq_t *tree_sequence, tsk_flags_t options) { int ret = TSK_ERR_NO_MEMORY; - tsk_size_t num_samples; - tsk_size_t num_nodes; + tsk_size_t num_samples, num_nodes, N; tsk_memset(self, 0, sizeof(tsk_tree_t)); if (tree_sequence == NULL) { ret = TSK_ERR_BAD_PARAM_VALUE; goto out; } - if (options & TSK_SAMPLE_COUNTS) { - fprintf(stderr, "TSK_SAMPLE_COUNTS is no longer supported. " - "Sample counts are tracked by default since 0.99.3, " - "Please use TSK_NO_SAMPLE_COUNTS to turn off sample counts."); - } num_nodes = tree_sequence->tables->nodes.num_rows; num_samples = tree_sequence->num_samples; self->num_nodes = num_nodes; + self->virtual_root = (tsk_id_t) num_nodes; self->tree_sequence = tree_sequence; self->samples = tree_sequence->samples; self->options = options; self->root_threshold = 1; - self->parent = tsk_malloc(num_nodes * sizeof(tsk_id_t)); - self->left_child = tsk_malloc(num_nodes * sizeof(tsk_id_t)); - self->right_child = tsk_malloc(num_nodes * sizeof(tsk_id_t)); - self->left_sib = tsk_malloc(num_nodes * sizeof(tsk_id_t)); - self->right_sib = tsk_malloc(num_nodes * sizeof(tsk_id_t)); + + /* Allocate space in the quintuply linked tree for the virtual root */ + N = num_nodes + 1; + self->parent = tsk_malloc(N * sizeof(*self->parent)); + self->left_child = tsk_malloc(N * sizeof(*self->left_child)); + self->right_child = tsk_malloc(N * sizeof(*self->right_child)); + self->left_sib = tsk_malloc(N * sizeof(*self->left_sib)); + self->right_sib = tsk_malloc(N * sizeof(*self->right_sib)); if (self->parent == NULL || self->left_child == NULL || self->right_child == NULL || self->left_sib == NULL || self->right_sib == NULL) { goto out; } if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { - self->num_samples = tsk_calloc(num_nodes, sizeof(tsk_id_t)); - self->num_tracked_samples = tsk_calloc(num_nodes, sizeof(tsk_id_t)); - self->marked = tsk_calloc(num_nodes, sizeof(uint8_t)); - if (self->num_samples == NULL || self->num_tracked_samples == NULL - || self->marked == NULL) { + self->num_samples = tsk_calloc(N, sizeof(*self->num_samples)); + self->num_tracked_samples = tsk_calloc(N, sizeof(*self->num_tracked_samples)); + if (self->num_samples == NULL || self->num_tracked_samples == NULL) { goto out; } } if (self->options & TSK_SAMPLE_LISTS) { - self->left_sample = tsk_malloc(num_nodes * sizeof(*self->left_sample)); - self->right_sample = tsk_malloc(num_nodes * sizeof(*self->right_sample)); + self->left_sample = tsk_malloc(N * sizeof(*self->left_sample)); + self->right_sample = tsk_malloc(N * sizeof(*self->right_sample)); self->next_sample = tsk_malloc(num_samples * sizeof(*self->next_sample)); if (self->left_sample == NULL || self->right_sample == NULL || self->next_sample == NULL) { @@ -3268,6 +3302,8 @@ tsk_tree_set_root_threshold(tsk_tree_t *self, tsk_size_t root_threshold) goto out; } self->root_threshold = root_threshold; + /* Reset the roots */ + ret = tsk_tree_clear(self); out: return ret; } @@ -3288,7 +3324,6 @@ tsk_tree_free(tsk_tree_t *self) tsk_safe_free(self->right_sib); tsk_safe_free(self->num_samples); tsk_safe_free(self->num_tracked_samples); - tsk_safe_free(self->marked); tsk_safe_free(self->left_sample); tsk_safe_free(self->right_sample); tsk_safe_free(self->next_sample); @@ -3316,7 +3351,8 @@ tsk_tree_reset_tracked_samples(tsk_tree_t *self) ret = TSK_ERR_UNSUPPORTED_OPERATION; goto out; } - tsk_memset(self->num_tracked_samples, 0, self->num_nodes * sizeof(tsk_id_t)); + tsk_memset(self->num_tracked_samples, 0, + (self->num_nodes + 1) * sizeof(*self->num_tracked_samples)); out: return ret; } @@ -3326,6 +3362,8 @@ tsk_tree_set_tracked_samples( tsk_tree_t *self, tsk_size_t num_tracked_samples, const tsk_id_t *tracked_samples) { int ret = TSK_ERR_GENERIC; + tsk_size_t *tree_num_tracked_samples = self->num_tracked_samples; + const tsk_id_t *parent = self->parent; tsk_size_t j; tsk_id_t u; @@ -3336,6 +3374,7 @@ tsk_tree_set_tracked_samples( if (ret != 0) { goto out; } + self->num_tracked_samples[self->virtual_root] = num_tracked_samples; for (j = 0; j < num_tracked_samples; j++) { u = tracked_samples[j]; if (u < 0 || u >= (tsk_id_t) self->num_nodes) { @@ -3352,8 +3391,8 @@ tsk_tree_set_tracked_samples( } /* Propagate this upwards */ while (u != TSK_NULL) { - self->num_tracked_samples[u] += 1; - u = self->parent[u]; + tree_num_tracked_samples[u]++; + u = parent[u]; } } out: @@ -3361,44 +3400,47 @@ tsk_tree_set_tracked_samples( } int TSK_WARN_UNUSED -tsk_tree_set_tracked_samples_from_sample_list( - tsk_tree_t *self, tsk_tree_t *other, tsk_id_t node) +tsk_tree_track_descendant_samples(tsk_tree_t *self, tsk_id_t node) { - int ret = TSK_ERR_GENERIC; - tsk_id_t u, stop, index; - const tsk_id_t *next = other->next_sample; - const tsk_id_t *samples = other->tree_sequence->samples; + int ret = 0; + tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); + const tsk_id_t *restrict parent = self->parent; + const tsk_id_t *restrict left_child = self->left_child; + const tsk_id_t *restrict right_sib = self->right_sib; + const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags; + tsk_size_t *num_tracked_samples = self->num_tracked_samples; + tsk_size_t n, j, num_nodes; + tsk_id_t u, v; - if (!tsk_tree_has_sample_lists(other)) { - ret = TSK_ERR_UNSUPPORTED_OPERATION; + if (nodes == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + ret = tsk_tree_postorder(self, node, nodes, &num_nodes); + if (ret != 0) { goto out; } - /* TODO This is not needed when the tree is new. We should use the - * state machine to check and only reset the tracked samples when needed. - */ ret = tsk_tree_reset_tracked_samples(self); if (ret != 0) { goto out; } - - index = other->left_sample[node]; - if (index != TSK_NULL) { - stop = other->right_sample[node]; - while (true) { - u = samples[index]; - tsk_bug_assert(self->num_tracked_samples[u] == 0); - /* Propagate this upwards */ - while (u != TSK_NULL) { - self->num_tracked_samples[u] += 1; - u = self->parent[u]; - } - if (index == stop) { - break; - } - index = next[index]; + u = 0; /* keep the compiler happy */ + for (j = 0; j < num_nodes; j++) { + u = nodes[j]; + for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { + num_tracked_samples[u] += num_tracked_samples[v]; } + num_tracked_samples[u] += flags[u] & TSK_NODE_IS_SAMPLE ? 1 : 0; + } + n = num_tracked_samples[u]; + u = parent[u]; + while (u != TSK_NULL) { + num_tracked_samples[u] = n; + u = parent[u]; } + num_tracked_samples[self->virtual_root] = n; out: + tsk_safe_free(nodes); return ret; } @@ -3406,7 +3448,7 @@ int TSK_WARN_UNUSED tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options) { int ret = TSK_ERR_GENERIC; - tsk_size_t N = self->num_nodes; + tsk_size_t N = self->num_nodes + 1; if (!(options & TSK_NO_INIT)) { ret = tsk_tree_init(dest, self->tree_sequence, options); @@ -3418,9 +3460,7 @@ tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options) ret = TSK_ERR_BAD_PARAM_VALUE; goto out; } - dest->left = self->left; - dest->right = self->right; - dest->left_root = self->left_root; + dest->interval = self->interval; dest->left_index = self->left_index; dest->right_index = self->right_index; dest->direction = self->direction; @@ -3428,12 +3468,13 @@ tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options) dest->sites = self->sites; dest->sites_length = self->sites_length; dest->root_threshold = self->root_threshold; + dest->num_edges = self->num_edges; - tsk_memcpy(dest->parent, self->parent, N * sizeof(tsk_id_t)); - tsk_memcpy(dest->left_child, self->left_child, N * sizeof(tsk_id_t)); - tsk_memcpy(dest->right_child, self->right_child, N * sizeof(tsk_id_t)); - tsk_memcpy(dest->left_sib, self->left_sib, N * sizeof(tsk_id_t)); - tsk_memcpy(dest->right_sib, self->right_sib, N * sizeof(tsk_id_t)); + tsk_memcpy(dest->parent, self->parent, N * sizeof(*self->parent)); + tsk_memcpy(dest->left_child, self->left_child, N * sizeof(*self->left_child)); + tsk_memcpy(dest->right_child, self->right_child, N * sizeof(*self->right_child)); + tsk_memcpy(dest->left_sib, self->left_sib, N * sizeof(*self->left_sib)); + tsk_memcpy(dest->right_sib, self->right_sib, N * sizeof(*self->right_sib)); if (!(dest->options & TSK_NO_SAMPLE_COUNTS)) { if (self->options & TSK_NO_SAMPLE_COUNTS) { ret = TSK_ERR_UNSUPPORTED_OPERATION; @@ -3442,17 +3483,17 @@ tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options) tsk_memcpy(dest->num_samples, self->num_samples, N * sizeof(*self->num_samples)); tsk_memcpy(dest->num_tracked_samples, self->num_tracked_samples, N * sizeof(*self->num_tracked_samples)); - tsk_memcpy(dest->marked, self->marked, N * sizeof(*self->marked)); } if (dest->options & TSK_SAMPLE_LISTS) { if (!(self->options & TSK_SAMPLE_LISTS)) { ret = TSK_ERR_UNSUPPORTED_OPERATION; goto out; } - tsk_memcpy(dest->left_sample, self->left_sample, N * sizeof(tsk_id_t)); - tsk_memcpy(dest->right_sample, self->right_sample, N * sizeof(tsk_id_t)); + tsk_memcpy(dest->left_sample, self->left_sample, N * sizeof(*self->left_sample)); + tsk_memcpy( + dest->right_sample, self->right_sample, N * sizeof(*self->right_sample)); tsk_memcpy(dest->next_sample, self->next_sample, - self->tree_sequence->num_samples * sizeof(tsk_id_t)); + self->tree_sequence->num_samples * sizeof(*self->next_sample)); } ret = 0; out: @@ -3474,7 +3515,7 @@ static int tsk_tree_check_node(const tsk_tree_t *self, tsk_id_t u) { int ret = 0; - if (u < 0 || u >= (tsk_id_t) self->num_nodes) { + if (u < 0 || u > (tsk_id_t) self->num_nodes) { ret = TSK_ERR_NODE_OUT_OF_BOUNDS; } return ret; @@ -3513,6 +3554,13 @@ tsk_tree_get_mrca(const tsk_tree_t *self, tsk_id_t u, tsk_id_t v, tsk_id_t *mrca goto out; } + /* Simplest to make the virtual_root a special case here to avoid + * doing the time lookup. */ + if (u == self->virtual_root || v == self->virtual_root) { + *mrca = self->virtual_root; + return 0; + } + tu = time[u]; tv = time[v]; while (u != v) { @@ -3540,34 +3588,29 @@ tsk_tree_get_num_samples_by_traversal( const tsk_tree_t *self, tsk_id_t u, tsk_size_t *num_samples) { int ret = 0; - tsk_id_t *stack = NULL; - tsk_id_t v; + tsk_size_t num_nodes, j; tsk_size_t count = 0; - int stack_top = 0; + const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags; + tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); + tsk_id_t v; - stack = tsk_malloc(self->num_nodes * sizeof(*stack)); - if (stack == NULL) { + if (nodes == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; } - - stack[0] = u; - while (stack_top >= 0) { - v = stack[stack_top]; - stack_top--; - if (tsk_treeseq_is_sample(self->tree_sequence, v)) { + ret = tsk_tree_preorder(self, u, nodes, &num_nodes); + if (ret != 0) { + goto out; + } + for (j = 0; j < num_nodes; j++) { + v = nodes[j]; + if (flags[v] & TSK_NODE_IS_SAMPLE) { count++; } - v = self->left_child[v]; - while (v != TSK_NULL) { - stack_top++; - stack[stack_top] = v; - v = self->right_sib[v]; - } } *num_samples = count; out: - tsk_safe_free(stack); + tsk_safe_free(nodes); return ret; } @@ -3604,7 +3647,7 @@ tsk_tree_get_num_tracked_samples( ret = TSK_ERR_UNSUPPORTED_OPERATION; goto out; } - *num_tracked_samples = (tsk_size_t) self->num_tracked_samples[u]; + *num_tracked_samples = self->num_tracked_samples[u]; out: return ret; } @@ -3615,14 +3658,27 @@ tsk_tree_is_sample(const tsk_tree_t *self, tsk_id_t u) return tsk_treeseq_is_sample(self->tree_sequence, u); } +tsk_id_t +tsk_tree_get_left_root(const tsk_tree_t *self) +{ + return self->left_child[self->virtual_root]; +} + +tsk_id_t +tsk_tree_get_right_root(const tsk_tree_t *self) +{ + return self->right_child[self->virtual_root]; +} + tsk_size_t tsk_tree_get_num_roots(const tsk_tree_t *self) { + const tsk_id_t *restrict right_sib = self->right_sib; + const tsk_id_t *restrict left_child = self->left_child; tsk_size_t num_roots = 0; - tsk_id_t u = self->left_root; + tsk_id_t u; - while (u != TSK_NULL) { - u = self->right_sib[u]; + for (u = left_child[self->virtual_root]; u != TSK_NULL; u = right_sib[u]) { num_roots++; } return num_roots; @@ -3654,12 +3710,73 @@ tsk_tree_get_time(const tsk_tree_t *self, tsk_id_t u, double *t) int ret = 0; tsk_node_t node; - ret = tsk_treeseq_get_node(self->tree_sequence, u, &node); + if (u == self->virtual_root) { + *t = INFINITY; + } else { + ret = tsk_treeseq_get_node(self->tree_sequence, u, &node); + if (ret != 0) { + goto out; + } + *t = node.time; + } +out: + return ret; +} + +static inline double +tsk_tree_get_branch_length_unsafe(const tsk_tree_t *self, tsk_id_t u) +{ + const double *times = self->tree_sequence->tables->nodes.time; + const tsk_id_t parent = self->parent[u]; + + return parent == TSK_NULL ? 0 : times[parent] - times[u]; +} + +int TSK_WARN_UNUSED +tsk_tree_get_branch_length(const tsk_tree_t *self, tsk_id_t u, double *ret_branch_length) +{ + int ret = 0; + + ret = tsk_tree_check_node(self, u); + if (ret != 0) { + goto out; + } + *ret_branch_length = tsk_tree_get_branch_length_unsafe(self, u); +out: + return ret; +} + +int +tsk_tree_get_total_branch_length(const tsk_tree_t *self, tsk_id_t node, double *ret_tbl) +{ + int ret = 0; + tsk_size_t j, num_nodes; + tsk_id_t u, v; + const tsk_id_t *restrict parent = self->parent; + const double *restrict time = self->tree_sequence->tables->nodes.time; + tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); + double sum = 0; + + if (nodes == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + ret = tsk_tree_preorder(self, node, nodes, &num_nodes); if (ret != 0) { goto out; } - *t = node.time; + /* We always skip the first node because we don't return the branch length + * over the input node. */ + for (j = 1; j < num_nodes; j++) { + u = nodes[j]; + v = parent[u]; + if (v != TSK_NULL) { + sum += time[v] - time[u]; + } + } + *ret_tbl = sum; out: + tsk_safe_free(nodes); return ret; } @@ -3673,22 +3790,24 @@ tsk_tree_get_sites( } /* u must be a valid node in the tree. For internal use */ -static tsk_size_t -tsk_tree_get_depth(const tsk_tree_t *self, tsk_id_t u) +static int +tsk_tree_get_depth_unsafe(const tsk_tree_t *self, tsk_id_t u) { - tsk_id_t v; - tsk_size_t depth = 0; + const tsk_id_t *restrict parent = self->parent; + int depth = 0; - for (v = self->parent[u]; v != TSK_NULL; v = self->parent[v]) { + if (u == self->virtual_root) { + return -1; + } + for (v = parent[u]; v != TSK_NULL; v = parent[v]) { depth++; } - return depth; } int TSK_WARN_UNUSED -tsk_tree_depth(const tsk_tree_t *self, tsk_id_t u, tsk_size_t *depth_ret) +tsk_tree_get_depth(const tsk_tree_t *self, tsk_id_t u, int *depth_ret) { int ret = 0; @@ -3697,7 +3816,7 @@ tsk_tree_depth(const tsk_tree_t *self, tsk_id_t u, tsk_size_t *depth_ret) goto out; } - *depth_ret = tsk_tree_get_depth(self, u); + *depth_ret = tsk_tree_get_depth_unsafe(self, u); out: return ret; } @@ -3713,17 +3832,6 @@ tsk_tree_node_root(tsk_tree_t *self, tsk_id_t u) return v; } -/* TODO Should push this through the stack since this is a python method and - * it's here anyway */ -static double -tsk_tree_branch_length(const tsk_tree_t *self, tsk_id_t u) -{ - const double *times = self->tree_sequence->tables->nodes.time; - tsk_id_t parent = self->parent[u]; - - return parent == TSK_NULL ? 0 : times[parent] - times[u]; -} - static void tsk_tree_check_state(const tsk_tree_t *self) { @@ -3736,6 +3844,11 @@ tsk_tree_check_state(const tsk_tree_t *self) tsk_bug_assert(children != NULL); + /* Check the virtual root properties */ + tsk_bug_assert(self->parent[self->virtual_root] == TSK_NULL); + tsk_bug_assert(self->left_sib[self->virtual_root] == TSK_NULL); + tsk_bug_assert(self->right_sib[self->virtual_root] == TSK_NULL); + for (j = 0; j < self->tree_sequence->num_samples; j++) { u = self->samples[j]; while (self->parent[u] != TSK_NULL) { @@ -3744,12 +3857,11 @@ tsk_tree_check_state(const tsk_tree_t *self) is_root[u] = true; } if (self->tree_sequence->num_samples == 0) { - tsk_bug_assert(self->left_root == TSK_NULL); - } else { - tsk_bug_assert(self->left_sib[self->left_root] == TSK_NULL); + tsk_bug_assert(self->left_child[self->virtual_root] == TSK_NULL); } + /* Iterate over the roots and make sure they are set */ - for (u = self->left_root; u != TSK_NULL; u = self->right_sib[u]) { + for (u = tsk_tree_get_left_root(self); u != TSK_NULL; u = self->right_sib[u]) { tsk_bug_assert(is_root[u]); is_root[u] = false; } @@ -3769,8 +3881,8 @@ tsk_tree_check_state(const tsk_tree_t *self) } for (j = 0; j < self->sites_length; j++) { site = self->sites[j]; - tsk_bug_assert(self->left <= site.position); - tsk_bug_assert(site.position < self->right); + tsk_bug_assert(self->interval.left <= site.position); + tsk_bug_assert(site.position < self->interval.right); } if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { @@ -3808,9 +3920,8 @@ tsk_tree_print_state(const tsk_tree_t *self, FILE *out) fprintf(out, "Tree state:\n"); fprintf(out, "options = %d\n", self->options); fprintf(out, "root_threshold = %lld\n", (long long) self->root_threshold); - fprintf(out, "left = %f\n", self->left); - fprintf(out, "right = %f\n", self->right); - fprintf(out, "left_root = %lld\n", (long long) self->left_root); + fprintf(out, "left = %f\n", self->interval.left); + fprintf(out, "right = %f\n", self->interval.right); fprintf(out, "index = %lld\n", (long long) self->index); fprintf(out, "node\tparent\tlchild\trchild\tlsib\trsib"); if (self->options & TSK_SAMPLE_LISTS) { @@ -3818,7 +3929,7 @@ tsk_tree_print_state(const tsk_tree_t *self, FILE *out) } fprintf(out, "\n"); - for (j = 0; j < self->num_nodes; j++) { + for (j = 0; j < self->num_nodes + 1; j++) { fprintf(out, "%lld\t%lld\t%lld\t%lld\t%lld\t%lld", (long long) j, (long long) self->parent[j], (long long) self->left_child[j], (long long) self->right_child[j], (long long) self->left_sib[j], @@ -3828,8 +3939,8 @@ tsk_tree_print_state(const tsk_tree_t *self, FILE *out) (long long) self->right_sample[j]); } if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { - fprintf(out, "\t%lld\t%lld\t%lld", (long long) self->num_samples[j], - (long long) self->num_tracked_samples[j], (long long) self->marked[j]); + fprintf(out, "\t%lld\t%lld", (long long) self->num_samples[j], + (long long) self->num_tracked_samples[j]); } fprintf(out, "\n"); } @@ -3843,15 +3954,17 @@ tsk_tree_print_state(const tsk_tree_t *self, FILE *out) /* Methods for positioning the tree along the sequence */ -/* parent, left_child and right_sib are restrict pointers in the calling function, - * so we pass these as parameters to ensure the relationships are clear to the - * compiler. */ +/* The following methods are performance sensitive and so we use a + * lot of restrict pointers. Because we are saying that we don't have + * any aliases to these pointers, we pass around the reference to parent + * since it's used in all the functions. */ static inline void -tsk_tree_update_sample_lists(tsk_tree_t *self, const tsk_id_t *restrict parent, - const tsk_id_t *restrict left_child, const tsk_id_t *restrict right_sib, - tsk_id_t node) +tsk_tree_update_sample_lists( + tsk_tree_t *self, tsk_id_t node, const tsk_id_t *restrict parent) { tsk_id_t u, v, sample_index; + tsk_id_t *restrict left_child = self->left_child; + tsk_id_t *restrict right_sib = self->right_sib; tsk_id_t *restrict left = self->left_sample; tsk_id_t *restrict right = self->right_sample; tsk_id_t *restrict next = self->next_sample; @@ -3880,26 +3993,17 @@ tsk_tree_update_sample_lists(tsk_tree_t *self, const tsk_id_t *restrict parent, } } -static void -tsk_tree_remove_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) +static inline void +tsk_tree_remove_branch( + tsk_tree_t *self, tsk_id_t p, tsk_id_t c, tsk_id_t *restrict parent) { - tsk_id_t *restrict parent = self->parent; tsk_id_t *restrict left_child = self->left_child; tsk_id_t *restrict right_child = self->right_child; tsk_id_t *restrict left_sib = self->left_sib; tsk_id_t *restrict right_sib = self->right_sib; - tsk_id_t *restrict num_samples = self->num_samples; - tsk_id_t *restrict num_tracked_samples = self->num_tracked_samples; - uint8_t *restrict marked = self->marked; - const uint8_t mark = self->mark; - const tsk_id_t root_threshold = (tsk_id_t) self->root_threshold; - tsk_id_t lsib, rsib, u, path_end, lroot, rroot; - bool path_end_was_root; - -#define IS_ROOT(U) (num_samples[U] >= root_threshold) - - lsib = left_sib[c]; - rsib = right_sib[c]; + tsk_id_t lsib = left_sib[c]; + tsk_id_t rsib = right_sib[c]; + if (lsib == TSK_NULL) { left_child[p] = rsib; } else { @@ -3913,78 +4017,20 @@ tsk_tree_remove_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) parent[c] = TSK_NULL; left_sib[c] = TSK_NULL; right_sib[c] = TSK_NULL; - - if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { - /* keep the compiler happy */ - path_end_was_root = false; - path_end = TSK_NULL; - - u = p; - while (u != TSK_NULL) { - path_end = u; - path_end_was_root = IS_ROOT(u); - num_samples[u] -= num_samples[c]; - num_tracked_samples[u] -= num_tracked_samples[c]; - marked[u] = mark; - u = parent[u]; - } - if (path_end_was_root && !IS_ROOT(path_end)) { - /* remove path_end from the list of roots */ - lroot = left_sib[path_end]; - rroot = right_sib[path_end]; - self->left_root = TSK_NULL; - if (lroot != TSK_NULL) { - right_sib[lroot] = rroot; - self->left_root = lroot; - } - if (rroot != TSK_NULL) { - left_sib[rroot] = lroot; - self->left_root = rroot; - } - left_sib[path_end] = TSK_NULL; - right_sib[path_end] = TSK_NULL; - } - if (IS_ROOT(c)) { - if (self->left_root != TSK_NULL) { - lroot = left_sib[self->left_root]; - if (lroot != TSK_NULL) { - right_sib[lroot] = c; - } - left_sib[c] = lroot; - left_sib[self->left_root] = c; - } - right_sib[c] = self->left_root; - self->left_root = c; - } - } - - if (self->options & TSK_SAMPLE_LISTS) { - tsk_tree_update_sample_lists(self, parent, left_child, right_sib, p); - } } -static void -tsk_tree_insert_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) +static inline void +tsk_tree_insert_branch( + tsk_tree_t *self, tsk_id_t p, tsk_id_t c, tsk_id_t *restrict parent) { - tsk_id_t *restrict parent = self->parent; tsk_id_t *restrict left_child = self->left_child; tsk_id_t *restrict right_child = self->right_child; tsk_id_t *restrict left_sib = self->left_sib; tsk_id_t *restrict right_sib = self->right_sib; - tsk_id_t *restrict num_samples = self->num_samples; - tsk_id_t *restrict num_tracked_samples = self->num_tracked_samples; - uint8_t *restrict marked = self->marked; - const uint8_t mark = self->mark; - const tsk_id_t root_threshold = (tsk_id_t) self->root_threshold; - tsk_id_t lsib, rsib, u, path_end, lroot; - bool path_end_was_root; - -#define IS_ROOT(U) (num_samples[U] >= root_threshold) + tsk_id_t u; parent[c] = p; u = right_child[p]; - lsib = left_sib[c]; - rsib = right_sib[c]; if (u == TSK_NULL) { left_child[p] = c; left_sib[c] = TSK_NULL; @@ -3995,65 +4041,96 @@ tsk_tree_insert_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) right_sib[c] = TSK_NULL; } right_child[p] = c; +} - if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { - - /* keep compiler happy */ - path_end = TSK_NULL; - path_end_was_root = false; +static inline void +tsk_tree_insert_root(tsk_tree_t *self, tsk_id_t root, tsk_id_t *restrict parent) +{ + tsk_tree_insert_branch(self, self->virtual_root, root, parent); + parent[root] = TSK_NULL; +} - u = p; +static inline void +tsk_tree_remove_root(tsk_tree_t *self, tsk_id_t root, tsk_id_t *restrict parent) +{ + tsk_tree_remove_branch(self, self->virtual_root, root, parent); +} + +static void +tsk_tree_remove_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) +{ + tsk_id_t *restrict parent = self->parent; + tsk_size_t *restrict num_samples = self->num_samples; + tsk_size_t *restrict num_tracked_samples = self->num_tracked_samples; + const tsk_size_t root_threshold = self->root_threshold; + tsk_id_t u; + tsk_id_t path_end = TSK_NULL; + bool path_end_was_root = false; + +#define POTENTIAL_ROOT(U) (num_samples[U] >= root_threshold) + + tsk_tree_remove_branch(self, p, c, parent); + self->num_edges--; + + if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { + u = p; while (u != TSK_NULL) { path_end = u; - path_end_was_root = IS_ROOT(u); + path_end_was_root = POTENTIAL_ROOT(u); + num_samples[u] -= num_samples[c]; + num_tracked_samples[u] -= num_tracked_samples[c]; + u = parent[u]; + } + + if (path_end_was_root && !POTENTIAL_ROOT(path_end)) { + tsk_tree_remove_root(self, path_end, parent); + } + if (POTENTIAL_ROOT(c)) { + tsk_tree_insert_root(self, c, parent); + } + } + + if (self->options & TSK_SAMPLE_LISTS) { + tsk_tree_update_sample_lists(self, p, parent); + } +} + +static void +tsk_tree_insert_edge(tsk_tree_t *self, tsk_id_t p, tsk_id_t c) +{ + tsk_id_t *restrict parent = self->parent; + tsk_size_t *restrict num_samples = self->num_samples; + tsk_size_t *restrict num_tracked_samples = self->num_tracked_samples; + const tsk_size_t root_threshold = self->root_threshold; + tsk_id_t u; + tsk_id_t path_end = TSK_NULL; + bool path_end_was_root = false; + +#define POTENTIAL_ROOT(U) (num_samples[U] >= root_threshold) + + if (!(self->options & TSK_NO_SAMPLE_COUNTS)) { + u = p; + while (u != TSK_NULL) { + path_end = u; + path_end_was_root = POTENTIAL_ROOT(u); num_samples[u] += num_samples[c]; num_tracked_samples[u] += num_tracked_samples[c]; - marked[u] = mark; u = parent[u]; } - if (IS_ROOT(c)) { - if (path_end_was_root) { - /* Remove c from the root list */ - self->left_root = TSK_NULL; - if (lsib != TSK_NULL) { - right_sib[lsib] = rsib; - self->left_root = lsib; - } - if (rsib != TSK_NULL) { - left_sib[rsib] = lsib; - self->left_root = rsib; - } - } else { - /* Replace c with path_end in root list */ - if (lsib != TSK_NULL) { - right_sib[lsib] = path_end; - } - if (rsib != TSK_NULL) { - left_sib[rsib] = path_end; - } - left_sib[path_end] = lsib; - right_sib[path_end] = rsib; - self->left_root = path_end; - } - } else { - if (IS_ROOT(path_end) && !path_end_was_root) { - /* Add a path_end as new root */ - if (self->left_root != TSK_NULL) { - lroot = left_sib[self->left_root]; - if (lroot != TSK_NULL) { - right_sib[lroot] = path_end; - } - left_sib[path_end] = lroot; - left_sib[self->left_root] = path_end; - } - right_sib[path_end] = self->left_root; - self->left_root = path_end; - } + if (POTENTIAL_ROOT(c)) { + tsk_tree_remove_root(self, c, parent); + } + if (POTENTIAL_ROOT(path_end) && !path_end_was_root) { + tsk_tree_insert_root(self, path_end, parent); } } + + tsk_tree_insert_branch(self, p, c, parent); + self->num_edges++; + if (self->options & TSK_SAMPLE_LISTS) { - tsk_tree_update_sample_lists(self, parent, left_child, right_sib, p); + tsk_tree_update_sample_lists(self, p, parent); } } @@ -4076,9 +4153,9 @@ tsk_tree_advance(tsk_tree_t *self, int direction, const double *restrict out_bre double x; if (direction == TSK_DIR_FORWARD) { - x = self->right; + x = self->interval.right; } else { - x = self->left; + x = self->interval.left; } while (out >= 0 && out < num_edges && out_breakpoints[out_order[out]] == x) { tsk_bug_assert(out < num_edges); @@ -4093,35 +4170,32 @@ tsk_tree_advance(tsk_tree_t *self, int direction, const double *restrict out_bre tsk_tree_insert_edge(self, edge_parent[k], edge_child[k]); } - if (self->left_root != TSK_NULL) { - /* Ensure that left_root is the left-most root */ - while (self->left_sib[self->left_root] != TSK_NULL) { - self->left_root = self->left_sib[self->left_root]; - } - } - self->direction = direction; self->index = self->index + direction; if (direction == TSK_DIR_FORWARD) { - self->left = x; - self->right = sequence_length; + self->interval.left = x; + self->interval.right = sequence_length; if (out >= 0 && out < num_edges) { - self->right = TSK_MIN(self->right, out_breakpoints[out_order[out]]); + self->interval.right + = TSK_MIN(self->interval.right, out_breakpoints[out_order[out]]); } if (in >= 0 && in < num_edges) { - self->right = TSK_MIN(self->right, in_breakpoints[in_order[in]]); + self->interval.right + = TSK_MIN(self->interval.right, in_breakpoints[in_order[in]]); } } else { - self->right = x; - self->left = 0; + self->interval.right = x; + self->interval.left = 0; if (out >= 0 && out < num_edges) { - self->left = TSK_MAX(self->left, out_breakpoints[out_order[out]]); + self->interval.left + = TSK_MAX(self->interval.left, out_breakpoints[out_order[out]]); } if (in >= 0 && in < num_edges) { - self->left = TSK_MAX(self->left, in_breakpoints[in_order[in]]); + self->interval.left + = TSK_MAX(self->interval.left, in_breakpoints[in_order[in]]); } } - tsk_bug_assert(self->left < self->right); + tsk_bug_assert(self->interval.left < self->interval.right); *out_index = out; *in_index = in; if (tables->sites.num_rows > 0) { @@ -4138,9 +4212,9 @@ tsk_tree_first(tsk_tree_t *self) int ret = 1; tsk_table_collection_t *tables = self->tree_sequence->tables; - self->left = 0; + self->interval.left = 0; self->index = 0; - self->right = tables->sequence_length; + self->interval.right = tables->sequence_length; self->sites = self->tree_sequence->tree_sites[0]; self->sites_length = self->tree_sequence->tree_sites_length[0]; @@ -4157,7 +4231,7 @@ tsk_tree_first(tsk_tree_t *self) self->left_index = 0; self->right_index = 0; self->direction = TSK_DIR_FORWARD; - self->right = 0; + self->interval.right = 0; ret = tsk_tree_advance(self, TSK_DIR_FORWARD, tables->edges.right, tables->indexes.edge_removal_order, &self->right_index, tables->edges.left, @@ -4174,8 +4248,8 @@ tsk_tree_last(tsk_tree_t *self) const tsk_treeseq_t *ts = self->tree_sequence; const tsk_table_collection_t *tables = ts->tables; - self->left = 0; - self->right = tables->sequence_length; + self->interval.left = 0; + self->interval.right = tables->sequence_length; self->index = 0; self->sites = ts->tree_sites[0]; self->sites_length = ts->tree_sites_length[0]; @@ -4193,8 +4267,8 @@ tsk_tree_last(tsk_tree_t *self) self->left_index = (tsk_id_t) tables->edges.num_rows - 1; self->right_index = (tsk_id_t) tables->edges.num_rows - 1; self->direction = TSK_DIR_REVERSE; - self->left = tables->sequence_length; - self->right = 0; + self->interval.left = tables->sequence_length; + self->interval.right = 0; ret = tsk_tree_advance(self, TSK_DIR_REVERSE, tables->edges.left, tables->indexes.edge_insertion_order, &self->left_index, tables->edges.right, @@ -4242,6 +4316,322 @@ tsk_tree_prev(tsk_tree_t *self) return ret; } +static inline bool +tsk_tree_position_in_interval(const tsk_tree_t *self, double x) +{ + return self->interval.left <= x && x < self->interval.right; +} + +int TSK_WARN_UNUSED +tsk_tree_seek(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options)) +{ + int ret = 0; + const double L = tsk_treeseq_get_sequence_length(self->tree_sequence); + const double t_l = self->interval.left; + const double t_r = self->interval.right; + double distance_left, distance_right; + + if (x < 0 || x >= L) { + ret = TSK_ERR_SEEK_OUT_OF_BOUNDS; + goto out; + } + + if (x < t_l) { + /* |-----|-----|========|---------| */ + /* 0 x t_l t_r L */ + distance_left = t_l - x; + distance_right = L - t_r + x; + } else { + /* |------|========|------|-------| */ + /* 0 t_l t_r x L */ + distance_right = x - t_r; + distance_left = t_l + L - x; + } + if (distance_right <= distance_left) { + while (!tsk_tree_position_in_interval(self, x)) { + ret = tsk_tree_next(self); + if (ret < 0) { + goto out; + } + } + } else { + while (!tsk_tree_position_in_interval(self, x)) { + ret = tsk_tree_prev(self); + if (ret < 0) { + goto out; + } + } + } + ret = 0; +out: + return ret; +} + +int TSK_WARN_UNUSED +tsk_tree_clear(tsk_tree_t *self) +{ + int ret = 0; + tsk_size_t j; + tsk_id_t u; + const tsk_size_t N = self->num_nodes + 1; + const tsk_size_t num_samples = self->tree_sequence->num_samples; + const bool sample_counts = !(self->options & TSK_NO_SAMPLE_COUNTS); + const bool sample_lists = !!(self->options & TSK_SAMPLE_LISTS); + const tsk_flags_t *flags = self->tree_sequence->tables->nodes.flags; + + self->interval.left = 0; + self->interval.right = 0; + self->num_edges = 0; + self->index = -1; + /* TODO we should profile this method to see if just doing a single loop over + * the nodes would be more efficient than multiple memsets. + */ + tsk_memset(self->parent, 0xff, N * sizeof(*self->parent)); + tsk_memset(self->left_child, 0xff, N * sizeof(*self->left_child)); + tsk_memset(self->right_child, 0xff, N * sizeof(*self->right_child)); + tsk_memset(self->left_sib, 0xff, N * sizeof(*self->left_sib)); + tsk_memset(self->right_sib, 0xff, N * sizeof(*self->right_sib)); + + if (sample_counts) { + tsk_memset(self->num_samples, 0, N * sizeof(*self->num_samples)); + /* We can't reset the tracked samples via memset because we don't + * know where the tracked samples are. + */ + for (j = 0; j < self->num_nodes; j++) { + if (!(flags[j] & TSK_NODE_IS_SAMPLE)) { + self->num_tracked_samples[j] = 0; + } + } + /* The total tracked_samples gets set in set_tracked_samples */ + self->num_samples[self->virtual_root] = num_samples; + } + if (sample_lists) { + tsk_memset(self->left_sample, 0xff, N * sizeof(tsk_id_t)); + tsk_memset(self->right_sample, 0xff, N * sizeof(tsk_id_t)); + tsk_memset(self->next_sample, 0xff, num_samples * sizeof(tsk_id_t)); + } + /* Set the sample attributes */ + for (j = 0; j < num_samples; j++) { + u = self->samples[j]; + if (sample_counts) { + self->num_samples[u] = 1; + } + if (sample_lists) { + /* We are mapping to *indexes* into the list of samples here */ + self->left_sample[u] = (tsk_id_t) j; + self->right_sample[u] = (tsk_id_t) j; + } + } + if (sample_counts && self->root_threshold == 1 && num_samples > 0) { + for (j = 0; j < num_samples; j++) { + /* Set initial roots */ + if (self->root_threshold == 1) { + tsk_tree_insert_root(self, self->samples[j], self->parent); + } + } + } + return ret; +} + +tsk_size_t +tsk_tree_get_size_bound(const tsk_tree_t *self) +{ + tsk_size_t bound = 0; + + if (self->tree_sequence != NULL) { + /* This is a safe upper bound which can be computed cheaply. + * We have at most n roots and each edge adds at most one new + * node to the tree. We also allow space for the virtual root, + * to simplify client code. + * + * In the common case of a binary tree with a single root, we have + * 2n - 1 nodes in total, and 2n - 2 edges. Therefore, we return + * 3n - 1, which is an over-estimate of 1/2 and we allocate + * 1.5 times as much memory as we need. + * + * Since tracking the exact number of nodes in the tree would require + * storing the number of nodes beneath every node and complicate + * the tree transition method, this seems like a good compromise + * and will result in less memory usage overall in nearly all cases. + */ + bound = 1 + self->tree_sequence->num_samples + self->num_edges; + } + return bound; +} + +/* Traversal orders */ +static tsk_id_t * +tsk_tree_alloc_node_stack(const tsk_tree_t *self) +{ + return tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(tsk_id_t)); +} + +int +tsk_tree_preorder( + const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) +{ + int ret = 0; + const tsk_id_t *restrict right_child = self->right_child; + const tsk_id_t *restrict left_sib = self->left_sib; + tsk_id_t *stack = tsk_tree_alloc_node_stack(self); + tsk_size_t num_nodes = 0; + tsk_id_t u, v; + int stack_top; + + if (stack == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + if (root == -1) { + stack_top = -1; + for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { + stack_top++; + stack[stack_top] = u; + } + } else { + ret = tsk_tree_check_node(self, root); + if (ret != 0) { + goto out; + } + stack_top = 0; + stack[stack_top] = root; + } + + while (stack_top >= 0) { + u = stack[stack_top]; + stack_top--; + nodes[num_nodes] = u; + num_nodes++; + for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { + stack_top++; + stack[stack_top] = v; + } + } + *num_nodes_ret = num_nodes; +out: + tsk_safe_free(stack); + return ret; +} + +/* We could implement this using the preorder function, but since it's + * going to be performance critical we want to avoid the overhead + * of mallocing the intermediate node list (which will be bigger than + * the number of samples). */ +int +tsk_tree_preorder_samples( + const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) +{ + int ret = 0; + const tsk_id_t *restrict right_child = self->right_child; + const tsk_id_t *restrict left_sib = self->left_sib; + const tsk_flags_t *restrict flags = self->tree_sequence->tables->nodes.flags; + tsk_id_t *stack = tsk_tree_alloc_node_stack(self); + tsk_size_t num_nodes = 0; + tsk_id_t u, v; + int stack_top; + + if (stack == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + /* We could push the virtual_root onto the stack directly to simplify + * the code a little, but then we'd have to check put an extra check + * when looking up the flags array (which isn't defined for virtual_root). + */ + if (root == -1 || root == self->virtual_root) { + stack_top = -1; + for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { + stack_top++; + stack[stack_top] = u; + } + } else { + ret = tsk_tree_check_node(self, root); + if (ret != 0) { + goto out; + } + stack_top = 0; + stack[stack_top] = root; + } + + while (stack_top >= 0) { + u = stack[stack_top]; + stack_top--; + if (flags[u] & TSK_NODE_IS_SAMPLE) { + nodes[num_nodes] = u; + num_nodes++; + } + for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { + stack_top++; + stack[stack_top] = v; + } + } + *num_nodes_ret = num_nodes; +out: + tsk_safe_free(stack); + return ret; +} + +int +tsk_tree_postorder( + const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes_ret) +{ + int ret = 0; + const tsk_id_t *restrict right_child = self->right_child; + const tsk_id_t *restrict left_sib = self->left_sib; + const tsk_id_t *restrict parent = self->parent; + tsk_id_t *stack = tsk_tree_alloc_node_stack(self); + tsk_size_t num_nodes = 0; + tsk_id_t u, v, postorder_parent; + int stack_top; + bool is_virtual_root = root == self->virtual_root; + + if (stack == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + if (root == -1 || is_virtual_root) { + stack_top = -1; + for (u = right_child[self->virtual_root]; u != TSK_NULL; u = left_sib[u]) { + stack_top++; + stack[stack_top] = u; + } + } else { + ret = tsk_tree_check_node(self, root); + if (ret != 0) { + goto out; + } + stack_top = 0; + stack[stack_top] = root; + } + + postorder_parent = TSK_NULL; + while (stack_top >= 0) { + u = stack[stack_top]; + if (right_child[u] != TSK_NULL && u != postorder_parent) { + for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { + stack_top++; + stack[stack_top] = v; + } + } else { + stack_top--; + postorder_parent = parent[u]; + nodes[num_nodes] = u; + num_nodes++; + } + } + if (is_virtual_root) { + nodes[num_nodes] = root; + num_nodes++; + } + *num_nodes_ret = num_nodes; +out: + tsk_safe_free(stack); + return ret; +} + /* Parsimony methods */ static inline uint64_t @@ -4280,6 +4670,9 @@ get_smallest_set_bit(uint64_t v) * use a general cost matrix, in which case we'll use the Sankoff algorithm. For * now this is unused. * + * We should also vectorise the function so that several sites can be processed + * at once. + * * The algorithm used here is Hartigan parsimony, "Minimum Mutation Fits to a * Given Tree", Biometrics 1973. */ @@ -4295,30 +4688,34 @@ tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes, int8_t state; }; const tsk_size_t num_samples = self->tree_sequence->num_samples; - const tsk_size_t num_nodes = self->num_nodes; const tsk_id_t *restrict left_child = self->left_child; const tsk_id_t *restrict right_sib = self->right_sib; - const tsk_id_t *restrict parent = self->parent; + const tsk_size_t N = tsk_treeseq_get_num_nodes(self->tree_sequence); const tsk_flags_t *restrict node_flags = self->tree_sequence->tables->nodes.flags; - uint64_t optimal_root_set; - uint64_t *restrict optimal_set = tsk_calloc(num_nodes, sizeof(*optimal_set)); - tsk_id_t *restrict postorder_stack - = tsk_malloc(num_nodes * sizeof(*postorder_stack)); + tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); + /* Note: to use less memory here and to improve cache performance we should + * probably change to allocating exactly the number of nodes returned by + * a preorder traversal, and then lay the memory out in this order. So, we'd + * need a map from node ID to its index in the preorder traversal, but this + * is trivial to compute. Probably doesn't matter so much at the moment + * when we're doing a single site, but it would make a big difference if + * we were vectorising over lots of sites. */ + uint64_t *restrict optimal_set = tsk_calloc(N + 1, sizeof(*optimal_set)); struct stack_elem *restrict preorder_stack - = tsk_malloc(num_nodes * sizeof(*preorder_stack)); - tsk_id_t postorder_parent, root, u, v; + = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*preorder_stack)); + tsk_id_t u, v; /* The largest possible number of transitions is one over every sample */ tsk_state_transition_t *transitions = tsk_malloc(num_samples * sizeof(*transitions)); int8_t allele, ancestral_state; int stack_top; struct stack_elem s; - tsk_size_t j, num_transitions, max_allele_count; + tsk_size_t j, num_transitions, max_allele_count, num_nodes; tsk_size_t allele_count[HARTIGAN_MAX_ALLELES]; tsk_size_t non_missing = 0; int8_t num_alleles = 0; - if (optimal_set == NULL || preorder_stack == NULL || postorder_stack == NULL - || transitions == NULL) { + if (optimal_set == NULL || preorder_stack == NULL || transitions == NULL + || nodes == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; } @@ -4355,94 +4752,60 @@ tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes, } } - for (root = self->left_root; root != TSK_NULL; root = self->right_sib[root]) { - /* Do a post order traversal */ - postorder_stack[0] = root; - stack_top = 0; - postorder_parent = TSK_NULL; - while (stack_top >= 0) { - u = postorder_stack[stack_top]; - if (left_child[u] != TSK_NULL && u != postorder_parent) { - for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { - stack_top++; - postorder_stack[stack_top] = v; - } - } else { - stack_top--; - postorder_parent = parent[u]; - - /* Visit u */ - tsk_memset( - allele_count, 0, ((size_t) num_alleles) * sizeof(*allele_count)); - for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { - for (allele = 0; allele < num_alleles; allele++) { - allele_count[allele] += bit_is_set(optimal_set[v], allele); - } - } - if (!(node_flags[u] & TSK_NODE_IS_SAMPLE)) { - max_allele_count = 0; - for (allele = 0; allele < num_alleles; allele++) { - max_allele_count - = TSK_MAX(max_allele_count, allele_count[allele]); - } - for (allele = 0; allele < num_alleles; allele++) { - if (allele_count[allele] == max_allele_count) { - optimal_set[u] = set_bit(optimal_set[u], allele); - } - } - } - } - } + ret = tsk_tree_postorder(self, self->virtual_root, nodes, &num_nodes); + if (ret != 0) { + goto out; } - - if (!(options & TSK_MM_FIXED_ANCESTRAL_STATE)) { - optimal_root_set = 0; - /* TODO it's annoying that this is essentially the same as the - * visit function above. It would be nice if we had an extra - * node that was the parent of all roots, then the algorithm - * would work as-is */ + for (j = 0; j < num_nodes; j++) { + u = nodes[j]; tsk_memset(allele_count, 0, ((size_t) num_alleles) * sizeof(*allele_count)); - for (root = self->left_root; root != TSK_NULL; root = right_sib[root]) { + for (v = left_child[u]; v != TSK_NULL; v = right_sib[v]) { for (allele = 0; allele < num_alleles; allele++) { - allele_count[allele] += bit_is_set(optimal_set[root], allele); + allele_count[allele] += bit_is_set(optimal_set[v], allele); } } - max_allele_count = 0; - for (allele = 0; allele < num_alleles; allele++) { - max_allele_count = TSK_MAX(max_allele_count, allele_count[allele]); - } - for (allele = 0; allele < num_alleles; allele++) { - if (allele_count[allele] == max_allele_count) { - optimal_root_set = set_bit(optimal_root_set, allele); + /* the virtual root has no flags defined */ + if (u == (tsk_id_t) N || !(node_flags[u] & TSK_NODE_IS_SAMPLE)) { + max_allele_count = 0; + for (allele = 0; allele < num_alleles; allele++) { + max_allele_count = TSK_MAX(max_allele_count, allele_count[allele]); + } + for (allele = 0; allele < num_alleles; allele++) { + if (allele_count[allele] == max_allele_count) { + optimal_set[u] = set_bit(optimal_set[u], allele); + } } } - ancestral_state = get_smallest_set_bit(optimal_root_set); + } + if (!(options & TSK_MM_FIXED_ANCESTRAL_STATE)) { + ancestral_state = get_smallest_set_bit(optimal_set[self->virtual_root]); + } else { + optimal_set[self->virtual_root] = UINT64_MAX; } num_transitions = 0; - for (root = self->left_root; root != TSK_NULL; root = self->right_sib[root]) { - /* Do a preorder traversal */ - preorder_stack[0].node = root; - preorder_stack[0].state = ancestral_state; - preorder_stack[0].transition_parent = TSK_NULL; - stack_top = 0; - while (stack_top >= 0) { - s = preorder_stack[stack_top]; - stack_top--; - if (!bit_is_set(optimal_set[s.node], s.state)) { - s.state = get_smallest_set_bit(optimal_set[s.node]); - transitions[num_transitions].node = s.node; - transitions[num_transitions].parent = s.transition_parent; - transitions[num_transitions].state = s.state; - s.transition_parent = (tsk_id_t) num_transitions; - num_transitions++; - } - for (v = left_child[s.node]; v != TSK_NULL; v = right_sib[v]) { - stack_top++; - s.node = v; - preorder_stack[stack_top] = s; - } + /* Do a preorder traversal */ + preorder_stack[0].node = self->virtual_root; + preorder_stack[0].state = ancestral_state; + preorder_stack[0].transition_parent = TSK_NULL; + stack_top = 0; + while (stack_top >= 0) { + s = preorder_stack[stack_top]; + stack_top--; + + if (!bit_is_set(optimal_set[s.node], s.state)) { + s.state = get_smallest_set_bit(optimal_set[s.node]); + transitions[num_transitions].node = s.node; + transitions[num_transitions].parent = s.transition_parent; + transitions[num_transitions].state = s.state; + s.transition_parent = (tsk_id_t) num_transitions; + num_transitions++; + } + for (v = left_child[s.node]; v != TSK_NULL; v = right_sib[v]) { + stack_top++; + s.node = v; + preorder_stack[stack_top] = s; } } @@ -4459,8 +4822,8 @@ tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes, if (preorder_stack != NULL) { free(preorder_stack); } - if (postorder_stack != NULL) { - free(postorder_stack); + if (nodes != NULL) { + free(nodes); } return ret; } @@ -4725,7 +5088,7 @@ fill_kc_vectors(const tsk_tree_t *t, kc_vectors *kc_vecs) int ret = 0; const tsk_treeseq_t *ts = t->tree_sequence; - stack = tsk_malloc(t->num_nodes * sizeof(*stack)); + stack = tsk_malloc(tsk_tree_get_size_bound(t) * sizeof(*stack)); if (stack == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; @@ -4733,7 +5096,7 @@ fill_kc_vectors(const tsk_tree_t *t, kc_vectors *kc_vecs) times = t->tree_sequence->tables->nodes.time; - for (root = t->left_root; root != TSK_NULL; root = t->right_sib[root]) { + for (root = tsk_tree_get_left_root(t); root != TSK_NULL; root = t->right_sib[root]) { stack_top = 0; stack[stack_top].node = root; stack[stack_top].depth = 0; @@ -4743,7 +5106,7 @@ fill_kc_vectors(const tsk_tree_t *t, kc_vectors *kc_vecs) stack_top--; if (tsk_tree_is_sample(t, u)) { - time = tsk_tree_branch_length(t, u); + time = tsk_tree_get_branch_length_unsafe(t, u); update_kc_vectors_single_sample(ts, kc_vecs, u, time); } @@ -4931,7 +5294,7 @@ update_kc_subtree_state( tsk_id_t *stack = NULL; int ret = 0; - stack = tsk_malloc(t->num_nodes * sizeof(*stack)); + stack = tsk_malloc(tsk_tree_get_size_bound(t) * sizeof(*stack)); if (stack == NULL) { ret = TSK_ERR_NO_MEMORY; goto out; @@ -5001,7 +5364,7 @@ update_kc_incremental(tsk_tree_t *self, kc_vectors *kc, tsk_edge_list_t *edges_o } if (tsk_tree_is_sample(self, u)) { - time = tsk_tree_branch_length(self, u); + time = tsk_tree_get_branch_length_unsafe(self, u); update_kc_vectors_single_sample(self->tree_sequence, kc, u, time); } } diff --git a/subprojects/tskit/tskit/trees.h b/subprojects/tskit/tskit/trees.h index 811195c99..896820e26 100644 --- a/subprojects/tskit/tskit/trees.h +++ b/subprojects/tskit/tskit/trees.h @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019 Tskit Developers + * Copyright (c) 2019-2021 Tskit Developers * Copyright (c) 2015-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -36,13 +36,7 @@ extern "C" { #include -/* The TSK_SAMPLE_COUNTS was removed in version 0.99.3, where - * the default is now to always count samples except when - * TSK_NO_SAMPLE_COUNTS is specified. This macro can be undefined - * at some point in the future and the option reused for something - * else. */ // clang-format off -#define TSK_SAMPLE_COUNTS (1 << 0) #define TSK_SAMPLE_LISTS (1 << 1) #define TSK_NO_SAMPLE_COUNTS (1 << 2) @@ -51,8 +45,9 @@ extern "C" { #define TSK_STAT_NODE (1 << 2) /* Leave room for other stat types */ -#define TSK_STAT_POLARISED (1 << 10) -#define TSK_STAT_SPAN_NORMALISE (1 << 11) +#define TSK_STAT_POLARISED (1 << 10) +#define TSK_STAT_SPAN_NORMALISE (1 << 11) +#define TSK_STAT_ALLOW_TIME_UNCALIBRATED (1 << 12) /* Options for map_mutations */ #define TSK_MM_FIXED_ANCESTRAL_STATE (1 << 0) @@ -71,6 +66,12 @@ typedef struct { tsk_size_t num_trees; tsk_size_t num_samples; tsk_id_t *samples; + /* Does this tree sequence have time_units == "uncalibrated" */ + bool time_uncalibrated; + /* Are all genome coordinates discrete? */ + bool discrete_genome; + /* Are all time values discrete? */ + bool discrete_time; /* Breakpoints along the sequence, including 0 and L. */ double *breakpoints; /* If a node is a sample, map to its index in the samples list */ @@ -124,10 +125,10 @@ typedef struct { */ const tsk_treeseq_t *tree_sequence; /** - * @brief The leftmost root in the tree. Roots are siblings, and - * other roots can be found using right_sib. + @brief The ID of the "virtual root" whose children are the roots of the + tree. */ - tsk_id_t left_root; + tsk_id_t virtual_root; /** @brief The parent of node u is parent[u]. Equal to TSK_NULL if node u is a root or is not a node in the current tree. @@ -153,31 +154,34 @@ typedef struct { if node u has no siblings to its right. */ tsk_id_t *right_sib; - + /** + @brief The total number of edges defining the topology of this tree. + This is equal to the number of tree sequence edges that intersect with + the tree's genomic interval. + */ + tsk_size_t num_edges; + /* FIXME Undocumenting this for now to resolve some sphinx/doxygen issues */ + /* + @brief Left and right coordinates of the genomic interval that this + tree covers. The left coordinate is inclusive and the right coordinate + exclusive. + */ + struct { + double left; + double right; + } interval; tsk_size_t num_nodes; tsk_flags_t options; tsk_size_t root_threshold; const tsk_id_t *samples; - /* TODO before documenting this should be change to interval. */ - /* Left and right physical coordinates of the tree */ - double left; - double right; tsk_id_t index; /* These are involved in the optional sample tracking; num_samples counts * all samples below a give node, and num_tracked_samples counts those * from a specific subset. By default sample counts are tracked and roots * maintained. If TSK_NO_SAMPLE_COUNTS is specified, then neither sample * counts or roots are available. */ - /* TODO should these be tsk_size_t values? */ - tsk_id_t *num_samples; - tsk_id_t *num_tracked_samples; - /* TODO the only place this feature seems to be used is in the ld_calculator. - * when this is being replaced we should come up with a better way of doing - * whatever this is being used for. */ - /* All nodes that are marked during a particular transition are marked - * with a given value. */ - uint8_t *marked; - uint8_t mark; + tsk_size_t *num_samples; + tsk_size_t *num_tracked_samples; /* These are for the optional sample list tracking. */ tsk_id_t *left_sample; tsk_id_t *right_sample; @@ -191,8 +195,7 @@ typedef struct { tsk_id_t right_index; } tsk_tree_t; -/* Diff iterator. TODO Not sure if we want to keep this, as it's not used - * very much in the C code. */ +/* Diff iterator. */ typedef struct _tsk_edge_list_node_t { tsk_edge_t edge; struct _tsk_edge_list_node_t *next; @@ -225,7 +228,7 @@ typedef struct { @{ */ int tsk_treeseq_init( - tsk_treeseq_t *self, const tsk_table_collection_t *tables, tsk_flags_t options); + tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options); int tsk_treeseq_load(tsk_treeseq_t *self, const char *filename, tsk_flags_t options); int tsk_treeseq_loadf(tsk_treeseq_t *self, FILE *file, tsk_flags_t options); @@ -240,6 +243,8 @@ void tsk_treeseq_print_state(const tsk_treeseq_t *self, FILE *out); /** @} */ +bool tsk_treeseq_has_reference_sequence(const tsk_treeseq_t *self); + tsk_size_t tsk_treeseq_get_num_nodes(const tsk_treeseq_t *self); tsk_size_t tsk_treeseq_get_num_edges(const tsk_treeseq_t *self); tsk_size_t tsk_treeseq_get_num_migrations(const tsk_treeseq_t *self); @@ -254,12 +259,16 @@ const char *tsk_treeseq_get_metadata(const tsk_treeseq_t *self); tsk_size_t tsk_treeseq_get_metadata_length(const tsk_treeseq_t *self); const char *tsk_treeseq_get_metadata_schema(const tsk_treeseq_t *self); tsk_size_t tsk_treeseq_get_metadata_schema_length(const tsk_treeseq_t *self); +const char *tsk_treeseq_get_time_units(const tsk_treeseq_t *self); +tsk_size_t tsk_treeseq_get_time_units_length(const tsk_treeseq_t *self); const char *tsk_treeseq_get_file_uuid(const tsk_treeseq_t *self); double tsk_treeseq_get_sequence_length(const tsk_treeseq_t *self); const double *tsk_treeseq_get_breakpoints(const tsk_treeseq_t *self); const tsk_id_t *tsk_treeseq_get_samples(const tsk_treeseq_t *self); const tsk_id_t *tsk_treeseq_get_sample_index_map(const tsk_treeseq_t *self); bool tsk_treeseq_is_sample(const tsk_treeseq_t *self, tsk_id_t u); +bool tsk_treeseq_get_discrete_genome(const tsk_treeseq_t *self); +bool tsk_treeseq_get_discrete_time(const tsk_treeseq_t *self); int tsk_treeseq_get_node(const tsk_treeseq_t *self, tsk_id_t index, tsk_node_t *node); int tsk_treeseq_get_edge(const tsk_treeseq_t *self, tsk_id_t index, tsk_edge_t *edge); @@ -401,8 +410,59 @@ int tsk_tree_last(tsk_tree_t *self); int tsk_tree_next(tsk_tree_t *self); int tsk_tree_prev(tsk_tree_t *self); int tsk_tree_clear(tsk_tree_t *self); +int tsk_tree_seek(tsk_tree_t *self, double position, tsk_flags_t options); void tsk_tree_print_state(const tsk_tree_t *self, FILE *out); + +/** +@brief Return an upper bound on the number of nodes reachable + from the roots of this tree. + +@rst +This function provides an upper bound on the number of nodes that +can be reached in tree traversals, and is intended to be used +for memory allocation purposes. If ``num_nodes`` is the number +of nodes visited in a tree traversal from the virtual root +(e.g., ``tsk_tree_preorder(tree, tree->virtual_root, nodes, +&num_nodes)``), the bound ``N`` returned here is guaranteed to +be greater than or equal to ``num_nodes``. + +.. warning:: The precise value returned is not defined and should + not be depended on, as it may change from version-to-version. + +@endrst + +@param self A pointer to a tsk_tree_t object. +@return An upper bound on the number nodes reachable from the roots + of this tree, or zero if this tree has not been initialised. +*/ +tsk_size_t tsk_tree_get_size_bound(const tsk_tree_t *self); + +/** +@brief Returns the sum of the lengths of all branches reachable from + the specified node, or from all roots if node=TSK_NULL. + +@rst +Return the total branch length in a particular subtree or of the +entire tree. If the specified node is TSK_NULL (or the virtual +root) the sum of the lengths of all branches reachable from roots +is returned. Branch length is defined as difference between the time +of a node and its parent. The branch length of a root is zero. + +Note that if the specified node is internal its branch length is +*not* included, so that, e.g., the total branch length of a +leaf node is zero. +@endrst + +@param self A pointer to a tsk_tree_t object. +@param node The tree node to compute branch length or TSK_NULL to return the + total branch length of the tree. +@param ret_tbl A double pointer to store the returned total branch length. +@return 0 on success or a negative value on failure. +*/ +int tsk_tree_get_total_branch_length( + const tsk_tree_t *self, tsk_id_t node, double *ret_tbl); + /** @} */ int tsk_tree_set_root_threshold(tsk_tree_t *self, tsk_size_t root_threshold); @@ -415,14 +475,20 @@ bool tsk_tree_equals(const tsk_tree_t *self, const tsk_tree_t *other); bool tsk_tree_is_descendant(const tsk_tree_t *self, tsk_id_t u, tsk_id_t v); bool tsk_tree_is_sample(const tsk_tree_t *self, tsk_id_t u); +tsk_id_t tsk_tree_get_left_root(const tsk_tree_t *self); +tsk_id_t tsk_tree_get_right_root(const tsk_tree_t *self); + int tsk_tree_copy(const tsk_tree_t *self, tsk_tree_t *dest, tsk_flags_t options); + +int tsk_tree_track_descendant_samples(tsk_tree_t *self, tsk_id_t node); int tsk_tree_set_tracked_samples( tsk_tree_t *self, tsk_size_t num_tracked_samples, const tsk_id_t *tracked_samples); -int tsk_tree_set_tracked_samples_from_sample_list( - tsk_tree_t *self, tsk_tree_t *other, tsk_id_t node); -int tsk_tree_get_parent(const tsk_tree_t *self, tsk_id_t u, tsk_id_t *parent); +int tsk_tree_get_branch_length( + const tsk_tree_t *self, tsk_id_t u, double *branch_length); int tsk_tree_get_time(const tsk_tree_t *self, tsk_id_t u, double *t); +int tsk_tree_get_parent(const tsk_tree_t *self, tsk_id_t u, tsk_id_t *parent); +int tsk_tree_get_depth(const tsk_tree_t *self, tsk_id_t u, int *depth); int tsk_tree_get_mrca(const tsk_tree_t *self, tsk_id_t u, tsk_id_t v, tsk_id_t *mrca); int tsk_tree_get_num_samples( const tsk_tree_t *self, tsk_id_t u, tsk_size_t *num_samples); @@ -430,7 +496,13 @@ int tsk_tree_get_num_tracked_samples( const tsk_tree_t *self, tsk_id_t u, tsk_size_t *num_tracked_samples); int tsk_tree_get_sites( const tsk_tree_t *self, const tsk_site_t **sites, tsk_size_t *sites_length); -int tsk_tree_depth(const tsk_tree_t *self, tsk_id_t u, tsk_size_t *depth); + +int tsk_tree_preorder( + const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes); +int tsk_tree_postorder( + const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes); +int tsk_tree_preorder_samples( + const tsk_tree_t *self, tsk_id_t root, tsk_id_t *nodes, tsk_size_t *num_nodes); typedef struct { tsk_id_t node;