From 96f690eb1f37f2eda0b719525d48757498ee78c1 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 16 Jun 2022 12:45:45 +0100 Subject: [PATCH 1/2] C implementation of Colless index --- c/tests/test_trees.c | 42 +++++++++++++++++++++++++++++++++-- c/tskit/core.c | 8 +++++++ c/tskit/core.h | 27 ++++++++++++++-------- c/tskit/trees.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ c/tskit/trees.h | 1 + 5 files changed, 120 insertions(+), 11 deletions(-) diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 492c4dc7fc..392c29bff5 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -6555,7 +6555,7 @@ test_single_tree_balance(void) int ret; tsk_treeseq_t ts; tsk_tree_t t; - tsk_size_t sackin; + tsk_size_t sackin, colless; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); @@ -6567,6 +6567,8 @@ test_single_tree_balance(void) /* Balanced binary tree with 4 leaves */ CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0); CU_ASSERT_EQUAL(sackin, 8); + CU_ASSERT_EQUAL_FATAL(tsk_tree_colless_index(&t, &colless), 0); + CU_ASSERT_EQUAL(colless, 0); tsk_treeseq_free(&ts); tsk_tree_free(&t); @@ -6602,6 +6604,38 @@ test_multiroot_balance(void) CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0); CU_ASSERT_EQUAL(sackin, 7); + CU_ASSERT_EQUAL_FATAL(tsk_tree_colless_index(&t, NULL), TSK_ERR_UNDEFINED_MULTIROOT); + + tsk_treeseq_free(&ts); + tsk_tree_free(&t); +} + +static void +test_nonbinary_balance(void) +{ + int ret; + const char *nodes = "1 0 0\n" + "1 0 0\n" + "1 0 0\n" + "1 0 0\n" + "0 1 0"; + const char *edges = "0 1 4 0,1,2,3\n"; + tsk_treeseq_t ts; + tsk_tree_t t; + tsk_size_t sackin, colless; + + tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); + ret = tsk_tree_init(&t, &ts, 0); + CU_ASSERT_EQUAL(ret, 0); + ret = tsk_tree_first(&t); + CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK); + + /* Star tree with 4 leaves */ + CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0); + CU_ASSERT_EQUAL(sackin, 4); + CU_ASSERT_EQUAL_FATAL( + tsk_tree_colless_index(&t, &colless), TSK_ERR_UNDEFINED_NONBINARY); + tsk_treeseq_free(&ts); tsk_tree_free(&t); } @@ -6613,7 +6647,7 @@ test_empty_tree_balance(void) tsk_table_collection_t tables; tsk_treeseq_t ts; tsk_tree_t t; - tsk_size_t sackin; + tsk_size_t sackin, colless; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -6627,6 +6661,9 @@ test_empty_tree_balance(void) CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0); CU_ASSERT_EQUAL(sackin, 0); + /* Technically wrong here because we have 0 roots, but not worth worrying about */ + CU_ASSERT_EQUAL_FATAL( + tsk_tree_colless_index(&t, &colless), TSK_ERR_UNDEFINED_MULTIROOT); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); @@ -7862,6 +7899,7 @@ main(int argc, char **argv) /* Tree balance/imbalance index tests */ { "test_single_tree_balance", test_single_tree_balance }, { "test_multiroot_balance", test_multiroot_balance }, + { "test_nonbinary_balance", test_nonbinary_balance }, { "test_empty_tree_balance", test_empty_tree_balance }, /* Misc */ diff --git a/c/tskit/core.c b/c/tskit/core.c index 8689c8f267..e18c0d1c10 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -413,6 +413,14 @@ tsk_strerror_internal(int err) ret = "A tree sequence can't take ownership of tables with " "TSK_NO_EDGE_METADATA. (TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA)"; break; + case TSK_ERR_UNDEFINED_NONBINARY: + ret = "Operation undefined for nonbinary trees. " + "(TSK_ERR_UNDEFINED_NONBINARY)"; + break; + case TSK_ERR_UNDEFINED_MULTIROOT: + ret = "Operation undefined for trees with multiple roots. " + "(TSK_ERR_UNDEFINED_MULTIROOT)"; + break; /* Stats errors */ case TSK_ERR_BAD_NUM_WINDOWS: diff --git a/c/tskit/core.h b/c/tskit/core.h index 6dfe21d570..aaf880effd 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -172,12 +172,12 @@ Used in node flags to indicate that a node is a sample node. */ #define TSK_NODE_IS_SAMPLE 1u -/** +/** Null value used for cases such as absent id references. */ #define TSK_NULL ((tsk_id_t) -1) -/** +/** Value used for missing data in genotype arrays. */ #define TSK_MISSING_DATA (-1) @@ -206,7 +206,7 @@ whose representation differs from the NAN generated by computations such as divi /* Place the common options at the top of the space; this way we can start options for individual functions at the bottom without worrying about -clashing with the common options +clashing with the common options */ /** Turn on debugging output. Not supported by all functions. */ @@ -224,7 +224,7 @@ behaviour. */ #define TSK_NO_CHECK_INTEGRITY (1u << 29) -/** +/** Instead of taking a copy of input objects, the function should take ownership of them and manage their lifecycle. The caller specifying this flag should no longer modify or free the object or objects passed. See individual functions @@ -599,13 +599,22 @@ A tree sequence cannot take ownership of a table collection where TSK_NO_EDGE_METADATA. */ #define TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA -809 +/** +Operation is undefined for nonbinary trees +*/ +#define TSK_ERR_UNDEFINED_NONBINARY -810 +/** +Operation is undefined for trees with multiple roots. +*/ +#define TSK_ERR_UNDEFINED_MULTIROOT -811 + /** @} */ /** @defgroup STATS_ERROR_GROUP Stats errors. @{ */ -/** +/** Zero windows were specified, at least one window must be specified. */ #define TSK_ERR_BAD_NUM_WINDOWS -900 @@ -657,17 +666,17 @@ were ``uncalibrated``. @defgroup MAPPING_ERROR_GROUP Mutation mapping errors. @{ */ -/** +/** Only missing genotypes were specified, at least one non-missing is required. */ #define TSK_ERR_GENOTYPES_ALL_MISSING -1000 -/** +/** A genotype value was greater than the maximum allowed (64) or less than TSK_MISSING_DATA (-1). */ #define TSK_ERR_BAD_GENOTYPE -1001 -/** +/** A ancestral genotype value was greater than the maximum allowed (64) or less than 0. */ @@ -764,7 +773,7 @@ specified table collection. /** The shared portions of the specified tree sequences are not equal. Note that this may be the case if the table collections were not -fully sorted before union was called. +fully sorted before union was called. */ #define TSK_ERR_UNION_DIFF_HISTORIES -1401 /** @} */ diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 1442a33df6..e5565024ff 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -4915,6 +4915,59 @@ tsk_tree_sackin_index(const tsk_tree_t *self, tsk_size_t *result) return ret; } +int +tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result) +{ + + int ret = 0; + const tsk_id_t *restrict right_child = self->right_child; + const tsk_id_t *restrict left_sib = self->left_sib; + tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); + tsk_id_t *num_leaves = tsk_calloc(self->num_nodes, sizeof(*num_leaves)); + tsk_size_t j, num_nodes, total; + tsk_id_t num_children, u, v; + + if (nodes == NULL || num_leaves == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + if (tsk_tree_get_num_roots(self) != 1) { + ret = TSK_ERR_UNDEFINED_MULTIROOT; + goto out; + } + ret = tsk_tree_postorder(self, nodes, &num_nodes); + if (ret != 0) { + goto out; + } + + total = 0; + for (j = 0; j < num_nodes; j++) { + u = nodes[j]; + /* Cheaper to compute this on the fly than to access the num_children array. + * since we're already iterating over the children. */ + num_children = 0; + for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { + num_children++; + num_leaves[u] += num_leaves[v]; + } + if (num_children == 0) { + num_leaves[u] = 1; + } else if (num_children == 2) { + v = right_child[u]; + total += (tsk_size_t) abs(num_leaves[v] - num_leaves[left_sib[v]]); + } else { + ret = TSK_ERR_UNDEFINED_NONBINARY; + goto out; + } + } + *result = total; +out: + tsk_safe_free(nodes); + tsk_safe_free(num_leaves); + + return ret; +} + /* Parsimony methods */ static inline uint64_t diff --git a/c/tskit/trees.h b/c/tskit/trees.h index 4065b40ba8..cbc4f7ca98 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -1689,6 +1689,7 @@ int tsk_tree_kc_distance( /* Don't document these balance metrics for now so it doesn't get in the way of * C API 1.0, but should be straightforward to document based on Python docs. */ int tsk_tree_sackin_index(const tsk_tree_t *self, tsk_size_t *result); +int tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result); /* Things to consider removing: */ From 3099d09792f88075aad3d1c2d48553bc37692b50 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Thu, 16 Jun 2022 12:54:19 +0100 Subject: [PATCH 2/2] Python-C glue for colless_index Closes #2254 --- c/tskit/core.c | 2 +- c/tskit/trees.c | 4 +--- python/_tskitmodule.c | 27 ++++++++++++++++++++++++++- python/tests/test_balance_metrics.py | 14 +++++++------- python/tskit/trees.py | 24 +++--------------------- 5 files changed, 38 insertions(+), 33 deletions(-) diff --git a/c/tskit/core.c b/c/tskit/core.c index e18c0d1c10..007b87e5d6 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -418,7 +418,7 @@ tsk_strerror_internal(int err) "(TSK_ERR_UNDEFINED_NONBINARY)"; break; case TSK_ERR_UNDEFINED_MULTIROOT: - ret = "Operation undefined for trees with multiple roots. " + ret = "Operation undefined for trees that are not singly-rooted. " "(TSK_ERR_UNDEFINED_MULTIROOT)"; break; diff --git a/c/tskit/trees.c b/c/tskit/trees.c index e5565024ff..1e7f51ef8d 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -4918,7 +4918,6 @@ tsk_tree_sackin_index(const tsk_tree_t *self, tsk_size_t *result) int tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result) { - int ret = 0; const tsk_id_t *restrict right_child = self->right_child; const tsk_id_t *restrict left_sib = self->left_sib; @@ -4954,7 +4953,7 @@ tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result) num_leaves[u] = 1; } else if (num_children == 2) { v = right_child[u]; - total += (tsk_size_t) abs(num_leaves[v] - num_leaves[left_sib[v]]); + total += (tsk_size_t) llabs(num_leaves[v] - num_leaves[left_sib[v]]); } else { ret = TSK_ERR_UNDEFINED_NONBINARY; goto out; @@ -4964,7 +4963,6 @@ tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result) out: tsk_safe_free(nodes); tsk_safe_free(num_leaves); - return ret; } diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index ec7e4eb658..b40f9421c7 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -10792,6 +10792,27 @@ Tree_get_sackin_index(Tree *self) return ret; } +static PyObject * +Tree_get_colless_index(Tree *self) +{ + PyObject *ret = NULL; + int err; + tsk_size_t result; + + if (Tree_check_state(self) != 0) { + goto out; + } + + err = tsk_tree_colless_index(self->tree, &result); + if (err != 0) { + handle_library_error(err); + goto out; + } + ret = Py_BuildValue("K", (unsigned long long) result); +out: + return ret; +} + static PyObject * Tree_get_root_threshold(Tree *self) { @@ -11206,7 +11227,11 @@ static PyMethodDef Tree_methods[] = { { .ml_name = "get_sackin_index", .ml_meth = (PyCFunction) Tree_get_sackin_index, .ml_flags = METH_NOARGS, - .ml_doc = "Returns the root threshold for this tree." }, + .ml_doc = "Returns the Sackin index for this tree." }, + { .ml_name = "get_colless_index", + .ml_meth = (PyCFunction) Tree_get_colless_index, + .ml_flags = METH_NOARGS, + .ml_doc = "Returns the Colless index for this tree." }, { NULL } /* Sentinel */ }; diff --git a/python/tests/test_balance_metrics.py b/python/tests/test_balance_metrics.py index d9e125e1c0..3e943c4017 100644 --- a/python/tests/test_balance_metrics.py +++ b/python/tests/test_balance_metrics.py @@ -76,7 +76,7 @@ def test_colless(self, ts): tree.num_children(u) == 2 for u in tree.nodes() if tree.is_internal(u) ) if tree.num_roots != 1 or not is_binary: - with pytest.raises(ValueError): + with pytest.raises(tskit.LibraryError): tree.colless_index() with pytest.raises(ValueError): colless_index_definition(tree) @@ -146,7 +146,7 @@ def test_sackin(self): assert self.tree().sackin_index() == 18 def test_colless(self): - with pytest.raises(ValueError): + with pytest.raises(tskit.LibraryError, match="UNDEFINED_NONBINARY"): self.tree().colless_index() def test_b1(self): @@ -166,7 +166,7 @@ def test_sackin(self): assert self.tree().sackin_index() == 10 def test_colless(self): - with pytest.raises(ValueError): + with pytest.raises(tskit.LibraryError, match="UNDEFINED_NONBINARY"): self.tree().colless_index() def test_b1(self): @@ -221,7 +221,7 @@ def test_sackin(self): assert self.tree().sackin_index() == 20 def test_colless(self): - with pytest.raises(ValueError): + with pytest.raises(tskit.LibraryError, match="UNDEFINED_MULTIROOT"): self.tree().colless_index() def test_b1(self): @@ -238,7 +238,7 @@ def test_sackin(self): assert self.tree().sackin_index() == 0 def test_colless(self): - with pytest.raises(ValueError): + with pytest.raises(tskit.LibraryError, match="UNDEFINED_MULTIROOT"): self.tree().colless_index() def test_b1(self): @@ -256,7 +256,7 @@ def test_sackin(self): assert self.tree().sackin_index() == 0 def test_colless(self): - with pytest.raises(ValueError): + with pytest.raises(tskit.LibraryError, match="UNDEFINED_MULTIROOT"): self.tree().colless_index() def test_b1(self): @@ -275,7 +275,7 @@ def test_sackin(self): assert self.tree().sackin_index() == 0 def test_colless(self): - with pytest.raises(ValueError): + with pytest.raises(tskit.LibraryError, match="UNDEFINED_MULTIROOT"): self.tree().colless_index() def test_b1(self): diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 202d0acb5a..aa848e3760 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -2808,9 +2808,9 @@ def colless_index(self): """ Returns the Colless imbalance index for this tree. This is defined as the sum of all differences between number of - leaves under right sub-node and left sub-node for each node. + leaves subtended by the left and right child of each node. The Colless index is undefined for non-binary trees and trees - with multiple roots. This method will raise a ValueError if the + with multiple roots. This method will raise a LibraryError if the tree is not singly-rooted and binary. .. seealso:: See `Shao and Sokal (1990) @@ -2819,25 +2819,7 @@ def colless_index(self): :return: The Colless imbalance index. :rtype: int """ - # TODO implement in C - if self.num_roots != 1: - raise ValueError("Colless index not defined for multiroot trees") - num_leaves = np.zeros(self.tree_sequence.num_nodes, dtype=np.int32) - total = 0 - for u in self.nodes(order="postorder"): - num_children = 0 - for v in self.children(u): - num_leaves[u] += num_leaves[v] - num_children += 1 - if num_children == 0: - num_leaves[u] = 1 - elif num_children != 2: - raise ValueError("Colless index not defined for nonbinary trees") - else: - total += abs( - num_leaves[self.right_child(u)] - num_leaves[self.left_child(u)] - ) - return total + return self._ll_tree.get_colless_index() def sackin_index(self): """