diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index fc679abb8e..e2bf440194 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -6556,6 +6556,7 @@ test_single_tree_balance(void) tsk_treeseq_t ts; tsk_tree_t t; tsk_size_t sackin, colless; + double b1; tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); @@ -6569,6 +6570,8 @@ test_single_tree_balance(void) CU_ASSERT_EQUAL(sackin, 8); CU_ASSERT_EQUAL_FATAL(tsk_tree_colless_index(&t, &colless), 0); CU_ASSERT_EQUAL(colless, 0); + CU_ASSERT_EQUAL_FATAL(tsk_tree_b1_index(&t, &b1), 0); + CU_ASSERT_DOUBLE_EQUAL(b1, 2, 1e-8); tsk_treeseq_free(&ts); tsk_tree_free(&t); @@ -6581,6 +6584,7 @@ test_multiroot_balance(void) tsk_treeseq_t ts; tsk_tree_t t; tsk_size_t sackin; + double b1; tsk_treeseq_from_text(&ts, 10, multiroot_ex_nodes, multiroot_ex_edges, NULL, NULL, NULL, NULL, NULL, 0); @@ -6603,8 +6607,9 @@ test_multiroot_balance(void) CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0); CU_ASSERT_EQUAL(sackin, 7); - CU_ASSERT_EQUAL_FATAL(tsk_tree_colless_index(&t, NULL), TSK_ERR_UNDEFINED_MULTIROOT); + CU_ASSERT_EQUAL_FATAL(tsk_tree_b1_index(&t, &b1), 0); + CU_ASSERT_DOUBLE_EQUAL(b1, 1.0, 1e-8); tsk_treeseq_free(&ts); tsk_tree_free(&t); @@ -6623,6 +6628,7 @@ test_nonbinary_balance(void) tsk_treeseq_t ts; tsk_tree_t t; tsk_size_t sackin, colless; + double b1; tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0); ret = tsk_tree_init(&t, &ts, 0); @@ -6635,6 +6641,8 @@ test_nonbinary_balance(void) CU_ASSERT_EQUAL(sackin, 4); CU_ASSERT_EQUAL_FATAL( tsk_tree_colless_index(&t, &colless), TSK_ERR_UNDEFINED_NONBINARY); + CU_ASSERT_EQUAL_FATAL(tsk_tree_b1_index(&t, &b1), 0); + CU_ASSERT_DOUBLE_EQUAL_FATAL(b1, 0, 1e-8); tsk_treeseq_free(&ts); tsk_tree_free(&t); @@ -6648,6 +6656,7 @@ test_empty_tree_balance(void) tsk_treeseq_t ts; tsk_tree_t t; tsk_size_t sackin, colless; + double b1; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -6664,6 +6673,8 @@ test_empty_tree_balance(void) /* Technically wrong here because we have 0 roots, but not worth worrying about */ CU_ASSERT_EQUAL_FATAL( tsk_tree_colless_index(&t, &colless), TSK_ERR_UNDEFINED_MULTIROOT); + CU_ASSERT_EQUAL_FATAL(tsk_tree_b1_index(&t, &b1), 0); + CU_ASSERT_EQUAL(b1, 0); tsk_table_collection_free(&tables); tsk_treeseq_free(&ts); diff --git a/c/tskit/trees.c b/c/tskit/trees.c index a4ad97d02f..68f6ed262d 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -4961,6 +4961,46 @@ tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result) return ret; } +int +tsk_tree_b1_index(const tsk_tree_t *self, double *result) +{ + int ret = 0; + const tsk_id_t *restrict parent = self->parent; + const tsk_id_t *restrict right_child = self->right_child; + const tsk_id_t *restrict left_sib = self->left_sib; + tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes)); + tsk_size_t *max_path_length = tsk_calloc(self->num_nodes, sizeof(*max_path_length)); + tsk_size_t j, num_nodes, mpl; + double total = 0.0; + tsk_id_t u, v; + + if (nodes == NULL || max_path_length == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + ret = tsk_tree_postorder(self, nodes, &num_nodes); + if (ret != 0) { + goto out; + } + + for (j = 0; j < num_nodes; j++) { + u = nodes[j]; + if (parent[u] != TSK_NULL && right_child[u] != TSK_NULL) { + mpl = 0; + for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) { + mpl = TSK_MAX(mpl, max_path_length[v]); + } + max_path_length[u] = mpl + 1; + total += 1 / (double) max_path_length[u]; + } + } + *result = total; +out: + tsk_safe_free(nodes); + tsk_safe_free(max_path_length); + return ret; +} + /* Parsimony methods */ static inline uint64_t diff --git a/c/tskit/trees.h b/c/tskit/trees.h index dcc07d491c..e0e1c8d714 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -1688,6 +1688,7 @@ int tsk_tree_kc_distance( * C API 1.0, but should be straightforward to document based on Python docs. */ int tsk_tree_sackin_index(const tsk_tree_t *self, tsk_size_t *result); int tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result); +int tsk_tree_b1_index(const tsk_tree_t *self, double *result); /* Things to consider removing: */ diff --git a/docs/python-api.md b/docs/python-api.md index 73eab4d096..79a44c05cc 100644 --- a/docs/python-api.md +++ b/docs/python-api.md @@ -546,6 +546,16 @@ Functions and static methods Tree.kc_distance ``` +(sec_python_api_trees_balance)= + +#### Balance/imbalance indices + +```{eval-rst} +.. autosummary:: + Tree.colless_index + Tree.sackin_index + Tree.b1_index +``` (sec_python_api_trees_sites_mutations)= diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 004f516ac9..00a4aaf916 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -47,10 +47,22 @@ - Add support for missing data to ``write_vcf``, and add the ``isolated_as_missing`` argument. (:user:`jeromekelleher`, :pr:`2329`, :issue:`447`). -- Add ``Tree.num_children_array`` and ``Tree.num_children``. Returns the counts of - the number of child nodes for each or a single node in the tree respectively. +- Add ``Tree.num_children_array`` and ``Tree.num_children``. Returns the counts of + the number of child nodes for each or a single node in the tree respectively. (:user:`GertjanBisschop`, :issue:`2318`, :issue:`2319`, :pr:`2332`) +- Add ``Tree.path_length``. + (:user:`jeremyguez`, :issue:`2249`, :pr:`2259`). + +- Add B1 tree balance index. + (:user:`jeremyguez`, :user:`jeromekelleher`, :issue:`2251`, :pr:`2281`, :pr:`2346`). + +- Add Sackin tree imbalance index. + (:user:`jeremyguez`, :user:`jeromekelleher`, :pr:`2246`, :pr:`2258`). + +- Add Colless tree imbalance index. + (:user:`jeremyguez`, :user:`jeromekelleher`, :issue:`2250`, :pr:`2266`, :pr:`2344`). + **Breaking Changes** - The JSON metadata codec now interprets the empty string as an empty object. This means diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index a70c2ba73d..d46aaecfa1 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -10802,6 +10802,27 @@ Tree_get_colless_index(Tree *self) return ret; } +static PyObject * +Tree_get_b1_index(Tree *self) +{ + PyObject *ret = NULL; + int err; + double result; + + if (Tree_check_state(self) != 0) { + goto out; + } + + err = tsk_tree_b1_index(self->tree, &result); + if (err != 0) { + handle_library_error(err); + goto out; + } + ret = Py_BuildValue("d", result); +out: + return ret; +} + static PyObject * Tree_get_root_threshold(Tree *self) { @@ -11221,6 +11242,10 @@ static PyMethodDef Tree_methods[] = { .ml_meth = (PyCFunction) Tree_get_colless_index, .ml_flags = METH_NOARGS, .ml_doc = "Returns the Colless index for this tree." }, + { .ml_name = "get_b1_index", + .ml_meth = (PyCFunction) Tree_get_b1_index, + .ml_flags = METH_NOARGS, + .ml_doc = "Returns the B1 index for this tree." }, { NULL } /* Sentinel */ }; diff --git a/python/tskit/trees.py b/python/tskit/trees.py index e8600d1db8..77571c9a1c 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -2783,9 +2783,10 @@ def path_length(self, u, v): def b1_index(self): """ - Returns the B1 balance index for this tree. - This is defined as the inverse of the sum of all longest paths - to leaves for each node besides roots. + Returns the + `B1 balance index `_ + for this tree. This is defined as the inverse of the sum of all + longest paths to leaves for each node besides roots. .. seealso:: See `Shao and Sokal (1990) `_ for details. @@ -2793,25 +2794,17 @@ def b1_index(self): :return: The B1 balance index. :rtype: float """ - # TODO implement in C - max_path_length = np.zeros(self.tree_sequence.num_nodes, dtype=int) - total = 0.0 - for u in self.postorder(): - if self.parent(u) != tskit.NULL and self.is_internal(u): - max_path_length[u] = 1 + max( - max_path_length[v] for v in self.children(u) - ) - total += 1 / max_path_length[u] - return total + return self._ll_tree.get_b1_index() def colless_index(self): """ - Returns the Colless imbalance index for this tree. - This is defined as the sum of all differences between number of - leaves subtended by the left and right child of each node. - The Colless index is undefined for non-binary trees and trees - with multiple roots. This method will raise a LibraryError if the - tree is not singly-rooted and binary. + Returns the + `Colless imbalance index `_ + for this tree. This is defined as the sum of all differences between + number of leaves subtended by the left and right child of each node. + The Colless index is undefined for non-binary trees and trees with + multiple roots. This method will raise a LibraryError if the tree is + not singly-rooted and binary. .. seealso:: See `Shao and Sokal (1990) `_ for details. @@ -2823,9 +2816,11 @@ def colless_index(self): def sackin_index(self): """ - Returns the Sackin imbalance index for this tree. This is defined - as the sum of the depths of all leaves in the tree. - Equivalent to ``sum(tree.depth(u) for u in tree.leaves())`` + Returns the + `Sackin imbalance index `_ + for this tree. This is defined as the sum of the depths of all leaves + in the tree. Equivalent to ``sum(tree.depth(u) for u in + tree.leaves())`` .. seealso:: See `Shao and Sokal (1990) `_ for details.