Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 40 additions & 2 deletions c/tests/test_trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -6555,7 +6555,7 @@ test_single_tree_balance(void)
int ret;
tsk_treeseq_t ts;
tsk_tree_t t;
tsk_size_t sackin;
tsk_size_t sackin, colless;

tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL,
NULL, NULL, NULL, 0);
Expand All @@ -6567,6 +6567,8 @@ test_single_tree_balance(void)
/* Balanced binary tree with 4 leaves */
CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0);
CU_ASSERT_EQUAL(sackin, 8);
CU_ASSERT_EQUAL_FATAL(tsk_tree_colless_index(&t, &colless), 0);
CU_ASSERT_EQUAL(colless, 0);

tsk_treeseq_free(&ts);
tsk_tree_free(&t);
Expand Down Expand Up @@ -6602,6 +6604,38 @@ test_multiroot_balance(void)
CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0);
CU_ASSERT_EQUAL(sackin, 7);

CU_ASSERT_EQUAL_FATAL(tsk_tree_colless_index(&t, NULL), TSK_ERR_UNDEFINED_MULTIROOT);

tsk_treeseq_free(&ts);
tsk_tree_free(&t);
}

static void
test_nonbinary_balance(void)
{
int ret;
const char *nodes = "1 0 0\n"
"1 0 0\n"
"1 0 0\n"
"1 0 0\n"
"0 1 0";
const char *edges = "0 1 4 0,1,2,3\n";
tsk_treeseq_t ts;
tsk_tree_t t;
tsk_size_t sackin, colless;

tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0);
ret = tsk_tree_init(&t, &ts, 0);
CU_ASSERT_EQUAL(ret, 0);
ret = tsk_tree_first(&t);
CU_ASSERT_EQUAL_FATAL(ret, TSK_TREE_OK);

/* Star tree with 4 leaves */
CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0);
CU_ASSERT_EQUAL(sackin, 4);
CU_ASSERT_EQUAL_FATAL(
tsk_tree_colless_index(&t, &colless), TSK_ERR_UNDEFINED_NONBINARY);

tsk_treeseq_free(&ts);
tsk_tree_free(&t);
}
Expand All @@ -6613,7 +6647,7 @@ test_empty_tree_balance(void)
tsk_table_collection_t tables;
tsk_treeseq_t ts;
tsk_tree_t t;
tsk_size_t sackin;
tsk_size_t sackin, colless;

ret = tsk_table_collection_init(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
Expand All @@ -6627,6 +6661,9 @@ test_empty_tree_balance(void)

CU_ASSERT_EQUAL_FATAL(tsk_tree_sackin_index(&t, &sackin), 0);
CU_ASSERT_EQUAL(sackin, 0);
/* Technically wrong here because we have 0 roots, but not worth worrying about */
CU_ASSERT_EQUAL_FATAL(
tsk_tree_colless_index(&t, &colless), TSK_ERR_UNDEFINED_MULTIROOT);

tsk_table_collection_free(&tables);
tsk_treeseq_free(&ts);
Expand Down Expand Up @@ -7862,6 +7899,7 @@ main(int argc, char **argv)
/* Tree balance/imbalance index tests */
{ "test_single_tree_balance", test_single_tree_balance },
{ "test_multiroot_balance", test_multiroot_balance },
{ "test_nonbinary_balance", test_nonbinary_balance },
{ "test_empty_tree_balance", test_empty_tree_balance },

/* Misc */
Expand Down
8 changes: 8 additions & 0 deletions c/tskit/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,14 @@ tsk_strerror_internal(int err)
ret = "A tree sequence can't take ownership of tables with "
"TSK_NO_EDGE_METADATA. (TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA)";
break;
case TSK_ERR_UNDEFINED_NONBINARY:
ret = "Operation undefined for nonbinary trees. "
"(TSK_ERR_UNDEFINED_NONBINARY)";
break;
case TSK_ERR_UNDEFINED_MULTIROOT:
ret = "Operation undefined for trees that are not singly-rooted. "
"(TSK_ERR_UNDEFINED_MULTIROOT)";
break;

/* Stats errors */
case TSK_ERR_BAD_NUM_WINDOWS:
Expand Down
27 changes: 18 additions & 9 deletions c/tskit/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,12 +172,12 @@ Used in node flags to indicate that a node is a sample node.
*/
#define TSK_NODE_IS_SAMPLE 1u

/**
/**
Null value used for cases such as absent id references.
*/
#define TSK_NULL ((tsk_id_t) -1)

/**
/**
Value used for missing data in genotype arrays.
*/
#define TSK_MISSING_DATA (-1)
Expand Down Expand Up @@ -206,7 +206,7 @@ whose representation differs from the NAN generated by computations such as divi

/* Place the common options at the top of the space; this way we can start
options for individual functions at the bottom without worrying about
clashing with the common options
clashing with the common options
*/

/** Turn on debugging output. Not supported by all functions. */
Expand All @@ -224,7 +224,7 @@ behaviour.
*/
#define TSK_NO_CHECK_INTEGRITY (1u << 29)

/**
/**
Instead of taking a copy of input objects, the function should take ownership
of them and manage their lifecycle. The caller specifying this flag should no
longer modify or free the object or objects passed. See individual functions
Expand Down Expand Up @@ -599,13 +599,22 @@ A tree sequence cannot take ownership of a table collection where
TSK_NO_EDGE_METADATA.
*/
#define TSK_ERR_CANT_TAKE_OWNERSHIP_NO_EDGE_METADATA -809
/**
Operation is undefined for nonbinary trees
*/
#define TSK_ERR_UNDEFINED_NONBINARY -810
/**
Operation is undefined for trees with multiple roots.
*/
#define TSK_ERR_UNDEFINED_MULTIROOT -811

/** @} */

/**
@defgroup STATS_ERROR_GROUP Stats errors.
@{
*/
/**
/**
Zero windows were specified, at least one window must be specified.
*/
#define TSK_ERR_BAD_NUM_WINDOWS -900
Expand Down Expand Up @@ -657,17 +666,17 @@ were ``uncalibrated``.
@defgroup MAPPING_ERROR_GROUP Mutation mapping errors.
@{
*/
/**
/**
Only missing genotypes were specified, at least one non-missing is
required.
*/
#define TSK_ERR_GENOTYPES_ALL_MISSING -1000
/**
/**
A genotype value was greater than the maximum allowed (64) or less
than TSK_MISSING_DATA (-1).
*/
#define TSK_ERR_BAD_GENOTYPE -1001
/**
/**
A ancestral genotype value was greater than the maximum allowed (64) or less
than 0.
*/
Expand Down Expand Up @@ -764,7 +773,7 @@ specified table collection.
/**
The shared portions of the specified tree sequences are not equal.
Note that this may be the case if the table collections were not
fully sorted before union was called.
fully sorted before union was called.
*/
#define TSK_ERR_UNION_DIFF_HISTORIES -1401
/** @} */
Expand Down
51 changes: 51 additions & 0 deletions c/tskit/trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -4915,6 +4915,57 @@ tsk_tree_sackin_index(const tsk_tree_t *self, tsk_size_t *result)
return ret;
}

int
tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result)
{
int ret = 0;
const tsk_id_t *restrict right_child = self->right_child;
const tsk_id_t *restrict left_sib = self->left_sib;
tsk_id_t *nodes = tsk_malloc(tsk_tree_get_size_bound(self) * sizeof(*nodes));
tsk_id_t *num_leaves = tsk_calloc(self->num_nodes, sizeof(*num_leaves));
tsk_size_t j, num_nodes, total;
tsk_id_t num_children, u, v;

if (nodes == NULL || num_leaves == NULL) {
ret = TSK_ERR_NO_MEMORY;
goto out;
}
if (tsk_tree_get_num_roots(self) != 1) {
ret = TSK_ERR_UNDEFINED_MULTIROOT;
goto out;
}
ret = tsk_tree_postorder(self, nodes, &num_nodes);
if (ret != 0) {
goto out;
}

total = 0;
for (j = 0; j < num_nodes; j++) {
u = nodes[j];
/* Cheaper to compute this on the fly than to access the num_children array.
* since we're already iterating over the children. */
num_children = 0;
for (v = right_child[u]; v != TSK_NULL; v = left_sib[v]) {
num_children++;
num_leaves[u] += num_leaves[v];
}
if (num_children == 0) {
num_leaves[u] = 1;
} else if (num_children == 2) {
v = right_child[u];
total += (tsk_size_t) llabs(num_leaves[v] - num_leaves[left_sib[v]]);
} else {
ret = TSK_ERR_UNDEFINED_NONBINARY;
goto out;
}
}
*result = total;
out:
tsk_safe_free(nodes);
tsk_safe_free(num_leaves);
return ret;
}

/* Parsimony methods */

static inline uint64_t
Expand Down
1 change: 1 addition & 0 deletions c/tskit/trees.h
Original file line number Diff line number Diff line change
Expand Up @@ -1689,6 +1689,7 @@ int tsk_tree_kc_distance(
/* Don't document these balance metrics for now so it doesn't get in the way of
* C API 1.0, but should be straightforward to document based on Python docs. */
int tsk_tree_sackin_index(const tsk_tree_t *self, tsk_size_t *result);
int tsk_tree_colless_index(const tsk_tree_t *self, tsk_size_t *result);

/* Things to consider removing: */

Expand Down
27 changes: 26 additions & 1 deletion python/_tskitmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -10792,6 +10792,27 @@ Tree_get_sackin_index(Tree *self)
return ret;
}

static PyObject *
Tree_get_colless_index(Tree *self)
{
PyObject *ret = NULL;
int err;
tsk_size_t result;

if (Tree_check_state(self) != 0) {
goto out;
}

err = tsk_tree_colless_index(self->tree, &result);
if (err != 0) {
handle_library_error(err);
goto out;
}
ret = Py_BuildValue("K", (unsigned long long) result);
out:
return ret;
}

static PyObject *
Tree_get_root_threshold(Tree *self)
{
Expand Down Expand Up @@ -11206,7 +11227,11 @@ static PyMethodDef Tree_methods[] = {
{ .ml_name = "get_sackin_index",
.ml_meth = (PyCFunction) Tree_get_sackin_index,
.ml_flags = METH_NOARGS,
.ml_doc = "Returns the root threshold for this tree." },
.ml_doc = "Returns the Sackin index for this tree." },
{ .ml_name = "get_colless_index",
.ml_meth = (PyCFunction) Tree_get_colless_index,
.ml_flags = METH_NOARGS,
.ml_doc = "Returns the Colless index for this tree." },
{ NULL } /* Sentinel */
};

Expand Down
14 changes: 7 additions & 7 deletions python/tests/test_balance_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_colless(self, ts):
tree.num_children(u) == 2 for u in tree.nodes() if tree.is_internal(u)
)
if tree.num_roots != 1 or not is_binary:
with pytest.raises(ValueError):
with pytest.raises(tskit.LibraryError):
tree.colless_index()
with pytest.raises(ValueError):
colless_index_definition(tree)
Expand Down Expand Up @@ -146,7 +146,7 @@ def test_sackin(self):
assert self.tree().sackin_index() == 18

def test_colless(self):
with pytest.raises(ValueError):
with pytest.raises(tskit.LibraryError, match="UNDEFINED_NONBINARY"):
self.tree().colless_index()

def test_b1(self):
Expand All @@ -166,7 +166,7 @@ def test_sackin(self):
assert self.tree().sackin_index() == 10

def test_colless(self):
with pytest.raises(ValueError):
with pytest.raises(tskit.LibraryError, match="UNDEFINED_NONBINARY"):
self.tree().colless_index()

def test_b1(self):
Expand Down Expand Up @@ -221,7 +221,7 @@ def test_sackin(self):
assert self.tree().sackin_index() == 20

def test_colless(self):
with pytest.raises(ValueError):
with pytest.raises(tskit.LibraryError, match="UNDEFINED_MULTIROOT"):
self.tree().colless_index()

def test_b1(self):
Expand All @@ -238,7 +238,7 @@ def test_sackin(self):
assert self.tree().sackin_index() == 0

def test_colless(self):
with pytest.raises(ValueError):
with pytest.raises(tskit.LibraryError, match="UNDEFINED_MULTIROOT"):
self.tree().colless_index()

def test_b1(self):
Expand All @@ -256,7 +256,7 @@ def test_sackin(self):
assert self.tree().sackin_index() == 0

def test_colless(self):
with pytest.raises(ValueError):
with pytest.raises(tskit.LibraryError, match="UNDEFINED_MULTIROOT"):
self.tree().colless_index()

def test_b1(self):
Expand All @@ -275,7 +275,7 @@ def test_sackin(self):
assert self.tree().sackin_index() == 0

def test_colless(self):
with pytest.raises(ValueError):
with pytest.raises(tskit.LibraryError, match="UNDEFINED_MULTIROOT"):
self.tree().colless_index()

def test_b1(self):
Expand Down
24 changes: 3 additions & 21 deletions python/tskit/trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -2808,9 +2808,9 @@ def colless_index(self):
"""
Returns the Colless imbalance index for this tree.
This is defined as the sum of all differences between number of
leaves under right sub-node and left sub-node for each node.
leaves subtended by the left and right child of each node.
The Colless index is undefined for non-binary trees and trees
with multiple roots. This method will raise a ValueError if the
with multiple roots. This method will raise a LibraryError if the
tree is not singly-rooted and binary.

.. seealso:: See `Shao and Sokal (1990)
Expand All @@ -2819,25 +2819,7 @@ def colless_index(self):
:return: The Colless imbalance index.
:rtype: int
"""
# TODO implement in C
if self.num_roots != 1:
raise ValueError("Colless index not defined for multiroot trees")
num_leaves = np.zeros(self.tree_sequence.num_nodes, dtype=np.int32)
total = 0
for u in self.nodes(order="postorder"):
num_children = 0
for v in self.children(u):
num_leaves[u] += num_leaves[v]
num_children += 1
if num_children == 0:
num_leaves[u] = 1
elif num_children != 2:
raise ValueError("Colless index not defined for nonbinary trees")
else:
total += abs(
num_leaves[self.right_child(u)] - num_leaves[self.left_child(u)]
)
return total
return self._ll_tree.get_colless_index()

def sackin_index(self):
"""
Expand Down