Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
143 changes: 143 additions & 0 deletions c/tests/test_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -3212,6 +3212,147 @@ test_table_collection_check_integrity(void)
tsk_table_collection_free(&tables);
}

static void
test_table_collection_subset(void)
{
int ret;
tsk_table_collection_t tables;
tsk_table_collection_t tables_copy;
int k;
tsk_id_t nodes[4];

ret = tsk_table_collection_init(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tables.sequence_length = 1;
ret = tsk_table_collection_init(&tables_copy, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

// does not error on empty tables
ret = tsk_table_collection_subset(&tables, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

// four nodes from two diploids; the first is from pop 0
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(
&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(
&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_population_table_add_row(&tables.populations, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 2, 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_site_table_add_row(&tables.sites, 0.2, "A", 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_site_table_add_row(&tables.sites, 0.4, "A", 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_mutation_table_add_row(
&tables.mutations, 0, 0, TSK_NULL, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_mutation_table_add_row(
&tables.mutations, 1, 1, TSK_NULL, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);

// empty nodes should get empty tables
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_subset(&tables_copy, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0);
CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 0);
CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, 0);

// the identity transformation
for (k = 0; k < 4; k++) {
nodes[k] = k;
}
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_subset(&tables_copy, nodes, 4);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy));

// reverse twice should get back to the start
for (k = 0; k < 4; k++) {
nodes[k] = 3 - k;
}
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_subset(&tables_copy, nodes, 4);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_subset(&tables_copy, nodes, 4);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy));

tsk_table_collection_free(&tables_copy);
tsk_table_collection_free(&tables);
}

static void
test_table_collection_subset_errors(void)
{
int ret;
tsk_table_collection_t tables;
tsk_table_collection_t tables_copy;
tsk_id_t nodes[4] = { 0, 1, 2, 3 };

ret = tsk_table_collection_init(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_init(&tables_copy, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

// four nodes from two diploids; the first is from pop 0
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(
&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(
&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_population_table_add_row(&tables.populations, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);

/* Migrations are not supported */
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_migration_table_add_row(&tables_copy.migrations, 0, 1, 0, 0, 0, 0, NULL, 0);
CU_ASSERT_EQUAL_FATAL(tables_copy.migrations.num_rows, 1);
ret = tsk_table_collection_subset(&tables_copy, nodes, 4);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATIONS_NOT_SUPPORTED);

// test out of bounds nodes
nodes[0] = -1;
ret = tsk_table_collection_subset(&tables, nodes, 4);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS);
nodes[0] = 6;
ret = tsk_table_collection_subset(&tables, nodes, 4);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS);

tsk_table_collection_free(&tables);
tsk_table_collection_free(&tables_copy);
}

int
main(int argc, char **argv)
{
Expand Down Expand Up @@ -3257,6 +3398,8 @@ main(int argc, char **argv)
{ "test_column_overflow", test_column_overflow },
{ "test_table_collection_check_integrity",
test_table_collection_check_integrity },
{ "test_table_collection_subset", test_table_collection_subset },
{ "test_table_collection__subset_errors", test_table_collection_subset_errors },
{ NULL, NULL },
};

Expand Down
3 changes: 3 additions & 0 deletions c/tskit/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,9 @@ tsk_strerror_internal(int err)
case TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED:
ret = "Migrations not currently supported by sort";
break;
case TSK_ERR_MIGRATIONS_NOT_SUPPORTED:
ret = "Migrations not currently supported by this operation";
break;
case TSK_ERR_SORT_OFFSET_NOT_SUPPORTED:
ret = "Specifying position for mutation, sites or migrations is not "
"supported";
Expand Down
1 change: 1 addition & 0 deletions c/tskit/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ not found in the file.
#define TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED -802
#define TSK_ERR_SORT_OFFSET_NOT_SUPPORTED -803
#define TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED -804
#define TSK_ERR_MIGRATIONS_NOT_SUPPORTED -805

/* Stats errors */
#define TSK_ERR_BAD_NUM_WINDOWS -900
Expand Down
158 changes: 158 additions & 0 deletions c/tskit/tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -7799,6 +7799,164 @@ tsk_table_collection_clear(tsk_table_collection_t *self)
return tsk_table_collection_truncate(self, &start);
}

int TSK_WARN_UNUSED
tsk_table_collection_subset(
tsk_table_collection_t *self, tsk_id_t *nodes, tsk_size_t num_nodes)
{
int ret = 0;
tsk_id_t k, i, new_ind, new_pop, new_parent, new_child, new_node;
tsk_node_t node;
tsk_individual_t ind;
tsk_population_t pop;
tsk_edge_t edge;
tsk_mutation_t mut;
tsk_site_t site;
tsk_id_t *node_map = NULL;
tsk_id_t *individual_map = NULL;
tsk_id_t *population_map = NULL;
tsk_id_t *site_map = NULL;
tsk_id_t *mutation_map = NULL;
tsk_table_collection_t tables;

ret = tsk_table_collection_copy(self, &tables, 0);
if (ret != 0) {
goto out;
}
ret = tsk_table_collection_clear(self);
if (ret != 0) {
goto out;
}

node_map = malloc(tables.nodes.num_rows * sizeof(*node_map));
individual_map = malloc(tables.individuals.num_rows * sizeof(*individual_map));
population_map = malloc(tables.populations.num_rows * sizeof(*population_map));
site_map = malloc(tables.sites.num_rows * sizeof(*site_map));
mutation_map = malloc(tables.mutations.num_rows * sizeof(*mutation_map));
if (node_map == NULL || individual_map == NULL || population_map == NULL
|| site_map == NULL || mutation_map == NULL) {
ret = TSK_ERR_NO_MEMORY;
goto out;
}
memset(node_map, 0xff, tables.nodes.num_rows * sizeof(*node_map));
memset(individual_map, 0xff, tables.individuals.num_rows * sizeof(*individual_map));
memset(population_map, 0xff, tables.populations.num_rows * sizeof(*population_map));
memset(site_map, 0xff, tables.sites.num_rows * sizeof(*site_map));
memset(mutation_map, 0xff, tables.mutations.num_rows * sizeof(*mutation_map));

// nodes, individuals, populations
for (k = 0; k < (tsk_id_t) num_nodes; k++) {
ret = tsk_node_table_get_row(&tables.nodes, nodes[k], &node);
if (ret < 0) {
goto out;
}
new_ind = TSK_NULL;
if (node.individual != TSK_NULL) {
if (individual_map[node.individual] == TSK_NULL) {
tsk_individual_table_get_row(&tables.individuals, node.individual, &ind);
ret = tsk_individual_table_add_row(&self->individuals, ind.flags,
ind.location, ind.location_length, ind.metadata,
ind.metadata_length);
if (ret < 0) {
goto out;
}
individual_map[node.individual] = ret;
}
new_ind = individual_map[node.individual];
}
new_pop = TSK_NULL;
if (node.population != TSK_NULL) {
if (population_map[node.population] == TSK_NULL) {
tsk_population_table_get_row(&tables.populations, node.population, &pop);
ret = tsk_population_table_add_row(
&self->populations, pop.metadata, pop.metadata_length);
if (ret < 0) {
goto out;
}
population_map[node.population] = ret;
}
new_pop = population_map[node.population];
}
ret = tsk_node_table_add_row(&self->nodes, node.flags, node.time, new_pop,
new_ind, node.metadata, node.metadata_length);
if (ret < 0) {
goto out;
}
node_map[node.id] = ret;
}

// edges
for (k = 0; k < (tsk_id_t) tables.edges.num_rows; k++) {
tsk_edge_table_get_row(&tables.edges, k, &edge);
new_parent = node_map[edge.parent];
new_child = node_map[edge.child];
if ((new_parent != TSK_NULL) && (new_child != TSK_NULL)) {
ret = tsk_edge_table_add_row(&self->edges, edge.left, edge.right, new_parent,
new_child, edge.metadata, edge.metadata_length);
if (ret < 0) {
goto out;
}
}
}

// mutations and sites
i = 0;
for (k = 0; k < (tsk_id_t) tables.sites.num_rows; k++) {
tsk_site_table_get_row(&tables.sites, k, &site);
while ((i < (tsk_id_t) tables.mutations.num_rows)
&& (tables.mutations.site[i] == site.id)) {
tsk_mutation_table_get_row(&tables.mutations, i, &mut);
new_node = node_map[mut.node];
if (new_node != TSK_NULL) {
if (site_map[site.id] == TSK_NULL) {
ret = tsk_site_table_add_row(&self->sites, site.position,
site.ancestral_state, site.ancestral_state_length, site.metadata,
site.metadata_length);
if (ret < 0) {
goto out;
}
site_map[site.id] = ret;
}
new_parent = TSK_NULL;
if (mut.parent != TSK_NULL) {
new_parent = mutation_map[mut.parent];
}
ret = tsk_mutation_table_add_row(&self->mutations, site_map[site.id],
new_node, new_parent, mut.derived_state, mut.derived_state_length,
mut.metadata, mut.metadata_length);
if (ret < 0) {
goto out;
}
mutation_map[mut.id] = ret;
}
i++;
}
}

/* TODO: Subset of the Migrations Table. The way to do this properly is not
* well-defined, mostly because migrations might contain events from/to
* populations that have not been kept in after the subset. */
if (tables.migrations.num_rows != 0) {
ret = TSK_ERR_MIGRATIONS_NOT_SUPPORTED;
goto out;
}

// provenance (new record is added in python)
ret = tsk_provenance_table_copy(
&tables.provenances, &self->provenances, TSK_NO_INIT);
if (ret < 0) {
goto out;
}

out:
tsk_safe_free(node_map);
tsk_safe_free(individual_map);
tsk_safe_free(population_map);
tsk_safe_free(site_map);
tsk_safe_free(mutation_map);
tsk_table_collection_free(&tables);
return ret;
}

static int
cmp_edge_cl(const void *a, const void *b)
{
Expand Down
37 changes: 37 additions & 0 deletions c/tskit/tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -2409,6 +2409,43 @@ completes.
int tsk_table_collection_simplify(tsk_table_collection_t *self, tsk_id_t *samples,
tsk_size_t num_samples, tsk_flags_t options, tsk_id_t *node_map);

/**
@brief Subsets and reorders a table collection according to an array of nodes.

@rst
Reduces the table collection to contain only the entries referring to
the provided list of nodes, with nodes reordered according to the order
they appear in the ``nodes`` argument. Specifically, this subsets and reorders
each of the tables as follows:

1. Nodes: if in the list of nodes, and in the order provided.
2. Individuals and Populations: if referred to by a retained node,
and in the order first seen when traversing the list of retained nodes.
3. Edges: if both parent and child are retained nodes.
4. Mutations: if the mutation's node is a retained node.
5. Sites: if any mutations remain at the site after removing mutations.
6. Migrations: if the migration's node is a retained node.

Retained edges, mutations, sites, and migrations appear in the same
order as in the original tables.

If ``nodes`` is the entire list of nodes in the tables, then the
resulting tables will be identical to the original tables, but with
nodes (and individuals and populations) reordered.

.. note:: Migrations are currently not supported by susbset, and an error will
be raised if we attempt call subset on a table collection with greater
than zero migrations.
@endrst

@param self A pointer to a tsk_table_collection_t object.
@param nodes An array of num_nodes valid node IDs.
@param num_nodes The number of node IDs in the input nodes array.
@return Return 0 on success or a negative value on failure.
*/
int tsk_table_collection_subset(
tsk_table_collection_t *self, tsk_id_t *nodes, tsk_size_t num_nodes);

/**
@brief Set the metadata
@rst
Expand Down
Loading