From 5708a042d012a6ea8b4f096262cf30d6037c4f61 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Fri, 5 Jun 2020 00:55:23 +0100 Subject: [PATCH 1/2] Add mutation time --- c/CHANGELOG.rst | 16 ++ c/tests/test_file_format.c | 1 + c/tests/test_genotypes.c | 2 +- c/tests/test_tables.c | 128 ++++++++---- c/tests/test_trees.c | 194 ++++++++++++++++-- c/tests/testlib.c | 17 +- c/tskit/core.c | 12 ++ c/tskit/core.h | 7 +- c/tskit/tables.c | 192 +++++++++++++++--- c/tskit/tables.h | 23 ++- c/tskit/trees.c | 35 +++- docs/data-model.rst | 31 ++- python/CHANGELOG.rst | 11 + python/_tskitmodule.c | 68 ++++++- python/setup.py | 9 +- python/tests/__init__.py | 105 +++++++++- python/tests/simplify.py | 1 + python/tests/test_cli.py | 46 +++++ python/tests/test_dict_encoding.py | 4 +- python/tests/test_drawing.py | 12 +- python/tests/test_file_format.py | 21 +- python/tests/test_genotypes.py | 32 ++- python/tests/test_highlevel.py | 35 +++- python/tests/test_lowlevel.py | 14 +- python/tests/test_metadata.py | 4 +- python/tests/test_parsimony.py | 50 +++-- python/tests/test_stats.py | 2 +- python/tests/test_tables.py | 129 +++++++++--- python/tests/test_topology.py | 313 ++++++++++++++++++++++------- python/tests/test_tree_stats.py | 78 +++---- python/tests/test_vcf.py | 4 +- python/tests/test_wright_fisher.py | 6 +- python/tests/tsutil.py | 97 ++++++++- python/tskit/formats.py | 57 +++++- python/tskit/tables.py | 50 ++++- python/tskit/trees.py | 29 ++- 36 files changed, 1505 insertions(+), 330 deletions(-) diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index 760e231dbf..d4d82fb5a8 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -6,6 +6,16 @@ In development. **Breaking changes** +- Mutations now have a double-precision floating-point ``time`` column. As this is a + mandatory column the file format version has been bumped to 13.0. Pre-existing + tree sequences can be upgraded using the ``tskit upgrade`` command-line utility, + which will assign times to mutations by spreading them evenly along the edges on + which they occur (using ``tsk_table_collection_compute_mutation_times``. For a tree + sequence to be considered valid it must meet new criteria for mutation times, see + :ref:`sec_mutation_requirements`. Mutation methods such as ``add_row`` now have an + additional argument. + (:user:`benjeffery`, :pr:`672`) + - Change genotypes from unsigned to signed to accommodate missing data (see :issue:`144` for discussion). This only affects users of the ``tsk_vargen_t`` class. Genotypes are now stored as int8_t and int16_t @@ -34,6 +44,12 @@ In development. **New features** +- Add ``time`` column to mutations, along with new function + ``tsk_table_collection_compute_mutation_times`` and new flag to + ``tsk_table_collection_check_integrity``:``TSK_CHECK_MUTATION_TIME``. + (:user:`benjeffery`, :pr:`672`) + + - Add ``metadata`` and ``metadata_schema`` fields to table collection, with accessors on tree sequence. These store arbitrary bytes and are optional in the file format. (:user: `benjeffery`, :pr:`641`) diff --git a/c/tests/test_file_format.c b/c/tests/test_file_format.c index 5216cb1c50..e7d9bb6f59 100644 --- a/c/tests/test_file_format.c +++ b/c/tests/test_file_format.c @@ -583,6 +583,7 @@ test_missing_required_columns(void) "mutations/node", "mutations/parent", "mutations/site", + "mutations/time", "nodes/flags", "nodes/individual", "nodes/population", diff --git a/c/tests/test_genotypes.c b/c/tests/test_genotypes.c index c2e963a943..d1bafcf00b 100644 --- a/c/tests/test_genotypes.c +++ b/c/tests/test_genotypes.c @@ -650,7 +650,7 @@ test_single_tree_many_alleles(void) for (j = 0; j < (tsk_id_t) num_alleles; j++) { /* When j = 0 we get a parent of -1, which is the NULL_NODE */ ret = tsk_mutation_table_add_row( - &tables.mutations, 0, 0, j - 1, alleles, (tsk_size_t) j, NULL, 0); + &tables.mutations, 0, 0, j - 1, 0, alleles, (tsk_size_t) j, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 7b087d4d09..9c19894b2f 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -1051,6 +1051,7 @@ test_mutation_table(void) tsk_id_t *node; tsk_id_t *parent; tsk_id_t *site; + double *time; char *derived_state, *metadata; char c[max_len + 1]; tsk_size_t *derived_state_offset, *metadata_offset; @@ -1071,11 +1072,12 @@ test_mutation_table(void) len = 0; for (j = 0; j < (tsk_id_t) num_rows; j++) { k = TSK_MIN((tsk_size_t) j + 1, max_len); - ret = tsk_mutation_table_add_row(&table, j, j, j, c, k, c, k); + ret = tsk_mutation_table_add_row(&table, j, j, j, j, c, k, c, k); CU_ASSERT_EQUAL_FATAL(ret, j); CU_ASSERT_EQUAL(table.site[j], j); CU_ASSERT_EQUAL(table.node[j], j); CU_ASSERT_EQUAL(table.parent[j], j); + CU_ASSERT_EQUAL(table.time[j], j); CU_ASSERT_EQUAL(table.derived_state_offset[j], len); CU_ASSERT_EQUAL(table.metadata_offset[j], len); CU_ASSERT_EQUAL(table.num_rows, (tsk_size_t) j + 1); @@ -1091,6 +1093,7 @@ test_mutation_table(void) CU_ASSERT_EQUAL(mutation.site, j); CU_ASSERT_EQUAL(mutation.node, j); CU_ASSERT_EQUAL(mutation.parent, j); + CU_ASSERT_EQUAL(mutation.time, j); CU_ASSERT_EQUAL(mutation.metadata_length, k); CU_ASSERT_NSTRING_EQUAL(mutation.metadata, c, k); CU_ASSERT_EQUAL(mutation.derived_state_length, k); @@ -1108,6 +1111,8 @@ test_mutation_table(void) CU_ASSERT_FATAL(node != NULL); parent = malloc(num_rows * sizeof(tsk_id_t)); CU_ASSERT_FATAL(parent != NULL); + time = malloc(num_rows * sizeof(double)); + CU_ASSERT_FATAL(time != NULL); derived_state = malloc(num_rows * sizeof(char)); CU_ASSERT_FATAL(derived_state != NULL); derived_state_offset = malloc((num_rows + 1) * sizeof(tsk_size_t)); @@ -1121,6 +1126,7 @@ test_mutation_table(void) node[j] = j; site[j] = j + 1; parent[j] = j + 2; + time[j] = j + 3; derived_state[j] = 'Y'; derived_state_offset[j] = (tsk_size_t) j; metadata[j] = 'M'; @@ -1129,12 +1135,13 @@ test_mutation_table(void) derived_state_offset[num_rows] = num_rows; metadata_offset[num_rows] = num_rows; - ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, + ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(memcmp(table.site, site, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); + CU_ASSERT_EQUAL(memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( memcmp(table.derived_state, derived_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); @@ -1143,7 +1150,7 @@ test_mutation_table(void) CU_ASSERT_EQUAL(table.metadata_length, num_rows); /* Append another num_rows */ - ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, + ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(memcmp(table.site, site, num_rows * sizeof(tsk_id_t)), 0); @@ -1153,6 +1160,8 @@ test_mutation_table(void) CU_ASSERT_EQUAL(memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( memcmp(table.parent + num_rows, parent, num_rows * sizeof(tsk_id_t)), 0); + CU_ASSERT_EQUAL(memcmp(table.time, time, num_rows * sizeof(double)), 0); + CU_ASSERT_EQUAL(memcmp(table.time + num_rows, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( memcmp(table.derived_state, derived_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL( @@ -1169,6 +1178,7 @@ test_mutation_table(void) CU_ASSERT_EQUAL(memcmp(table.site, site, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); + CU_ASSERT_EQUAL(memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( memcmp(table.derived_state, derived_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(memcmp(table.metadata, metadata, num_rows * sizeof(char)), 0); @@ -1182,12 +1192,13 @@ test_mutation_table(void) /* Check all this again, except with parent == NULL and metadata == NULL. */ memset(parent, 0xff, num_rows * sizeof(tsk_id_t)); memset(metadata_offset, 0, (num_rows + 1) * sizeof(tsk_size_t)); - ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, NULL, + ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, NULL, time, derived_state, derived_state_offset, NULL, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(memcmp(table.site, site, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(memcmp(table.node, node, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL(memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); + CU_ASSERT_EQUAL(memcmp(table.time, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( memcmp(table.derived_state, derived_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL(memcmp(table.derived_state_offset, derived_state_offset, @@ -1201,7 +1212,7 @@ test_mutation_table(void) CU_ASSERT_EQUAL(table.metadata_length, 0); /* Append another num_rows */ - ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, NULL, + ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, NULL, time, derived_state, derived_state_offset, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(memcmp(table.site, site, num_rows * sizeof(tsk_id_t)), 0); @@ -1211,6 +1222,8 @@ test_mutation_table(void) CU_ASSERT_EQUAL(memcmp(table.parent, parent, num_rows * sizeof(tsk_id_t)), 0); CU_ASSERT_EQUAL( memcmp(table.parent + num_rows, parent, num_rows * sizeof(tsk_id_t)), 0); + CU_ASSERT_EQUAL(memcmp(table.time, time, num_rows * sizeof(double)), 0); + CU_ASSERT_EQUAL(memcmp(table.time + num_rows, time, num_rows * sizeof(double)), 0); CU_ASSERT_EQUAL( memcmp(table.derived_state, derived_state, num_rows * sizeof(char)), 0); CU_ASSERT_EQUAL( @@ -1221,53 +1234,59 @@ test_mutation_table(void) CU_ASSERT_EQUAL(table.metadata_length, 0); /* Inputs except parent, metadata and metadata_offset cannot be NULL*/ - ret = tsk_mutation_table_set_columns(&table, num_rows, NULL, node, parent, + ret = tsk_mutation_table_set_columns(&table, num_rows, NULL, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_mutation_table_set_columns(&table, num_rows, site, NULL, parent, + ret = tsk_mutation_table_set_columns(&table, num_rows, site, NULL, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, NULL, - derived_state_offset, metadata, metadata_offset); + derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, + ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, + NULL, derived_state_offset, metadata, metadata_offset); + CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); + ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, derived_state, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, + ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, + ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Inputs except parent, metadata and metadata_offset cannot be NULL*/ - ret = tsk_mutation_table_append_columns(&table, num_rows, NULL, node, parent, + ret = tsk_mutation_table_append_columns(&table, num_rows, NULL, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_mutation_table_append_columns(&table, num_rows, site, NULL, parent, + ret = tsk_mutation_table_append_columns(&table, num_rows, site, NULL, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, NULL, - derived_state_offset, metadata, metadata_offset); + derived_state, derived_state_offset, metadata, metadata_offset); + CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); + ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, time, + NULL, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, + ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, time, derived_state, NULL, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, + ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, NULL, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, + ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); /* Test for bad offsets */ derived_state_offset[0] = 1; - ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, + ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, NULL, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); derived_state_offset[0] = 0; derived_state_offset[num_rows] = 0; - ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, + ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, derived_state, derived_state_offset, NULL, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_OFFSET); @@ -1305,6 +1324,7 @@ test_mutation_table(void) free(site); free(node); free(parent); + free(time); free(derived_state); free(derived_state_offset); free(metadata); @@ -2825,7 +2845,7 @@ test_table_overflow(void) tables.mutations.max_rows = max_rows; tables.mutations.num_rows = max_rows; - ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, 0, 0, 0, 0); + ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, 0, 0, 0, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TABLE_OVERFLOW); tables.provenances.max_rows = max_rows; @@ -2876,11 +2896,13 @@ test_column_overflow(void) ret = tsk_site_table_add_row(&tables.sites, 0, NULL, 0, NULL, too_big); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_COLUMN_OVERFLOW); - ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, zeros, 1, zeros, 1); + ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, 0, zeros, 1, zeros, 1); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, NULL, 0, NULL, too_big); + ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 0, 0, 0, NULL, 0, NULL, too_big); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_COLUMN_OVERFLOW); - ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, NULL, too_big, NULL, 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 0, 0, 0, NULL, too_big, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_COLUMN_OVERFLOW); ret = tsk_provenance_table_add_row(&tables.provenances, zeros, 1, zeros, 1); @@ -3060,7 +3082,7 @@ test_table_collection_check_integrity(void) /* mutations */ ret = tsk_mutation_table_add_row( - &tables.mutations, 2, 0, TSK_NULL, NULL, 0, NULL, 0); + &tables.mutations, 2, 0, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); @@ -3068,31 +3090,31 @@ test_table_collection_check_integrity(void) ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_add_row( - &tables.mutations, 0, 2, TSK_NULL, NULL, 0, NULL, 0); + &tables.mutations, 0, 2, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_mutation_table_add_row(&tables.mutations, 0, 1, 2, NULL, 0, NULL, 0); + ret = tsk_mutation_table_add_row(&tables.mutations, 0, 1, 2, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_OUT_OF_BOUNDS); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_mutation_table_add_row(&tables.mutations, 0, 1, 0, NULL, 0, NULL, 0); + ret = tsk_mutation_table_add_row(&tables.mutations, 0, 1, 0, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_EQUAL); ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_mutation_table_add_row(&tables.mutations, 0, 1, 1, NULL, 0, NULL, 0); + ret = tsk_mutation_table_add_row(&tables.mutations, 0, 1, 1, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_mutation_table_add_row( - &tables.mutations, 0, 1, TSK_NULL, NULL, 0, NULL, 0); + &tables.mutations, 0, 1, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -3105,9 +3127,9 @@ test_table_collection_check_integrity(void) ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_add_row( - &tables.mutations, 0, 1, TSK_NULL, NULL, 0, NULL, 0); + &tables.mutations, 0, 1, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); - ret = tsk_mutation_table_add_row(&tables.mutations, 1, 1, 0, NULL, 0, NULL, 0); + ret = tsk_mutation_table_add_row(&tables.mutations, 1, 1, 0, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -3119,10 +3141,10 @@ test_table_collection_check_integrity(void) ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_add_row( - &tables.mutations, 1, 1, TSK_NULL, NULL, 0, NULL, 0); + &tables.mutations, 1, 1, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_mutation_table_add_row( - &tables.mutations, 0, 1, TSK_NULL, NULL, 0, NULL, 0); + &tables.mutations, 0, 1, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -3131,20 +3153,46 @@ test_table_collection_check_integrity(void) ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); - - ret = tsk_population_table_add_row(&tables.populations, NULL, 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 0, TSK_NULL, INFINITY, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); - ret = tsk_population_table_add_row(&tables.populations, NULL, 0); + ret = tsk_table_collection_check_integrity(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_TIME); + printf("%s", tsk_strerror(ret)); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_NONFINITE); + + ret = tsk_mutation_table_clear(&tables.mutations); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 1, 1, TSK_NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); + ret = tsk_table_collection_check_integrity(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_TIME); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE); - /* migrations */ - ret = tsk_migration_table_add_row( - &tables.migrations, 0.0, 0.5, 2, 0, 1, 1.5, NULL, 0); + ret = tsk_mutation_table_clear(&tables.mutations); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 1, 1, TSK_NULL, 1, NULL, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_mutation_table_add_row(&tables.mutations, 1, 1, 0, 2, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_table_collection_check_integrity(&tables, 0); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_TIME); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION); + + /* Note that we do not test for TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE as + that is only checked for tree sequences not tables */ /* migrations */ + ret = tsk_population_table_add_row(&tables.populations, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_population_table_add_row(&tables.populations, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_migration_table_clear(&tables.migrations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_migration_table_add_row( diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index e7d04fab3b..9c59ad2644 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -128,6 +128,29 @@ verify_compute_mutation_parents(tsk_treeseq_t *ts) tsk_table_collection_free(&tables); } +static void +verify_compute_mutation_times(tsk_treeseq_t *ts) +{ + int ret; + size_t size = tsk_treeseq_get_num_mutations(ts) * sizeof(tsk_id_t); + tsk_id_t *time = malloc(size); + tsk_table_collection_t tables; + + CU_ASSERT_FATAL(time != NULL); + ret = tsk_treeseq_copy_tables(ts, &tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + memcpy(time, tables.mutations.time, size); + /* Make sure the tables are actually updated */ + memset(tables.mutations.time, 0, size); + + ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL_FATAL(memcmp(time, tables.mutations.time, size), 0); + + free(time); + tsk_table_collection_free(&tables); +} + static void verify_individual_nodes(tsk_treeseq_t *ts) { @@ -3140,12 +3163,12 @@ test_single_tree_bad_mutations(void) const char *sites = "0 0\n" "0.1 0\n" "0.2 0\n"; - const char *mutations = "0 0 1 -1\n" - "1 1 1 -1\n" - "2 4 1 -1\n" - "2 1 0 2\n" - "2 1 1 3\n" - "2 2 1 -1\n"; + const char *mutations = "0 0 1 -1 0\n" + "1 1 1 -1 0\n" + "2 4 1 -1 1\n" + "2 1 0 2 0\n" + "2 1 1 3 0\n" + "2 2 1 -1 0\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; tsk_flags_t load_flags = TSK_BUILD_INDEXES; @@ -3282,6 +3305,27 @@ test_single_tree_bad_mutations(void) tsk_treeseq_free(&ts); tables.mutations.parent[2] = TSK_NULL; + /* time < node time */ + tables.mutations.time[2] = 0; + ret = tsk_treeseq_init(&ts, &tables, load_flags); + CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE); + tsk_treeseq_free(&ts); + tables.mutations.time[2] = 1; + + /* time > parent mutation */ + tables.mutations.time[4] = 0.5; + ret = tsk_treeseq_init(&ts, &tables, load_flags); + CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION); + tsk_treeseq_free(&ts); + tables.mutations.time[4] = 0; + + /* time > parent node */ + tables.mutations.time[0] = 1.5; + ret = tsk_treeseq_init(&ts, &tables, load_flags); + CU_ASSERT_EQUAL(ret, TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE); + tsk_treeseq_free(&ts); + tables.mutations.time[0] = 0; + /* Check to make sure we've maintained legal mutations */ ret = tsk_treeseq_init(&ts, &tables, load_flags); CU_ASSERT_EQUAL(ret, 0); @@ -3714,12 +3758,12 @@ test_single_tree_compute_mutation_parents(void) const char *sites = "0 0\n" "0.1 0\n" "0.2 0\n"; - const char *mutations = "0 0 1 -1\n" - "1 1 1 -1\n" - "2 4 1 -1\n" - "2 1 0 2\n" - "2 1 1 3\n" - "2 2 1 -1\n"; + const char *mutations = "0 0 1 -1 0\n" + "1 1 1 -1 0\n" + "2 4 1 -1 1\n" + "2 1 0 2 0\n" + "2 1 1 3 0\n" + "2 2 1 -1 0\n"; tsk_treeseq_t ts; tsk_table_collection_t tables; @@ -3749,10 +3793,6 @@ test_single_tree_compute_mutation_parents(void) /* Compute the mutation parents */ verify_compute_mutation_parents(&ts); - /* Verify consistency of individuals */ - verify_individual_nodes(&ts); - tsk_treeseq_free(&ts); - /* Bad site reference */ tables.mutations.site[0] = -1; ret = tsk_table_collection_compute_mutation_parents(&tables, 0); @@ -3819,6 +3859,124 @@ test_single_tree_compute_mutation_parents(void) tsk_table_collection_free(&tables); } +static void +test_single_tree_compute_mutation_times(void) +{ + int ret = 0; + const char *sites = "0 0\n" + "0.1 0\n" + "0.2 0\n" + "0.3 0\n"; + const char *mutations = "0 0 1 -1 3\n" + "1 1 1 -1 3\n" + "2 4 1 -1 8\n" + "2 1 0 2 4\n" + "2 1 1 3 2\n" + "2 2 1 -1 4\n" + "3 6 1 -1 10\n"; + /* 6 */ + /* 6 */ + /* / \ */ + /* / \ */ + /* 2 \ */ + /* / 5 */ + /* 4 / \ */ + /* 0 1,3,4 5 \ */ + /* 0 1 2 3 */ + + tsk_treeseq_t ts; + tsk_table_collection_t tables; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + tables.sequence_length = 1; + parse_nodes(single_tree_ex_nodes, &tables.nodes); + CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 7); + tables.nodes.time[4] = 6; + tables.nodes.time[5] = 8; + tables.nodes.time[6] = 10; + parse_edges(single_tree_ex_edges, &tables.edges); + CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 6); + parse_sites(sites, &tables.sites); + parse_mutations(mutations, &tables.mutations); + CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 4); + CU_ASSERT_EQUAL_FATAL(tables.mutations.num_rows, 7); + tables.sequence_length = 1.0; + + ret = tsk_table_collection_build_index(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + /* Check to make sure we have legal mutations */ + ret = tsk_treeseq_init(&ts, &tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 4); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 7); + + /* Compute the mutation times */ + verify_compute_mutation_times(&ts); + + /* Verify consistency of individuals */ + verify_individual_nodes(&ts); + tsk_treeseq_free(&ts); + + /* Bad random param */ + ret = tsk_table_collection_compute_mutation_times(&tables, (double *) 1, 0); + CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); + + /* Bad site reference */ + tables.mutations.site[0] = -1; + ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); + CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); + tables.mutations.site[0] = 0; + + /* Bad site reference */ + tables.mutations.site[0] = -1; + ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); + CU_ASSERT_EQUAL(ret, TSK_ERR_SITE_OUT_OF_BOUNDS); + tables.mutations.site[0] = 0; + + /* mutation sites out of order */ + tables.mutations.site[0] = 2; + ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); + CU_ASSERT_EQUAL(ret, TSK_ERR_UNSORTED_MUTATIONS); + tables.mutations.site[0] = 0; + + /* sites out of order */ + tables.sites.position[0] = 0.11; + ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); + CU_ASSERT_EQUAL(ret, TSK_ERR_UNSORTED_SITES); + tables.sites.position[0] = 0; + + /* Bad node reference */ + tables.mutations.node[0] = -1; + ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); + CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); + tables.mutations.node[0] = 0; + + /* Bad node reference */ + tables.mutations.node[0] = (tsk_id_t) tables.nodes.num_rows; + ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); + CU_ASSERT_EQUAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); + tables.mutations.node[0] = 0; + + /* Mutations not ordered by site */ + tables.mutations.site[2] = 0; + ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_MUTATIONS); + tables.mutations.site[2] = 2; + + ret = tsk_treeseq_init(&ts, &tables, 0); + CU_ASSERT_EQUAL(ret, 0); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_sites(&ts), 4); + CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 7); + tsk_treeseq_free(&ts); + + tsk_treeseq_free(&ts); + tsk_table_collection_free(&tables); +} + static void test_single_tree_is_descendant(void) { @@ -5350,7 +5508,7 @@ test_deduplicate_sites_errors(void) CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_site_table_add_row(&tables.sites, 2, "TT", 2, "MM", 2); CU_ASSERT_EQUAL_FATAL(ret, 1); - ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, -1, "T", 1, NULL, 0); + ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, -1, 0, "T", 1, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_node_table_add_row(&tables.nodes, 0, 0, TSK_NULL, TSK_NULL, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -5732,6 +5890,8 @@ main(int argc, char **argv) test_single_tree_simplify_null_samples }, { "test_single_tree_compute_mutation_parents", test_single_tree_compute_mutation_parents }, + { "test_single_tree_compute_mutation_times", + test_single_tree_compute_mutation_times }, { "test_single_tree_is_descendant", test_single_tree_is_descendant }, { "test_single_tree_map_mutations", test_single_tree_map_mutations }, { "test_single_tree_map_mutations_internal_samples", diff --git a/c/tests/testlib.c b/c/tests/testlib.c index ea73c57e03..71edd30fb2 100644 --- a/c/tests/testlib.c +++ b/c/tests/testlib.c @@ -369,6 +369,7 @@ parse_mutations(const char *text, tsk_mutation_table_t *mutation_table) const char *whitespace = " \t"; char *p; tsk_id_t node, site, parent; + double time; char derived_state[MAX_LINE]; c = 0; @@ -399,7 +400,12 @@ parse_mutations(const char *text, tsk_mutation_table_t *mutation_table) if (p != NULL) { parent = atoi(p); } - ret = tsk_mutation_table_add_row(mutation_table, site, node, parent, + time = 0; + p = strtok(NULL, whitespace); + if (p != NULL) { + time = atof(p); + } + ret = tsk_mutation_table_add_row(mutation_table, site, node, parent, time, derived_state, strlen(derived_state), NULL, 0); CU_ASSERT_FATAL(ret >= 0); } @@ -512,7 +518,11 @@ tsk_treeseq_from_text(tsk_treeseq_t *ts, double sequence_length, const char *nod } } - ret = tsk_treeseq_init(ts, &tables, TSK_BUILD_INDEXES); + ret = tsk_table_collection_build_index(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_treeseq_init(ts, &tables, 0); /* tsk_treeseq_print_state(ts, stdout); */ // printf("ret = %s\n", tsk_strerror(ret)); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -606,7 +616,8 @@ caterpillar_tree(tsk_size_t n, tsk_size_t num_sites, tsk_size_t num_mutations) m = k % num_metadatas; state = (state + 1) % 2; ret = tsk_mutation_table_add_row(&tables.mutations, j, u, TSK_NULL, - states[state], strlen(states[state]), metadata[m], strlen(metadata[m])); + tables.nodes.time[u], states[state], strlen(states[state]), metadata[m], + strlen(metadata[m])); CU_ASSERT_FATAL(ret >= 0); u--; } diff --git a/c/tskit/core.c b/c/tskit/core.c index ae6f12b930..29b1a27d23 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -287,6 +287,18 @@ tsk_strerror_internal(int err) case TSK_ERR_UNSORTED_MUTATIONS: ret = "Mutations must be provided in non-decreasing site order"; break; + case TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE: + ret = "Mutations must have a time equal to or greater than that of their " + "node"; + break; + case TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION: + ret = "Mutations must have a time equal to or less than that of their " + "parent mutation"; + break; + case TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE: + ret = "Mutations must have a time less than the parent node of " + "the edge they are on"; + break; /* Sample errors */ case TSK_ERR_DUPLICATE_SAMPLE: diff --git a/c/tskit/core.h b/c/tskit/core.h index 9925c89b91..1f0e244d01 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -101,8 +101,8 @@ to the API or ABI are introduced, i.e., internal refactors of bugfixes. #define TSK_FILE_FORMAT_NAME "tskit.trees" #define TSK_FILE_FORMAT_NAME_LENGTH 11 -#define TSK_FILE_FORMAT_VERSION_MAJOR 12 -#define TSK_FILE_FORMAT_VERSION_MINOR 2 +#define TSK_FILE_FORMAT_VERSION_MAJOR 13 +#define TSK_FILE_FORMAT_VERSION_MINOR 0 /** @defgroup GENERAL_ERROR_GROUP General errors. @@ -207,6 +207,9 @@ not found in the file. #define TSK_ERR_INCONSISTENT_MUTATIONS -503 #define TSK_ERR_UNSORTED_MUTATIONS -505 #define TSK_ERR_NON_SINGLE_CHAR_MUTATION -506 +#define TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE -507 +#define TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION -508 +#define TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE -509 /* Sample errors */ #define TSK_ERR_DUPLICATE_SAMPLE -600 diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 5f7babfe29..fce14f870b 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -2332,6 +2332,10 @@ tsk_mutation_table_expand_main_columns( if (ret != 0) { goto out; } + ret = expand_column((void **) &self->time, new_size, sizeof(double)); + if (ret != 0) { + goto out; + } ret = expand_column( (void **) &self->derived_state_offset, new_size + 1, sizeof(tsk_size_t)); if (ret != 0) { @@ -2465,8 +2469,8 @@ tsk_mutation_table_init(tsk_mutation_table_t *self, tsk_flags_t TSK_UNUSED(optio tsk_id_t tsk_mutation_table_add_row(tsk_mutation_table_t *self, tsk_id_t site, tsk_id_t node, - tsk_id_t parent, const char *derived_state, tsk_size_t derived_state_length, - const char *metadata, tsk_size_t metadata_length) + tsk_id_t parent, double time, const char *derived_state, + tsk_size_t derived_state_length, const char *metadata, tsk_size_t metadata_length) { tsk_size_t derived_state_offset, metadata_offset; int ret; @@ -2478,6 +2482,7 @@ tsk_mutation_table_add_row(tsk_mutation_table_t *self, tsk_id_t site, tsk_id_t n self->site[self->num_rows] = site; self->node[self->num_rows] = node; self->parent[self->num_rows] = parent; + self->time[self->num_rows] = time; derived_state_offset = self->derived_state_length; assert(self->derived_state_offset[self->num_rows] == derived_state_offset); @@ -2508,13 +2513,14 @@ tsk_mutation_table_add_row(tsk_mutation_table_t *self, tsk_id_t site, tsk_id_t n int tsk_mutation_table_append_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, - tsk_id_t *site, tsk_id_t *node, tsk_id_t *parent, const char *derived_state, - tsk_size_t *derived_state_offset, const char *metadata, tsk_size_t *metadata_offset) + tsk_id_t *site, tsk_id_t *node, tsk_id_t *parent, double *time, + const char *derived_state, tsk_size_t *derived_state_offset, const char *metadata, + tsk_size_t *metadata_offset) { int ret = 0; tsk_size_t j, derived_state_length, metadata_length; - if (site == NULL || node == NULL || derived_state == NULL + if (site == NULL || node == NULL || time == NULL || derived_state == NULL || derived_state_offset == NULL) { ret = TSK_ERR_BAD_PARAM_VALUE; goto out; @@ -2536,6 +2542,7 @@ tsk_mutation_table_append_columns(tsk_mutation_table_t *self, tsk_size_t num_row } else { memcpy(self->parent + self->num_rows, parent, num_rows * sizeof(tsk_id_t)); } + memcpy(self->time + self->num_rows, time, num_rows * sizeof(double)); /* Metadata column */ if (metadata == NULL) { @@ -2599,8 +2606,8 @@ tsk_mutation_table_copy( } } ret = tsk_mutation_table_set_columns(dest, self->num_rows, self->site, self->node, - self->parent, self->derived_state, self->derived_state_offset, self->metadata, - self->metadata_offset); + self->parent, self->time, self->derived_state, self->derived_state_offset, + self->metadata, self->metadata_offset); if (ret != 0) { goto out; } @@ -2612,8 +2619,9 @@ tsk_mutation_table_copy( int tsk_mutation_table_set_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, - tsk_id_t *site, tsk_id_t *node, tsk_id_t *parent, const char *derived_state, - tsk_size_t *derived_state_offset, const char *metadata, tsk_size_t *metadata_offset) + tsk_id_t *site, tsk_id_t *node, tsk_id_t *parent, double *time, + const char *derived_state, tsk_size_t *derived_state_offset, const char *metadata, + tsk_size_t *metadata_offset) { int ret = 0; @@ -2621,7 +2629,7 @@ tsk_mutation_table_set_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, if (ret != 0) { goto out; } - ret = tsk_mutation_table_append_columns(self, num_rows, site, node, parent, + ret = tsk_mutation_table_append_columns(self, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); out: return ret; @@ -2639,6 +2647,7 @@ tsk_mutation_table_equals(tsk_mutation_table_t *self, tsk_mutation_table_t *othe && memcmp(self->node, other->node, self->num_rows * sizeof(tsk_id_t)) == 0 && memcmp(self->parent, other->parent, self->num_rows * sizeof(tsk_id_t)) == 0 + && memcmp(self->time, other->time, self->num_rows * sizeof(double)) == 0 && memcmp(self->derived_state_offset, other->derived_state_offset, (self->num_rows + 1) * sizeof(tsk_size_t)) == 0 @@ -2689,6 +2698,7 @@ tsk_mutation_table_free(tsk_mutation_table_t *self) tsk_safe_free(self->node); tsk_safe_free(self->site); tsk_safe_free(self->parent); + tsk_safe_free(self->time); tsk_safe_free(self->derived_state); tsk_safe_free(self->derived_state_offset); tsk_safe_free(self->metadata); @@ -2735,6 +2745,7 @@ tsk_mutation_table_get_row( row->site = self->site[index]; row->node = self->node[index]; row->parent = self->parent[index]; + row->time = self->time[index]; row->derived_state_length = self->derived_state_offset[index + 1] - self->derived_state_offset[index]; row->derived_state = self->derived_state + self->derived_state_offset[index]; @@ -2772,7 +2783,7 @@ tsk_mutation_table_dump_text(tsk_mutation_table_t *self, FILE *out) if (err < 0) { goto out; } - err = fprintf(out, "id\tsite\tnode\tparent\tderived_state\tmetadata\n"); + err = fprintf(out, "id\tsite\tnode\tparent\ttime\tderived_state\tmetadata\n"); if (err < 0) { goto out; } @@ -2780,8 +2791,8 @@ tsk_mutation_table_dump_text(tsk_mutation_table_t *self, FILE *out) derived_state_len = self->derived_state_offset[j + 1] - self->derived_state_offset[j]; metadata_len = self->metadata_offset[j + 1] - self->metadata_offset[j]; - err = fprintf(out, "%d\t%d\t%d\t%d\t%.*s\t%.*s\n", (int) j, self->site[j], - self->node[j], self->parent[j], derived_state_len, + err = fprintf(out, "%d\t%d\t%d\t%d\t%f\t%.*s\t%.*s\n", (int) j, self->site[j], + self->node[j], self->parent[j], self->time[j], derived_state_len, self->derived_state + self->derived_state_offset[j], metadata_len, self->metadata + self->metadata_offset[j]); if (err < 0) { @@ -2800,6 +2811,7 @@ tsk_mutation_table_dump(tsk_mutation_table_t *self, kastore_t *store) { "mutations/site", (void *) self->site, self->num_rows, KAS_INT32 }, { "mutations/node", (void *) self->node, self->num_rows, KAS_INT32 }, { "mutations/parent", (void *) self->parent, self->num_rows, KAS_INT32 }, + { "mutations/time", (void *) self->time, self->num_rows, KAS_FLOAT64 }, { "mutations/derived_state", (void *) self->derived_state, self->derived_state_length, KAS_UINT8 }, { "mutations/derived_state_offset", (void *) self->derived_state_offset, @@ -2822,6 +2834,7 @@ tsk_mutation_table_load(tsk_mutation_table_t *self, kastore_t *store) tsk_id_t *node = NULL; tsk_id_t *site = NULL; tsk_id_t *parent = NULL; + double *time = NULL; char *derived_state = NULL; tsk_size_t *derived_state_offset = NULL; char *metadata = NULL; @@ -2833,6 +2846,7 @@ tsk_mutation_table_load(tsk_mutation_table_t *self, kastore_t *store) { "mutations/site", (void **) &site, &num_rows, 0, KAS_INT32, 0 }, { "mutations/node", (void **) &node, &num_rows, 0, KAS_INT32, 0 }, { "mutations/parent", (void **) &parent, &num_rows, 0, KAS_INT32, 0 }, + { "mutations/time", (void **) &time, &num_rows, 0, KAS_FLOAT64, 0 }, { "mutations/derived_state", (void **) &derived_state, &derived_state_length, 0, KAS_UINT8, 0 }, { "mutations/derived_state_offset", (void **) &derived_state_offset, &num_rows, @@ -2856,7 +2870,7 @@ tsk_mutation_table_load(tsk_mutation_table_t *self, kastore_t *store) ret = TSK_ERR_BAD_OFFSET; goto out; } - ret = tsk_mutation_table_set_columns(self, num_rows, site, node, parent, + ret = tsk_mutation_table_set_columns(self, num_rows, site, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); if (ret != 0) { goto out; @@ -4479,9 +4493,9 @@ table_sorter_sort_mutations(table_sorter_t *self) mapped_parent = mutation_id_map[parent]; } ret = tsk_mutation_table_add_row(self->mutations, sorted_mutations[j].site, - sorted_mutations[j].node, mapped_parent, sorted_mutations[j].derived_state, - sorted_mutations[j].derived_state_length, sorted_mutations[j].metadata, - sorted_mutations[j].metadata_length); + sorted_mutations[j].node, mapped_parent, sorted_mutations[j].time, + sorted_mutations[j].derived_state, sorted_mutations[j].derived_state_length, + sorted_mutations[j].metadata, sorted_mutations[j].metadata_length); if (ret < 0) { goto out; } @@ -6092,7 +6106,7 @@ simplifier_output_sites(simplifier_t *self) } ret = tsk_mutation_table_add_row(&self->tables->mutations, (tsk_id_t) self->tables->sites.num_rows, mapped_node, - mapped_parent, mutation.derived_state, + mapped_parent, mutation.time, mutation.derived_state, mutation.derived_state_length, mutation.metadata, mutation.metadata_length); if (ret < 0) { @@ -6437,6 +6451,7 @@ tsk_table_collection_check_edge_ordering(tsk_table_collection_t *self) * TSK_CHECK_SITE_ORDERING Check that sites are in nondecreasing position order. * TSK_CHECK_SITE_DUPLICATES Check for any duplicate site positions. * TSK_CHECK_MUTATION_ORDERING Check mutation ordering contraints for a tree sequence. + * TSK_CHECK_MUTATION_TIME Check contraints on mutation 'time' column. * TSK_CHECK_INDEXES Check indexes exist & reference integrity. * TSK_CHECK_ALL All above checks. * TSK_NO_CHECK_MUTATION_PARENTS Do not check contraints on mutation 'parent' column. @@ -6447,7 +6462,7 @@ tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t o { int ret = TSK_ERR_GENERIC; tsk_size_t j; - double node_time, left, right, position; + double node_time, left, right, position, mutation_time; double L = self->sequence_length; double *time = self->nodes.time; tsk_id_t parent, child; @@ -6465,6 +6480,7 @@ tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t o bool check_mutation_ordering = !!(options & TSK_CHECK_MUTATION_ORDERING); bool check_mutation_parents = (check_mutation_ordering & !(options & TSK_NO_CHECK_MUTATION_PARENTS)); + bool check_mutation_time = !!(options & TSK_CHECK_MUTATION_TIME); bool check_populations = !(options & TSK_NO_CHECK_POPULATION_REFS); bool check_indexes = !!(options & TSK_CHECK_INDEXES); @@ -6605,6 +6621,22 @@ tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t o } } } + if (check_mutation_time) { + mutation_time = self->mutations.time[j]; + if (!isfinite(mutation_time)) { + ret = TSK_ERR_TIME_NONFINITE; + goto out; + } + if (mutation_time < self->nodes.time[self->mutations.node[j]]) { + ret = TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE; + goto out; + } + if (parent_mut != TSK_NULL + && self->mutations.time[parent_mut] < mutation_time) { + ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION; + goto out; + } + } } /* Migrations */ @@ -6681,6 +6713,7 @@ tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t o goto out; } } + out: return ret; } @@ -7560,9 +7593,10 @@ tsk_table_collection_deduplicate_sites( goto out; } /* Check everything except site duplicates (which we expect) and - * edge indexes (which we don't use) */ + * edge indexes and mutation times (which we don't use) */ ret = tsk_table_collection_check_integrity( - self, TSK_CHECK_ALL & ~TSK_CHECK_SITE_DUPLICATES & ~TSK_CHECK_INDEXES); + self, TSK_CHECK_ALL & ~TSK_CHECK_SITE_DUPLICATES & ~TSK_CHECK_INDEXES + & ~TSK_CHECK_MUTATION_TIME); if (ret != 0) { goto out; } @@ -7627,13 +7661,14 @@ tsk_table_collection_compute_mutation_parents( double left, right; tsk_id_t site; /* Using unsigned values here avoids potentially undefined behaviour */ - uint32_t j, mutation, first_mutation; + tsk_size_t j, mutation, first_mutation; /* Note that because we check everything here, any non-null mutation parents * will also be checked, even though they are about to be overwritten. To - * ensure that his function always succeeds we must ensure that the + * ensure that this function always succeeds we must ensure that the * parent field is set to -1 first. */ - ret = tsk_table_collection_check_integrity(self, TSK_CHECK_ALL); + ret = tsk_table_collection_check_integrity( + self, TSK_CHECK_ALL & ~TSK_CHECK_MUTATION_TIME); if (ret != 0) { goto out; } @@ -7724,6 +7759,115 @@ tsk_table_collection_compute_mutation_parents( return ret; } +int TSK_WARN_UNUSED +tsk_table_collection_compute_mutation_times( + tsk_table_collection_t *self, double *random, tsk_flags_t TSK_UNUSED(options)) +{ + int ret = 0; + const tsk_id_t *restrict I = self->indexes.edge_insertion_order; + const tsk_id_t *restrict O = self->indexes.edge_removal_order; + const tsk_edge_table_t edges = self->edges; + const tsk_node_table_t nodes = self->nodes; + const tsk_site_table_t sites = self->sites; + const tsk_mutation_table_t mutations = self->mutations; + const tsk_id_t M = (tsk_id_t) edges.num_rows; + tsk_id_t tj, tk; + tsk_id_t *parent = NULL; + tsk_size_t *numerator = NULL; + tsk_size_t *denominator = NULL; + tsk_id_t u; + double left, right, parent_time; + tsk_id_t site; + /* Using unsigned values here avoids potentially undefined behaviour */ + tsk_size_t j, mutation, first_mutation; + + /* The random param is for future usage */ + if (random != NULL) { + ret = TSK_ERR_BAD_PARAM_VALUE; + goto out; + } + + ret = tsk_table_collection_check_integrity( + self, TSK_CHECK_ALL & ~TSK_CHECK_MUTATION_TIME); + if (ret != 0) { + goto out; + } + parent = malloc(nodes.num_rows * sizeof(*parent)); + numerator = malloc(nodes.num_rows * sizeof(*numerator)); + denominator = malloc(nodes.num_rows * sizeof(*denominator)); + if (parent == NULL || numerator == NULL || denominator == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + memset(parent, 0xff, nodes.num_rows * sizeof(*parent)); + memset(numerator, 0, nodes.num_rows * sizeof(*numerator)); + memset(denominator, 0, nodes.num_rows * sizeof(*denominator)); + + tj = 0; + tk = 0; + site = 0; + mutation = 0; + left = 0; + while (tj < M || left < self->sequence_length) { + while (tk < M && edges.right[O[tk]] == left) { + parent[edges.child[O[tk]]] = TSK_NULL; + tk++; + } + while (tj < M && edges.left[I[tj]] == left) { + parent[edges.child[I[tj]]] = edges.parent[I[tj]]; + tj++; + } + right = self->sequence_length; + if (tj < M) { + right = TSK_MIN(right, edges.left[I[tj]]); + } + if (tk < M) { + right = TSK_MIN(right, edges.right[O[tk]]); + } + + /* Tree is now ready. We look at each site on this tree in turn */ + while (site < (tsk_id_t) sites.num_rows && sites.position[site] < right) { + first_mutation = mutation; + /* Count how many mutations each edge has to get our + denominator */ + while (mutation < mutations.num_rows && mutations.site[mutation] == site) { + denominator[mutations.node[mutation]]++; + mutation++; + } + /* Go over the mutations again assigning times. As the sorting requirements + guarantee that parents are before children, we assign oldest first */ + for (j = first_mutation; j < mutation; j++) { + u = mutations.node[j]; + numerator[u]++; + if (parent[u] == TSK_NULL) { + /* This mutation is above a root */ + mutations.time[j] = nodes.time[u]; + } else { + parent_time = nodes.time[parent[u]]; + mutations.time[j] = parent_time + - (parent_time - nodes.time[u]) * numerator[u] + / (denominator[u] + 1); + } + } + /* Reset the book-keeping for the next site */ + for (j = first_mutation; j < mutation; j++) { + u = mutations.node[j]; + numerator[u] = 0; + denominator[u] = 0; + } + site++; + } + /* Move on to the next tree */ + left = right; + } + +out: + tsk_safe_free(parent); + tsk_safe_free(numerator); + tsk_safe_free(denominator); + return ret; +} + int tsk_table_collection_record_num_rows( tsk_table_collection_t *self, tsk_bookmark_t *position) diff --git a/c/tskit/tables.h b/c/tskit/tables.h index 6fc1f3e5cf..82d6bc0324 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -170,6 +170,8 @@ typedef struct { tsk_id_t node; /** @brief Parent mutation ID. */ tsk_id_t parent; + /** @brief Mutation time. */ + double time; /** @brief Derived state. */ const char *derived_state; /** @brief Size of the derived state in bytes. */ @@ -482,6 +484,8 @@ typedef struct { tsk_id_t *site; /** @brief The parent column. */ tsk_id_t *parent; + /** @brief The time column. */ + double *time; /** @brief The derived_state column. */ char *derived_state; /** @brief The derived_state_offset column. */ @@ -645,9 +649,11 @@ typedef struct { #define TSK_CHECK_INDEXES (1 << 5) #define TSK_CHECK_ALL \ (TSK_CHECK_OFFSETS | TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING \ - | TSK_CHECK_SITE_DUPLICATES | TSK_CHECK_MUTATION_ORDERING | TSK_CHECK_INDEXES) + | TSK_CHECK_SITE_DUPLICATES | TSK_CHECK_MUTATION_ORDERING | TSK_CHECK_INDEXES \ + | TSK_CHECK_MUTATION_TIME) #define TSK_NO_CHECK_MUTATION_PARENTS (1 << 6) #define TSK_NO_CHECK_POPULATION_REFS (1 << 7) +#define TSK_CHECK_MUTATION_TIME (1 << 8) /* Flags for dump tables */ #define TSK_NO_BUILD_INDEXES (1 << 0) @@ -1546,6 +1552,7 @@ for details of the columns in this table. @param site The site ID for the new mutation. @param node The ID of the node this mutation occurs over. @param parent The ID of the parent mutation. +@param time The time of the mutation. @param derived_state The derived_state for the new mutation. @param derived_state_length The length of the derived_state in bytes. @param metadata The metadata to be associated with the new mutation. This @@ -1555,7 +1562,7 @@ for details of the columns in this table. or a negative value on failure. */ tsk_id_t tsk_mutation_table_add_row(tsk_mutation_table_t *self, tsk_id_t site, - tsk_id_t node, tsk_id_t parent, const char *derived_state, + tsk_id_t node, tsk_id_t parent, double time, const char *derived_state, tsk_size_t derived_state_length, const char *metadata, tsk_size_t metadata_length); /** @@ -1665,11 +1672,13 @@ int tsk_mutation_table_set_max_metadata_length_increment( int tsk_mutation_table_set_max_derived_state_length_increment( tsk_mutation_table_t *self, tsk_size_t max_derived_state_length_increment); int tsk_mutation_table_set_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, - tsk_id_t *site, tsk_id_t *node, tsk_id_t *parent, const char *derived_state, - tsk_size_t *derived_state_length, const char *metadata, tsk_size_t *metadata_length); + tsk_id_t *site, tsk_id_t *node, tsk_id_t *parent, double *time, + const char *derived_state, tsk_size_t *derived_state_length, const char *metadata, + tsk_size_t *metadata_length); int tsk_mutation_table_append_columns(tsk_mutation_table_t *self, tsk_size_t num_rows, - tsk_id_t *site, tsk_id_t *node, tsk_id_t *parent, const char *derived_state, - tsk_size_t *derived_state_length, const char *metadata, tsk_size_t *metadata_length); + tsk_id_t *site, tsk_id_t *node, tsk_id_t *parent, double *time, + const char *derived_state, tsk_size_t *derived_state_length, const char *metadata, + tsk_size_t *metadata_length); bool tsk_mutation_table_equals(tsk_mutation_table_t *self, tsk_mutation_table_t *other); int tsk_mutation_table_clear(tsk_mutation_table_t *self); int tsk_mutation_table_truncate(tsk_mutation_table_t *self, tsk_size_t num_rows); @@ -2522,6 +2531,8 @@ int tsk_table_collection_deduplicate_sites( tsk_table_collection_t *tables, tsk_flags_t options); int tsk_table_collection_compute_mutation_parents( tsk_table_collection_t *self, tsk_flags_t options); +int tsk_table_collection_compute_mutation_times( + tsk_table_collection_t *self, double *random, tsk_flags_t TSK_UNUSED(options)); int tsk_table_collection_check_integrity( tsk_table_collection_t *self, tsk_flags_t options); diff --git a/c/tskit/trees.c b/c/tskit/trees.c index f7109bac1b..648beee153 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -228,28 +228,48 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) { int ret = TSK_ERR_GENERIC; size_t j, k, tree_index; - tsk_id_t site; + tsk_id_t site, mutation; double tree_left, tree_right; const double sequence_length = self->tables->sequence_length; const tsk_id_t num_sites = (tsk_id_t) self->tables->sites.num_rows; + const tsk_id_t num_mutations = (tsk_id_t) self->tables->mutations.num_rows; const size_t num_edges = self->tables->edges.num_rows; const double *restrict site_position = self->tables->sites.position; + const tsk_id_t *restrict mutation_site = self->tables->mutations.site; + const tsk_id_t *restrict mutation_node = self->tables->mutations.node; + const double *restrict mutation_time = self->tables->mutations.time; + const double *restrict node_time = self->tables->nodes.time; const tsk_id_t *restrict I = self->tables->indexes.edge_insertion_order; const tsk_id_t *restrict O = self->tables->indexes.edge_removal_order; const double *restrict edge_right = self->tables->edges.right; const double *restrict edge_left = self->tables->edges.left; + const tsk_id_t *restrict edge_child = self->tables->edges.child; + const tsk_id_t *restrict edge_parent = self->tables->edges.parent; + tsk_id_t *parent = NULL; + + parent = malloc(self->tables->nodes.num_rows * sizeof(*parent)); + if (parent == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + memset(parent, 0xff, self->tables->nodes.num_rows * sizeof(*parent)); tree_left = 0; tree_right = sequence_length; self->num_trees = 0; j = 0; k = 0; + site = 0; + mutation = 0; assert(I != NULL && O != NULL); + /* Iterate through the trees in order to count them and check mutation times */ while (j < num_edges || tree_left < sequence_length) { while (k < num_edges && edge_right[O[k]] == tree_left) { + parent[edge_child[O[k]]] = TSK_NULL; k++; } while (j < num_edges && edge_left[I[j]] == tree_left) { + parent[edge_child[I[j]]] = edge_parent[I[j]]; j++; } tree_right = sequence_length; @@ -259,6 +279,18 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) if (k < num_edges) { tree_right = TSK_MIN(tree_right, edge_right[O[k]]); } + while (site < num_sites && site_position[site] < tree_right) { + while (mutation < num_mutations && mutation_site[mutation] == site) { + if (parent[mutation_node[mutation]] != TSK_NULL + && node_time[parent[mutation_node[mutation]]] + <= mutation_time[mutation]) { + ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE; + goto out; + } + mutation++; + } + site++; + } tree_left = tree_right; self->num_trees++; } @@ -309,6 +341,7 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) self->breakpoints[tree_index] = tree_right; ret = 0; out: + tsk_safe_free(parent); return ret; } diff --git a/docs/data-model.rst b/docs/data-model.rst index 159fc8124e..1cf32e436c 100644 --- a/docs/data-model.rst +++ b/docs/data-model.rst @@ -374,6 +374,7 @@ Column Type Description site int32 The ID of the site the mutation occurs at. node int32 The node this mutation occurs at. parent int32 The ID of the parent mutation. +time double Time at which the mutation occurred. derived_state char The allelic state resulting from the mutation. metadata binary Mutation :ref:`sec_metadata_definition`. ================ ============== =========== @@ -384,6 +385,9 @@ The ``site`` column is an integer value defining the ID of the The ``node`` column is an integer value defining the ID of the first :ref:`node ` in the tree below this mutation. +The ``time`` column is a double precision floating point value recording how long ago +the mutation happened. + The ``derived_state`` column specifies the allelic state resulting from the mutation, thus defining the state that the ``node`` and any descendant nodes in the subtree inherit unless further mutations occur. The column stores text @@ -633,6 +637,9 @@ requirements for a valid set of mutations are: - ``site`` must refer to a valid site ID; - ``node`` must refer to a valid node ID; +- ``time`` must be a finite value which is greater or equal to the mutation ``node``'s + ``time``, less than the ``node`` above the mutation's ``time`` and equal to or less + than the ``time`` of the ``parent`` mutation if this mutation has one. - ``parent`` must either be the null ID (-1) or a valid mutation ID within the current table @@ -815,6 +822,17 @@ of this fact to compute the ``parent`` column of a mutation table, if all other information is valid. +Computing mutation times +------------------------ + +In the case where the method generating a tree sequence does not generate mutation +times, valid times can be provided by :meth:`TableCollection.compute_mutation_parents`. +If all other information is valid this method will assign times to the mutations by +placing them at evenly spaced intervals along their edge (for instance, a single +mutation on an edge between a node at time 1.0 and a node at time 4.0 would be given +time 2.5; while two mutations on that edge would be given times 2.0 and 3.0). + + Recording tables in forwards time --------------------------------- @@ -1131,18 +1149,19 @@ Mutation text format ==================== The mutation text format must contain the columns ``site``, -``node`` and ``derived_state``. The ``parent`` and ``metadata`` columns +``node`` and ``derived_state``. The ``time``, ``parent`` and ``metadata`` columns may also be optionally present (but ``parent`` must be specified if -more than one mutation occurs at the same site). See the +more than one mutation occurs at the same site). If ``time`` is absent ``0`` will be +used to fill the column. See the :ref:`mutation table definitions ` for details on these columns. mutations:: - site node derived_state parent - 0 0 A -1 - 1 0 T -1 - 1 1 A 1 + site node derived_state time parent + 0 0 A 0 -1 + 1 0 T 0.5 -1 + 1 1 A 1 1 .. _sec_migration_text_format: diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 8611ce6d8b..2bfc5ba161 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -6,6 +6,14 @@ In development **Breaking changes** +- Mutations now have a double-precision floating-point ``time`` column. As this is a + mandatory column the file format version has been bumped to ``13.0``. + Pre-existing tree sequences can be upgraded using the ``tskit upgrade`` command-line + utility, which will assign times to mutations by spreading them evenly along the + edges on which they occur (using ``TableCollection.compute_mutation_times``). For + a tree sequence to be considered valid it must meet new criteria for mutation + times, see :ref:`sec_mutation_requirements`. + - The default display order for tree visualisations has been changed to ``minlex`` (see below) to stabilise the node ordering and to make trees more readily comparable. The old behaviour is still available with ``order="tree"``. @@ -16,6 +24,9 @@ In development **New features** +- Add time column to mutations, along with + ``TableCollection.compute_mutation_times``. (:user:`benjeffery`, :pr:`672`) + - Add support for trees with internal samples for the Kendall-Colijn tree distance metric. (:user:`daniel-goldstein`, :pr:`610`) diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 2e77fd653c..e201c924b9 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -287,7 +287,7 @@ make_mutation(tsk_mutation_t *mutation) if (metadata == NULL) { goto out; } - ret = Py_BuildValue("iis#iO", mutation->site, mutation->node, mutation->derived_state, + ret = Py_BuildValue("iids#iO", mutation->site, mutation->node, mutation->time, mutation->derived_state, (Py_ssize_t) mutation->derived_state_length, mutation->parent, metadata); out: @@ -1534,6 +1534,9 @@ parse_mutation_table_dict(tsk_mutation_table_t *table, PyObject *dict, bool clea PyArrayObject *derived_state_offset_array = NULL; PyObject *node_input = NULL; PyArrayObject *node_array = NULL; + PyObject *time_input = NULL; + PyArrayObject *time_array = NULL; + double *time_data; PyObject *parent_input = NULL; PyArrayObject *parent_array = NULL; tsk_id_t *parent_data; @@ -1560,6 +1563,10 @@ parse_mutation_table_dict(tsk_mutation_table_t *table, PyObject *dict, bool clea if (parent_input == NULL) { goto out; } + time_input = get_table_dict_value(dict, "time", true); + if (time_input == NULL) { + goto out; + } derived_state_input = get_table_dict_value(dict, "derived_state", true); if (derived_state_input == NULL) { goto out; @@ -1600,6 +1607,14 @@ parse_mutation_table_dict(tsk_mutation_table_t *table, PyObject *dict, bool clea if (node_array == NULL) { goto out; } + time_data = NULL; + if (time_input != Py_None) { + time_array = table_read_column_array(time_input, NPY_FLOAT64, &num_rows, true); + if (time_array == NULL) { + goto out; + } + time_data = PyArray_DATA(time_array); + } parent_data = NULL; if (parent_input != Py_None) { @@ -1654,7 +1669,7 @@ parse_mutation_table_dict(tsk_mutation_table_t *table, PyObject *dict, bool clea } err = tsk_mutation_table_append_columns(table, num_rows, PyArray_DATA(site_array), PyArray_DATA(node_array), - parent_data, PyArray_DATA(derived_state_array), + parent_data, time_data, PyArray_DATA(derived_state_array), PyArray_DATA(derived_state_offset_array), metadata_data, metadata_offset_data); if (err != 0) { @@ -1670,6 +1685,7 @@ parse_mutation_table_dict(tsk_mutation_table_t *table, PyObject *dict, bool clea Py_XDECREF(metadata_offset_array); Py_XDECREF(node_array); Py_XDECREF(parent_array); + Py_XDECREF(time_array); return ret; } @@ -2105,6 +2121,8 @@ write_table_arrays(tsk_table_collection_t *tables, PyObject *dict) (void *) tables->mutations.site, tables->mutations.num_rows, NPY_INT32}, {"node", (void *) tables->mutations.node, tables->mutations.num_rows, NPY_INT32}, + {"time", + (void *) tables->mutations.time, tables->mutations.num_rows, NPY_FLOAT64}, {"parent", (void *) tables->mutations.parent, tables->mutations.num_rows, NPY_INT32}, {"derived_state", @@ -4876,15 +4894,16 @@ MutationTable_add_row(MutationTable *self, PyObject *args, PyObject *kwds) int site; int node; int parent = TSK_NULL; + double time = TSK_NULL; char *derived_state; Py_ssize_t derived_state_length; PyObject *py_metadata = Py_None; char *metadata = NULL; Py_ssize_t metadata_length = 0; - static char *kwlist[] = {"site", "node", "derived_state", "parent", "metadata", NULL}; + static char *kwlist[] = {"site", "node", "time", "derived_state", "parent", "metadata", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "iis#|iO", kwlist, - &site, &node, &derived_state, &derived_state_length, &parent, + if (!PyArg_ParseTupleAndKeywords(args, kwds, "iids#|iO", kwlist, + &site, &node, &time, &derived_state, &derived_state_length, &parent, &py_metadata)) { goto out; } @@ -4897,7 +4916,7 @@ MutationTable_add_row(MutationTable *self, PyObject *args, PyObject *kwds) } } err = tsk_mutation_table_add_row(self->table, (tsk_id_t) site, - (tsk_id_t) node, (tsk_id_t) parent, + (tsk_id_t) node, (tsk_id_t) parent, time, derived_state, derived_state_length, metadata, metadata_length); if (err < 0) { @@ -5114,6 +5133,21 @@ MutationTable_get_parent(MutationTable *self, void *closure) return ret; } +static PyObject * +MutationTable_get_time(MutationTable *self, void *closure) +{ + PyObject *ret = NULL; + + if (MutationTable_check_state(self) != 0) { + goto out; + } + ret = table_get_column_array(self->table->num_rows, self->table->time, + NPY_FLOAT64, sizeof(double)); +out: + return ret; +} + + static PyObject * MutationTable_get_derived_state(MutationTable *self, void *closure) { @@ -5227,6 +5261,7 @@ static PyGetSetDef MutationTable_getsetters[] = { {"site", (getter) MutationTable_get_site, NULL, "The site array"}, {"node", (getter) MutationTable_get_node, NULL, "The node array"}, {"parent", (getter) MutationTable_get_parent, NULL, "The parent array"}, + {"time", (getter) MutationTable_get_time, NULL, "The time array"}, {"derived_state", (getter) MutationTable_get_derived_state, NULL, "The derived_state array"}, {"derived_state_offset", (getter) MutationTable_get_derived_state_offset, NULL, @@ -6579,6 +6614,23 @@ TableCollection_compute_mutation_parents(TableCollection *self) return ret; } +static PyObject * +TableCollection_compute_mutation_times(TableCollection *self) +{ + int err; + PyObject *ret = NULL; + + err = tsk_table_collection_compute_mutation_times(self->tables, NULL, 0); + if (err != 0) { + handle_library_error(err); + goto out; + } + ret = Py_BuildValue(""); +out: + return ret; +} + + static PyObject * TableCollection_deduplicate_sites(TableCollection *self) { @@ -6687,7 +6739,9 @@ static PyMethodDef TableCollection_methods[] = { {"equals", (PyCFunction) TableCollection_equals, METH_VARARGS, "Returns True if the parameter table collection is equal to this one." }, {"compute_mutation_parents", (PyCFunction) TableCollection_compute_mutation_parents, - METH_NOARGS, "Computes the mutation parents for a the tables." }, + METH_NOARGS, "Computes the mutation parents for the tables." }, + {"compute_mutation_times", (PyCFunction) TableCollection_compute_mutation_times, + METH_NOARGS, "Computes the mutation times for the tables." }, {"deduplicate_sites", (PyCFunction) TableCollection_deduplicate_sites, METH_NOARGS, "Removes sites with duplicate positions." }, {"build_index", (PyCFunction) TableCollection_build_index, diff --git a/python/setup.py b/python/setup.py index 8a98f9accc..213c72a5ff 100644 --- a/python/setup.py +++ b/python/setup.py @@ -95,7 +95,14 @@ def finalize_options(self): packages=["tskit"], include_package_data=True, ext_modules=[_tskit_module], - install_requires=[numpy_ver, "h5py", "svgwrite", "jsonschema", "attrs>=19.1.0"], + install_requires=[ + "attrs>=19.1.0", + "h5py", + "jsonschema", + "kastore", + numpy_ver, + "svgwrite", + ], entry_points={"console_scripts": ["tskit=tskit.cli:tskit_main"]}, project_urls={ "Bug Reports": "https://github.com/tskit-dev/tskit/issues", diff --git a/python/tests/__init__.py b/python/tests/__init__.py index 4d1ab0a554..1eae7f5fd0 100644 --- a/python/tests/__init__.py +++ b/python/tests/__init__.py @@ -20,11 +20,113 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import base64 +import json +import sys + +import msprime +import numpy as np import tskit from . import tsutil from .simplify import * # NOQA +if msprime.__version__ <= "0.7.4": + # As mutation time is introduced into tskit but not msprime we need to patch up + # msprime to return times, this should be removed once msprime 1.0 is released. + def get_tree_sequence(self, mutation_generator=None, provenance_record=None): + if mutation_generator is not None: + mutation_generator.generate(self.ll_tables) + tables_dict = self.ll_tables.asdict() + tables_dict["mutations"]["time"] = np.full( + tables_dict["mutations"]["node"].shape, -1, np.float64 + ) + tables = tskit.TableCollection.fromdict(tables_dict) + if provenance_record is not None: + tables.provenances.add_row(provenance_record) + if self.from_ts is None: + # Add the populations with metadata + assert len(tables.populations) == len(self.population_configurations) + tables.populations.clear() + for pop_config in self.population_configurations: + tables.populations.add_row(metadata=pop_config.encoded_metadata) + tables.build_index() + tables.compute_mutation_times() + return tables.tree_sequence() + + def mutate( + tree_sequence, + rate=None, + random_seed=None, + model=None, + keep=False, + start_time=None, + end_time=None, + ): + try: + tables = tree_sequence.tables + except AttributeError: + raise ValueError("First argument must be a TreeSequence instance.") + if random_seed is None: + random_seed = msprime.simulations._get_random_seed() + random_seed = int(random_seed) + + rng = msprime.mutations._msprime.RandomGenerator(random_seed) + if model is None: + model = msprime.InfiniteSites() + try: + alphabet = model.alphabet + except AttributeError: + raise TypeError("model must be an InfiniteSites instance") + if rate is None: + rate = 0 + rate = float(rate) + keep = bool(keep) + + parameters = { + "command": "mutate", + "rate": rate, + "random_seed": random_seed, + "keep": keep, + } + + if start_time is None: + start_time = -sys.float_info.max + else: + start_time = float(start_time) + parameters["start_time"] = start_time + + if end_time is None: + end_time = sys.float_info.max + else: + end_time = float(end_time) + parameters["end_time"] = end_time + provenance_dict = msprime.provenance.get_provenance_dict(parameters) + + if start_time > end_time: + raise ValueError("start_time must be <= end_time") + + mutation_generator = msprime.mutations._msprime.MutationGenerator( + rng, rate, alphabet=alphabet, start_time=start_time, end_time=end_time + ) + lwt = msprime.mutations._msprime.LightweightTableCollection() + lwt.fromdict(tables.asdict()) + mutation_generator.generate(lwt, keep=keep) + + tables_dict = lwt.asdict() + tables_dict["mutations"]["time"] = np.full( + tables_dict["mutations"]["node"].shape, -1, np.float64 + ) + tables = tskit.TableCollection.fromdict(tables_dict) + tables.provenances.add_row(json.dumps(provenance_dict)) + + tables.build_index() + tables.compute_mutation_times() + return tables.tree_sequence() + + msprime.simulations.Simulator.get_tree_sequence = get_tree_sequence + msprime.mutations.mutate = mutate + msprime.mutate = mutate + # TODO remove this code and refactor elsewhere. @@ -232,11 +334,12 @@ def __init__(self, tree_sequence, breakpoints=None): ll_ts = self._tree_sequence._ll_tree_sequence def make_mutation(id_): - site, node, derived_state, parent, metadata = ll_ts.get_mutation(id_) + site, node, time, derived_state, parent, metadata = ll_ts.get_mutation(id_) return tskit.Mutation( id_=id_, site=site, node=node, + time=time, derived_state=derived_state, parent=parent, encoded_metadata=metadata, diff --git a/python/tests/simplify.py b/python/tests/simplify.py index 071d048db9..bfd9c387d5 100644 --- a/python/tests/simplify.py +++ b/python/tests/simplify.py @@ -353,6 +353,7 @@ def finalise_sites(self): self.tables.mutations.add_row( site=len(self.tables.sites), node=self.mutation_node_map[mut.id], + time=mut.time, parent=mapped_parent, derived_state=mut.derived_state, metadata=mut.metadata, diff --git a/python/tests/test_cli.py b/python/tests/test_cli.py index 13ea65ef99..1e45cc4ded 100644 --- a/python/tests/test_cli.py +++ b/python/tests/test_cli.py @@ -31,7 +31,9 @@ from unittest import mock import h5py +import kastore import msprime +import numpy import tskit import tskit.cli as cli @@ -645,3 +647,47 @@ def test_duplicate_positions_error(self): ["upgrade", self.legacy_file_name, self.current_file_name], ) self.assertEqual(mocked_exit.call_count, 1) + + def test_12_to_13(self): + ts = msprime.simulate(10, mutation_rate=10) + tc = ts.tables + tc.metadata_schema = tskit.MetadataSchema( + { + "codec": "struct", + "type": "object", + "properties": {"top-level": {"type": "string", "binaryFormat": "50p"}}, + } + ) + tc.metadata = {"top-level": "top-level-metadata"} + for table in [ + "individuals", + "nodes", + "edges", + "migrations", + "sites", + "mutations", + "populations", + ]: + t = getattr(tc, table) + t.packset_metadata([f"{table}-{i}".encode() for i in range(t.num_rows)]) + t.metadata_schema = tskit.MetadataSchema( + { + "codec": "struct", + "type": "object", + "properties": {table: {"type": "string", "binaryFormat": "50p"}}, + } + ) + ts = tc.tree_sequence() + ts.dump(self.legacy_file_name) + with kastore.load(self.legacy_file_name) as store: + all_data = dict(store) + del all_data["mutations/time"] + all_data["format/version"] = numpy.asarray([12, 0], dtype=numpy.uint32) + kastore.dump(all_data, self.legacy_file_name) + stdout, stderr = capture_output( + cli.tskit_main, ["upgrade", self.legacy_file_name, self.current_file_name], + ) + ts2 = tskit.load(self.current_file_name) + self.assertEqual(stdout, "") + self.assertEqual(stderr, "") + self.assertEqual(ts.tables, ts2.tables) diff --git a/python/tests/test_dict_encoding.py b/python/tests/test_dict_encoding.py index e1c29d7cfa..2ec9ad147e 100644 --- a/python/tests/test_dict_encoding.py +++ b/python/tests/test_dict_encoding.py @@ -81,6 +81,7 @@ def get_example_tables(): mut_id = tables.mutations.add_row( site=mutation.site, node=mutation.node, + time=0, parent=-1, derived_state="C" * mutation.id, metadata=b"x" * mutation.id, @@ -89,6 +90,7 @@ def get_example_tables(): tables.mutations.add_row( site=mutation.site, node=mutation.node, + time=0, parent=mut_id, derived_state="G" * mutation.id, metadata=b"y" * mutation.id, @@ -536,7 +538,7 @@ def test_mutations(self): self.verify_required_columns( tables, "mutations", - ["site", "node", "derived_state", "derived_state_offset"], + ["site", "node", "time", "derived_state", "derived_state_offset"], ) self.verify_offset_pair(tables, len(tables.mutations), "mutations", "metadata") self.verify_metadata_schema(tables, "mutations") diff --git a/python/tests/test_drawing.py b/python/tests/test_drawing.py index 71aa0e4f3f..fed07c1293 100644 --- a/python/tests/test_drawing.py +++ b/python/tests/test_drawing.py @@ -74,8 +74,8 @@ def get_zero_edge_tree(self): tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) tables.sites.add_row(position=0, ancestral_state="0") - tables.mutations.add_row(site=0, node=0, derived_state="1") - tables.mutations.add_row(site=0, node=1, derived_state="1") + tables.mutations.add_row(site=0, node=0, time=0, derived_state="1") + tables.mutations.add_row(site=0, node=1, time=0, derived_state="1") return tables.tree_sequence().first() def get_zero_roots_tree(self): @@ -117,7 +117,9 @@ def get_mutations_over_roots_tree(self): for node in range(ts.num_nodes): site_id = tables.sites.add_row(x, ancestral_state="0") x += delta - tables.mutations.add_row(site_id, node=node, derived_state="1") + tables.mutations.add_row( + site_id, node=node, time=ts.node(node).time, derived_state="1" + ) ts = tables.tree_sequence() tree = ts.first() assert any(tree.parent(mut.node) == tskit.NULL for mut in tree.mutations()) @@ -193,8 +195,8 @@ def get_simple_ts(self): ) mutations = io.StringIO( """\ - site node derived_state parent - 0 4 T -1 + site node time derived_state parent + 0 4 0.05 T -1 """ ) return tskit.load_text( diff --git a/python/tests/test_file_format.py b/python/tests/test_file_format.py index ee0c397003..b0fd722b69 100644 --- a/python/tests/test_file_format.py +++ b/python/tests/test_file_format.py @@ -39,8 +39,8 @@ import tskit.exceptions as exceptions -CURRENT_FILE_MAJOR = 12 -CURRENT_FILE_MINOR = 2 +CURRENT_FILE_MAJOR = 13 +CURRENT_FILE_MINOR = 0 test_data_dir = os.path.join(os.path.dirname(__file__), "data") @@ -69,8 +69,10 @@ def general_mutation_example(): tables = ts.dump_tables() tables.sites.add_row(position=0, ancestral_state="A", metadata=b"{}") tables.sites.add_row(position=1, ancestral_state="C", metadata=b"{'id':1}") - tables.mutations.add_row(site=0, node=0, derived_state="T") - tables.mutations.add_row(site=1, node=0, derived_state="G") + tables.mutations.add_row(site=0, node=0, time=0, derived_state="T") + tables.mutations.add_row(site=1, node=0, time=0, derived_state="G") + tables.build_index() + tables.compute_mutation_times() return tables.tree_sequence() @@ -166,7 +168,11 @@ def mutation_metadata_example(): tables = ts.dump_tables() tables.sites.add_row(0, ancestral_state="a") for j in range(10): - tables.mutations.add_row(site=0, node=j, derived_state="t", metadata=b"1234") + tables.mutations.add_row( + site=0, node=j, time=-1, derived_state="t", metadata=b"1234" + ) + tables.build_index() + tables.compute_mutation_times() return tables.tree_sequence() @@ -456,8 +462,6 @@ def test_unsupported_version(self): self.assertRaises(ValueError, tskit.dump_legacy, ts, self.temp_file, version=4) # Cannot read current files. ts.dump(self.temp_file) - # Catch Exception here because h5py throws different exceptions on py2 and py3 - self.assertRaises(Exception, tskit.load_legacy, self.temp_file) def test_no_version_number(self): root = h5py.File(self.temp_file, "w") @@ -515,6 +519,7 @@ def verify_keys(self, ts): "mutations/node", "mutations/parent", "mutations/site", + "mutations/time", "nodes/flags", "nodes/individual", "nodes/metadata", @@ -820,8 +825,6 @@ class TestFileFormatErrors(TestFileFormat): Tests for errors in the HDF5 format. """ - current_major_version = 12 - def verify_missing_fields(self, ts): ts.dump(self.temp_file) with kastore.load(self.temp_file) as store: diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index 584cc7610a..6dfcdc2703 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -248,7 +248,7 @@ def test_many_alleles(self): for u in ts.samples(): if u != 0: self.assertEqual(var.alleles[var.genotypes[u]], alleles[0]) - tables.mutations.add_row(0, 0, allele, parent=parent) + tables.mutations.add_row(0, 0, 0, allele, parent=parent) parent += 1 num_alleles += 1 @@ -284,7 +284,7 @@ def test_many_alleles_missing_data(self): for u in samples[:-1]: if u != 0: self.assertEqual(var.alleles[var.genotypes[u]], alleles[0]) - tables.mutations.add_row(0, 0, allele, parent=parent) + tables.mutations.add_row(0, 0, 0, allele, parent=parent) parent += 1 num_alleles += 1 @@ -320,7 +320,7 @@ def test_recurrent_mutations_over_samples(self): position=j * ts.sequence_length / num_sites, ancestral_state="0" ) for u in range(ts.sample_size): - tables.mutations.add_row(site=j, node=u, derived_state="1") + tables.mutations.add_row(site=j, node=u, time=0, derived_state="1") ts = tables.tree_sequence() variants = list(ts.variants()) self.assertEqual(len(variants), num_sites) @@ -341,8 +341,12 @@ def test_recurrent_mutations_errors(self): tables.sites.clear() tables.mutations.clear() site = tables.sites.add_row(position=0, ancestral_state="0") - tables.mutations.add_row(site=site, node=u, derived_state="1") - tables.mutations.add_row(site=site, node=sample, derived_state="1") + tables.mutations.add_row( + site=site, node=u, time=ts.node(u).time, derived_state="1" + ) + tables.mutations.add_row( + site=site, node=sample, time=0, derived_state="1" + ) ts_new = tables.tree_sequence() self.assertRaises(exceptions.LibraryError, list, ts_new.variants()) @@ -471,7 +475,7 @@ def test_mutation_over_isolated_sample_missing(self): tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) tables.sites.add_row(0.5, "A") - tables.mutations.add_row(0, 0, "T") + tables.mutations.add_row(0, 0, 0, "T") ts = tables.tree_sequence() variants = list(ts.variants()) self.assertEqual(len(variants), 1) @@ -568,6 +572,7 @@ def test_acgt_mutations(self): mutations.set_columns( site=mutations.site, node=mutations.node, + time=mutations.time, derived_state=np.zeros(ts.num_sites, dtype=np.int8) + ord("T"), derived_state_offset=np.arange(ts.num_sites + 1, dtype=np.uint32), ) @@ -605,7 +610,7 @@ def test_recurrent_mutations_over_samples(self): position=j * ts.sequence_length / num_sites, ancestral_state="0" ) for u in range(ts.sample_size): - tables.mutations.add_row(site=j, node=u, derived_state="1") + tables.mutations.add_row(site=j, node=u, time=0, derived_state="1") ts_new = tables.tree_sequence() ones = "1" * num_sites for h in ts_new.haplotypes(): @@ -619,8 +624,15 @@ def test_recurrent_mutations_errors(self): tables.sites.clear() tables.mutations.clear() site = tables.sites.add_row(position=0, ancestral_state="0") - tables.mutations.add_row(site=site, node=u, derived_state="1") - tables.mutations.add_row(site=site, node=tree.root, derived_state="1") + tables.mutations.add_row( + site=site, node=u, time=ts.node(u).time, derived_state="1" + ) + tables.mutations.add_row( + site=site, + node=tree.root, + time=ts.node(tree.root).time, + derived_state="1", + ) ts_new = tables.tree_sequence() self.assertRaises(exceptions.LibraryError, list, ts_new.haplotypes()) ts_new.haplotypes() @@ -768,7 +780,7 @@ def test_missing_data(self): tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) tables.sites.add_row(0.5, "0") - tables.mutations.add_row(0, 0, "1") + tables.mutations.add_row(0, 0, 0, "1") ts = tables.tree_sequence() for impute in [True, False]: diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index 9f0d1059df..2cd1a2eb12 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -64,8 +64,11 @@ def insert_uniform_mutations(tables, num_mutations, nodes): site=j, derived_state="1", node=nodes[j % len(nodes)], + time=-1, metadata=json.dumps({"index": j}).encode(), ) + tables.build_index() + tables.compute_mutation_times() def get_table_collection_copy(tables, sequence_length): @@ -743,6 +746,16 @@ def test_compute_mutation_parent(self): parent = ts.tables.mutations.parent self.assertTrue(np.array_equal(parent, before)) + def test_compute_mutation_time(self): + for ts in get_example_tree_sequences(): + tables = ts.dump_tables() + before = tables.mutations.time[:] + tables.compute_mutation_times() + time = tables.mutations.time + self.assertTrue(np.array_equal(time, before)) + # Check we have valid times + tables.tree_sequence() + def verify_tracked_samples(self, ts): # Should be empty list by default. for tree in ts.trees(): @@ -1094,6 +1107,7 @@ def test_simplify_bugs(self): sites=sites, mutations=mutations, strict=False, + compute_mutation_times=True, ) samples = list(ts.samples()) self.verify_simplify_equality(ts, samples) @@ -1656,16 +1670,17 @@ def convert(v): self.assertEqual(len(output_mutations) - 1, ts.num_mutations) self.assertEqual( list(output_mutations[0].split()), - ["site", "node", "derived_state", "parent", "metadata"], + ["site", "node", "time", "derived_state", "parent", "metadata"], ) mutations = [mut for site in ts.sites() for mut in site.mutations] for mutation, line in zip(mutations, output_mutations[1:]): splits = line.split("\t") self.assertEqual(str(mutation.site), splits[0]) self.assertEqual(str(mutation.node), splits[1]) - self.assertEqual(str(mutation.derived_state), splits[2]) - self.assertEqual(str(mutation.parent), splits[3]) - self.assertEqual(tests.base64_encode(mutation.metadata), splits[4]) + self.assertEqual(str(mutation.time), splits[2]) + self.assertEqual(str(mutation.derived_state), splits[3]) + self.assertEqual(str(mutation.parent), splits[4]) + self.assertEqual(tests.base64_encode(mutation.metadata), splits[5]) def test_output_format(self): for ts in get_example_tree_sequences(): @@ -1728,6 +1743,17 @@ def verify_approximate_equality(self, ts1, ts2): self.assertEqual(s1.mutations, s2.mutations) self.assertEqual(ts1.num_sites, checked) + checked = 0 + for s1, s2 in zip(ts1.mutations(), ts2.mutations()): + checked += 1 + self.assertEqual(s1.site, s2.site) + self.assertEqual(s1.node, s2.node) + self.assertAlmostEqual(s1.time, s2.time) + self.assertEqual(s1.derived_state, s2.derived_state) + self.assertEqual(s1.parent, s2.parent) + self.assertEqual(s1.metadata, s2.metadata) + self.assertEqual(ts1.num_mutations, checked) + # Check the trees check = 0 for t1, t2 in zip(ts1.trees(), ts2.trees()): @@ -2734,6 +2760,7 @@ def get_instances(self, n): id_=j, site=j, node=j, + time=j, derived_state="A" * j, parent=j, encoded_metadata=b"x" * j, diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index 7b8fd18791..7fa0e55f41 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -1643,9 +1643,14 @@ def test_sites(self): self.assertEqual(index, j) self.assertEqual(metadata, b"") for mut_id in mutations: - site, node, derived_state, parent, metadata = ts.get_mutation( - mut_id - ) + ( + site, + node, + time, + derived_state, + parent, + metadata, + ) = ts.get_mutation(mut_id) self.assertEqual(site, index) self.assertEqual(mutation_id, mut_id) self.assertNotEqual(st.get_parent(node), _tskit.NULL) @@ -1932,6 +1937,7 @@ def f(mutations): position = [] node = [] site = [] + time = [] ancestral_state = [] ancestral_state_offset = [0] derived_state = [] @@ -1939,6 +1945,7 @@ def f(mutations): for j, (p, n) in enumerate(mutations): site.append(j) position.append(p) + time.append(-1) ancestral_state.append("0") ancestral_state_offset.append(ancestral_state_offset[-1] + 1) derived_state.append("1") @@ -1957,6 +1964,7 @@ def f(mutations): dict( site=site, node=node, + time=time, derived_state=derived_state, derived_state_offset=derived_state_offset, parent=None, diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py index 3bb5116c15..0d0cec4871 100644 --- a/python/tests/test_metadata.py +++ b/python/tests/test_metadata.py @@ -156,7 +156,9 @@ def test_mutations(self): pickled = pickle.dumps(metadata) tables.nodes.add_row(time=0) tables.sites.add_row(position=0.1, ancestral_state="A") - tables.mutations.add_row(site=0, node=0, derived_state="T", metadata=pickled) + tables.mutations.add_row( + site=0, node=0, time=0, derived_state="T", metadata=pickled + ) ts = tables.tree_sequence() mutation = ts.site(0).mutations[0] self.assertEqual(mutation.site, 0) diff --git a/python/tests/test_parsimony.py b/python/tests/test_parsimony.py index b085748564..7afb25127f 100644 --- a/python/tests/test_parsimony.py +++ b/python/tests/test_parsimony.py @@ -154,7 +154,10 @@ def fitch_map_mutations(tree, genotypes, alleles): state[root] = np.where(A[root] == 1)[0][0] mutations.append( tskit.Mutation( - node=root, parent=tskit.NULL, derived_state=alleles[state[root]] + node=root, + time=tree.tree_sequence.node(root).time, + parent=tskit.NULL, + derived_state=alleles[state[root]], ) ) parent = len(mutations) - 1 @@ -168,6 +171,7 @@ def fitch_map_mutations(tree, genotypes, alleles): mutations.append( tskit.Mutation( node=v, + time=tree.tree_sequence.node(v).time, parent=parent_mutation, derived_state=alleles[state[v]], ) @@ -484,6 +488,7 @@ def verify(self, ts): tables.mutations.add_row( site_id, node=mutation.node, + time=mutation.time, parent=parent, derived_state=mutation.derived_state, ) @@ -511,6 +516,7 @@ def verify(self, ts): tables2.mutations.add_row( site_id, node=mutation.node, + time=mutation.time, derived_state=mutation.derived_state, ) tables2.sort() @@ -616,6 +622,7 @@ def verify(self, ts): tables.mutations.add_row( site_id, node=m.node, + time=m.time, parent=parent, derived_state=m.derived_state, ) @@ -746,7 +753,7 @@ def test_mutation_over_0(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], tskit.Mutation(node=0, parent=-1, derived_state="1") + transitions[0], tskit.Mutation(node=0, time=0, parent=-1, derived_state="1") ) def test_mutation_over_5(self): @@ -755,7 +762,8 @@ def test_mutation_over_5(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], tskit.Mutation(node=5, parent=-1, derived_state="1") + transitions[0], + tskit.Mutation(node=5, time=0.14567111023387, parent=-1, derived_state="1"), ) def test_mutation_over_7(self): @@ -764,7 +772,8 @@ def test_mutation_over_7(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], tskit.Mutation(node=7, parent=-1, derived_state="1") + transitions[0], + tskit.Mutation(node=7, time=0.43508024345063, parent=-1, derived_state="1"), ) def test_mutation_over_7_0(self): @@ -773,10 +782,11 @@ def test_mutation_over_7_0(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 2) self.assertEqual( - transitions[0], tskit.Mutation(node=7, parent=-1, derived_state="1") + transitions[0], + tskit.Mutation(node=7, time=0.43508024345063, parent=-1, derived_state="1"), ) self.assertEqual( - transitions[1], tskit.Mutation(node=0, parent=0, derived_state="2") + transitions[1], tskit.Mutation(node=0, time=0, parent=0, derived_state="2") ) def test_mutation_over_7_0_alleles(self): @@ -788,10 +798,14 @@ def test_mutation_over_7_0_alleles(self): self.assertEqual(ancestral_state, "ANC") self.assertEqual(len(transitions), 2) self.assertEqual( - transitions[0], tskit.Mutation(node=7, parent=-1, derived_state="ONE") + transitions[0], + tskit.Mutation( + node=7, time=0.43508024345063, parent=-1, derived_state="ONE" + ), ) self.assertEqual( - transitions[1], tskit.Mutation(node=0, parent=0, derived_state="TWO") + transitions[1], + tskit.Mutation(node=0, time=0, parent=0, derived_state="TWO"), ) def test_mutation_over_7_missing_data_0(self): @@ -800,7 +814,8 @@ def test_mutation_over_7_missing_data_0(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], tskit.Mutation(node=7, parent=-1, derived_state="1") + transitions[0], + tskit.Mutation(node=7, time=0.43508024345063, parent=-1, derived_state="1"), ) def test_mutation_over_leaf_sibling_missing(self): @@ -814,7 +829,8 @@ def test_mutation_over_leaf_sibling_missing(self): # data had the ancestral state. However, the number of *state changes* # is the same, which is what the algorithm is minimising. self.assertEqual( - transitions[0], tskit.Mutation(node=6, parent=-1, derived_state="1") + transitions[0], + tskit.Mutation(node=6, time=0.21385545626353, parent=-1, derived_state="1"), ) # Reverse is the same @@ -823,7 +839,8 @@ def test_mutation_over_leaf_sibling_missing(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], tskit.Mutation(node=6, parent=-1, derived_state="1") + transitions[0], + tskit.Mutation(node=6, time=0.21385545626353, parent=-1, derived_state="1"), ) def test_mutation_over_6_missing_data_0(self): @@ -832,7 +849,8 @@ def test_mutation_over_6_missing_data_0(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], tskit.Mutation(node=6, parent=-1, derived_state="1") + transitions[0], + tskit.Mutation(node=6, time=0.21385545626353, parent=-1, derived_state="1"), ) def test_mutation_over_0_missing_data_4(self): @@ -841,7 +859,7 @@ def test_mutation_over_0_missing_data_4(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], tskit.Mutation(node=0, parent=-1, derived_state="1") + transitions[0], tskit.Mutation(node=0, time=0, parent=-1, derived_state="1") ) def test_multi_mutation_missing_data(self): @@ -850,10 +868,11 @@ def test_multi_mutation_missing_data(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 2) self.assertEqual( - transitions[0], tskit.Mutation(node=5, parent=-1, derived_state="1") + transitions[0], + tskit.Mutation(node=5, time=0.14567111023387, parent=-1, derived_state="1"), ) self.assertEqual( - transitions[1], tskit.Mutation(node=1, parent=0, derived_state="2") + transitions[1], tskit.Mutation(node=1, time=0, parent=0, derived_state="2") ) @@ -890,6 +909,7 @@ def verify(self, ts, k): tables.mutations.add_row( j, node=mutation.node, + time=mutation.time, parent=parent, derived_state=mutation.derived_state, ) diff --git a/python/tests/test_stats.py b/python/tests/test_stats.py index 3048fc997e..a0b3201c8a 100644 --- a/python/tests/test_stats.py +++ b/python/tests/test_stats.py @@ -176,7 +176,7 @@ def test_tree_sequence_regular_mutations(self): for j in range(self.num_test_sites): site_id = len(t.sites) t.sites.add_row(position=j, ancestral_state="0") - t.mutations.add_row(site=site_id, derived_state="1", node=j) + t.mutations.add_row(site=site_id, time=0, derived_state="1", node=j) ts = t.tree_sequence() self.verify_matrix(ts) self.verify_max_distance(ts) diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index fb6280f822..c9ef7c0ea3 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -1033,12 +1033,17 @@ def test_packset_ancestral_state(self): class TestMutationTable(unittest.TestCase, CommonTestsMixin, MetadataTestsMixin): - columns = [Int32Column("site"), Int32Column("node"), Int32Column("parent")] + columns = [ + Int32Column("site"), + Int32Column("node"), + DoubleColumn("time"), + Int32Column("parent"), + ] ragged_list_columns = [ (CharColumn("derived_state"), UInt32Column("derived_state_offset")), (CharColumn("metadata"), UInt32Column("metadata_offset")), ] - equal_len_columns = [["site", "node"]] + equal_len_columns = [["site", "node", "time"]] string_colnames = ["derived_state"] binary_colnames = ["metadata"] input_parameters = [("max_rows_increment", 1024)] @@ -1046,31 +1051,38 @@ class TestMutationTable(unittest.TestCase, CommonTestsMixin, MetadataTestsMixin) def test_simple_example(self): t = tskit.MutationTable() - t.add_row(site=0, node=1, derived_state="2", parent=3, metadata=b"4") - t.add_row(1, 2, "3", 4, b"\xf0") + t.add_row(site=0, node=1, time=2, derived_state="3", parent=4, metadata=b"5") + t.add_row(1, 2, 3, "4", 5, b"\xf0") s = str(t) self.assertGreater(len(s), 0) self.assertEqual(len(t), 2) - self.assertEqual(attr.astuple(t[0]), (0, 1, "2", 3, b"4")) - self.assertEqual(attr.astuple(t[1]), (1, 2, "3", 4, b"\xf0")) + self.assertEqual(attr.astuple(t[0]), (0, 1, 2, "3", 4, b"5")) + self.assertEqual(attr.astuple(t[1]), (1, 2, 3, "4", 5, b"\xf0")) self.assertEqual(t[0].site, 0) self.assertEqual(t[0].node, 1) - self.assertEqual(t[0].derived_state, "2") - self.assertEqual(t[0].parent, 3) - self.assertEqual(t[0].metadata, b"4") + self.assertEqual(t[0].time, 2) + self.assertEqual(t[0].derived_state, "3") + self.assertEqual(t[0].parent, 4) + self.assertEqual(t[0].metadata, b"5") self.assertEqual(t[0], t[-2]) self.assertEqual(t[1], t[-1]) self.assertRaises(IndexError, t.__getitem__, -3) def test_add_row_bad_data(self): t = tskit.MutationTable() - t.add_row(0, 0, "A") + t.add_row(0, 0, 0, "A", 0) + with self.assertRaises(TypeError): + t.add_row("0", 0, 0, "A", 0) + with self.assertRaises(TypeError): + t.add_row(0, "0", 0, "A", 0) + with self.assertRaises(TypeError): + t.add_row(0, 0, "0", "A", 0) with self.assertRaises(TypeError): - t.add_row("0", 0, "A") + t.add_row(0, 0, 0, "A", 0, parent=None) with self.assertRaises(TypeError): - t.add_row(0, 0, "A", parent=None) + t.add_row(0, 0, 0, "A", 0, metadata=[0]) with self.assertRaises(TypeError): - t.add_row(0, 0, "A", metadata=[0]) + t.add_row(0, 0, 0, "A", "0") def test_packset_derived_state(self): for num_rows in [0, 10, 100]: @@ -1254,6 +1266,7 @@ def verify_randomise_tables(self, ts): tables.mutations.add_row( site=site_id_map[m.site], node=m.node, + time=m.time, derived_state=m.derived_state, parent=m.parent, metadata=m.metadata, @@ -1463,11 +1476,11 @@ def test_sort_mutations_stability(self): ) mutations = io.StringIO( """\ - site node derived_state parent - 1 0 1 -1 - 1 1 1 -1 - 0 1 1 -1 - 0 0 1 -1 + site node time derived_state parent + 1 0 0 1 -1 + 1 1 0 1 -1 + 0 1 0 1 -1 + 0 0 0 1 -1 """ ) ts = tskit.load_text( @@ -1508,13 +1521,13 @@ def test_sort_mutations_remap_parent_id(self): ) mutations = io.StringIO( """\ - site node derived_state parent - 1 0 1 -1 - 1 0 0 0 - 1 0 1 1 - 0 0 1 -1 - 0 0 0 3 - 0 0 1 4 + site node time derived_state parent + 1 0 0.5 1 -1 + 1 0 0.25 0 0 + 1 0 0 1 1 + 0 0 0.5 1 -1 + 0 0 0.125 0 3 + 0 0 0 1 4 """ ) ts = tskit.load_text( @@ -1533,6 +1546,7 @@ def test_sort_mutations_remap_parent_id(self): self.assertEqual(len(mutations), 6) self.assertEqual(list(mutations.site), [0, 0, 0, 1, 1, 1]) self.assertEqual(list(mutations.node), [0, 0, 0, 0, 0, 0]) + self.assertEqual(list(mutations.time), [0.5, 0.125, 0.0, 0.5, 0.25, 0.0]) self.assertEqual(list(mutations.parent), [-1, 0, 1, -1, 3, 4]) def test_sort_mutations_bad_parent_id(self): @@ -1555,8 +1569,8 @@ def test_sort_mutations_bad_parent_id(self): ) mutations = io.StringIO( """\ - site node derived_state parent - 1 0 1 -2 + site node time derived_state parent + 1 0 0 1 -2 """ ) self.assertRaises( @@ -1587,6 +1601,50 @@ def test_round_trip(self): ) +class TestMutationTimeErrors(unittest.TestCase): + def test_younger_than_node_below(self): + ts = msprime.simulate(5, mutation_rate=1, random_seed=42) + tables = ts.dump_tables() + tables.mutations.time = np.zeros(len(tables.mutations.time), dtype=np.float64) + with self.assertRaisesRegex( + _tskit.LibraryError, + "Mutations must have a time equal to or greater than that of their node", + ): + tables.tree_sequence() + + def test_older_than_node_above(self): + ts = msprime.simulate(5, mutation_rate=1, random_seed=42) + tables = ts.dump_tables() + tables.mutations.time = ( + np.ones(len(tables.mutations.time), dtype=np.float64) * 42 + ) + with self.assertRaisesRegex( + _tskit.LibraryError, + "Mutations must have a time less than the parent" + " node of the edge they are on", + ): + tables.tree_sequence() + + def test_older_than_parent(self): + ts = msprime.simulate( + 10, random_seed=42, mutation_rate=0.0, recombination_rate=1.0 + ) + ts = tsutil.jukes_cantor( + ts, num_sites=10, mu=1, multiple_per_node=False, seed=42 + ) + tables = ts.dump_tables() + self.assertNotEqual(sum(tables.mutations.parent != -1), 0) + as_dict = tables.mutations.asdict() + as_dict["time"][tables.mutations.parent != -1] = 64 + tables.mutations.set_columns(**as_dict) + with self.assertRaisesRegex( + _tskit.LibraryError, + "Mutations must have a time equal to or less than that" + " of their parent mutation", + ): + tables.tree_sequence() + + class TestNanDoubleValues(unittest.TestCase): """ In some tables we need to guard against NaN/infinite values in the input. @@ -1639,6 +1697,15 @@ def test_node_times(self): tables.nodes.time = bad_times self.assertRaises(_tskit.LibraryError, tables.tree_sequence) + def test_mutation_times(self): + ts = msprime.simulate(5, mutation_rate=1, random_seed=42) + tables = ts.dump_tables() + bad_times = tables.mutations.time.copy() + bad_times[-1] = np.inf + tables.mutations.time = bad_times + with self.assertRaisesRegex(_tskit.LibraryError, "Times must be finite"): + tables.tree_sequence() + def test_individual(self): ts = msprime.simulate(12, mutation_rate=1, random_seed=42) ts = tsutil.insert_random_ploidy_individuals(ts, seed=42) @@ -1779,6 +1846,7 @@ def test_bad_mutation_nodes(self): mutations.set_columns( site=mutations.site, node=node, + time=mutations.time, derived_state=mutations.derived_state, derived_state_offset=mutations.derived_state_offset, ) @@ -1795,6 +1863,7 @@ def test_bad_mutation_sites(self): mutations.set_columns( site=site, node=mutations.node, + time=mutations.time, derived_state=mutations.derived_state, derived_state_offset=mutations.derived_state_offset, ) @@ -2372,7 +2441,10 @@ def test_multichar_ancestral_state(self): tables.sites.add_row(position=site.position, ancestral_state="0") for mutation in site.mutations: tables.mutations.add_row( - site=site_id, node=mutation.node, derived_state="T" * site.id + site=site_id, + node=mutation.node, + time=mutation.time, + derived_state="T" * site.id, ) tables.deduplicate_sites() new_ts = tables.tree_sequence() @@ -2395,6 +2467,7 @@ def test_multichar_metadata(self): tables.mutations.add_row( site=site_id, node=mutation.node, + time=mutation.time, derived_state="1", metadata=b"T" * site.id, ) diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index 8ab49cbd7a..d671b3cba1 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -97,7 +97,7 @@ def simple_keep_intervals(tables, intervals, simplify=True, record_provenance=Tr ) for m in site.mutations: tables.mutations.add_row( - site_id, m.node, m.derived_state, tskit.NULL, m.metadata + site_id, m.node, m.time, m.derived_state, tskit.NULL, m.metadata ) tables.build_index() tables.compute_mutation_parents() @@ -1756,7 +1756,7 @@ def test_one_node_zero_samples_sites(self): tables = tskit.TableCollection(sequence_length=1) tables.nodes.add_row(time=0, flags=0) tables.sites.add_row(position=0.5, ancestral_state="0") - tables.mutations.add_row(site=0, derived_state="1", node=0) + tables.mutations.add_row(node=0, site=0, time=0, derived_state="1") ts = tables.tree_sequence() self.assertEqual(ts.sequence_length, 1) self.assertEqual(ts.num_trees, 1) @@ -1812,7 +1812,7 @@ def test_one_node_one_sample_sites(self): tables = tskit.TableCollection(sequence_length=1) tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) tables.sites.add_row(position=0.5, ancestral_state="0") - tables.mutations.add_row(site=0, derived_state="1", node=0) + tables.mutations.add_row(site=0, time=0, derived_state="1", node=0) ts = tables.tree_sequence() self.assertEqual(ts.sequence_length, 1) self.assertEqual(ts.num_trees, 1) @@ -2230,6 +2230,7 @@ def test_simple_case(self): sites=io.StringIO(sites), mutations=io.StringIO(mutations), strict=False, + compute_mutation_times=True, ) self.assertEqual(ts.sample_size, 2) @@ -2397,11 +2398,11 @@ def test_simple_case(self): ) mutations = io.StringIO( """\ - site node derived_state - 0 2 1 - 1 3 1 - 2 4 1 - 3 1 1 + site node time derived_state + 0 2 0 1 + 1 3 0 1 + 2 4 0 1 + 3 1 1 1 """ ) ts = tskit.load_text( @@ -2974,9 +2975,9 @@ def test_single_binary_tree_simple_mutations(self): 1 0.2 0 """ mutations_before = """\ - site node derived_state - 0 6 1 - 1 2 1 + site node time derived_state + 0 6 0 1 + 1 2 0 1 """ # We sample 0 and 2 and 6, so we get @@ -2998,8 +2999,8 @@ def test_single_binary_tree_simple_mutations(self): 0 0.1 0 """ mutations_after = """\ - site node derived_state - 0 2 1 + site node time derived_state + 0 2 0 1 """ self.verify_simplify( samples=[0, 1, 6], @@ -3158,11 +3159,11 @@ def test_simple_case(self): ) mutations = io.StringIO( """\ - site node derived_state - 0 0 1 - 1 1 1 - 2 3 1 - 3 4 1 + site node time derived_state + 0 0 0 1 + 1 1 0 1 + 2 3 0 1 + 3 4 0 1 """ ) ts = tskit.load_text( @@ -3244,9 +3245,9 @@ def test_simplest_degenerate_case(self): ) mutations = io.StringIO( """\ - site node derived_state - 0 0 1 - 1 1 1 + site node time derived_state + 0 0 0 1 + 1 1 0 1 """ ) ts = tskit.load_text( @@ -3306,11 +3307,11 @@ def test_simplest_non_degenerate_case(self): ) mutations = io.StringIO( """\ - site node derived_state - 0 0 1 - 1 1 1 - 2 2 1 - 3 3 1 + site node time derived_state + 0 0 0 1 + 1 1 0 1 + 2 2 0 1 + 3 3 0 1 """ ) ts = tskit.load_text( @@ -3380,12 +3381,12 @@ def test_two_reducible_trees(self): ) mutations = io.StringIO( """\ - site node derived_state - 0 0 1 - 1 1 1 - 2 2 1 - 3 3 1 - 4 8 1 + site node time derived_state + 0 0 0 1 + 1 1 0 1 + 2 2 0 1 + 3 3 0 1 + 4 8 0 1 """ ) ts = tskit.load_text( @@ -3505,13 +3506,13 @@ def test_mutations_over_roots(self): ) mutations = io.StringIO( """\ - site node derived_state - 0 0 1 - 1 1 1 - 2 3 1 - 3 4 1 - 4 2 1 - 5 5 1 + site node time derived_state + 0 0 0 1 + 1 1 0 1 + 2 3 1 1 + 3 4 2 1 + 4 2 0 1 + 5 5 2 1 """ ) ts = tskit.load_text( @@ -3889,17 +3890,17 @@ def test_many_single_offspring(self): ) mutations = io.StringIO( """\ - site node derived_state parent - 0 7 1 -1 - 0 10 0 0 - 0 2 1 1 - 1 0 1 -1 - 1 10 1 -1 - 2 8 1 -1 - 2 9 0 5 - 2 10 0 5 - 2 2 1 7 - 3 8 1 -1 + site node derived_state time parent + 0 7 1 3 -1 + 0 10 0 1 0 + 0 2 1 0 1 + 1 0 1 0 -1 + 1 10 1 1 -1 + 2 8 1 2 -1 + 2 9 0 1 5 + 2 10 0 1 5 + 2 2 1 0 7 + 3 8 1 2 -1 """ ) ts = tskit.load_text(nodes, edges, sites, mutations, strict=False) @@ -4816,10 +4817,10 @@ def test_small_tree_mutations(self): tables.sites.add_row(position=0.5, ancestral_state="0") tables.sites.add_row(position=0.75, ancestral_state="0") tables.sites.add_row(position=0.8, ancestral_state="0") - tables.mutations.add_row(site=0, node=0, derived_state="1") - tables.mutations.add_row(site=1, node=2, derived_state="1") - tables.mutations.add_row(site=2, node=7, derived_state="1") - tables.mutations.add_row(site=3, node=0, derived_state="1") + tables.mutations.add_row(site=0, node=0, time=0, derived_state="1") + tables.mutations.add_row(site=1, node=2, time=0, derived_state="1") + tables.mutations.add_row(site=2, node=7, time=0.5, derived_state="1") + tables.mutations.add_row(site=3, node=0, time=0, derived_state="1") ts = tables.tree_sequence() self.assertEqual(ts.num_sites, 4) self.assertEqual(ts.num_mutations, 4) @@ -4859,9 +4860,9 @@ def test_small_tree_fixed_sites(self): tables.sites.add_row(position=0.25, ancestral_state="0") tables.sites.add_row(position=0.5, ancestral_state="0") tables.sites.add_row(position=0.75, ancestral_state="0") - tables.mutations.add_row(site=0, node=2, derived_state="1") - tables.mutations.add_row(site=1, node=3, derived_state="1") - tables.mutations.add_row(site=2, node=6, derived_state="1") + tables.mutations.add_row(site=0, node=2, time=0, derived_state="1") + tables.mutations.add_row(site=1, node=3, time=0, derived_state="1") + tables.mutations.add_row(site=2, node=6, time=0.22, derived_state="1") ts = tables.tree_sequence() self.assertEqual(ts.num_sites, 3) self.assertEqual(ts.num_mutations, 3) @@ -4879,7 +4880,7 @@ def test_small_tree_mutations_over_root(self): ) tables = ts.dump_tables() tables.sites.add_row(position=0.25, ancestral_state="0") - tables.mutations.add_row(site=0, node=8, derived_state="1") + tables.mutations.add_row(site=0, node=8, time=1.65, derived_state="1") ts = tables.tree_sequence() self.assertEqual(ts.num_sites, 1) self.assertEqual(ts.num_mutations, 1) @@ -4899,8 +4900,8 @@ def test_small_tree_recurrent_mutations(self): tables = ts.dump_tables() # Add recurrent mutation on the root branches tables.sites.add_row(position=0.25, ancestral_state="0") - tables.mutations.add_row(site=0, node=6, derived_state="1") - tables.mutations.add_row(site=0, node=7, derived_state="1") + tables.mutations.add_row(site=0, node=6, time=0.22, derived_state="1") + tables.mutations.add_row(site=0, node=7, time=0.44, derived_state="1") ts = tables.tree_sequence() self.assertEqual(ts.num_sites, 1) self.assertEqual(ts.num_mutations, 2) @@ -4920,9 +4921,9 @@ def test_small_tree_back_mutations(self): tables = ts.dump_tables() # Add a chain of mutations tables.sites.add_row(position=0.25, ancestral_state="0") - tables.mutations.add_row(site=0, node=7, derived_state="1") - tables.mutations.add_row(site=0, node=5, derived_state="0") - tables.mutations.add_row(site=0, node=1, derived_state="1") + tables.mutations.add_row(site=0, node=7, time=0.44, derived_state="1") + tables.mutations.add_row(site=0, node=5, time=0.15, derived_state="0") + tables.mutations.add_row(site=0, node=1, time=0, derived_state="1") ts = tables.tree_sequence() self.assertEqual(ts.num_sites, 1) self.assertEqual(ts.num_mutations, 3) @@ -5110,9 +5111,9 @@ def test_many_mutations_over_single_sample_ancestral_state(self): ) mutations = io.StringIO( """\ - site node derived_state parent - 0 0 1 -1 - 0 0 0 0 + site node time derived_state parent + 0 0 0 1 -1 + 0 0 0 0 0 """ ) ts = tskit.load_text( @@ -5150,10 +5151,10 @@ def test_many_mutations_over_single_sample_derived_state(self): ) mutations = io.StringIO( """\ - site node derived_state parent - 0 0 1 -1 - 0 0 0 0 - 0 0 1 1 + site node time derived_state parent + 0 0 0 1 -1 + 0 0 0 0 0 + 0 0 0 1 1 """ ) ts = tskit.load_text( @@ -5797,7 +5798,12 @@ def test_example(self): """ ) ts = tskit.load_text( - nodes=nodes, edges=edges, sites=sites, mutations=mutations, strict=False + nodes=nodes, + edges=edges, + sites=sites, + mutations=mutations, + strict=False, + compute_mutation_times=True, ) self.verify_parents(ts) @@ -5858,6 +5864,151 @@ def test_many_multiroot_trees_recurrent_mutations(self): self.verify_branch_mutations(ts, mutations_per_branch) +class TestMutationTime(unittest.TestCase): + """ + Tests that mutation time is correctly specified, and that we correctly + recompute it with compute_mutation_times. + """ + + seed = 42 + + def verify_times(self, ts): + tables = ts.tables + # Clear out the existing mutations as they come from msprime + tables.mutations.time = np.full( + tables.mutations.time.shape, -1, dtype=np.float64 + ) + self.assertTrue(np.all(tables.mutations.time == -1)) + # Compute times with C method and dumb python method + tables.compute_mutation_times() + python_time = tsutil.compute_mutation_times(ts) + self.assertTrue( + np.allclose(python_time, tables.mutations.time, rtol=1e-15, atol=1e-15) + ) + + def test_example(self): + nodes = io.StringIO( + """\ + id is_sample time + 0 0 2.0 + 1 0 1.0 + 2 0 1.0 + 3 1 0 + 4 1 0 + """ + ) + edges = io.StringIO( + """\ + left right parent child + 0.0 0.5 2 3 + 0.0 0.8 2 4 + 0.5 1.0 1 3 + 0.0 1.0 0 1 + 0.0 1.0 0 2 + 0.8 1.0 0 4 + """ + ) + sites = io.StringIO( + """\ + position ancestral_state + 0.1 0 + 0.5 0 + 0.9 0 + """ + ) + mutations = io.StringIO( + """\ + site node time derived_state parent + 0 1 1.5 1 -1 + 0 2 1.5 1 -1 + 0 3 0.5 2 1 + 1 0 2.0 1 -1 + 1 1 1.5 1 3 + 1 3 0.5 2 4 + 1 2 1.5 1 3 + 1 4 0.5 2 6 + 2 0 2.0 1 -1 + 2 1 1.5 1 8 + 2 2 1.5 1 8 + 2 4 1.0 1 8 + """ + ) + ts = tskit.load_text( + nodes=nodes, edges=edges, sites=sites, mutations=mutations, strict=False, + ) + # ts.dump_text(mutations=sys.stdout) + # self.assertFalse(True) + tables = ts.tables + python_time = tsutil.compute_mutation_times(ts) + self.assertTrue( + np.allclose(python_time, tables.mutations.time, rtol=1e-15, atol=1e-15) + ) + tables.mutations.time = np.full( + tables.mutations.time.shape, -1, dtype=np.float64 + ) + self.assertTrue(np.all(tables.mutations.time == -1)) + tables.compute_mutation_times() + self.assertTrue( + np.allclose(python_time, tables.mutations.time, rtol=1e-15, atol=1e-15) + ) + + def test_single_muts(self): + ts = msprime.simulate( + 10, random_seed=self.seed, mutation_rate=3.0, recombination_rate=1.0 + ) + self.verify_times(ts) + + def test_with_jukes_cantor(self): + ts = msprime.simulate( + 10, random_seed=self.seed, mutation_rate=0.0, recombination_rate=1.0 + ) + # make *lots* of recurrent mutations + mut_ts = tsutil.jukes_cantor( + ts, num_sites=10, mu=1, multiple_per_node=False, seed=self.seed + ) + self.verify_times(mut_ts) + + def test_with_jukes_cantor_multiple_per_node(self): + ts = msprime.simulate( + 10, random_seed=self.seed, mutation_rate=0.0, recombination_rate=1.0 + ) + # make *lots* of recurrent mutations + mut_ts = tsutil.jukes_cantor( + ts, num_sites=10, mu=1, multiple_per_node=True, seed=self.seed + ) + self.verify_times(mut_ts) + + def verify_branch_mutations(self, ts, mutations_per_branch): + ts = tsutil.insert_branch_mutations(ts, mutations_per_branch) + self.assertGreater(ts.num_mutations, 1) + self.verify_times(ts) + + def test_single_tree_one_mutation_per_branch(self): + ts = msprime.simulate(6, random_seed=10) + self.verify_branch_mutations(ts, 1) + + def test_single_tree_two_mutations_per_branch(self): + ts = msprime.simulate(10, random_seed=9) + self.verify_branch_mutations(ts, 2) + + def test_single_tree_three_mutations_per_branch(self): + ts = msprime.simulate(8, random_seed=9) + self.verify_branch_mutations(ts, 3) + + def test_single_multiroot_tree_recurrent_mutations(self): + ts = msprime.simulate(6, random_seed=10) + ts = tsutil.decapitate(ts, ts.num_edges // 2) + for mutations_per_branch in [1, 2, 3]: + self.verify_branch_mutations(ts, mutations_per_branch) + + def test_many_multiroot_trees_recurrent_mutations(self): + ts = msprime.simulate(7, recombination_rate=1, random_seed=10) + self.assertGreater(ts.num_trees, 3) + ts = tsutil.decapitate(ts, ts.num_edges // 2) + for mutations_per_branch in [1, 2, 3]: + self.verify_branch_mutations(ts, mutations_per_branch) + + class TestSimpleTreeAlgorithm(unittest.TestCase): """ Tests for the direct implementation of Algorithm T in tsutil.py. @@ -6669,9 +6820,13 @@ def ts_with_4_sites(self): ts = msprime.simulate(8, random_seed=3) tables = ts.dump_tables() tables.sites.set_columns(np.arange(0, 1, 0.25), *tskit.pack_strings(["G"] * 4)) - tables.mutations.add_row(site=1, node=ts.first().parent(0), derived_state="C") - tables.mutations.add_row(site=1, node=0, derived_state="T", parent=0) - tables.mutations.add_row(site=2, node=1, derived_state="A") + tables.mutations.add_row( + site=1, node=ts.first().parent(0), time=-1, derived_state="C" + ) + tables.mutations.add_row(site=1, node=0, time=-1, derived_state="T", parent=0) + tables.mutations.add_row(site=2, node=1, time=-1, derived_state="A") + tables.build_index() + tables.compute_mutation_times() return tables.tree_sequence() def test_remove_by_index(self): @@ -6718,7 +6873,7 @@ def verify_removal(self, ts, remove_sites): tables = ts.dump_tables() tables.delete_sites(remove_sites) - # Make sure we've computed the mutation parents properly. + # Make sure we've computed the mutation parents and times properly. mutation_parent = tables.mutations.parent tables.compute_mutation_parents() self.assertTrue(np.array_equal(mutation_parent, tables.mutations.parent)) @@ -6734,6 +6889,7 @@ def verify_removal(self, ts, remove_sites): self.assertEqual(len(s1.mutations), len(s2.mutations)) for m1, m2 in zip(s1.mutations, s2.mutations): self.assertEqual(m1.node, m2.node) + self.assertEqual(m1.time, m2.time) self.assertEqual(m1.derived_state, m2.derived_state) self.assertEqual(m1.metadata, m2.metadata) @@ -6755,7 +6911,9 @@ def test_simple_mixed_length_states(self): tables = ts.dump_tables() for j in range(10): tables.sites.add_row(j, "X" * j) - tables.mutations.add_row(site=j, node=j, derived_state="X" * (j + 1)) + tables.mutations.add_row( + site=j, node=j, time=0, derived_state="X" * (j + 1) + ) ts = tables.tree_sequence() self.verify_removal(ts, [9]) @@ -7109,10 +7267,11 @@ def add_mutations(self, ts, position, ancestral_state, derived_states, nodes): tables = ts.dump_tables() site = tables.sites.add_row(position, ancestral_state) for state, node in zip(derived_states, nodes): - tables.mutations.add_row(site, node, state) + tables.mutations.add_row(site, node, -1, state) tables.sort() tables.build_index() tables.compute_mutation_parents() + tables.compute_mutation_times() return tables.tree_sequence() def verify_sites(self, source_tree, trimmed_tree, position_offset): diff --git a/python/tests/test_tree_stats.py b/python/tests/test_tree_stats.py index 27f66da41c..7f77226cb6 100644 --- a/python/tests/test_tree_stats.py +++ b/python/tests/test_tree_stats.py @@ -704,8 +704,8 @@ def test_ghost_allele(self): tables.nodes.add_row(flags=1, time=0) tables.nodes.add_row(flags=1, time=0) # The ghost mutation that's never seen in the genotypes - tables.mutations.add_row(site=0, node=0, derived_state="T") - tables.mutations.add_row(site=0, node=0, derived_state="G", parent=0) + tables.mutations.add_row(site=0, node=0, time=0, derived_state="T") + tables.mutations.add_row(site=0, node=0, time=0, derived_state="G", parent=0) ts = tables.tree_sequence() self.verify(ts) @@ -720,9 +720,9 @@ def test_ghost_allele_all_ancestral(self): tables.edges.add_row(0, 1, 2, 0) tables.edges.add_row(0, 1, 2, 1) tables.sites.add_row(position=0.5, ancestral_state="A") - tables.mutations.add_row(site=0, node=0, derived_state="T") + tables.mutations.add_row(site=0, node=0, time=0, derived_state="T") # Mutate back to the ancestral state so that all genotypes are zero - tables.mutations.add_row(site=0, node=0, derived_state="A", parent=0) + tables.mutations.add_row(site=0, node=0, time=0, derived_state="A", parent=0) ts = tables.tree_sequence() self.verify(ts) @@ -750,8 +750,10 @@ def test_non_sample_ancestry(self): # leaf so that samples are fixed at zero. tables.sites.add_row(position=0.25, ancestral_state="0") tables.sites.add_row(position=0.5, ancestral_state="0") - tables.mutations.add_row(site=0, node=4, derived_state="1") - tables.mutations.add_row(site=1, node=6, derived_state="1") + tables.mutations.add_row(site=0, node=4, time=-1, derived_state="1") + tables.mutations.add_row(site=1, node=6, time=-1, derived_state="1") + tables.build_index() + tables.compute_mutation_times() ts = tables.tree_sequence() self.verify(ts) @@ -4893,17 +4895,17 @@ def test_case_1(self): ) mutations = io.StringIO( """\ - site node derived_state - 0 4 1 - 1 0 1 - 2 2 1 - 3 0 1 - 4 1 1 - 5 1 1 - 6 2 1 - 7 0 1 - 8 1 1 - 9 2 1 + site node time derived_state + 0 4 0.5 1 + 1 0 0 1 + 2 2 0 1 + 3 0 0 1 + 4 1 0 1 + 5 1 0 1 + 6 2 0 1 + 7 0 0 1 + 8 1 0 1 + 9 2 0 1 """ ) ts = tskit.load_text( @@ -5101,9 +5103,9 @@ def test_case_odds_and_ends(self): ) mutations = io.StringIO( """\ - site node derived_state parent - 0 0 1 -1 - 0 1 2 -1 + site node time derived_state parent + 0 0 0 1 -1 + 0 1 0 2 -1 """ ) ts = tskit.load_text( @@ -5283,16 +5285,16 @@ def test_case_recurrent_muts(self): ) mutations = io.StringIO( """\ - site node derived_state parent - 0 0 1 -1 - 0 0 2 0 - 0 0 0 1 - 0 1 2 -1 - 1 1 1 -1 - 1 2 1 -1 - 2 4 1 -1 - 2 1 2 6 - 2 2 3 6 + site node time derived_state parent + 0 0 0 1 -1 + 0 0 0 2 0 + 0 0 0 0 1 + 0 1 0 2 -1 + 1 1 0 1 -1 + 1 2 0 1 -1 + 2 4 0.5 1 -1 + 2 1 0 2 6 + 2 2 0 3 6 """ ) ts = tskit.load_text( @@ -5404,14 +5406,14 @@ def test_case_2(self): ) mutations = io.StringIO( """\ - site node derived_state parent - 0 0 1 -1 - 0 10 1 -1 - 0 0 0 0 - 1 8 1 -1 - 1 2 1 -1 - 2 8 1 -1 - 2 9 0 5 + site node time derived_state parent + 0 0 0 1 -1 + 0 10 1 1 -1 + 0 0 0 0 0 + 1 8 2 1 -1 + 1 2 0 1 -1 + 2 8 2 1 -1 + 2 9 1 0 5 """ ) ts = tskit.load_text( diff --git a/python/tests/test_vcf.py b/python/tests/test_vcf.py index 7872c55533..4a31b60ab4 100644 --- a/python/tests/test_vcf.py +++ b/python/tests/test_vcf.py @@ -514,11 +514,11 @@ def test_many_alleles(self): tables.sites.add_row(0.5, "0") # 9 alleles should be fine for j in range(8): - tables.mutations.add_row(0, node=j, derived_state=str(j + 1)) + tables.mutations.add_row(0, node=j, time=0, derived_state=str(j + 1)) ts = tables.tree_sequence() ts.write_vcf(io.StringIO()) for j in range(9, 15): - tables.mutations.add_row(0, node=j, derived_state=str(j)) + tables.mutations.add_row(0, node=j, time=0, derived_state=str(j)) ts = tables.tree_sequence() with self.assertRaises(ValueError): ts.write_vcf(io.StringIO()) diff --git a/python/tests/test_wright_fisher.py b/python/tests/test_wright_fisher.py index 540c01909d..0844c12c0a 100644 --- a/python/tests/test_wright_fisher.py +++ b/python/tests/test_wright_fisher.py @@ -371,7 +371,7 @@ def verify_simplify(self, ts, new_ts, samples, node_map): """ Check that trees in `ts` match `new_ts` using the specified node_map. Modified from `verify_simplify_topology`. Also check that the `parent` - column in the MutationTable is correct. + and `time` column in the MutationTable is correct. """ # check trees agree at these points locs = [random.random() for _ in range(20)] @@ -405,6 +405,10 @@ def verify_simplify(self, ts, new_ts, samples, node_map): self.assertEqual(node_map[mrca1], mrca2) mut_parent = tsutil.compute_mutation_parent(ts=ts) self.assertArrayEqual(mut_parent, ts.tables.mutations.parent) + mut_time = tsutil.compute_mutation_times(ts=ts) + self.assertTrue( + np.allclose(mut_time, ts.tables.mutations.time, rtol=1e-15, atol=1e-15) + ) def verify_haplotypes(self, ts, samples): """ diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index aeb5b9f743..08448147a2 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -23,6 +23,7 @@ """ A collection of utilities to edit and construct tree sequences. """ +import collections import json import random import string @@ -74,6 +75,7 @@ def subsample_sites(ts, num_sites): site=site_id, derived_state=mutation.derived_state, node=mutation.node, + time=mutation.time, parent=mutation.parent, ) add_provenance(t.provenances, "subsample_sites") @@ -124,11 +126,14 @@ def insert_branch_mutations(ts, mutations_per_branch=1): mutation[u] = tables.mutations.add_row( site=site, node=u, + time=-1, derived_state=str(state[u]), parent=parent, ) parent = mutation[u] add_provenance(tables.provenances, "insert_branch_mutations") + tables.build_index() + tables.compute_mutation_times() return tables.tree_sequence() @@ -147,9 +152,11 @@ def insert_branch_sites(ts): for u in tree.nodes(): if tree.parent(u) != tskit.NULL: site = tables.sites.add_row(position=x, ancestral_state="0") - tables.mutations.add_row(site=site, node=u, derived_state="1") + tables.mutations.add_row(site=site, node=u, time=-1, derived_state="1") x += delta add_provenance(tables.provenances, "insert_branch_sites") + tables.build_index() + tables.compute_mutation_times() return tables.tree_sequence() @@ -174,8 +181,12 @@ def insert_multichar_mutations(ts, seed=1, max_len=10): derived_state = ancestral_state while ancestral_state == derived_state: derived_state = rng.choice(letters) * rng.randint(0, max_len) - tables.mutations.add_row(site=site, node=u, derived_state=derived_state) + tables.mutations.add_row( + site=site, node=u, time=tables.nodes[u].time, derived_state=derived_state + ) add_provenance(tables.provenances, "insert_multichar_mutations") + tables.build_index() + tables.compute_mutation_times() return tables.tree_sequence() @@ -260,6 +271,7 @@ def permute_nodes(ts, node_map): for mutation in site.mutations: tables.mutations.add_row( site=site.id, + time=mutation.time, derived_state=mutation.derived_state, node=node_map[mutation.node], metadata=mutation.metadata, @@ -288,12 +300,23 @@ def insert_redundant_breakpoints(ts): def single_childify(ts): """ Builds a new equivalent tree sequence which contains an extra node in the - middle of all exising branches. + middle of all existing branches. """ tables = ts.dump_tables() + mutations_above_node = collections.defaultdict(list) + for mut in tables.mutations: + mutations_above_node[mut.node].append(mut) + + mutations_on_edge = collections.defaultdict(list) + for edge_idx, edge in enumerate(tables.edges): + for mut in mutations_above_node[edge.child]: + if edge.left <= tables.sites[mut.site].position < edge.right: + mutations_on_edge[edge_idx].append(mut) + time = tables.nodes.time[:] tables.edges.reset() + tables.mutations.reset() for edge in ts.edges(): # Insert a new node in between the parent and child. t = time[edge.child] + (time[edge.parent] - time[edge.child]) / 2 @@ -304,6 +327,20 @@ def single_childify(ts): tables.edges.add_row( left=edge.left, right=edge.right, parent=edge.parent, child=u ) + for mut in mutations_on_edge[edge.id]: + if mut.time < t: + tables.mutations.add_row( + mut.site, + mut.node, + mut.time, + mut.derived_state, + mut.parent, + mut.metadata, + ) + else: + tables.mutations.add_row( + mut.site, u, mut.time, mut.derived_state, mut.parent, mut.metadata + ) tables.sort() add_provenance(tables.provenances, "insert_redundant_breakpoints") return tables.tree_sequence() @@ -351,6 +388,7 @@ def add_random_metadata(ts, seed=1, max_length=10): mutations.set_columns( site=mutations.site, node=mutations.node, + time=mutations.time, parent=mutations.parent, derived_state=mutations.derived_state, derived_state_offset=mutations.derived_state_offset, @@ -424,13 +462,13 @@ def generate_site_mutations( while x < branch_length: new_state = random.choice([s for s in states if s != state]) if multiple_per_node and (state != new_state): - mutation_table.add_row(site, u, new_state, parent) + mutation_table.add_row(site, u, -1, new_state, parent) parent = mutation_table.num_rows - 1 state = new_state x += random.expovariate(mu) else: if (not multiple_per_node) and (state != new_state): - mutation_table.add_row(site, u, new_state, parent) + mutation_table.add_row(site, u, -1, new_state, parent) parent = mutation_table.num_rows - 1 state = new_state stack.extend(reversed([(v, state, parent) for v in tree.children(u)])) @@ -462,6 +500,8 @@ def jukes_cantor(ts, num_sites, mu, multiple_per_node=True, seed=None): multiple_per_node=multiple_per_node, ) add_provenance(tables.provenances, "jukes_cantor") + tables.build_index() + tables.compute_mutation_times() new_ts = tables.tree_sequence() return new_ts @@ -490,12 +530,15 @@ def caterpillar_tree(n, num_sites=0, num_mutations=1): state = 0 for _ in range(num_mutations): state = (state + 1) % 2 - tables.mutations.add_row(site=j, derived_state=str(state), node=node) + tables.mutations.add_row( + site=j, time=-1, derived_state=str(state), node=node + ) node -= 1 tables.sort() tables.build_index() tables.compute_mutation_parents() + tables.compute_mutation_times() return tables.tree_sequence() @@ -609,6 +652,48 @@ def py_subset(tables, nodes, record_provenance=True): mutation_map[i] = new_mut +def compute_mutation_times(ts): + """ + Compute the `time` column of a MutationTable in a TableCollection. + Finds the set of mutations on an edge that share a site and spreads + the times evenly over the edge. + + :param TreeSequence ts: The tree sequence to compute for. Need not + have a valid mutation time column. + """ + tables = ts.dump_tables() + mutations = tables.mutations + + mutations_above_node = collections.defaultdict(list) + for mut_idx, mut in enumerate(mutations): + mutations_above_node[mut.node].append((mut_idx, mut)) + + mutations_at_site_on_edge = collections.defaultdict(list) + for edge_idx, edge in enumerate(tables.edges): + for mut_idx, mut in mutations_above_node[edge.child]: + if edge.left <= tables.sites[mut.site].position < edge.right: + mutations_at_site_on_edge[(mut.site, edge_idx)].append(mut_idx) + + edges = tables.edges + nodes = tables.nodes + times = np.full(len(mutations), -1, dtype=np.float64) + for (_, edge_idx), edge_mutations in mutations_at_site_on_edge.items(): + start_time = nodes[edges[edge_idx].child].time + end_time = nodes[edges[edge_idx].parent].time + duration = end_time - start_time + for i, mut_idx in enumerate(edge_mutations): + times[mut_idx] = end_time - ( + duration * ((i + 1) / (len(edge_mutations) + 1)) + ) + + # Mutations not on a edge (i.e. above a root) get given their node's time + for i in range(len(mutations)): + if times[i] == -1: + times[i] = nodes[mutations[i].node].time + + return times + + def algorithm_T(ts): """ Simple implementation of algorithm T from the PLOS paper, taking into diff --git a/python/tskit/formats.py b/python/tskit/formats.py index 12e31ddfb8..6d22a794b5 100644 --- a/python/tskit/formats.py +++ b/python/tskit/formats.py @@ -24,11 +24,14 @@ Module responsible for converting tree sequence files from older formats. """ +import collections import datetime import json import logging import h5py +import kastore +import numpy import numpy as np import tskit @@ -97,6 +100,7 @@ def _convert_hdf5_mutations( ) mutations.set_columns( node=node, + time=np.full(num_mutations, -1, dtype=np.float64), site=np.arange(num_mutations, dtype=np.int32), derived_state=ord("1") * np.ones(num_mutations, dtype=np.int8), derived_state_offset=np.arange(num_mutations + 1, dtype=np.uint32), @@ -168,6 +172,8 @@ def _load_legacy_hdf5_v2(root, remove_duplicate_positions): ) tables.provenances.add_row(_get_upgrade_provenance(root)) tables.sort() + tables.build_index() + tables.compute_mutation_times() return tables.tree_sequence() @@ -222,6 +228,8 @@ def _load_legacy_hdf5_v3(root, remove_duplicate_positions): tables.provenances.add_row(timestamp=old_timestamp, record=record) tables.provenances.add_row(_get_upgrade_provenance(root)) tables.sort() + tables.build_index() + tables.compute_mutation_times() return tables.tree_sequence() @@ -239,16 +247,44 @@ def load_legacy(filename, remove_duplicate_positions=False): 3: _load_legacy_hdf5_v3, 10: _load_legacy_hdf5_v10, } - root = h5py.File(filename, "r") - if "format_version" not in root.attrs: - raise ValueError("HDF5 file not in msprime format") - format_version = root.attrs["format_version"] - if format_version[0] not in loaders: - raise ValueError(f"Version {format_version} not supported for loading") try: - ts = loaders[format_version[0]](root, remove_duplicate_positions) - finally: - root.close() + root = h5py.File(filename, "r") + if "format_version" not in root.attrs: + raise ValueError("HDF5 file not in msprime format") + format_version = root.attrs["format_version"] + if format_version[0] not in loaders: + raise ValueError(f"Version {format_version} not supported for loading") + try: + ts = loaders[format_version[0]](root, remove_duplicate_positions) + finally: + root.close() + except OSError as e: + # An exception like this means we have a kastore + if "file signature not found" not in str(e): + raise e + else: + with kastore.load(filename) as store: + data = dict(store) + # v12 and below have no mutation time + if "mutations/time" not in data: + data["mutations/time"] = np.full( + data["mutations/node"].shape, -1, dtype=np.float64 + ) + table_dict = collections.defaultdict(collections.defaultdict) + for key, value in data.items(): + if "/" in key: + table, col = key.split("/") + # kastore type for string columns doesn't match what `fromdict wants` + if value.dtype == np.uint8: + table_dict[table][col] = value.astype(np.int8) + else: + table_dict[table][col] = value + else: + table_dict[key] = value + tables = tskit.TableCollection.fromdict(table_dict) + tables.build_index() + tables.compute_mutation_times() + ts = tables.tree_sequence() return ts @@ -541,6 +577,7 @@ def _load_legacy_hdf5_v10(root, remove_duplicate_positions=False): tables.mutations.set_columns( site=mutations_group["site"], node=mutations_group["node"], + time=np.full(mutations_group["node"].shape, -1, dtype=np.float64), parent=mutations_group["parent"], derived_state=mutations_group["derived_state"], derived_state_offset=mutations_group["derived_state_offset"], @@ -562,6 +599,8 @@ def _load_legacy_hdf5_v10(root, remove_duplicate_positions=False): ) tables.provenances.add_row(_get_upgrade_provenance(root)) _set_populations(tables) + tables.build_index() + tables.compute_mutation_times() return tables.tree_sequence() diff --git a/python/tskit/tables.py b/python/tskit/tables.py index ece9d88478..f922937176 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -105,6 +105,7 @@ class SiteTableRow: class MutationTableRow: site: int node: int + time: float derived_state: str parent: int metadata: bytes @@ -1351,6 +1352,8 @@ class MutationTable(BaseTable, MetadataMixin): :vartype site: numpy.ndarray, dtype=np.int32 :ivar node: The array of node IDs. :vartype node: numpy.ndarray, dtype=np.int32 + :ivar time: The array of time values. + :vartype time: numpy.ndarray, dtype=np.float64 :ivar derived_state: The flattened array of derived state strings. See :ref:`sec_tables_api_text_columns` for more details. :vartype derived_state: numpy.ndarray, dtype=np.int8 @@ -1372,6 +1375,7 @@ class MutationTable(BaseTable, MetadataMixin): column_names = [ "site", "node", + "time", "derived_state", "derived_state_offset", "parent", @@ -1388,22 +1392,23 @@ def _text_header_and_rows(self): site = self.site node = self.node parent = self.parent + time = self.time derived_state = util.unpack_strings( self.derived_state, self.derived_state_offset ) metadata = util.unpack_bytes(self.metadata, self.metadata_offset) - headers = ("id", "site", "node", "derived_state", "parent", "metadata") + headers = ("id", "site", "node", "time", "derived_state", "parent", "metadata") rows = [] for j in range(self.num_rows): md = base64.b64encode(metadata[j]).decode("utf8") rows.append( - "{}\t{}\t{}\t{}\t{}\t{}".format( - j, site[j], node[j], derived_state[j], parent[j], md + "{}\t{}\t{}\t{}\t{}\t{}\t{}".format( + j, site[j], node[j], time[j], derived_state[j], parent[j], md ).split("\t") ) return headers, rows - def add_row(self, site, node, derived_state, parent=-1, metadata=None): + def add_row(self, site, node, time, derived_state, parent=-1, metadata=None): """ Adds a new row to this :class:`MutationTable` and returns the ID of the corresponding mutation. Metadata, if specified, will be validated and encoded @@ -1412,6 +1417,7 @@ def add_row(self, site, node, derived_state, parent=-1, metadata=None): :param int site: The ID of the site that this mutation occurs at. :param int node: The ID of the first node inheriting this mutation. + :param float time: The occurence time for the new mutation. :param str derived_state: The state of the site at this mutation's node. :param int parent: The ID of the parent mutation. If not specified, defaults to :attr:`NULL`. @@ -1420,12 +1426,13 @@ def add_row(self, site, node, derived_state, parent=-1, metadata=None): :rtype: int """ metadata = self.metadata_schema.validate_and_encode_row(metadata) - return self.ll_table.add_row(site, node, derived_state, parent, metadata) + return self.ll_table.add_row(site, node, time, derived_state, parent, metadata) def set_columns( self, site=None, node=None, + time=None, derived_state=None, derived_state_offset=None, parent=None, @@ -1437,9 +1444,9 @@ def set_columns( Sets the values for each column in this :class:`MutationTable` using the values in the specified arrays. Overwrites any data currently stored in the table. - The ``site``, ``node``, ``derived_state`` and ``derived_state_offset`` + The ``site``, ``node``, ``time``, ``derived_state`` and ``derived_state_offset`` parameters are mandatory, and must be 1D numpy arrays. The - ``site`` and ``node`` (also ``parent``, if supplied) arrays + ``site``, ``node`` and ``time`` (also ``parent``, if supplied) arrays must be of equal length, and determine the number of rows in the table. The ``derived_state`` and ``derived_state_offset`` parameters must be supplied together, and meet the requirements for @@ -1455,6 +1462,8 @@ def set_columns( :type site: numpy.ndarray, dtype=np.int32 :param node: The ID of the node each mutation is associated with. :type node: numpy.ndarray, dtype=np.int32 + :param time: The time values for each mutation. Required. + :type time: numpy.ndarray, dtype=np.float64 :param derived_state: The flattened derived_state array. Required. :type derived_state: numpy.ndarray, dtype=np.int8 :param derived_state_offset: The offsets into the ``derived_state`` array. @@ -1480,6 +1489,7 @@ def set_columns( site=site, node=node, parent=parent, + time=time, derived_state=derived_state, derived_state_offset=derived_state_offset, metadata=metadata, @@ -1492,6 +1502,7 @@ def append_columns( self, site, node, + time, derived_state, derived_state_offset, parent=None, @@ -1502,9 +1513,9 @@ def append_columns( Appends the specified arrays to the end of the columns of this :class:`MutationTable`. This allows many new rows to be added at once. - The ``site``, ``node``, ``derived_state`` and ``derived_state_offset`` + The ``site``, ``node``, ``time``, ``derived_state`` and ``derived_state_offset`` parameters are mandatory, and must be 1D numpy arrays. The - ``site`` and ``node`` (also ``parent``, if supplied) arrays + ``site``, ``node`` and ``time`` (also ``parent``, if supplied) arrays must be of equal length, and determine the number of additional rows to add to the table. The ``derived_state`` and ``derived_state_offset`` parameters must @@ -1521,6 +1532,8 @@ def append_columns( :type site: numpy.ndarray, dtype=np.int32 :param node: The ID of the node each mutation is associated with. :type node: numpy.ndarray, dtype=np.int32 + :param time: The time values for each mutation. Required. + :type time: numpy.ndarray, dtype=np.float64 :param derived_state: The flattened derived_state array. Required. :type derived_state: numpy.ndarray, dtype=np.int8 :param derived_state_offset: The offsets into the ``derived_state`` array. @@ -1538,6 +1551,7 @@ def append_columns( dict( site=site, node=node, + time=time, parent=parent, derived_state=derived_state, derived_state_offset=derived_state_offset, @@ -2282,6 +2296,23 @@ def compute_mutation_parents(self): self.ll_tables.compute_mutation_parents() # TODO add provenance + def compute_mutation_times(self): + """ + Modifies the tables in place, computing valid values for the ``time`` column of + the mutation table. For this to work, the node and edge tables must be + valid, and the site and mutation tables must be sorted and indexed(see + :meth:`TableCollection.sort` and :meth:`TableCollection.build_index`). + + For a single mutation on an edge at a site, the ``time`` assigned to a mutation + by this method is the mid-point between the times of the nodes above and below + the mutation. In the case where there is more than one mutation on an edge for + a site, the times are evenly spread along the edge. For mutations that are + above a root node, the time of the root node is assigned. + + """ + self.ll_tables.compute_mutation_times() + # TODO add provenance + def deduplicate_sites(self): """ Modifies the tables in place, removing entries in the site table with @@ -2340,6 +2371,7 @@ def delete_sites(self, site_ids, record_provenance=True): self.mutations.set_columns( site=site_map[self.mutations.site[keep_mutations]], node=self.mutations.node[keep_mutations], + time=self.mutations.time[keep_mutations], derived_state=new_ds, derived_state_offset=new_ds_offset, parent=mutation_map[self.mutations.parent[keep_mutations]], diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 9de0153f0a..8b35c6ea47 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -358,6 +358,7 @@ def __init__( id_=NULL, site=NULL, node=NULL, + time=NULL, derived_state=None, parent=NULL, encoded_metadata=b"", @@ -366,6 +367,7 @@ def __init__( self.id = id_ self.site = site self.node = node + self.time = time self.derived_state = derived_state self.parent = parent self._encoded_metadata = encoded_metadata @@ -2240,7 +2242,12 @@ def map_mutations(self, genotypes, alleles): # Translate back into string alleles ancestral_state = alleles[ancestral_state] mutations = [ - Mutation(node=node, derived_state=alleles[derived_state], parent=parent) + Mutation( + node=node, + time=self.tree_sequence.node(node).time, + derived_state=alleles[derived_state], + parent=parent, + ) for node, parent, derived_state in transitions ] return ancestral_state, mutations @@ -2518,7 +2525,8 @@ def parse_mutations( instance. See the :ref:`mutation text format ` section for the details of the required format and the :ref:`mutation table definition ` section for the - required properties of the contents. + required properties of the contents. Note that if the ``time`` column is missing it's + entries are filled with 0. See :func:`tskit.load_text` for a detailed explanation of the ``strict`` parameter. @@ -2540,6 +2548,10 @@ def parse_mutations( header = source.readline().strip("\n").split(sep) site_index = header.index("site") node_index = header.index("node") + try: + time_index = header.index("time") + except ValueError: + time_index = None derived_state_index = header.index("derived_state") parent_index = None parent = NULL @@ -2557,6 +2569,7 @@ def parse_mutations( if len(tokens) >= 3: site = int(tokens[site_index]) node = int(tokens[node_index]) + time = float(tokens[time_index]) if time_index is not None else 0 derived_state = tokens[derived_state_index] if parent_index is not None: parent = int(tokens[parent_index]) @@ -2568,6 +2581,7 @@ def parse_mutations( table.add_row( site=site, node=node, + time=time, derived_state=derived_state, parent=parent, metadata=metadata, @@ -2628,6 +2642,7 @@ def load_text( strict=True, encoding="utf8", base64_metadata=True, + compute_mutation_times=False, ): """ Parses the tree sequence data from the specified file-like objects, and @@ -2686,6 +2701,8 @@ def load_text( :param str encoding: Encoding used for text representation. :param bool base64_metadata: If True, metadata is encoded using Base64 encoding; otherwise, as plain text. + :param bool compute_mutation_times: If True, mutation times are replaced with those + from :meth:`TableCollection.compute_mutation_times` :return: The tree sequence object containing the information stored in the specified file paths. :rtype: :class:`tskit.TreeSequence` @@ -2749,6 +2766,9 @@ def load_text( table=tc.populations, ) tc.sort() + if compute_mutation_times: + tc.build_index() + tc.compute_mutation_times() return tc.tree_sequence() @@ -3044,6 +3064,7 @@ def dump_text( print( "site", "node", + "time", "derived_state", "parent", "metadata", @@ -3058,12 +3079,14 @@ def dump_text( row = ( "{site}\t" "{node}\t" + "{time}\t" "{derived_state}\t" "{parent}\t" "{metadata}" ).format( site=mutation.site, node=mutation.node, + time=mutation.time, derived_state=mutation.derived_state, parent=mutation.parent, metadata=metadata, @@ -3903,6 +3926,7 @@ def mutation(self, id_): ( site, node, + time, derived_state, parent, metadata, @@ -3911,6 +3935,7 @@ def mutation(self, id_): id_=id_, site=site, node=node, + time=time, derived_state=derived_state, parent=parent, encoded_metadata=metadata, From bbef3350d605727ecd4daf11bd046a3b44f664c8 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Wed, 1 Jul 2020 12:36:56 +0100 Subject: [PATCH 2/2] Make mutation time an optional column by defaulting to a specific NAN value. --- c/CHANGELOG.rst | 18 ++-- c/tests/test_core.c | 12 +++ c/tests/test_file_format.c | 1 - c/tests/test_genotypes.c | 4 +- c/tests/test_tables.c | 47 +++++--- c/tests/test_trees.c | 2 +- c/tests/testlib.c | 8 +- c/tskit/core.c | 24 +++-- c/tskit/core.h | 25 ++++- c/tskit/tables.c | 45 +++++--- c/tskit/trees.c | 3 +- docs/data-model.rst | 12 ++- python/CHANGELOG.rst | 16 ++- python/_tskitmodule.c | 21 ++-- python/setup.py | 9 +- python/tests/__init__.py | 104 +----------------- python/tests/test_cli.py | 46 -------- python/tests/test_dict_encoding.py | 13 ++- python/tests/test_drawing.py | 12 +-- python/tests/test_file_format.py | 55 +++++++--- python/tests/test_genotypes.py | 32 ++---- python/tests/test_highlevel.py | 63 +++++++++-- python/tests/test_lowlevel.py | 10 +- python/tests/test_metadata.py | 4 +- python/tests/test_parsimony.py | 46 +++----- python/tests/test_stats.py | 2 +- python/tests/test_tables.py | 77 ++++++------- python/tests/test_topology.py | 168 +++++++++++++---------------- python/tests/test_tree_stats.py | 78 +++++++------- python/tests/test_util.py | 13 +++ python/tests/test_vcf.py | 4 +- python/tests/test_wright_fisher.py | 4 - python/tests/tsutil.py | 26 ++--- python/tskit/__init__.py | 6 ++ python/tskit/formats.py | 57 ++-------- python/tskit/tables.py | 33 ++++-- python/tskit/trees.py | 52 +++++---- python/tskit/util.py | 12 +++ 38 files changed, 561 insertions(+), 603 deletions(-) diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index d4d82fb5a8..7d8bfbe3e1 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -6,14 +6,8 @@ In development. **Breaking changes** -- Mutations now have a double-precision floating-point ``time`` column. As this is a - mandatory column the file format version has been bumped to 13.0. Pre-existing - tree sequences can be upgraded using the ``tskit upgrade`` command-line utility, - which will assign times to mutations by spreading them evenly along the edges on - which they occur (using ``tsk_table_collection_compute_mutation_times``. For a tree - sequence to be considered valid it must meet new criteria for mutation times, see - :ref:`sec_mutation_requirements`. Mutation methods such as ``add_row`` now have an - additional argument. +- ``tsk_mutation_table_add_row`` has an extra ``time`` argument. If the time + is unknown ``TSK_UNKNOWN_TIME`` should be passed. (:user:`benjeffery`, :pr:`672`) - Change genotypes from unsigned to signed to accommodate missing data @@ -44,12 +38,14 @@ In development. **New features** -- Add ``time`` column to mutations, along with new function - ``tsk_table_collection_compute_mutation_times`` and new flag to +- Mutations now have an optional double-precision floating-point ``time`` column. + If not specified, this defaults to a particular NaN value (``TSK_UNKNOWN_TIME``) + indicating that the time is unknown. For a tree sequence to be considered valid + it must meet new criteria for mutation times, see :ref:`sec_mutation_requirements`. + Add ``tsk_table_collection_compute_mutation_times`` and new flag to ``tsk_table_collection_check_integrity``:``TSK_CHECK_MUTATION_TIME``. (:user:`benjeffery`, :pr:`672`) - - Add ``metadata`` and ``metadata_schema`` fields to table collection, with accessors on tree sequence. These store arbitrary bytes and are optional in the file format. (:user: `benjeffery`, :pr:`641`) diff --git a/c/tests/test_core.c b/c/tests/test_core.c index ed509e803c..491dc20e44 100644 --- a/c/tests/test_core.c +++ b/c/tests/test_core.c @@ -334,6 +334,17 @@ test_blkalloc(void) tsk_blkalloc_free(&alloc); } +static void +test_unknown_time(void) +{ + CU_ASSERT_TRUE(isnan(TSK_UNKNOWN_TIME)); + CU_ASSERT_TRUE(tsk_is_unknown_time(TSK_UNKNOWN_TIME)); + CU_ASSERT_FALSE(tsk_is_unknown_time(NAN)); + CU_ASSERT_FALSE(tsk_is_unknown_time(0)); + CU_ASSERT_FALSE(tsk_is_unknown_time(INFINITY)); + CU_ASSERT_FALSE(tsk_is_unknown_time(1)); +} + int main(int argc, char **argv) { @@ -343,6 +354,7 @@ main(int argc, char **argv) { "test_generate_uuid", test_generate_uuid }, { "test_double_round", test_double_round }, { "test_blkalloc", test_blkalloc }, + { "test_unknown_time", test_unknown_time }, { NULL, NULL }, }; diff --git a/c/tests/test_file_format.c b/c/tests/test_file_format.c index e7d9bb6f59..5216cb1c50 100644 --- a/c/tests/test_file_format.c +++ b/c/tests/test_file_format.c @@ -583,7 +583,6 @@ test_missing_required_columns(void) "mutations/node", "mutations/parent", "mutations/site", - "mutations/time", "nodes/flags", "nodes/individual", "nodes/population", diff --git a/c/tests/test_genotypes.c b/c/tests/test_genotypes.c index d1bafcf00b..b38882571a 100644 --- a/c/tests/test_genotypes.c +++ b/c/tests/test_genotypes.c @@ -649,8 +649,8 @@ test_single_tree_many_alleles(void) /* Add j mutations over a single node. */ for (j = 0; j < (tsk_id_t) num_alleles; j++) { /* When j = 0 we get a parent of -1, which is the NULL_NODE */ - ret = tsk_mutation_table_add_row( - &tables.mutations, 0, 0, j - 1, 0, alleles, (tsk_size_t) j, NULL, 0); + ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, j - 1, + TSK_UNKNOWN_TIME, alleles, (tsk_size_t) j, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES); CU_ASSERT_EQUAL_FATAL(ret, 0); diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 9c19894b2f..9e87dafd4a 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -1189,10 +1189,14 @@ test_mutation_table(void) ret = tsk_mutation_table_truncate(&table, num_rows + 1); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_TABLE_POSITION); - /* Check all this again, except with parent == NULL and metadata == NULL. */ + /* Check all this again, except with parent == NULL, time == NULL + * and metadata == NULL. */ memset(parent, 0xff, num_rows * sizeof(tsk_id_t)); + for (j = 0; j < (tsk_id_t) num_rows; j++) { + time[j] = TSK_UNKNOWN_TIME; + } memset(metadata_offset, 0, (num_rows + 1) * sizeof(tsk_size_t)); - ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, NULL, time, + ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, NULL, NULL, derived_state, derived_state_offset, NULL, NULL); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL(memcmp(table.site, site, num_rows * sizeof(tsk_id_t)), 0); @@ -1212,7 +1216,7 @@ test_mutation_table(void) CU_ASSERT_EQUAL(table.metadata_length, 0); /* Append another num_rows */ - ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, NULL, time, + ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, NULL, NULL, derived_state, derived_state_offset, NULL, NULL); CU_ASSERT_EQUAL(ret, 0); CU_ASSERT_EQUAL(memcmp(table.site, site, num_rows * sizeof(tsk_id_t)), 0); @@ -1233,16 +1237,13 @@ test_mutation_table(void) CU_ASSERT_EQUAL(table.derived_state_length, 2 * num_rows); CU_ASSERT_EQUAL(table.metadata_length, 0); - /* Inputs except parent, metadata and metadata_offset cannot be NULL*/ + /* Inputs except parent, time, metadata and metadata_offset cannot be NULL*/ ret = tsk_mutation_table_set_columns(&table, num_rows, NULL, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_set_columns(&table, num_rows, site, NULL, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, NULL, - derived_state, derived_state_offset, metadata, metadata_offset); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_set_columns(&table, num_rows, site, node, parent, time, NULL, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); @@ -1256,16 +1257,13 @@ test_mutation_table(void) derived_state, derived_state_offset, metadata, NULL); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - /* Inputs except parent, metadata and metadata_offset cannot be NULL*/ + /* Inputs except parent, time, metadata and metadata_offset cannot be NULL*/ ret = tsk_mutation_table_append_columns(&table, num_rows, NULL, node, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_append_columns(&table, num_rows, site, NULL, parent, time, derived_state, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); - ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, NULL, - derived_state, derived_state_offset, metadata, metadata_offset); - CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); ret = tsk_mutation_table_append_columns(&table, num_rows, site, node, parent, time, NULL, derived_state_offset, metadata, metadata_offset); CU_ASSERT_EQUAL(ret, TSK_ERR_BAD_PARAM_VALUE); @@ -3151,6 +3149,26 @@ test_table_collection_check_integrity(void) ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_MUTATIONS); + ret = tsk_mutation_table_clear(&tables.mutations); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_table_collection_check_integrity(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_TIME); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_mutation_table_clear(&tables.mutations); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 0, TSK_NULL, NAN, NULL, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_table_collection_check_integrity(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_TIME); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_NONFINITE); + ret = tsk_mutation_table_clear(&tables.mutations); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_mutation_table_add_row( @@ -3159,7 +3177,6 @@ test_table_collection_check_integrity(void) ret = tsk_table_collection_check_integrity(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_check_integrity(&tables, TSK_CHECK_MUTATION_TIME); - printf("%s", tsk_strerror(ret)); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_TIME_NONFINITE); ret = tsk_mutation_table_clear(&tables.mutations); @@ -3305,12 +3322,12 @@ test_table_collection_subset(void) ret = tsk_site_table_add_row(&tables.sites, 0.4, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_mutation_table_add_row( - &tables.mutations, 0, 0, TSK_NULL, NULL, 0, NULL, 0); + &tables.mutations, 0, 0, TSK_NULL, NAN, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); - ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, NULL, 0, NULL, 0); + ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, NAN, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_mutation_table_add_row( - &tables.mutations, 1, 1, TSK_NULL, NULL, 0, NULL, 0); + &tables.mutations, 1, 1, TSK_NULL, NAN, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); // empty nodes should get empty tables diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 9c59ad2644..e87d86086e 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -3792,6 +3792,7 @@ test_single_tree_compute_mutation_parents(void) /* Compute the mutation parents */ verify_compute_mutation_parents(&ts); + tsk_treeseq_free(&ts); /* Bad site reference */ tables.mutations.site[0] = -1; @@ -3855,7 +3856,6 @@ test_single_tree_compute_mutation_parents(void) CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 6); tsk_treeseq_free(&ts); - tsk_treeseq_free(&ts); tsk_table_collection_free(&tables); } diff --git a/c/tests/testlib.c b/c/tests/testlib.c index 71edd30fb2..3d4fd9c50d 100644 --- a/c/tests/testlib.c +++ b/c/tests/testlib.c @@ -400,7 +400,7 @@ parse_mutations(const char *text, tsk_mutation_table_t *mutation_table) if (p != NULL) { parent = atoi(p); } - time = 0; + time = TSK_UNKNOWN_TIME; p = strtok(NULL, whitespace); if (p != NULL) { time = atof(p); @@ -518,11 +518,7 @@ tsk_treeseq_from_text(tsk_treeseq_t *ts, double sequence_length, const char *nod } } - ret = tsk_table_collection_build_index(&tables, 0); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_compute_mutation_times(&tables, NULL, 0); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_treeseq_init(ts, &tables, 0); + ret = tsk_treeseq_init(ts, &tables, TSK_BUILD_INDEXES); /* tsk_treeseq_print_state(ts, stdout); */ // printf("ret = %s\n", tsk_strerror(ret)); CU_ASSERT_EQUAL_FATAL(ret, 0); diff --git a/c/tskit/core.c b/c/tskit/core.c index 29b1a27d23..049bd658f5 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -288,16 +288,16 @@ tsk_strerror_internal(int err) ret = "Mutations must be provided in non-decreasing site order"; break; case TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE: - ret = "Mutations must have a time equal to or greater than that of their " - "node"; + ret = "A mutation's time must be >= the node time, or be marked as " + "'unknown'"; break; case TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION: - ret = "Mutations must have a time equal to or less than that of their " - "parent mutation"; + ret = "A mutation's time must be <= the parent mutation time (if known), or " + "be marked as 'unknown'"; break; case TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE: - ret = "Mutations must have a time less than the parent node of " - "the edge they are on"; + ret = "A mutation's time must be < the parent node of the edge on which it " + "occurs, or be marked as 'unknown'"; break; /* Sample errors */ @@ -637,3 +637,15 @@ tsk_round(double x, unsigned int ndigits) } return z; } + +/* As NANs are not equal, use this function to check for equality to TSK_UNKNOWN_TIME */ +bool +tsk_is_unknown_time(double val) +{ + union { + uint64_t i; + double f; + } nan_union; + nan_union.f = val; + return nan_union.i == TSK_UNKNOWN_TIME_HEX; +} diff --git a/c/tskit/core.h b/c/tskit/core.h index 1f0e244d01..09fd9202af 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -34,7 +34,9 @@ extern "C" { #endif +#include #include +#include #include #ifdef __GNUC__ @@ -67,6 +69,23 @@ extern "C" { #define TSK_DBL_DECIMAL_DIG (DBL_DIG + 3) #endif +/* We define a specific NAN value for default mutation time which indicates + * the time is unknown. We use a specific value so that if mutation time is set to + * a NAN from a computation we can reject it. This specific value is a non-signalling + * NAN with the last six fraction bytes set to the ascii of "tskit!" + */ +#define TSK_UNKNOWN_TIME_HEX 0x7FF874736B697421ULL +static inline double +__tsk_nan_f(void) +{ + const union { + uint64_t i; + double f; + } nan_union = { .i = TSK_UNKNOWN_TIME_HEX }; + return nan_union.f; +} +#define TSK_UNKNOWN_TIME __tsk_nan_f() + // clang-format off /** @defgroup API_VERSION_GROUP API version macros. @@ -101,8 +120,8 @@ to the API or ABI are introduced, i.e., internal refactors of bugfixes. #define TSK_FILE_FORMAT_NAME "tskit.trees" #define TSK_FILE_FORMAT_NAME_LENGTH 11 -#define TSK_FILE_FORMAT_VERSION_MAJOR 13 -#define TSK_FILE_FORMAT_VERSION_MINOR 0 +#define TSK_FILE_FORMAT_VERSION_MAJOR 12 +#define TSK_FILE_FORMAT_VERSION_MINOR 3 /** @defgroup GENERAL_ERROR_GROUP General errors. @@ -318,6 +337,8 @@ extern void tsk_blkalloc_free(tsk_blkalloc_t *self); size_t tsk_search_sorted(const double *array, size_t size, double value); double tsk_round(double x, unsigned int ndigits); +bool tsk_is_unknown_time(double val); + #define TSK_UUID_SIZE 36 int tsk_generate_uuid(char *dest, int flags); diff --git a/c/tskit/tables.c b/c/tskit/tables.c index fce14f870b..826c279018 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -2520,7 +2520,7 @@ tsk_mutation_table_append_columns(tsk_mutation_table_t *self, tsk_size_t num_row int ret = 0; tsk_size_t j, derived_state_length, metadata_length; - if (site == NULL || node == NULL || time == NULL || derived_state == NULL + if (site == NULL || node == NULL || derived_state == NULL || derived_state_offset == NULL) { ret = TSK_ERR_BAD_PARAM_VALUE; goto out; @@ -2542,7 +2542,15 @@ tsk_mutation_table_append_columns(tsk_mutation_table_t *self, tsk_size_t num_row } else { memcpy(self->parent + self->num_rows, parent, num_rows * sizeof(tsk_id_t)); } - memcpy(self->time + self->num_rows, time, num_rows * sizeof(double)); + if (time == NULL) { + /* If time is NULL, set all times to TSK_UNKNOWN_TIME which is the + * default */ + for (j = 0; j < num_rows; j++) { + self->time[self->num_rows + j] = TSK_UNKNOWN_TIME; + } + } else { + memcpy(self->time + self->num_rows, time, num_rows * sizeof(double)); + } /* Metadata column */ if (metadata == NULL) { @@ -2846,7 +2854,8 @@ tsk_mutation_table_load(tsk_mutation_table_t *self, kastore_t *store) { "mutations/site", (void **) &site, &num_rows, 0, KAS_INT32, 0 }, { "mutations/node", (void **) &node, &num_rows, 0, KAS_INT32, 0 }, { "mutations/parent", (void **) &parent, &num_rows, 0, KAS_INT32, 0 }, - { "mutations/time", (void **) &time, &num_rows, 0, KAS_FLOAT64, 0 }, + { "mutations/time", (void **) &time, &num_rows, 0, KAS_FLOAT64, + TSK_COL_OPTIONAL }, { "mutations/derived_state", (void **) &derived_state, &derived_state_length, 0, KAS_UINT8, 0 }, { "mutations/derived_state_offset", (void **) &derived_state_offset, &num_rows, @@ -6623,18 +6632,20 @@ tsk_table_collection_check_integrity(tsk_table_collection_t *self, tsk_flags_t o } if (check_mutation_time) { mutation_time = self->mutations.time[j]; - if (!isfinite(mutation_time)) { - ret = TSK_ERR_TIME_NONFINITE; - goto out; - } - if (mutation_time < self->nodes.time[self->mutations.node[j]]) { - ret = TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE; - goto out; - } - if (parent_mut != TSK_NULL - && self->mutations.time[parent_mut] < mutation_time) { - ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION; - goto out; + if (!tsk_is_unknown_time(mutation_time)) { + if (!isfinite(mutation_time)) { + ret = TSK_ERR_TIME_NONFINITE; + goto out; + } + if (mutation_time < self->nodes.time[self->mutations.node[j]]) { + ret = TSK_ERR_MUTATION_TIME_YOUNGER_THAN_NODE; + goto out; + } + if (parent_mut != TSK_NULL + && self->mutations.time[parent_mut] < mutation_time) { + ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_MUTATION; + goto out; + } } } } @@ -8059,8 +8070,8 @@ tsk_table_collection_subset( new_parent = mutation_map[mut.parent]; } ret = tsk_mutation_table_add_row(&self->mutations, site_map[site.id], - new_node, new_parent, mut.derived_state, mut.derived_state_length, - mut.metadata, mut.metadata_length); + new_node, new_parent, mut.time, mut.derived_state, + mut.derived_state_length, mut.metadata, mut.metadata_length); if (ret < 0) { goto out; } diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 648beee153..d922855f10 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -281,7 +281,8 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self) } while (site < num_sites && site_position[site] < tree_right) { while (mutation < num_mutations && mutation_site[mutation] == site) { - if (parent[mutation_node[mutation]] != TSK_NULL + if (!isnan(mutation_time[mutation]) + && parent[mutation_node[mutation]] != TSK_NULL && node_time[parent[mutation_node[mutation]]] <= mutation_time[mutation]) { ret = TSK_ERR_MUTATION_TIME_OLDER_THAN_PARENT_NODE; diff --git a/docs/data-model.rst b/docs/data-model.rst index 1cf32e436c..04a83927a3 100644 --- a/docs/data-model.rst +++ b/docs/data-model.rst @@ -637,9 +637,11 @@ requirements for a valid set of mutations are: - ``site`` must refer to a valid site ID; - ``node`` must refer to a valid node ID; -- ``time`` must be a finite value which is greater or equal to the mutation ``node``'s - ``time``, less than the ``node`` above the mutation's ``time`` and equal to or less - than the ``time`` of the ``parent`` mutation if this mutation has one. +- ``time`` must either be UNKNOWN_TIME (a NAN value which indicates + the time is unknown) or be a finite value which is greater or equal to the + mutation ``node``'s ``time``, less than the ``node`` above the mutation's + ``time`` and equal to or less than the ``time`` of the ``parent`` mutation + if this mutation has one. - ``parent`` must either be the null ID (-1) or a valid mutation ID within the current table @@ -1151,8 +1153,8 @@ Mutation text format The mutation text format must contain the columns ``site``, ``node`` and ``derived_state``. The ``time``, ``parent`` and ``metadata`` columns may also be optionally present (but ``parent`` must be specified if -more than one mutation occurs at the same site). If ``time`` is absent ``0`` will be -used to fill the column. See the +more than one mutation occurs at the same site). If ``time`` is absent +``UNKNOWN_TIME`` will be used to fill the column. See the :ref:`mutation table definitions ` for details on these columns. diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 2bfc5ba161..e39946e9c6 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -6,14 +6,6 @@ In development **Breaking changes** -- Mutations now have a double-precision floating-point ``time`` column. As this is a - mandatory column the file format version has been bumped to ``13.0``. - Pre-existing tree sequences can be upgraded using the ``tskit upgrade`` command-line - utility, which will assign times to mutations by spreading them evenly along the - edges on which they occur (using ``TableCollection.compute_mutation_times``). For - a tree sequence to be considered valid it must meet new criteria for mutation - times, see :ref:`sec_mutation_requirements`. - - The default display order for tree visualisations has been changed to ``minlex`` (see below) to stabilise the node ordering and to make trees more readily comparable. The old behaviour is still available with ``order="tree"``. @@ -24,8 +16,12 @@ In development **New features** -- Add time column to mutations, along with - ``TableCollection.compute_mutation_times``. (:user:`benjeffery`, :pr:`672`) +- Mutations now have an optional double-precision floating-point ``time`` column. + If not specified, this defaults to a particular NaN value (``tskit.UNKNOWN_TIME``) + indicating that the time is unknown. For a tree sequence to be considered valid + it must meet new criteria for mutation times, see :ref:`sec_mutation_requirements`. + Also added function ``TableCollection.compute_mutation_times``. + (:user:`benjeffery`, :pr:`672`) - Add support for trees with internal samples for the Kendall-Colijn tree distance metric. (:user:`daniel-goldstein`, :pr:`610`) diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index e201c924b9..d0deab7de4 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -287,9 +287,9 @@ make_mutation(tsk_mutation_t *mutation) if (metadata == NULL) { goto out; } - ret = Py_BuildValue("iids#iO", mutation->site, mutation->node, mutation->time, mutation->derived_state, + ret = Py_BuildValue("iis#iOd", mutation->site, mutation->node, mutation->derived_state, (Py_ssize_t) mutation->derived_state_length, mutation->parent, - metadata); + metadata, mutation->time); out: Py_XDECREF(metadata); return ret; @@ -1563,7 +1563,7 @@ parse_mutation_table_dict(tsk_mutation_table_t *table, PyObject *dict, bool clea if (parent_input == NULL) { goto out; } - time_input = get_table_dict_value(dict, "time", true); + time_input = get_table_dict_value(dict, "time", false); if (time_input == NULL) { goto out; } @@ -1607,6 +1607,7 @@ parse_mutation_table_dict(tsk_mutation_table_t *table, PyObject *dict, bool clea if (node_array == NULL) { goto out; } + time_data = NULL; if (time_input != Py_None) { time_array = table_read_column_array(time_input, NPY_FLOAT64, &num_rows, true); @@ -4894,17 +4895,17 @@ MutationTable_add_row(MutationTable *self, PyObject *args, PyObject *kwds) int site; int node; int parent = TSK_NULL; - double time = TSK_NULL; + double time = TSK_UNKNOWN_TIME; char *derived_state; Py_ssize_t derived_state_length; PyObject *py_metadata = Py_None; char *metadata = NULL; Py_ssize_t metadata_length = 0; - static char *kwlist[] = {"site", "node", "time", "derived_state", "parent", "metadata", NULL}; + static char *kwlist[] = {"site", "node", "derived_state", "parent", "metadata", "time", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "iids#|iO", kwlist, - &site, &node, &time, &derived_state, &derived_state_length, &parent, - &py_metadata)) { + if (!PyArg_ParseTupleAndKeywords(args, kwds, "iis#|iOd", kwlist, + &site, &node, &derived_state, &derived_state_length, &parent, + &py_metadata, &time)) { goto out; } if (MutationTable_check_state(self) != 0) { @@ -11168,6 +11169,10 @@ PyInit__tskit(void) PyModule_AddIntConstant(module, "NULL", TSK_NULL); PyModule_AddIntConstant(module, "MISSING_DATA", TSK_MISSING_DATA); + + PyObject *unknown_time = PyFloat_FromDouble(TSK_UNKNOWN_TIME); + PyModule_AddObject(module, "UNKNOWN_TIME", unknown_time); + /* Node flags */ PyModule_AddIntConstant(module, "NODE_IS_SAMPLE", TSK_NODE_IS_SAMPLE); /* Tree flags */ diff --git a/python/setup.py b/python/setup.py index 213c72a5ff..0a552c6f97 100644 --- a/python/setup.py +++ b/python/setup.py @@ -95,14 +95,7 @@ def finalize_options(self): packages=["tskit"], include_package_data=True, ext_modules=[_tskit_module], - install_requires=[ - "attrs>=19.1.0", - "h5py", - "jsonschema", - "kastore", - numpy_ver, - "svgwrite", - ], + install_requires=["attrs>=19.1.0", "h5py", "jsonschema", numpy_ver, "svgwrite"], entry_points={"console_scripts": ["tskit=tskit.cli:tskit_main"]}, project_urls={ "Bug Reports": "https://github.com/tskit-dev/tskit/issues", diff --git a/python/tests/__init__.py b/python/tests/__init__.py index 1eae7f5fd0..3e60939c75 100644 --- a/python/tests/__init__.py +++ b/python/tests/__init__.py @@ -20,113 +20,11 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import base64 -import json -import sys - -import msprime -import numpy as np import tskit from . import tsutil from .simplify import * # NOQA -if msprime.__version__ <= "0.7.4": - # As mutation time is introduced into tskit but not msprime we need to patch up - # msprime to return times, this should be removed once msprime 1.0 is released. - def get_tree_sequence(self, mutation_generator=None, provenance_record=None): - if mutation_generator is not None: - mutation_generator.generate(self.ll_tables) - tables_dict = self.ll_tables.asdict() - tables_dict["mutations"]["time"] = np.full( - tables_dict["mutations"]["node"].shape, -1, np.float64 - ) - tables = tskit.TableCollection.fromdict(tables_dict) - if provenance_record is not None: - tables.provenances.add_row(provenance_record) - if self.from_ts is None: - # Add the populations with metadata - assert len(tables.populations) == len(self.population_configurations) - tables.populations.clear() - for pop_config in self.population_configurations: - tables.populations.add_row(metadata=pop_config.encoded_metadata) - tables.build_index() - tables.compute_mutation_times() - return tables.tree_sequence() - - def mutate( - tree_sequence, - rate=None, - random_seed=None, - model=None, - keep=False, - start_time=None, - end_time=None, - ): - try: - tables = tree_sequence.tables - except AttributeError: - raise ValueError("First argument must be a TreeSequence instance.") - if random_seed is None: - random_seed = msprime.simulations._get_random_seed() - random_seed = int(random_seed) - - rng = msprime.mutations._msprime.RandomGenerator(random_seed) - if model is None: - model = msprime.InfiniteSites() - try: - alphabet = model.alphabet - except AttributeError: - raise TypeError("model must be an InfiniteSites instance") - if rate is None: - rate = 0 - rate = float(rate) - keep = bool(keep) - - parameters = { - "command": "mutate", - "rate": rate, - "random_seed": random_seed, - "keep": keep, - } - - if start_time is None: - start_time = -sys.float_info.max - else: - start_time = float(start_time) - parameters["start_time"] = start_time - - if end_time is None: - end_time = sys.float_info.max - else: - end_time = float(end_time) - parameters["end_time"] = end_time - provenance_dict = msprime.provenance.get_provenance_dict(parameters) - - if start_time > end_time: - raise ValueError("start_time must be <= end_time") - - mutation_generator = msprime.mutations._msprime.MutationGenerator( - rng, rate, alphabet=alphabet, start_time=start_time, end_time=end_time - ) - lwt = msprime.mutations._msprime.LightweightTableCollection() - lwt.fromdict(tables.asdict()) - mutation_generator.generate(lwt, keep=keep) - - tables_dict = lwt.asdict() - tables_dict["mutations"]["time"] = np.full( - tables_dict["mutations"]["node"].shape, -1, np.float64 - ) - tables = tskit.TableCollection.fromdict(tables_dict) - tables.provenances.add_row(json.dumps(provenance_dict)) - - tables.build_index() - tables.compute_mutation_times() - return tables.tree_sequence() - - msprime.simulations.Simulator.get_tree_sequence = get_tree_sequence - msprime.mutations.mutate = mutate - msprime.mutate = mutate - # TODO remove this code and refactor elsewhere. @@ -334,7 +232,7 @@ def __init__(self, tree_sequence, breakpoints=None): ll_ts = self._tree_sequence._ll_tree_sequence def make_mutation(id_): - site, node, time, derived_state, parent, metadata = ll_ts.get_mutation(id_) + site, node, derived_state, parent, metadata, time = ll_ts.get_mutation(id_) return tskit.Mutation( id_=id_, site=site, diff --git a/python/tests/test_cli.py b/python/tests/test_cli.py index 1e45cc4ded..13ea65ef99 100644 --- a/python/tests/test_cli.py +++ b/python/tests/test_cli.py @@ -31,9 +31,7 @@ from unittest import mock import h5py -import kastore import msprime -import numpy import tskit import tskit.cli as cli @@ -647,47 +645,3 @@ def test_duplicate_positions_error(self): ["upgrade", self.legacy_file_name, self.current_file_name], ) self.assertEqual(mocked_exit.call_count, 1) - - def test_12_to_13(self): - ts = msprime.simulate(10, mutation_rate=10) - tc = ts.tables - tc.metadata_schema = tskit.MetadataSchema( - { - "codec": "struct", - "type": "object", - "properties": {"top-level": {"type": "string", "binaryFormat": "50p"}}, - } - ) - tc.metadata = {"top-level": "top-level-metadata"} - for table in [ - "individuals", - "nodes", - "edges", - "migrations", - "sites", - "mutations", - "populations", - ]: - t = getattr(tc, table) - t.packset_metadata([f"{table}-{i}".encode() for i in range(t.num_rows)]) - t.metadata_schema = tskit.MetadataSchema( - { - "codec": "struct", - "type": "object", - "properties": {table: {"type": "string", "binaryFormat": "50p"}}, - } - ) - ts = tc.tree_sequence() - ts.dump(self.legacy_file_name) - with kastore.load(self.legacy_file_name) as store: - all_data = dict(store) - del all_data["mutations/time"] - all_data["format/version"] = numpy.asarray([12, 0], dtype=numpy.uint32) - kastore.dump(all_data, self.legacy_file_name) - stdout, stderr = capture_output( - cli.tskit_main, ["upgrade", self.legacy_file_name, self.current_file_name], - ) - ts2 = tskit.load(self.current_file_name) - self.assertEqual(stdout, "") - self.assertEqual(stderr, "") - self.assertEqual(ts.tables, ts2.tables) diff --git a/python/tests/test_dict_encoding.py b/python/tests/test_dict_encoding.py index 2ec9ad147e..10bf013cbd 100644 --- a/python/tests/test_dict_encoding.py +++ b/python/tests/test_dict_encoding.py @@ -32,6 +32,7 @@ import _tskit as c_module import tskit +import tskit.util as util def get_example_tables(): @@ -523,6 +524,7 @@ def test_migrations(self): self.verify_offset_pair( tables, len(tables.migrations), "migrations", "metadata" ) + self.verify_optional_column(tables, len(tables.nodes), "nodes", "individual") self.verify_metadata_schema(tables, "migrations") def test_sites(self): @@ -538,10 +540,19 @@ def test_mutations(self): self.verify_required_columns( tables, "mutations", - ["site", "node", "time", "derived_state", "derived_state_offset"], + ["site", "node", "derived_state", "derived_state_offset"], ) self.verify_offset_pair(tables, len(tables.mutations), "mutations", "metadata") self.verify_metadata_schema(tables, "mutations") + # Verify optional time column + d = tables.asdict() + d["mutations"]["time"] = None + lwt = c_module.LightweightTableCollection() + lwt.fromdict(d) + out = lwt.asdict() + self.assertTrue( + all(util.is_unknown_time(val) for val in out["mutations"]["time"]) + ) def test_populations(self): tables = get_example_tables() diff --git a/python/tests/test_drawing.py b/python/tests/test_drawing.py index fed07c1293..71aa0e4f3f 100644 --- a/python/tests/test_drawing.py +++ b/python/tests/test_drawing.py @@ -74,8 +74,8 @@ def get_zero_edge_tree(self): tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) tables.sites.add_row(position=0, ancestral_state="0") - tables.mutations.add_row(site=0, node=0, time=0, derived_state="1") - tables.mutations.add_row(site=0, node=1, time=0, derived_state="1") + tables.mutations.add_row(site=0, node=0, derived_state="1") + tables.mutations.add_row(site=0, node=1, derived_state="1") return tables.tree_sequence().first() def get_zero_roots_tree(self): @@ -117,9 +117,7 @@ def get_mutations_over_roots_tree(self): for node in range(ts.num_nodes): site_id = tables.sites.add_row(x, ancestral_state="0") x += delta - tables.mutations.add_row( - site_id, node=node, time=ts.node(node).time, derived_state="1" - ) + tables.mutations.add_row(site_id, node=node, derived_state="1") ts = tables.tree_sequence() tree = ts.first() assert any(tree.parent(mut.node) == tskit.NULL for mut in tree.mutations()) @@ -195,8 +193,8 @@ def get_simple_ts(self): ) mutations = io.StringIO( """\ - site node time derived_state parent - 0 4 0.05 T -1 + site node derived_state parent + 0 4 T -1 """ ) return tskit.load_text( diff --git a/python/tests/test_file_format.py b/python/tests/test_file_format.py index b0fd722b69..14ac4b9f16 100644 --- a/python/tests/test_file_format.py +++ b/python/tests/test_file_format.py @@ -37,10 +37,11 @@ import tests.tsutil as tsutil import tskit import tskit.exceptions as exceptions +from tskit import UNKNOWN_TIME -CURRENT_FILE_MAJOR = 13 -CURRENT_FILE_MINOR = 0 +CURRENT_FILE_MAJOR = 12 +CURRENT_FILE_MINOR = 3 test_data_dir = os.path.join(os.path.dirname(__file__), "data") @@ -69,10 +70,8 @@ def general_mutation_example(): tables = ts.dump_tables() tables.sites.add_row(position=0, ancestral_state="A", metadata=b"{}") tables.sites.add_row(position=1, ancestral_state="C", metadata=b"{'id':1}") - tables.mutations.add_row(site=0, node=0, time=0, derived_state="T") - tables.mutations.add_row(site=1, node=0, time=0, derived_state="G") - tables.build_index() - tables.compute_mutation_times() + tables.mutations.add_row(site=0, node=0, derived_state="T") + tables.mutations.add_row(site=1, node=0, derived_state="G") return tables.tree_sequence() @@ -168,11 +167,7 @@ def mutation_metadata_example(): tables = ts.dump_tables() tables.sites.add_row(0, ancestral_state="a") for j in range(10): - tables.mutations.add_row( - site=0, node=j, time=-1, derived_state="t", metadata=b"1234" - ) - tables.build_index() - tables.compute_mutation_times() + tables.mutations.add_row(site=0, node=j, derived_state="t", metadata=b"1234") return tables.tree_sequence() @@ -462,6 +457,8 @@ def test_unsupported_version(self): self.assertRaises(ValueError, tskit.dump_legacy, ts, self.temp_file, version=4) # Cannot read current files. ts.dump(self.temp_file) + # Catch Exception here because h5py throws different exceptions on py2 and py3 + self.assertRaises(Exception, tskit.load_legacy, self.temp_file) def test_no_version_number(self): root = h5py.File(self.temp_file, "w") @@ -677,6 +674,11 @@ def verify_dump_format(self, ts): self.assertTrue(np.array_equal(tables.mutations.site, store["mutations/site"])) self.assertTrue(np.array_equal(tables.mutations.node, store["mutations/node"])) + # Default mutation time is a NaN value so we want to check for + # bit equality, not numeric equality + self.assertEqual( + tables.mutations.time.tobytes(), store["mutations/time"].tobytes() + ) self.assertTrue( np.array_equal(tables.mutations.parent, store["mutations/parent"]) ) @@ -819,6 +821,32 @@ def test_empty_migration_metadata(self): ts3 = tskit.load(self.temp_file) self.assertEqual(ts1.tables, ts3.tables) + def test_empty_mutation_time(self): + ts1 = migration_example() + ts1.dump(self.temp_file) + ts2 = tskit.load(self.temp_file) + self.assertEqual(ts1.tables, ts2.tables) + self.assertEqual(len(ts1.tables.mutations.metadata), 0) + with kastore.load(self.temp_file) as store: + all_data = dict(store) + del all_data["mutations/time"] + kastore.dump(all_data, self.temp_file) + ts3 = tskit.load(self.temp_file) + self.assertEqual(ts1.tables, ts3.tables) + + +class TestMixedUnknownMutation(TestFileFormat): + def test_unknown_mutation(self): + ts1 = migration_example() + tables = ts1.tables + tables.compute_mutation_times() + mutations = tables.mutations.asdict() + mutations["time"][0] = UNKNOWN_TIME + tables.mutations.set_columns(**mutations) + tables.tree_sequence().dump(self.temp_file) + tables2 = tskit.load(self.temp_file).tables + self.assertEqual(tables, tables2) + class TestFileFormatErrors(TestFileFormat): """ @@ -831,7 +859,10 @@ def verify_missing_fields(self, ts): all_data = dict(store) for key in all_data.keys(): # We skip these keys as they are optional - if "metadata_schema" not in key and key != "metadata": + if "metadata_schema" not in key and key not in [ + "metadata", + "mutations/time", + ]: data = dict(all_data) del data[key] kastore.dump(data, self.temp_file) diff --git a/python/tests/test_genotypes.py b/python/tests/test_genotypes.py index 6dfcdc2703..584cc7610a 100644 --- a/python/tests/test_genotypes.py +++ b/python/tests/test_genotypes.py @@ -248,7 +248,7 @@ def test_many_alleles(self): for u in ts.samples(): if u != 0: self.assertEqual(var.alleles[var.genotypes[u]], alleles[0]) - tables.mutations.add_row(0, 0, 0, allele, parent=parent) + tables.mutations.add_row(0, 0, allele, parent=parent) parent += 1 num_alleles += 1 @@ -284,7 +284,7 @@ def test_many_alleles_missing_data(self): for u in samples[:-1]: if u != 0: self.assertEqual(var.alleles[var.genotypes[u]], alleles[0]) - tables.mutations.add_row(0, 0, 0, allele, parent=parent) + tables.mutations.add_row(0, 0, allele, parent=parent) parent += 1 num_alleles += 1 @@ -320,7 +320,7 @@ def test_recurrent_mutations_over_samples(self): position=j * ts.sequence_length / num_sites, ancestral_state="0" ) for u in range(ts.sample_size): - tables.mutations.add_row(site=j, node=u, time=0, derived_state="1") + tables.mutations.add_row(site=j, node=u, derived_state="1") ts = tables.tree_sequence() variants = list(ts.variants()) self.assertEqual(len(variants), num_sites) @@ -341,12 +341,8 @@ def test_recurrent_mutations_errors(self): tables.sites.clear() tables.mutations.clear() site = tables.sites.add_row(position=0, ancestral_state="0") - tables.mutations.add_row( - site=site, node=u, time=ts.node(u).time, derived_state="1" - ) - tables.mutations.add_row( - site=site, node=sample, time=0, derived_state="1" - ) + tables.mutations.add_row(site=site, node=u, derived_state="1") + tables.mutations.add_row(site=site, node=sample, derived_state="1") ts_new = tables.tree_sequence() self.assertRaises(exceptions.LibraryError, list, ts_new.variants()) @@ -475,7 +471,7 @@ def test_mutation_over_isolated_sample_missing(self): tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) tables.nodes.add_row(tskit.NODE_IS_SAMPLE, 0) tables.sites.add_row(0.5, "A") - tables.mutations.add_row(0, 0, 0, "T") + tables.mutations.add_row(0, 0, "T") ts = tables.tree_sequence() variants = list(ts.variants()) self.assertEqual(len(variants), 1) @@ -572,7 +568,6 @@ def test_acgt_mutations(self): mutations.set_columns( site=mutations.site, node=mutations.node, - time=mutations.time, derived_state=np.zeros(ts.num_sites, dtype=np.int8) + ord("T"), derived_state_offset=np.arange(ts.num_sites + 1, dtype=np.uint32), ) @@ -610,7 +605,7 @@ def test_recurrent_mutations_over_samples(self): position=j * ts.sequence_length / num_sites, ancestral_state="0" ) for u in range(ts.sample_size): - tables.mutations.add_row(site=j, node=u, time=0, derived_state="1") + tables.mutations.add_row(site=j, node=u, derived_state="1") ts_new = tables.tree_sequence() ones = "1" * num_sites for h in ts_new.haplotypes(): @@ -624,15 +619,8 @@ def test_recurrent_mutations_errors(self): tables.sites.clear() tables.mutations.clear() site = tables.sites.add_row(position=0, ancestral_state="0") - tables.mutations.add_row( - site=site, node=u, time=ts.node(u).time, derived_state="1" - ) - tables.mutations.add_row( - site=site, - node=tree.root, - time=ts.node(tree.root).time, - derived_state="1", - ) + tables.mutations.add_row(site=site, node=u, derived_state="1") + tables.mutations.add_row(site=site, node=tree.root, derived_state="1") ts_new = tables.tree_sequence() self.assertRaises(exceptions.LibraryError, list, ts_new.haplotypes()) ts_new.haplotypes() @@ -780,7 +768,7 @@ def test_missing_data(self): tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) tables.nodes.add_row(flags=tskit.NODE_IS_SAMPLE, time=0) tables.sites.add_row(0.5, "0") - tables.mutations.add_row(0, 0, 0, "1") + tables.mutations.add_row(0, 0, "1") ts = tables.tree_sequence() for impute in [True, False]: diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index 2cd1a2eb12..84ce5421d6 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -48,6 +48,8 @@ import tests.simplify as simplify import tests.tsutil as tsutil import tskit +import tskit.util as util +from tskit import UNKNOWN_TIME def insert_uniform_mutations(tables, num_mutations, nodes): @@ -64,11 +66,8 @@ def insert_uniform_mutations(tables, num_mutations, nodes): site=j, derived_state="1", node=nodes[j % len(nodes)], - time=-1, metadata=json.dumps({"index": j}).encode(), ) - tables.build_index() - tables.compute_mutation_times() def get_table_collection_copy(tables, sequence_length): @@ -749,10 +748,11 @@ def test_compute_mutation_parent(self): def test_compute_mutation_time(self): for ts in get_example_tree_sequences(): tables = ts.dump_tables() - before = tables.mutations.time[:] + python_time = tsutil.compute_mutation_times(ts) tables.compute_mutation_times() - time = tables.mutations.time - self.assertTrue(np.array_equal(time, before)) + self.assertTrue( + np.allclose(python_time, tables.mutations.time, rtol=1e-15, atol=1e-15) + ) # Check we have valid times tables.tree_sequence() @@ -1107,7 +1107,6 @@ def test_simplify_bugs(self): sites=sites, mutations=mutations, strict=False, - compute_mutation_times=True, ) samples = list(ts.samples()) self.verify_simplify_equality(ts, samples) @@ -1677,7 +1676,12 @@ def convert(v): splits = line.split("\t") self.assertEqual(str(mutation.site), splits[0]) self.assertEqual(str(mutation.node), splits[1]) - self.assertEqual(str(mutation.time), splits[2]) + self.assertEqual( + "unknown" + if util.is_unknown_time(mutation.time) + else str(mutation.time), + splits[2], + ) self.assertEqual(str(mutation.derived_state), splits[3]) self.assertEqual(str(mutation.parent), splits[4]) self.assertEqual(tests.base64_encode(mutation.metadata), splits[5]) @@ -1748,7 +1752,8 @@ def verify_approximate_equality(self, ts1, ts2): checked += 1 self.assertEqual(s1.site, s2.site) self.assertEqual(s1.node, s2.node) - self.assertAlmostEqual(s1.time, s2.time) + if not (math.isnan(s1.time) and math.isnan(s2.time)): + self.assertAlmostEqual(s1.time, s2.time) self.assertEqual(s1.derived_state, s2.derived_state) self.assertEqual(s1.parent, s2.parent) self.assertEqual(s1.metadata, s2.metadata) @@ -2769,6 +2774,46 @@ def get_instances(self, n): for j in range(n) ] + def test_nan_equality(self): + a = tskit.Mutation( + id_=42, + site=42, + node=42, + time=UNKNOWN_TIME, + derived_state="A" * 42, + parent=42, + encoded_metadata=b"x" * 42, + metadata_decoder=lambda m: m.decode() + "decoded", + ) + b = tskit.Mutation( + id_=42, + site=42, + node=42, + derived_state="A" * 42, + parent=42, + encoded_metadata=b"x" * 42, + metadata_decoder=lambda m: m.decode() + "decoded", + ) + c = tskit.Mutation( + id_=42, + site=42, + node=42, + time=math.nan, + derived_state="A" * 42, + parent=42, + encoded_metadata=b"x" * 42, + metadata_decoder=lambda m: m.decode() + "decoded", + ) + self.assertTrue(a == a) + self.assertTrue(a == b) + self.assertFalse(a == c) + self.assertFalse(b == c) + self.assertFalse(a != a) + self.assertFalse(a != b) + self.assertTrue(a != c) + self.assertTrue(c != c) + self.assertFalse(c == c) + class TestMigrationContainer( unittest.TestCase, SimpleContainersMixin, SimpleContainersWithMetadataMixin diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index 7fa0e55f41..c39ee9d4a7 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -365,6 +365,11 @@ def verify_dump_equality(self, ts): def test_dump_equality(self): for ts in self.get_example_tree_sequences(): + tables = _tskit.TableCollection(sequence_length=ts.get_sequence_length()) + ts.dump_tables(tables) + tables.compute_mutation_times() + ts = _tskit.TreeSequence() + ts.load_tables(tables) self.verify_dump_equality(ts) def verify_mutations(self, ts): @@ -1646,10 +1651,10 @@ def test_sites(self): ( site, node, - time, derived_state, parent, metadata, + time, ) = ts.get_mutation(mut_id) self.assertEqual(site, index) self.assertEqual(mutation_id, mut_id) @@ -1937,7 +1942,6 @@ def f(mutations): position = [] node = [] site = [] - time = [] ancestral_state = [] ancestral_state_offset = [0] derived_state = [] @@ -1945,7 +1949,6 @@ def f(mutations): for j, (p, n) in enumerate(mutations): site.append(j) position.append(p) - time.append(-1) ancestral_state.append("0") ancestral_state_offset.append(ancestral_state_offset[-1] + 1) derived_state.append("1") @@ -1964,7 +1967,6 @@ def f(mutations): dict( site=site, node=node, - time=time, derived_state=derived_state, derived_state_offset=derived_state_offset, parent=None, diff --git a/python/tests/test_metadata.py b/python/tests/test_metadata.py index 0d0cec4871..3bb5116c15 100644 --- a/python/tests/test_metadata.py +++ b/python/tests/test_metadata.py @@ -156,9 +156,7 @@ def test_mutations(self): pickled = pickle.dumps(metadata) tables.nodes.add_row(time=0) tables.sites.add_row(position=0.1, ancestral_state="A") - tables.mutations.add_row( - site=0, node=0, time=0, derived_state="T", metadata=pickled - ) + tables.mutations.add_row(site=0, node=0, derived_state="T", metadata=pickled) ts = tables.tree_sequence() mutation = ts.site(0).mutations[0] self.assertEqual(mutation.site, 0) diff --git a/python/tests/test_parsimony.py b/python/tests/test_parsimony.py index 7afb25127f..4ed3edc731 100644 --- a/python/tests/test_parsimony.py +++ b/python/tests/test_parsimony.py @@ -154,10 +154,7 @@ def fitch_map_mutations(tree, genotypes, alleles): state[root] = np.where(A[root] == 1)[0][0] mutations.append( tskit.Mutation( - node=root, - time=tree.tree_sequence.node(root).time, - parent=tskit.NULL, - derived_state=alleles[state[root]], + node=root, parent=tskit.NULL, derived_state=alleles[state[root]] ) ) parent = len(mutations) - 1 @@ -171,7 +168,6 @@ def fitch_map_mutations(tree, genotypes, alleles): mutations.append( tskit.Mutation( node=v, - time=tree.tree_sequence.node(v).time, parent=parent_mutation, derived_state=alleles[state[v]], ) @@ -753,7 +749,7 @@ def test_mutation_over_0(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], tskit.Mutation(node=0, time=0, parent=-1, derived_state="1") + transitions[0], tskit.Mutation(node=0, parent=-1, derived_state="1") ) def test_mutation_over_5(self): @@ -762,8 +758,7 @@ def test_mutation_over_5(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], - tskit.Mutation(node=5, time=0.14567111023387, parent=-1, derived_state="1"), + transitions[0], tskit.Mutation(node=5, parent=-1, derived_state="1") ) def test_mutation_over_7(self): @@ -772,8 +767,7 @@ def test_mutation_over_7(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], - tskit.Mutation(node=7, time=0.43508024345063, parent=-1, derived_state="1"), + transitions[0], tskit.Mutation(node=7, parent=-1, derived_state="1") ) def test_mutation_over_7_0(self): @@ -782,11 +776,10 @@ def test_mutation_over_7_0(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 2) self.assertEqual( - transitions[0], - tskit.Mutation(node=7, time=0.43508024345063, parent=-1, derived_state="1"), + transitions[0], tskit.Mutation(node=7, parent=-1, derived_state="1") ) self.assertEqual( - transitions[1], tskit.Mutation(node=0, time=0, parent=0, derived_state="2") + transitions[1], tskit.Mutation(node=0, parent=0, derived_state="2") ) def test_mutation_over_7_0_alleles(self): @@ -798,14 +791,10 @@ def test_mutation_over_7_0_alleles(self): self.assertEqual(ancestral_state, "ANC") self.assertEqual(len(transitions), 2) self.assertEqual( - transitions[0], - tskit.Mutation( - node=7, time=0.43508024345063, parent=-1, derived_state="ONE" - ), + transitions[0], tskit.Mutation(node=7, parent=-1, derived_state="ONE") ) self.assertEqual( - transitions[1], - tskit.Mutation(node=0, time=0, parent=0, derived_state="TWO"), + transitions[1], tskit.Mutation(node=0, parent=0, derived_state="TWO") ) def test_mutation_over_7_missing_data_0(self): @@ -814,8 +803,7 @@ def test_mutation_over_7_missing_data_0(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], - tskit.Mutation(node=7, time=0.43508024345063, parent=-1, derived_state="1"), + transitions[0], tskit.Mutation(node=7, parent=-1, derived_state="1") ) def test_mutation_over_leaf_sibling_missing(self): @@ -829,8 +817,7 @@ def test_mutation_over_leaf_sibling_missing(self): # data had the ancestral state. However, the number of *state changes* # is the same, which is what the algorithm is minimising. self.assertEqual( - transitions[0], - tskit.Mutation(node=6, time=0.21385545626353, parent=-1, derived_state="1"), + transitions[0], tskit.Mutation(node=6, parent=-1, derived_state="1") ) # Reverse is the same @@ -839,8 +826,7 @@ def test_mutation_over_leaf_sibling_missing(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], - tskit.Mutation(node=6, time=0.21385545626353, parent=-1, derived_state="1"), + transitions[0], tskit.Mutation(node=6, parent=-1, derived_state="1") ) def test_mutation_over_6_missing_data_0(self): @@ -849,8 +835,7 @@ def test_mutation_over_6_missing_data_0(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], - tskit.Mutation(node=6, time=0.21385545626353, parent=-1, derived_state="1"), + transitions[0], tskit.Mutation(node=6, parent=-1, derived_state="1") ) def test_mutation_over_0_missing_data_4(self): @@ -859,7 +844,7 @@ def test_mutation_over_0_missing_data_4(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 1) self.assertEqual( - transitions[0], tskit.Mutation(node=0, time=0, parent=-1, derived_state="1") + transitions[0], tskit.Mutation(node=0, parent=-1, derived_state="1") ) def test_multi_mutation_missing_data(self): @@ -868,11 +853,10 @@ def test_multi_mutation_missing_data(self): self.assertEqual(ancestral_state, "0") self.assertEqual(len(transitions), 2) self.assertEqual( - transitions[0], - tskit.Mutation(node=5, time=0.14567111023387, parent=-1, derived_state="1"), + transitions[0], tskit.Mutation(node=5, parent=-1, derived_state="1") ) self.assertEqual( - transitions[1], tskit.Mutation(node=1, time=0, parent=0, derived_state="2") + transitions[1], tskit.Mutation(node=1, parent=0, derived_state="2") ) diff --git a/python/tests/test_stats.py b/python/tests/test_stats.py index a0b3201c8a..3048fc997e 100644 --- a/python/tests/test_stats.py +++ b/python/tests/test_stats.py @@ -176,7 +176,7 @@ def test_tree_sequence_regular_mutations(self): for j in range(self.num_test_sites): site_id = len(t.sites) t.sites.add_row(position=j, ancestral_state="0") - t.mutations.add_row(site=site_id, time=0, derived_state="1", node=j) + t.mutations.add_row(site=site_id, derived_state="1", node=j) ts = t.tree_sequence() self.verify_matrix(ts) self.verify_max_distance(ts) diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index c9ef7c0ea3..1469505701 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -26,6 +26,7 @@ """ import io import itertools +import math import pickle import random import unittest @@ -1051,38 +1052,36 @@ class TestMutationTable(unittest.TestCase, CommonTestsMixin, MetadataTestsMixin) def test_simple_example(self): t = tskit.MutationTable() - t.add_row(site=0, node=1, time=2, derived_state="3", parent=4, metadata=b"5") - t.add_row(1, 2, 3, "4", 5, b"\xf0") + t.add_row(site=0, node=1, derived_state="2", parent=3, metadata=b"4", time=5) + t.add_row(1, 2, "3", 4, b"\xf0", 6) s = str(t) self.assertGreater(len(s), 0) self.assertEqual(len(t), 2) - self.assertEqual(attr.astuple(t[0]), (0, 1, 2, "3", 4, b"5")) - self.assertEqual(attr.astuple(t[1]), (1, 2, 3, "4", 5, b"\xf0")) + self.assertEqual(attr.astuple(t[0]), (0, 1, "2", 3, b"4", 5)) + self.assertEqual(attr.astuple(t[1]), (1, 2, "3", 4, b"\xf0", 6)) self.assertEqual(t[0].site, 0) self.assertEqual(t[0].node, 1) - self.assertEqual(t[0].time, 2) - self.assertEqual(t[0].derived_state, "3") - self.assertEqual(t[0].parent, 4) - self.assertEqual(t[0].metadata, b"5") + self.assertEqual(t[0].derived_state, "2") + self.assertEqual(t[0].parent, 3) + self.assertEqual(t[0].metadata, b"4") + self.assertEqual(t[0].time, 5) self.assertEqual(t[0], t[-2]) self.assertEqual(t[1], t[-1]) self.assertRaises(IndexError, t.__getitem__, -3) def test_add_row_bad_data(self): t = tskit.MutationTable() - t.add_row(0, 0, 0, "A", 0) - with self.assertRaises(TypeError): - t.add_row("0", 0, 0, "A", 0) + t.add_row(0, 0, "A") with self.assertRaises(TypeError): - t.add_row(0, "0", 0, "A", 0) + t.add_row("0", 0, "A") with self.assertRaises(TypeError): - t.add_row(0, 0, "0", "A", 0) + t.add_row(0, "0", "A") with self.assertRaises(TypeError): - t.add_row(0, 0, 0, "A", 0, parent=None) + t.add_row(0, 0, "A", parent=None) with self.assertRaises(TypeError): - t.add_row(0, 0, 0, "A", 0, metadata=[0]) + t.add_row(0, 0, "A", metadata=[0]) with self.assertRaises(TypeError): - t.add_row(0, 0, 0, "A", "0") + t.add_row(0, 0, "A", time="A") def test_packset_derived_state(self): for num_rows in [0, 10, 100]: @@ -1266,10 +1265,10 @@ def verify_randomise_tables(self, ts): tables.mutations.add_row( site=site_id_map[m.site], node=m.node, - time=m.time, derived_state=m.derived_state, parent=m.parent, metadata=m.metadata, + time=m.time, ) if ts.num_sites > 1: # Verify that import fails for randomised sites @@ -1476,11 +1475,11 @@ def test_sort_mutations_stability(self): ) mutations = io.StringIO( """\ - site node time derived_state parent - 1 0 0 1 -1 - 1 1 0 1 -1 - 0 1 0 1 -1 - 0 0 0 1 -1 + site node derived_state parent + 1 0 1 -1 + 1 1 1 -1 + 0 1 1 -1 + 0 0 1 -1 """ ) ts = tskit.load_text( @@ -1569,8 +1568,8 @@ def test_sort_mutations_bad_parent_id(self): ) mutations = io.StringIO( """\ - site node time derived_state parent - 1 0 0 1 -2 + site node derived_state parent + 1 0 1 -2 """ ) self.assertRaises( @@ -1608,7 +1607,7 @@ def test_younger_than_node_below(self): tables.mutations.time = np.zeros(len(tables.mutations.time), dtype=np.float64) with self.assertRaisesRegex( _tskit.LibraryError, - "Mutations must have a time equal to or greater than that of their node", + "A mutation's time must be >= the node time, or be marked as 'unknown'", ): tables.tree_sequence() @@ -1620,8 +1619,8 @@ def test_older_than_node_above(self): ) with self.assertRaisesRegex( _tskit.LibraryError, - "Mutations must have a time less than the parent" - " node of the edge they are on", + "A mutation's time must be < the parent node of the edge on which it" + " occurs, or be marked as 'unknown'", ): tables.tree_sequence() @@ -1639,8 +1638,8 @@ def test_older_than_parent(self): tables.mutations.set_columns(**as_dict) with self.assertRaisesRegex( _tskit.LibraryError, - "Mutations must have a time equal to or less than that" - " of their parent mutation", + "A mutation's time must be < the parent node of the edge on which it" + " occurs, or be marked as 'unknown'", ): tables.tree_sequence() @@ -1695,7 +1694,12 @@ def test_node_times(self): bad_times = tables.nodes.time.copy() bad_times[-1] = np.inf tables.nodes.time = bad_times - self.assertRaises(_tskit.LibraryError, tables.tree_sequence) + with self.assertRaisesRegex(_tskit.LibraryError, "Times must be finite"): + tables.tree_sequence() + bad_times[-1] = math.nan + tables.nodes.time = bad_times + with self.assertRaisesRegex(_tskit.LibraryError, "Times must be finite"): + tables.tree_sequence() def test_mutation_times(self): ts = msprime.simulate(5, mutation_rate=1, random_seed=42) @@ -1705,6 +1709,11 @@ def test_mutation_times(self): tables.mutations.time = bad_times with self.assertRaisesRegex(_tskit.LibraryError, "Times must be finite"): tables.tree_sequence() + bad_times = tables.mutations.time.copy() + bad_times[-1] = math.nan + tables.mutations.time = bad_times + with self.assertRaisesRegex(_tskit.LibraryError, "Times must be finite"): + tables.tree_sequence() def test_individual(self): ts = msprime.simulate(12, mutation_rate=1, random_seed=42) @@ -1846,7 +1855,6 @@ def test_bad_mutation_nodes(self): mutations.set_columns( site=mutations.site, node=node, - time=mutations.time, derived_state=mutations.derived_state, derived_state_offset=mutations.derived_state_offset, ) @@ -1863,7 +1871,6 @@ def test_bad_mutation_sites(self): mutations.set_columns( site=site, node=mutations.node, - time=mutations.time, derived_state=mutations.derived_state, derived_state_offset=mutations.derived_state_offset, ) @@ -2441,10 +2448,7 @@ def test_multichar_ancestral_state(self): tables.sites.add_row(position=site.position, ancestral_state="0") for mutation in site.mutations: tables.mutations.add_row( - site=site_id, - node=mutation.node, - time=mutation.time, - derived_state="T" * site.id, + site=site_id, node=mutation.node, derived_state="T" * site.id, ) tables.deduplicate_sites() new_ts = tables.tree_sequence() @@ -2467,7 +2471,6 @@ def test_multichar_metadata(self): tables.mutations.add_row( site=site_id, node=mutation.node, - time=mutation.time, derived_state="1", metadata=b"T" * site.id, ) diff --git a/python/tests/test_topology.py b/python/tests/test_topology.py index d671b3cba1..e0d0b44bb8 100644 --- a/python/tests/test_topology.py +++ b/python/tests/test_topology.py @@ -97,7 +97,7 @@ def simple_keep_intervals(tables, intervals, simplify=True, record_provenance=Tr ) for m in site.mutations: tables.mutations.add_row( - site_id, m.node, m.time, m.derived_state, tskit.NULL, m.metadata + site_id, m.node, m.derived_state, tskit.NULL, m.metadata ) tables.build_index() tables.compute_mutation_parents() @@ -1756,7 +1756,7 @@ def test_one_node_zero_samples_sites(self): tables = tskit.TableCollection(sequence_length=1) tables.nodes.add_row(time=0, flags=0) tables.sites.add_row(position=0.5, ancestral_state="0") - tables.mutations.add_row(node=0, site=0, time=0, derived_state="1") + tables.mutations.add_row(site=0, derived_state="1", node=0) ts = tables.tree_sequence() self.assertEqual(ts.sequence_length, 1) self.assertEqual(ts.num_trees, 1) @@ -1812,7 +1812,7 @@ def test_one_node_one_sample_sites(self): tables = tskit.TableCollection(sequence_length=1) tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) tables.sites.add_row(position=0.5, ancestral_state="0") - tables.mutations.add_row(site=0, time=0, derived_state="1", node=0) + tables.mutations.add_row(site=0, derived_state="1", node=0) ts = tables.tree_sequence() self.assertEqual(ts.sequence_length, 1) self.assertEqual(ts.num_trees, 1) @@ -2230,7 +2230,6 @@ def test_simple_case(self): sites=io.StringIO(sites), mutations=io.StringIO(mutations), strict=False, - compute_mutation_times=True, ) self.assertEqual(ts.sample_size, 2) @@ -2398,11 +2397,11 @@ def test_simple_case(self): ) mutations = io.StringIO( """\ - site node time derived_state - 0 2 0 1 - 1 3 0 1 - 2 4 0 1 - 3 1 1 1 + site node derived_state + 0 2 1 + 1 3 1 + 2 4 1 + 3 1 1 """ ) ts = tskit.load_text( @@ -2975,9 +2974,9 @@ def test_single_binary_tree_simple_mutations(self): 1 0.2 0 """ mutations_before = """\ - site node time derived_state - 0 6 0 1 - 1 2 0 1 + site node derived_state + 0 6 1 + 1 2 1 """ # We sample 0 and 2 and 6, so we get @@ -2999,8 +2998,8 @@ def test_single_binary_tree_simple_mutations(self): 0 0.1 0 """ mutations_after = """\ - site node time derived_state - 0 2 0 1 + site node derived_state + 0 2 1 """ self.verify_simplify( samples=[0, 1, 6], @@ -3159,11 +3158,11 @@ def test_simple_case(self): ) mutations = io.StringIO( """\ - site node time derived_state - 0 0 0 1 - 1 1 0 1 - 2 3 0 1 - 3 4 0 1 + site node derived_state + 0 0 1 + 1 1 1 + 2 3 1 + 3 4 1 """ ) ts = tskit.load_text( @@ -3245,9 +3244,9 @@ def test_simplest_degenerate_case(self): ) mutations = io.StringIO( """\ - site node time derived_state - 0 0 0 1 - 1 1 0 1 + site node derived_state + 0 0 1 + 1 1 1 """ ) ts = tskit.load_text( @@ -3307,11 +3306,11 @@ def test_simplest_non_degenerate_case(self): ) mutations = io.StringIO( """\ - site node time derived_state - 0 0 0 1 - 1 1 0 1 - 2 2 0 1 - 3 3 0 1 + site node derived_state + 0 0 1 + 1 1 1 + 2 2 1 + 3 3 1 """ ) ts = tskit.load_text( @@ -3381,12 +3380,12 @@ def test_two_reducible_trees(self): ) mutations = io.StringIO( """\ - site node time derived_state - 0 0 0 1 - 1 1 0 1 - 2 2 0 1 - 3 3 0 1 - 4 8 0 1 + site node derived_state + 0 0 1 + 1 1 1 + 2 2 1 + 3 3 1 + 4 8 1 """ ) ts = tskit.load_text( @@ -3506,13 +3505,13 @@ def test_mutations_over_roots(self): ) mutations = io.StringIO( """\ - site node time derived_state - 0 0 0 1 - 1 1 0 1 - 2 3 1 1 - 3 4 2 1 - 4 2 0 1 - 5 5 2 1 + site node derived_state + 0 0 1 + 1 1 1 + 2 3 1 + 3 4 1 + 4 2 1 + 5 5 1 """ ) ts = tskit.load_text( @@ -3890,17 +3889,17 @@ def test_many_single_offspring(self): ) mutations = io.StringIO( """\ - site node derived_state time parent - 0 7 1 3 -1 - 0 10 0 1 0 - 0 2 1 0 1 - 1 0 1 0 -1 - 1 10 1 1 -1 - 2 8 1 2 -1 - 2 9 0 1 5 - 2 10 0 1 5 - 2 2 1 0 7 - 3 8 1 2 -1 + site node derived_state parent + 0 7 1 -1 + 0 10 0 0 + 0 2 1 1 + 1 0 1 -1 + 1 10 1 -1 + 2 8 1 -1 + 2 9 0 5 + 2 10 0 5 + 2 2 1 7 + 3 8 1 -1 """ ) ts = tskit.load_text(nodes, edges, sites, mutations, strict=False) @@ -4817,10 +4816,10 @@ def test_small_tree_mutations(self): tables.sites.add_row(position=0.5, ancestral_state="0") tables.sites.add_row(position=0.75, ancestral_state="0") tables.sites.add_row(position=0.8, ancestral_state="0") - tables.mutations.add_row(site=0, node=0, time=0, derived_state="1") - tables.mutations.add_row(site=1, node=2, time=0, derived_state="1") - tables.mutations.add_row(site=2, node=7, time=0.5, derived_state="1") - tables.mutations.add_row(site=3, node=0, time=0, derived_state="1") + tables.mutations.add_row(site=0, node=0, derived_state="1") + tables.mutations.add_row(site=1, node=2, derived_state="1") + tables.mutations.add_row(site=2, node=7, derived_state="1") + tables.mutations.add_row(site=3, node=0, derived_state="1") ts = tables.tree_sequence() self.assertEqual(ts.num_sites, 4) self.assertEqual(ts.num_mutations, 4) @@ -4860,9 +4859,9 @@ def test_small_tree_fixed_sites(self): tables.sites.add_row(position=0.25, ancestral_state="0") tables.sites.add_row(position=0.5, ancestral_state="0") tables.sites.add_row(position=0.75, ancestral_state="0") - tables.mutations.add_row(site=0, node=2, time=0, derived_state="1") - tables.mutations.add_row(site=1, node=3, time=0, derived_state="1") - tables.mutations.add_row(site=2, node=6, time=0.22, derived_state="1") + tables.mutations.add_row(site=0, node=2, derived_state="1") + tables.mutations.add_row(site=1, node=3, derived_state="1") + tables.mutations.add_row(site=2, node=6, derived_state="1") ts = tables.tree_sequence() self.assertEqual(ts.num_sites, 3) self.assertEqual(ts.num_mutations, 3) @@ -4880,7 +4879,7 @@ def test_small_tree_mutations_over_root(self): ) tables = ts.dump_tables() tables.sites.add_row(position=0.25, ancestral_state="0") - tables.mutations.add_row(site=0, node=8, time=1.65, derived_state="1") + tables.mutations.add_row(site=0, node=8, derived_state="1") ts = tables.tree_sequence() self.assertEqual(ts.num_sites, 1) self.assertEqual(ts.num_mutations, 1) @@ -4900,8 +4899,8 @@ def test_small_tree_recurrent_mutations(self): tables = ts.dump_tables() # Add recurrent mutation on the root branches tables.sites.add_row(position=0.25, ancestral_state="0") - tables.mutations.add_row(site=0, node=6, time=0.22, derived_state="1") - tables.mutations.add_row(site=0, node=7, time=0.44, derived_state="1") + tables.mutations.add_row(site=0, node=6, derived_state="1") + tables.mutations.add_row(site=0, node=7, derived_state="1") ts = tables.tree_sequence() self.assertEqual(ts.num_sites, 1) self.assertEqual(ts.num_mutations, 2) @@ -4921,9 +4920,9 @@ def test_small_tree_back_mutations(self): tables = ts.dump_tables() # Add a chain of mutations tables.sites.add_row(position=0.25, ancestral_state="0") - tables.mutations.add_row(site=0, node=7, time=0.44, derived_state="1") - tables.mutations.add_row(site=0, node=5, time=0.15, derived_state="0") - tables.mutations.add_row(site=0, node=1, time=0, derived_state="1") + tables.mutations.add_row(site=0, node=7, derived_state="1") + tables.mutations.add_row(site=0, node=5, derived_state="0") + tables.mutations.add_row(site=0, node=1, derived_state="1") ts = tables.tree_sequence() self.assertEqual(ts.num_sites, 1) self.assertEqual(ts.num_mutations, 3) @@ -5111,9 +5110,9 @@ def test_many_mutations_over_single_sample_ancestral_state(self): ) mutations = io.StringIO( """\ - site node time derived_state parent - 0 0 0 1 -1 - 0 0 0 0 0 + site node derived_state parent + 0 0 1 -1 + 0 0 0 0 """ ) ts = tskit.load_text( @@ -5151,10 +5150,10 @@ def test_many_mutations_over_single_sample_derived_state(self): ) mutations = io.StringIO( """\ - site node time derived_state parent - 0 0 0 1 -1 - 0 0 0 0 0 - 0 0 0 1 1 + site node derived_state parent + 0 0 1 -1 + 0 0 0 0 + 0 0 1 1 """ ) ts = tskit.load_text( @@ -5798,12 +5797,7 @@ def test_example(self): """ ) ts = tskit.load_text( - nodes=nodes, - edges=edges, - sites=sites, - mutations=mutations, - strict=False, - compute_mutation_times=True, + nodes=nodes, edges=edges, sites=sites, mutations=mutations, strict=False ) self.verify_parents(ts) @@ -6820,13 +6814,9 @@ def ts_with_4_sites(self): ts = msprime.simulate(8, random_seed=3) tables = ts.dump_tables() tables.sites.set_columns(np.arange(0, 1, 0.25), *tskit.pack_strings(["G"] * 4)) - tables.mutations.add_row( - site=1, node=ts.first().parent(0), time=-1, derived_state="C" - ) - tables.mutations.add_row(site=1, node=0, time=-1, derived_state="T", parent=0) - tables.mutations.add_row(site=2, node=1, time=-1, derived_state="A") - tables.build_index() - tables.compute_mutation_times() + tables.mutations.add_row(site=1, node=ts.first().parent(0), derived_state="C") + tables.mutations.add_row(site=1, node=0, derived_state="T", parent=0) + tables.mutations.add_row(site=2, node=1, derived_state="A") return tables.tree_sequence() def test_remove_by_index(self): @@ -6873,7 +6863,7 @@ def verify_removal(self, ts, remove_sites): tables = ts.dump_tables() tables.delete_sites(remove_sites) - # Make sure we've computed the mutation parents and times properly. + # Make sure we've computed the mutation parents properly. mutation_parent = tables.mutations.parent tables.compute_mutation_parents() self.assertTrue(np.array_equal(mutation_parent, tables.mutations.parent)) @@ -6889,7 +6879,6 @@ def verify_removal(self, ts, remove_sites): self.assertEqual(len(s1.mutations), len(s2.mutations)) for m1, m2 in zip(s1.mutations, s2.mutations): self.assertEqual(m1.node, m2.node) - self.assertEqual(m1.time, m2.time) self.assertEqual(m1.derived_state, m2.derived_state) self.assertEqual(m1.metadata, m2.metadata) @@ -6911,9 +6900,7 @@ def test_simple_mixed_length_states(self): tables = ts.dump_tables() for j in range(10): tables.sites.add_row(j, "X" * j) - tables.mutations.add_row( - site=j, node=j, time=0, derived_state="X" * (j + 1) - ) + tables.mutations.add_row(site=j, node=j, derived_state="X" * (j + 1)) ts = tables.tree_sequence() self.verify_removal(ts, [9]) @@ -7267,11 +7254,10 @@ def add_mutations(self, ts, position, ancestral_state, derived_states, nodes): tables = ts.dump_tables() site = tables.sites.add_row(position, ancestral_state) for state, node in zip(derived_states, nodes): - tables.mutations.add_row(site, node, -1, state) + tables.mutations.add_row(site, node, state) tables.sort() tables.build_index() tables.compute_mutation_parents() - tables.compute_mutation_times() return tables.tree_sequence() def verify_sites(self, source_tree, trimmed_tree, position_offset): diff --git a/python/tests/test_tree_stats.py b/python/tests/test_tree_stats.py index 7f77226cb6..27f66da41c 100644 --- a/python/tests/test_tree_stats.py +++ b/python/tests/test_tree_stats.py @@ -704,8 +704,8 @@ def test_ghost_allele(self): tables.nodes.add_row(flags=1, time=0) tables.nodes.add_row(flags=1, time=0) # The ghost mutation that's never seen in the genotypes - tables.mutations.add_row(site=0, node=0, time=0, derived_state="T") - tables.mutations.add_row(site=0, node=0, time=0, derived_state="G", parent=0) + tables.mutations.add_row(site=0, node=0, derived_state="T") + tables.mutations.add_row(site=0, node=0, derived_state="G", parent=0) ts = tables.tree_sequence() self.verify(ts) @@ -720,9 +720,9 @@ def test_ghost_allele_all_ancestral(self): tables.edges.add_row(0, 1, 2, 0) tables.edges.add_row(0, 1, 2, 1) tables.sites.add_row(position=0.5, ancestral_state="A") - tables.mutations.add_row(site=0, node=0, time=0, derived_state="T") + tables.mutations.add_row(site=0, node=0, derived_state="T") # Mutate back to the ancestral state so that all genotypes are zero - tables.mutations.add_row(site=0, node=0, time=0, derived_state="A", parent=0) + tables.mutations.add_row(site=0, node=0, derived_state="A", parent=0) ts = tables.tree_sequence() self.verify(ts) @@ -750,10 +750,8 @@ def test_non_sample_ancestry(self): # leaf so that samples are fixed at zero. tables.sites.add_row(position=0.25, ancestral_state="0") tables.sites.add_row(position=0.5, ancestral_state="0") - tables.mutations.add_row(site=0, node=4, time=-1, derived_state="1") - tables.mutations.add_row(site=1, node=6, time=-1, derived_state="1") - tables.build_index() - tables.compute_mutation_times() + tables.mutations.add_row(site=0, node=4, derived_state="1") + tables.mutations.add_row(site=1, node=6, derived_state="1") ts = tables.tree_sequence() self.verify(ts) @@ -4895,17 +4893,17 @@ def test_case_1(self): ) mutations = io.StringIO( """\ - site node time derived_state - 0 4 0.5 1 - 1 0 0 1 - 2 2 0 1 - 3 0 0 1 - 4 1 0 1 - 5 1 0 1 - 6 2 0 1 - 7 0 0 1 - 8 1 0 1 - 9 2 0 1 + site node derived_state + 0 4 1 + 1 0 1 + 2 2 1 + 3 0 1 + 4 1 1 + 5 1 1 + 6 2 1 + 7 0 1 + 8 1 1 + 9 2 1 """ ) ts = tskit.load_text( @@ -5103,9 +5101,9 @@ def test_case_odds_and_ends(self): ) mutations = io.StringIO( """\ - site node time derived_state parent - 0 0 0 1 -1 - 0 1 0 2 -1 + site node derived_state parent + 0 0 1 -1 + 0 1 2 -1 """ ) ts = tskit.load_text( @@ -5285,16 +5283,16 @@ def test_case_recurrent_muts(self): ) mutations = io.StringIO( """\ - site node time derived_state parent - 0 0 0 1 -1 - 0 0 0 2 0 - 0 0 0 0 1 - 0 1 0 2 -1 - 1 1 0 1 -1 - 1 2 0 1 -1 - 2 4 0.5 1 -1 - 2 1 0 2 6 - 2 2 0 3 6 + site node derived_state parent + 0 0 1 -1 + 0 0 2 0 + 0 0 0 1 + 0 1 2 -1 + 1 1 1 -1 + 1 2 1 -1 + 2 4 1 -1 + 2 1 2 6 + 2 2 3 6 """ ) ts = tskit.load_text( @@ -5406,14 +5404,14 @@ def test_case_2(self): ) mutations = io.StringIO( """\ - site node time derived_state parent - 0 0 0 1 -1 - 0 10 1 1 -1 - 0 0 0 0 0 - 1 8 2 1 -1 - 1 2 0 1 -1 - 2 8 2 1 -1 - 2 9 1 0 5 + site node derived_state parent + 0 0 1 -1 + 0 10 1 -1 + 0 0 0 0 + 1 8 1 -1 + 1 2 1 -1 + 2 8 1 -1 + 2 9 0 5 """ ) ts = tskit.load_text( diff --git a/python/tests/test_util.py b/python/tests/test_util.py index 20e85aa513..a9ed0b82e3 100644 --- a/python/tests/test_util.py +++ b/python/tests/test_util.py @@ -23,6 +23,7 @@ Tests for functions in util.py """ import itertools +import math import pickle import unittest @@ -30,6 +31,18 @@ import tests.tsutil as tsutil import tskit.util as util +from tskit import UNKNOWN_TIME + + +class TestUnknownTime(unittest.TestCase): + def test_unknown_time(self): + self.assertTrue(math.isnan(UNKNOWN_TIME)) + self.assertTrue(util.is_unknown_time(UNKNOWN_TIME)) + self.assertFalse(util.is_unknown_time(math.nan)) + self.assertFalse(util.is_unknown_time(np.nan)) + self.assertFalse(util.is_unknown_time(0)) + self.assertFalse(util.is_unknown_time(math.inf)) + self.assertFalse(util.is_unknown_time(1)) class TestNumpyArrayCasting(unittest.TestCase): diff --git a/python/tests/test_vcf.py b/python/tests/test_vcf.py index 4a31b60ab4..7872c55533 100644 --- a/python/tests/test_vcf.py +++ b/python/tests/test_vcf.py @@ -514,11 +514,11 @@ def test_many_alleles(self): tables.sites.add_row(0.5, "0") # 9 alleles should be fine for j in range(8): - tables.mutations.add_row(0, node=j, time=0, derived_state=str(j + 1)) + tables.mutations.add_row(0, node=j, derived_state=str(j + 1)) ts = tables.tree_sequence() ts.write_vcf(io.StringIO()) for j in range(9, 15): - tables.mutations.add_row(0, node=j, time=0, derived_state=str(j)) + tables.mutations.add_row(0, node=j, derived_state=str(j)) ts = tables.tree_sequence() with self.assertRaises(ValueError): ts.write_vcf(io.StringIO()) diff --git a/python/tests/test_wright_fisher.py b/python/tests/test_wright_fisher.py index 0844c12c0a..cbabf2859b 100644 --- a/python/tests/test_wright_fisher.py +++ b/python/tests/test_wright_fisher.py @@ -405,10 +405,6 @@ def verify_simplify(self, ts, new_ts, samples, node_map): self.assertEqual(node_map[mrca1], mrca2) mut_parent = tsutil.compute_mutation_parent(ts=ts) self.assertArrayEqual(mut_parent, ts.tables.mutations.parent) - mut_time = tsutil.compute_mutation_times(ts=ts) - self.assertTrue( - np.allclose(mut_time, ts.tables.mutations.time, rtol=1e-15, atol=1e-15) - ) def verify_haplotypes(self, ts, samples): """ diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 08448147a2..b76acdad9c 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -126,14 +126,11 @@ def insert_branch_mutations(ts, mutations_per_branch=1): mutation[u] = tables.mutations.add_row( site=site, node=u, - time=-1, derived_state=str(state[u]), parent=parent, ) parent = mutation[u] add_provenance(tables.provenances, "insert_branch_mutations") - tables.build_index() - tables.compute_mutation_times() return tables.tree_sequence() @@ -152,11 +149,9 @@ def insert_branch_sites(ts): for u in tree.nodes(): if tree.parent(u) != tskit.NULL: site = tables.sites.add_row(position=x, ancestral_state="0") - tables.mutations.add_row(site=site, node=u, time=-1, derived_state="1") + tables.mutations.add_row(site=site, node=u, derived_state="1") x += delta add_provenance(tables.provenances, "insert_branch_sites") - tables.build_index() - tables.compute_mutation_times() return tables.tree_sequence() @@ -181,12 +176,8 @@ def insert_multichar_mutations(ts, seed=1, max_len=10): derived_state = ancestral_state while ancestral_state == derived_state: derived_state = rng.choice(letters) * rng.randint(0, max_len) - tables.mutations.add_row( - site=site, node=u, time=tables.nodes[u].time, derived_state=derived_state - ) + tables.mutations.add_row(site=site, node=u, derived_state=derived_state) add_provenance(tables.provenances, "insert_multichar_mutations") - tables.build_index() - tables.compute_mutation_times() return tables.tree_sequence() @@ -339,7 +330,7 @@ def single_childify(ts): ) else: tables.mutations.add_row( - mut.site, u, mut.time, mut.derived_state, mut.parent, mut.metadata + mut.site, u, mut.derived_state, mut.parent, mut.metadata, mut.time ) tables.sort() add_provenance(tables.provenances, "insert_redundant_breakpoints") @@ -462,13 +453,13 @@ def generate_site_mutations( while x < branch_length: new_state = random.choice([s for s in states if s != state]) if multiple_per_node and (state != new_state): - mutation_table.add_row(site, u, -1, new_state, parent) + mutation_table.add_row(site, u, new_state, parent) parent = mutation_table.num_rows - 1 state = new_state x += random.expovariate(mu) else: if (not multiple_per_node) and (state != new_state): - mutation_table.add_row(site, u, -1, new_state, parent) + mutation_table.add_row(site, u, new_state, parent) parent = mutation_table.num_rows - 1 state = new_state stack.extend(reversed([(v, state, parent) for v in tree.children(u)])) @@ -500,8 +491,6 @@ def jukes_cantor(ts, num_sites, mu, multiple_per_node=True, seed=None): multiple_per_node=multiple_per_node, ) add_provenance(tables.provenances, "jukes_cantor") - tables.build_index() - tables.compute_mutation_times() new_ts = tables.tree_sequence() return new_ts @@ -530,15 +519,12 @@ def caterpillar_tree(n, num_sites=0, num_mutations=1): state = 0 for _ in range(num_mutations): state = (state + 1) % 2 - tables.mutations.add_row( - site=j, time=-1, derived_state=str(state), node=node - ) + tables.mutations.add_row(site=j, derived_state=str(state), node=node) node -= 1 tables.sort() tables.build_index() tables.compute_mutation_parents() - tables.compute_mutation_times() return tables.tree_sequence() diff --git a/python/tskit/__init__.py b/python/tskit/__init__.py index dd3d68e6a7..efdfd30820 100644 --- a/python/tskit/__init__.py +++ b/python/tskit/__init__.py @@ -46,6 +46,12 @@ #: the genotype integers 0, 1, 2, and 3, respectively. ALLELES_ACGT = ("A", "C", "G", "T") +#: Special NAN value used to indicate unknown mutation times +""" +Say what +""" +UNKNOWN_TIME = _tskit.UNKNOWN_TIME + from tskit.provenance import __version__ # NOQA from tskit.provenance import validate_provenance # NOQA from tskit.formats import * # NOQA diff --git a/python/tskit/formats.py b/python/tskit/formats.py index 6d22a794b5..12e31ddfb8 100644 --- a/python/tskit/formats.py +++ b/python/tskit/formats.py @@ -24,14 +24,11 @@ Module responsible for converting tree sequence files from older formats. """ -import collections import datetime import json import logging import h5py -import kastore -import numpy import numpy as np import tskit @@ -100,7 +97,6 @@ def _convert_hdf5_mutations( ) mutations.set_columns( node=node, - time=np.full(num_mutations, -1, dtype=np.float64), site=np.arange(num_mutations, dtype=np.int32), derived_state=ord("1") * np.ones(num_mutations, dtype=np.int8), derived_state_offset=np.arange(num_mutations + 1, dtype=np.uint32), @@ -172,8 +168,6 @@ def _load_legacy_hdf5_v2(root, remove_duplicate_positions): ) tables.provenances.add_row(_get_upgrade_provenance(root)) tables.sort() - tables.build_index() - tables.compute_mutation_times() return tables.tree_sequence() @@ -228,8 +222,6 @@ def _load_legacy_hdf5_v3(root, remove_duplicate_positions): tables.provenances.add_row(timestamp=old_timestamp, record=record) tables.provenances.add_row(_get_upgrade_provenance(root)) tables.sort() - tables.build_index() - tables.compute_mutation_times() return tables.tree_sequence() @@ -247,44 +239,16 @@ def load_legacy(filename, remove_duplicate_positions=False): 3: _load_legacy_hdf5_v3, 10: _load_legacy_hdf5_v10, } + root = h5py.File(filename, "r") + if "format_version" not in root.attrs: + raise ValueError("HDF5 file not in msprime format") + format_version = root.attrs["format_version"] + if format_version[0] not in loaders: + raise ValueError(f"Version {format_version} not supported for loading") try: - root = h5py.File(filename, "r") - if "format_version" not in root.attrs: - raise ValueError("HDF5 file not in msprime format") - format_version = root.attrs["format_version"] - if format_version[0] not in loaders: - raise ValueError(f"Version {format_version} not supported for loading") - try: - ts = loaders[format_version[0]](root, remove_duplicate_positions) - finally: - root.close() - except OSError as e: - # An exception like this means we have a kastore - if "file signature not found" not in str(e): - raise e - else: - with kastore.load(filename) as store: - data = dict(store) - # v12 and below have no mutation time - if "mutations/time" not in data: - data["mutations/time"] = np.full( - data["mutations/node"].shape, -1, dtype=np.float64 - ) - table_dict = collections.defaultdict(collections.defaultdict) - for key, value in data.items(): - if "/" in key: - table, col = key.split("/") - # kastore type for string columns doesn't match what `fromdict wants` - if value.dtype == np.uint8: - table_dict[table][col] = value.astype(np.int8) - else: - table_dict[table][col] = value - else: - table_dict[key] = value - tables = tskit.TableCollection.fromdict(table_dict) - tables.build_index() - tables.compute_mutation_times() - ts = tables.tree_sequence() + ts = loaders[format_version[0]](root, remove_duplicate_positions) + finally: + root.close() return ts @@ -577,7 +541,6 @@ def _load_legacy_hdf5_v10(root, remove_duplicate_positions=False): tables.mutations.set_columns( site=mutations_group["site"], node=mutations_group["node"], - time=np.full(mutations_group["node"].shape, -1, dtype=np.float64), parent=mutations_group["parent"], derived_state=mutations_group["derived_state"], derived_state_offset=mutations_group["derived_state_offset"], @@ -599,8 +562,6 @@ def _load_legacy_hdf5_v10(root, remove_duplicate_positions=False): ) tables.provenances.add_row(_get_upgrade_provenance(root)) _set_populations(tables) - tables.build_index() - tables.compute_mutation_times() return tables.tree_sequence() diff --git a/python/tskit/tables.py b/python/tskit/tables.py index f922937176..547f84e279 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -38,6 +38,7 @@ import tskit.metadata as metadata import tskit.provenance as provenance import tskit.util as util +from tskit import UNKNOWN_TIME attr_options = {"slots": True, "frozen": True, "auto_attribs": True} @@ -105,10 +106,10 @@ class SiteTableRow: class MutationTableRow: site: int node: int - time: float derived_state: str parent: int metadata: bytes + time: float @attr.s(**attr_options) @@ -1408,7 +1409,9 @@ def _text_header_and_rows(self): ) return headers, rows - def add_row(self, site, node, time, derived_state, parent=-1, metadata=None): + def add_row( + self, site, node, derived_state, parent=-1, metadata=None, time=None, + ): """ Adds a new row to this :class:`MutationTable` and returns the ID of the corresponding mutation. Metadata, if specified, will be validated and encoded @@ -1417,16 +1420,24 @@ def add_row(self, site, node, time, derived_state, parent=-1, metadata=None): :param int site: The ID of the site that this mutation occurs at. :param int node: The ID of the first node inheriting this mutation. - :param float time: The occurence time for the new mutation. :param str derived_state: The state of the site at this mutation's node. :param int parent: The ID of the parent mutation. If not specified, defaults to :attr:`NULL`. :param object metadata: Any object that is valid metadata for the table's schema. :return: The ID of the newly added mutation. + :param float time: The occurrence time for the new mutation. If not specified, + defaults to ``UNKNOWN_TIME``, indicating the time is unknown. :rtype: int """ metadata = self.metadata_schema.validate_and_encode_row(metadata) - return self.ll_table.add_row(site, node, time, derived_state, parent, metadata) + return self.ll_table.add_row( + site, + node, + derived_state, + parent, + metadata, + UNKNOWN_TIME if time is None else time, + ) def set_columns( self, @@ -1444,9 +1455,9 @@ def set_columns( Sets the values for each column in this :class:`MutationTable` using the values in the specified arrays. Overwrites any data currently stored in the table. - The ``site``, ``node``, ``time``, ``derived_state`` and ``derived_state_offset`` + The ``site``, ``node``, ``derived_state`` and ``derived_state_offset`` parameters are mandatory, and must be 1D numpy arrays. The - ``site``, ``node`` and ``time`` (also ``parent``, if supplied) arrays + ``site`` and ``node`` (also ``parent`` and ``time``, if supplied) arrays must be of equal length, and determine the number of rows in the table. The ``derived_state`` and ``derived_state_offset`` parameters must be supplied together, and meet the requirements for @@ -1462,7 +1473,7 @@ def set_columns( :type site: numpy.ndarray, dtype=np.int32 :param node: The ID of the node each mutation is associated with. :type node: numpy.ndarray, dtype=np.int32 - :param time: The time values for each mutation. Required. + :param time: The time values for each mutation. :type time: numpy.ndarray, dtype=np.float64 :param derived_state: The flattened derived_state array. Required. :type derived_state: numpy.ndarray, dtype=np.int8 @@ -1502,10 +1513,10 @@ def append_columns( self, site, node, - time, derived_state, derived_state_offset, parent=None, + time=None, metadata=None, metadata_offset=None, ): @@ -1513,9 +1524,9 @@ def append_columns( Appends the specified arrays to the end of the columns of this :class:`MutationTable`. This allows many new rows to be added at once. - The ``site``, ``node``, ``time``, ``derived_state`` and ``derived_state_offset`` + The ``site``, ``node``, ``derived_state`` and ``derived_state_offset`` parameters are mandatory, and must be 1D numpy arrays. The - ``site``, ``node`` and ``time`` (also ``parent``, if supplied) arrays + ``site`` and ``node`` (also ``time`` and ``parent``, if supplied) arrays must be of equal length, and determine the number of additional rows to add to the table. The ``derived_state`` and ``derived_state_offset`` parameters must @@ -1532,7 +1543,7 @@ def append_columns( :type site: numpy.ndarray, dtype=np.int32 :param node: The ID of the node each mutation is associated with. :type node: numpy.ndarray, dtype=np.int32 - :param time: The time values for each mutation. Required. + :param time: The time values for each mutation. :type time: numpy.ndarray, dtype=np.float64 :param derived_state: The flattened derived_state array. Required. :type derived_state: numpy.ndarray, dtype=np.int8 diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 8b35c6ea47..d2e14d9548 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -27,6 +27,7 @@ import base64 import collections import concurrent.futures +import copy import functools import itertools import json @@ -50,6 +51,7 @@ import tskit.vcf as vcf from tskit import NODE_IS_SAMPLE from tskit import NULL +from tskit import UNKNOWN_TIME CoalescenceRecord = collections.namedtuple( @@ -337,6 +339,8 @@ class Mutation(SimpleContainerWithMetadata): To obtain further information about a node with a given ID, use :meth:`TreeSequence.node`. :vartype node: int + :ivar time: The occurrence time of this mutation. + :vartype node: float :ivar derived_state: The derived state for this mutation. This is the state inherited by nodes in the subtree rooted at this mutation's node, unless another mutation occurs. @@ -358,7 +362,7 @@ def __init__( id_=NULL, site=NULL, node=NULL, - time=NULL, + time=UNKNOWN_TIME, derived_state=None, parent=NULL, encoded_metadata=b"", @@ -373,6 +377,24 @@ def __init__( self._encoded_metadata = encoded_metadata self._metadata_decoder = metadata_decoder + def __eq__(self, other): + # We need to remove metadata and the decoder so we are just comparing + # the encoded metadata, along with the other attributes. + # We also need to remove time as we have to compare to unknown time. + other_ = copy.copy(other.__dict__) + other_["metadata"] = None + other_["_metadata_decoder"] = None + other_["time"] = None + self_ = copy.copy(self.__dict__) + self_["metadata"] = None + self_["_metadata_decoder"] = None + self_["time"] = None + return self_ == other_ and ( + self.time == other.time + # We need to special case unknown times as they are a nan value. + or (util.is_unknown_time(self.time) and util.is_unknown_time(other.time)) + ) + class Migration(SimpleContainerWithMetadata): """ @@ -2242,12 +2264,7 @@ def map_mutations(self, genotypes, alleles): # Translate back into string alleles ancestral_state = alleles[ancestral_state] mutations = [ - Mutation( - node=node, - time=self.tree_sequence.node(node).time, - derived_state=alleles[derived_state], - parent=parent, - ) + Mutation(node=node, derived_state=alleles[derived_state], parent=parent) for node, parent, derived_state in transitions ] return ancestral_state, mutations @@ -2526,7 +2543,7 @@ def parse_mutations( for the details of the required format and the :ref:`mutation table definition ` section for the required properties of the contents. Note that if the ``time`` column is missing it's - entries are filled with 0. + entries are filled with ``UNKNOWN_TIME``. See :func:`tskit.load_text` for a detailed explanation of the ``strict`` parameter. @@ -2569,7 +2586,10 @@ def parse_mutations( if len(tokens) >= 3: site = int(tokens[site_index]) node = int(tokens[node_index]) - time = float(tokens[time_index]) if time_index is not None else 0 + if time_index is None or tokens[time_index] == "unknown": + time = UNKNOWN_TIME + else: + time = float(tokens[time_index]) derived_state = tokens[derived_state_index] if parent_index is not None: parent = int(tokens[parent_index]) @@ -2642,7 +2662,6 @@ def load_text( strict=True, encoding="utf8", base64_metadata=True, - compute_mutation_times=False, ): """ Parses the tree sequence data from the specified file-like objects, and @@ -2701,8 +2720,6 @@ def load_text( :param str encoding: Encoding used for text representation. :param bool base64_metadata: If True, metadata is encoded using Base64 encoding; otherwise, as plain text. - :param bool compute_mutation_times: If True, mutation times are replaced with those - from :meth:`TableCollection.compute_mutation_times` :return: The tree sequence object containing the information stored in the specified file paths. :rtype: :class:`tskit.TreeSequence` @@ -2766,9 +2783,6 @@ def load_text( table=tc.populations, ) tc.sort() - if compute_mutation_times: - tc.build_index() - tc.compute_mutation_times() return tc.tree_sequence() @@ -3086,7 +3100,9 @@ def dump_text( ).format( site=mutation.site, node=mutation.node, - time=mutation.time, + time="unknown" + if util.is_unknown_time(mutation.time) + else mutation.time, derived_state=mutation.derived_state, parent=mutation.parent, metadata=metadata, @@ -3926,20 +3942,20 @@ def mutation(self, id_): ( site, node, - time, derived_state, parent, metadata, + time, ) = self._ll_tree_sequence.get_mutation(id_) return Mutation( id_=id_, site=site, node=node, - time=time, derived_state=derived_state, parent=parent, encoded_metadata=metadata, metadata_decoder=self.table_metadata_schemas.mutation.decode_row, + time=time, ) def site(self, id_): diff --git a/python/tskit/util.py b/python/tskit/util.py index 58ea1a2f87..e49f07fa7f 100644 --- a/python/tskit/util.py +++ b/python/tskit/util.py @@ -22,8 +22,20 @@ """ Module responsible for various utility functions used in other modules. """ +import struct + import numpy as np +from tskit import UNKNOWN_TIME + + +def is_unknown_time(time): + """ + As the default unknown mutation time is NAN equality always fails. This + method compares the bitfield. + """ + return struct.pack(">d", UNKNOWN_TIME) == struct.pack(">d", time) + def safe_np_int_cast(int_array, dtype, copy=False): """