diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index 65f2722aa8..43c71b192d 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -11,6 +11,11 @@ current behaviour. (:user:`mufernando`, :user:`jeromekelleher`, :issue:`896`, :pr:`897`, :issue:`913`, :pr:`917`). +- Changed default behaviour of ``tsk_table_collection_clear`` to not clear + provenances and added ``options`` argument to optionally clear provenances + and schemas. + (:user:`benjeffery`, :issue:`929`, :pr:`1001`) + - Exposed ``tsk_table_collection_set_indexes`` to the API. (:user:`benjeffery`, :issue:`870`, :pr:`921`) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index f539e0744e..d8904b8a09 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -5142,7 +5142,7 @@ test_table_collection_union(void) &tables_copy, &tables_empty, node_mapping, TSK_UNION_NO_CHECK_SHARED); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); // self is empty - ret = tsk_table_collection_clear(&tables_copy); + ret = tsk_table_collection_clear(&tables_copy, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_union( &tables_copy, &tables, node_mapping, TSK_UNION_NO_CHECK_SHARED); @@ -5290,6 +5290,118 @@ test_table_collection_union_errors(void) tsk_table_collection_free(&tables); } +static void +test_table_collection_clear_with_options(tsk_flags_t options) +{ + int ret; + tsk_table_collection_t tables; + bool clear_provenance = !!(options & TSK_CLEAR_PROVENANCE); + bool clear_metadata_schemas = !!(options & TSK_CLEAR_METADATA_SCHEMAS); + bool clear_ts_metadata = !!(options & TSK_CLEAR_TS_METADATA_AND_SCHEMA); + tsk_bookmark_t num_rows; + tsk_bookmark_t expected_rows = { .provenances = clear_provenance ? 0 : 1 }; + tsk_size_t expected_len = clear_metadata_schemas ? 0 : 4; + tsk_size_t expected_len_ts = clear_ts_metadata ? 0 : 4; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.sequence_length = 1; + + ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.5, 1, 1, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_population_table_add_row(&tables.populations, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_population_table_add_row(&tables.populations, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_site_table_add_row(&tables.sites, 0.2, "A", 1, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_migration_table_add_row(&tables.migrations, 0, 1, 0, 0, 0, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + + ret = tsk_table_collection_build_index(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_individual_table_set_metadata_schema(&tables.individuals, "test", 4); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_node_table_set_metadata_schema(&tables.nodes, "test", 4); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_edge_table_set_metadata_schema(&tables.edges, "test", 4); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_migration_table_set_metadata_schema(&tables.migrations, "test", 4); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_site_table_set_metadata_schema(&tables.sites, "test", 4); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_mutation_table_set_metadata_schema(&tables.mutations, "test", 4); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_population_table_set_metadata_schema(&tables.populations, "test", 4); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_table_collection_set_metadata(&tables, "test", 4); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_set_metadata_schema(&tables, "test", 4); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_provenance_table_add_row(&tables.provenances, "today", 5, "test", 4); + CU_ASSERT_FATAL(ret >= 0); + + ret = tsk_table_collection_clear(&tables, options); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + ret = tsk_table_collection_record_num_rows(&tables, &num_rows); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(num_rows.individuals, expected_rows.individuals); + CU_ASSERT_EQUAL(num_rows.nodes, expected_rows.nodes); + CU_ASSERT_EQUAL(num_rows.edges, expected_rows.edges); + CU_ASSERT_EQUAL(num_rows.migrations, expected_rows.migrations); + CU_ASSERT_EQUAL(num_rows.sites, expected_rows.sites); + CU_ASSERT_EQUAL(num_rows.mutations, expected_rows.mutations); + CU_ASSERT_EQUAL(num_rows.populations, expected_rows.populations); + CU_ASSERT_EQUAL(num_rows.provenances, expected_rows.provenances); + + CU_ASSERT_FALSE(tsk_table_collection_has_index(&tables, 0)); + + CU_ASSERT_EQUAL(tables.individuals.metadata_schema_length, expected_len); + CU_ASSERT_EQUAL(tables.nodes.metadata_schema_length, expected_len); + CU_ASSERT_EQUAL(tables.edges.metadata_schema_length, expected_len); + CU_ASSERT_EQUAL(tables.migrations.metadata_schema_length, expected_len); + CU_ASSERT_EQUAL(tables.sites.metadata_schema_length, expected_len); + CU_ASSERT_EQUAL(tables.mutations.metadata_schema_length, expected_len); + CU_ASSERT_EQUAL(tables.populations.metadata_schema_length, expected_len); + CU_ASSERT_EQUAL(tables.metadata_schema_length, expected_len_ts); + CU_ASSERT_EQUAL(tables.metadata_length, expected_len_ts); + + tsk_table_collection_free(&tables); +} + +static void +test_table_collection_clear(void) +{ + test_table_collection_clear_with_options(0); + test_table_collection_clear_with_options(TSK_CLEAR_PROVENANCE); + test_table_collection_clear_with_options(TSK_CLEAR_METADATA_SCHEMAS); + test_table_collection_clear_with_options(TSK_CLEAR_TS_METADATA_AND_SCHEMA); + test_table_collection_clear_with_options( + TSK_CLEAR_PROVENANCE | TSK_CLEAR_METADATA_SCHEMAS); + test_table_collection_clear_with_options( + TSK_CLEAR_PROVENANCE | TSK_CLEAR_TS_METADATA_AND_SCHEMA); + test_table_collection_clear_with_options( + TSK_CLEAR_METADATA_SCHEMAS | TSK_CLEAR_TS_METADATA_AND_SCHEMA); + test_table_collection_clear_with_options(TSK_CLEAR_PROVENANCE + | TSK_CLEAR_METADATA_SCHEMAS + | TSK_CLEAR_TS_METADATA_AND_SCHEMA); +} + int main(int argc, char **argv) { @@ -5360,6 +5472,7 @@ main(int argc, char **argv) { "test_table_collection_subset_errors", test_table_collection_subset_errors }, { "test_table_collection_union", test_table_collection_union }, { "test_table_collection_union_errors", test_table_collection_union_errors }, + { "test_table_collection_clear", test_table_collection_clear }, { NULL, NULL }, }; diff --git a/c/tskit/tables.c b/c/tskit/tables.c index a874166889..629389713f 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -6547,7 +6547,7 @@ simplifier_init(simplifier_t *self, const tsk_id_t *samples, size_t num_samples, ret = TSK_ERR_NO_MEMORY; goto out; } - ret = tsk_table_collection_clear(self->tables); + ret = tsk_table_collection_clear(self->tables, 0); if (ret != 0) { goto out; } @@ -7015,11 +7015,7 @@ simplifier_finalise_references(simplifier_t *self) } } - ret = tsk_provenance_table_copy( - &self->input_tables.provenances, &self->tables->provenances, TSK_NO_INIT); - if (ret != 0) { - goto out; - } + ret = 0; out: tsk_safe_free(population_referenced); tsk_safe_free(individual_referenced); @@ -9018,12 +9014,63 @@ tsk_table_collection_truncate(tsk_table_collection_t *tables, tsk_bookmark_t *po } int TSK_WARN_UNUSED -tsk_table_collection_clear(tsk_table_collection_t *self) +tsk_table_collection_clear(tsk_table_collection_t *self, tsk_flags_t options) { - tsk_bookmark_t start; + int ret = 0; + bool clear_provenance = !!(options & TSK_CLEAR_PROVENANCE); + bool clear_metadata_schemas = !!(options & TSK_CLEAR_METADATA_SCHEMAS); + bool clear_ts_metadata = !!(options & TSK_CLEAR_TS_METADATA_AND_SCHEMA); + tsk_bookmark_t rows_to_retain + = { .provenances = clear_provenance ? 0 : self->provenances.num_rows }; + + ret = tsk_table_collection_truncate(self, &rows_to_retain); + if (ret != 0) { + goto out; + } - memset(&start, 0, sizeof(start)); - return tsk_table_collection_truncate(self, &start); + if (clear_metadata_schemas) { + ret = tsk_individual_table_set_metadata_schema(&self->individuals, "", 0); + if (ret != 0) { + goto out; + } + ret = tsk_node_table_set_metadata_schema(&self->nodes, "", 0); + if (ret != 0) { + goto out; + } + ret = tsk_edge_table_set_metadata_schema(&self->edges, "", 0); + if (ret != 0) { + goto out; + } + ret = tsk_migration_table_set_metadata_schema(&self->migrations, "", 0); + if (ret != 0) { + goto out; + } + ret = tsk_site_table_set_metadata_schema(&self->sites, "", 0); + if (ret != 0) { + goto out; + } + ret = tsk_mutation_table_set_metadata_schema(&self->mutations, "", 0); + if (ret != 0) { + goto out; + } + ret = tsk_population_table_set_metadata_schema(&self->populations, "", 0); + if (ret != 0) { + goto out; + } + } + + if (clear_ts_metadata) { + ret = tsk_table_collection_set_metadata(self, "", 0); + if (ret != 0) { + goto out; + } + ret = tsk_table_collection_set_metadata_schema(self, "", 0); + if (ret != 0) { + goto out; + } + } +out: + return ret; } static int @@ -9114,7 +9161,7 @@ tsk_table_collection_subset( if (ret != 0) { goto out; } - ret = tsk_table_collection_clear(self); + ret = tsk_table_collection_clear(self, 0); if (ret != 0) { goto out; } @@ -9199,13 +9246,7 @@ tsk_table_collection_subset( ret = TSK_ERR_MIGRATIONS_NOT_SUPPORTED; goto out; } - - // provenance (new record is added in python) - ret = tsk_provenance_table_copy( - &tables.provenances, &self->provenances, TSK_NO_INIT); - if (ret < 0) { - goto out; - } + ret = 0; out: tsk_safe_free(node_map); diff --git a/c/tskit/tables.h b/c/tskit/tables.h index 294577a5f1..6c5e8da9e1 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -727,6 +727,11 @@ typedef struct { #define TSK_CMP_IGNORE_METADATA (1 << 2) #define TSK_CMP_IGNORE_TIMESTAMPS (1 << 3) +/* Flags for tables collection clear */ +#define TSK_CLEAR_METADATA_SCHEMAS (1 << 0) +#define TSK_CLEAR_TS_METADATA_AND_SCHEMA (1 << 1) +#define TSK_CLEAR_PROVENANCE (1 << 2) + /****************************************************************************/ /* Function signatures */ /****************************************************************************/ @@ -2251,17 +2256,34 @@ int tsk_table_collection_init(tsk_table_collection_t *self, tsk_flags_t options) int tsk_table_collection_free(tsk_table_collection_t *self); /** -@brief Clears all tables in this table collection. +@brief Clears data tables (and optionally provenances and metadata) in +this table collection. @rst +By default this operation clears all tables except the provenance table, retaining +table metadata schemas and the tree-sequnce level metadata and schema. + +**Options** + +Options can be specified by providing one or more of the following bitwise +flags: + +TSK_CLEAR_PROVENANCE + Additionally clear the provenance table +TSK_CLEAR_METADATA_SCHEMAS + Additionally clear the table metadata schemas +TSK_CLEAR_TS_METADATA_AND_SCHEMA + Additionally clear the tree-sequence metadata and schema + No memory is freed as a result of this operation; please use :c:func:`tsk_table_collection_free` to free internal resources. @endrst @param self A pointer to a tsk_table_collection_t object. +@param options Bitwise clearing options @return Return 0 on success or a negative value on failure. */ -int tsk_table_collection_clear(tsk_table_collection_t *self); +int tsk_table_collection_clear(tsk_table_collection_t *self, tsk_flags_t options); /** @brief Returns true if the data in the specified table collection is equal diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 8d01059ce2..2636684abd 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -45,6 +45,10 @@ reports the size in bytes of those objects. (:user:`jeromekelleher`, :user:`benjeffery`, :issue:`54`, :pr:`871`) +- Added ``TableCollection.clear`` to clear data table rows and optionally + provenances, table schemas and tree-sequence level metadata and schema. + (:user:`benjeffery`, :issue:`929`, :pr:`1001`) + **Breaking changes** - The argument to ``ts.dump`` and ``tskit.load`` has been renamed `file` from `path`. diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 424e0d109d..7e3d81ea5d 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -5469,6 +5469,45 @@ TableCollection_equals(TableCollection *self, PyObject *args, PyObject *kwds) return ret; } +static PyObject * +TableCollection_clear(TableCollection *self, PyObject *args, PyObject *kwds) +{ + int err; + PyObject *ret = NULL; + tsk_flags_t options = 0; + int clear_provenance = false; + int clear_metadata_schemas = false; + int clear_ts_metadata = false; + static char *kwlist[] = { "clear_provenance", "clear_metadata_schemas", + "clear_ts_metadata_and_schema", NULL }; + + if (TableCollection_check_state(self)) { + goto out; + } + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iii", kwlist, &clear_provenance, + &clear_metadata_schemas, &clear_ts_metadata)) { + goto out; + } + if (clear_provenance) { + options |= TSK_CLEAR_PROVENANCE; + } + if (clear_metadata_schemas) { + options |= TSK_CLEAR_METADATA_SCHEMAS; + } + if (clear_ts_metadata) { + options |= TSK_CLEAR_TS_METADATA_AND_SCHEMA; + } + + err = tsk_table_collection_clear(self->tables, options); + if (err != 0) { + handle_library_error(err); + goto out; + } + ret = Py_BuildValue(""); +out: + return ret; +} + static PyObject * TableCollection_dump(TableCollection *self, PyObject *args, PyObject *kwds) { @@ -5649,6 +5688,10 @@ static PyMethodDef TableCollection_methods[] = { .ml_meth = (PyCFunction) TableCollection_has_index, .ml_flags = METH_NOARGS, .ml_doc = "Returns True if the TableCollection is indexed." }, + { .ml_name = "clear", + .ml_meth = (PyCFunction) TableCollection_clear, + .ml_flags = METH_VARARGS | METH_KEYWORDS, + .ml_doc = "Clears table contents, and optionally provenances and metadata" }, { .ml_name = "dump", .ml_meth = (PyCFunction) TableCollection_dump, .ml_flags = METH_VARARGS | METH_KEYWORDS, diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index 1f5e69cbc0..978770f963 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -865,6 +865,45 @@ def test_load_tables_build_indexes(self): ts4.dump_tables(tables4) assert tables4.has_index() + def test_clear_table(self, ts_fixture): + tables = _tskit.TableCollection( + sequence_length=ts_fixture.get_sequence_length() + ) + ts_fixture.ll_tree_sequence.dump_tables(tables) + tables.clear() + data_tables = [ + "individuals", + "nodes", + "edges", + "migrations", + "sites", + "mutations", + "populations", + ] + for table in data_tables: + assert getattr(tables, f"{table}").num_rows == 0 + assert len(getattr(tables, f"{table}").metadata_schema) != 0 + assert tables.provenances.num_rows > 0 + assert len(tables.metadata) > 0 + assert len(tables.metadata_schema) > 0 + + tables.clear(clear_provenance=True) + assert tables.provenances.num_rows == 0 + for table in data_tables: + assert len(getattr(tables, f"{table}").metadata_schema) != 0 + assert len(tables.metadata) > 0 + assert len(tables.metadata_schema) > 0 + + tables.clear(clear_metadata_schemas=True) + for table in data_tables: + assert len(getattr(tables, f"{table}").metadata_schema) == 0 + assert len(tables.metadata) > 0 + assert len(tables.metadata_schema) > 0 + + tables.clear(clear_ts_metadata_and_schema=True) + assert len(tables.metadata) == 0 + assert len(tables.metadata_schema) == 0 + class StatsInterfaceMixin: """ diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index da6dba68f2..3591406371 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -2369,6 +2369,42 @@ def test_copy(self, ts_fixture): t1.edges.clear() assert t1 != t2 + def test_clear_table(self, ts_fixture): + tables = ts_fixture.dump_tables() + tables.clear() + data_tables = [ + "individuals", + "nodes", + "edges", + "migrations", + "sites", + "mutations", + "populations", + ] + for table in data_tables: + assert getattr(tables, f"{table}").num_rows == 0 + assert str(getattr(tables, f"{table}").metadata_schema) != "" + assert tables.provenances.num_rows > 0 + assert len(tables.metadata) > 0 + assert str(tables.metadata_schema) != "" + + tables.clear(clear_provenance=True) + assert tables.provenances.num_rows == 0 + for table in data_tables: + assert str(getattr(tables, f"{table}").metadata_schema) != "" + assert len(tables.metadata) > 0 + assert str(tables.metadata_schema) != "" + + tables.clear(clear_metadata_schemas=True) + for table in data_tables: + assert str(getattr(tables, f"{table}").metadata_schema) == "" + assert len(tables.metadata) > 0 + assert str(tables.metadata_schema) != 0 + + tables.clear(clear_ts_metadata_and_schema=True) + assert len(tables.metadata) == 0 + assert str(tables.metadata_schema) == "" + def test_equals(self): pop_configs = [msprime.PopulationConfiguration(5) for _ in range(2)] migration_matrix = [[0, 1], [1, 0]] diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index b50c4c4f33..4eddc4cb3f 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -576,14 +576,7 @@ def py_subset(tables, nodes, record_provenance=True): if np.any(nodes > tables.nodes.num_rows) or np.any(nodes < 0): raise ValueError("Nodes out of bounds.") full = tables.copy() - # there is no table collection clear in the py API - tables.nodes.clear() - tables.individuals.clear() - tables.populations.clear() - tables.edges.clear() - tables.migrations.clear() - tables.sites.clear() - tables.mutations.clear() + tables.clear() # mapping from old to new ids node_map = {} ind_map = {tskit.NULL: tskit.NULL} diff --git a/python/tskit/tables.py b/python/tskit/tables.py index f1a3a09796..b60445c541 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -2879,6 +2879,29 @@ def trim(self, record_provenance=True): record=json.dumps(provenance.get_provenance_dict(parameters)) ) + def clear( + self, + clear_provenance=False, + clear_metadata_schemas=False, + clear_ts_metadata_and_schema=False, + ): + """ + Remove all rows of the data tables, optionally remove provenance, metadata + schemas and ts-level metadata. + + :param bool clear_provenance: If ``True``, remove all rows of the provenance + table. (Default: ``False``). + :param bool clear_metadata_schemas: If ``True``, clear the table metadata + schemas. (Default: ``False``). + :param bool clear_ts_metadata_and_schema: If ``True``, clear the tree-sequence + level metadata and schema (Default: ``False``). + """ + self._ll_tables.clear( + clear_provenance=clear_provenance, + clear_metadata_schemas=clear_metadata_schemas, + clear_ts_metadata_and_schema=clear_ts_metadata_and_schema, + ) + def has_index(self): """ Returns True if this TableCollection is indexed.