diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index 1d6a3d1d69..760e231dbf 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -55,6 +55,9 @@ In development. - Tables loaded from a file can now be edited in the same way as any other table collection (:user:`jeromekelleher`, :issue:`536`, :pr:`530`. +- Support for reading/writing to arbitrary file streams with the loadf/dumpf + variants for tree sequence and table collection load/dump + (:user:`jeromekelleher`, :user:`grahamgower`, :issue:`565`, :pr:`599`). **Deprecated** diff --git a/c/examples/Makefile b/c/examples/Makefile index 79fe3baa51..d3fddfba12 100644 --- a/c/examples/Makefile +++ b/c/examples/Makefile @@ -1,5 +1,5 @@ # Simple Makefile for building examples. -# This will build the examples in the current directory by compiling in the +# This will build the examples in the current directory by compiling in the # full tskit source into each of the examples. This is *not* recommended for # real projects! # @@ -8,28 +8,31 @@ # # **Note**: This repo uses git submodules, and these must be checked out # correctly for this makefile to work, e.g.: -# +# # $ git clone git@github.com:tskit-dev/tskit.git --recurse-submodules -# +# # See the documentation (https://tskit.readthedocs.io/en/stable/c-api.html) -# for more details on how to use the C API, and the tskit build examples -# repo (https://github.com/tskit-dev/tskit-build-examples) for examples +# for more details on how to use the C API, and the tskit build examples +# repo (https://github.com/tskit-dev/tskit-build-examples) for examples # of how to set up a production-ready build with tskit. # CFLAGS=-I../ -I../subprojects/kastore TSKIT_SOURCE=../tskit/*.c ../subprojects/kastore/kastore.c -all: tree_iteration haploid_wright_fisher tree_traversal +all: tree_iteration haploid_wright_fisher tree_traversal streaming + +tree_iteration: tree_iteration.c + ${CC} ${CFLAGS} -o $@ $< ${TSKIT_SOURCE} -lm -tree_iteration: tree_iteration.c +tree_traversal: tree_traversal.c ${CC} ${CFLAGS} -o $@ $< ${TSKIT_SOURCE} -lm -tree_traversal: tree_traversal.c +streaming: streaming.c ${CC} ${CFLAGS} -o $@ $< ${TSKIT_SOURCE} -lm # This needs GSL -haploid_wright_fisher: haploid_wright_fisher.c +haploid_wright_fisher: haploid_wright_fisher.c ${CC} ${CFLAGS} -o $@ $< ${TSKIT_SOURCE} -lgsl -lgslcblas -lm clean: diff --git a/c/examples/streaming.c b/c/examples/streaming.c new file mode 100644 index 0000000000..cef8eca69c --- /dev/null +++ b/c/examples/streaming.c @@ -0,0 +1,37 @@ +#include +#include +#include + +#define check_tsk_error(val) \ + if (val < 0) { \ + fprintf(stderr, "Error: line %d: %s\n", __LINE__, tsk_strerror(val)); \ + exit(EXIT_FAILURE); \ + } + +int +main(int argc, char **argv) +{ + int ret; + int j = 0; + tsk_table_collection_t tables; + + ret = tsk_table_collection_init(&tables, 0); + check_tsk_error(ret); + + while (true) { + ret = tsk_table_collection_loadf(&tables, stdin, TSK_NO_INIT); + if (ret == TSK_ERR_EOF) { + break; + } + check_tsk_error(ret); + fprintf(stderr, "Tree sequence %d had %d mutations\n", j, + (int) tables.mutations.num_rows); + ret = tsk_mutation_table_truncate(&tables.mutations, 0); + check_tsk_error(ret); + ret = tsk_table_collection_dumpf(&tables, stdout, 0); + check_tsk_error(ret); + j++; + } + tsk_table_collection_free(&tables); + return EXIT_SUCCESS; +} diff --git a/c/meson.build b/c/meson.build index 861a15ccc1..3fee7085c8 100644 --- a/c/meson.build +++ b/c/meson.build @@ -99,6 +99,8 @@ if not meson.is_subproject() sources: ['examples/tree_iteration.c'], link_with: [tskit_lib], dependencies: lib_deps) executable('tree_traversal', sources: ['examples/tree_traversal.c'], link_with: [tskit_lib], dependencies: lib_deps) + executable('streaming', + sources: ['examples/streaming.c'], link_with: [tskit_lib], dependencies: lib_deps) gsl_dep = dependency('gsl', required: false) if gsl_dep.found() diff --git a/c/tests/meson-subproject/example.c b/c/tests/meson-subproject/example.c index 3d21448785..d9625e0b3a 100644 --- a/c/tests/meson-subproject/example.c +++ b/c/tests/meson-subproject/example.c @@ -52,7 +52,7 @@ test_load_error() printf("test_open_error\n"); tsk_treeseq_t ts; int ret = tsk_treeseq_load(&ts, "no such file", 0); - assert(tsk_is_kas_error(ret)); + assert(ret == TSK_ERR_IO); tsk_treeseq_free(&ts); } diff --git a/c/tests/test_core.c b/c/tests/test_core.c index eb8660c72a..ed509e803c 100644 --- a/c/tests/test_core.c +++ b/c/tests/test_core.c @@ -47,7 +47,7 @@ test_strerror(void) static void test_strerror_kastore(void) { - int kastore_errors[] = { KAS_ERR_NO_MEMORY, KAS_ERR_IO, KAS_ERR_KEY_NOT_FOUND }; + int kastore_errors[] = { KAS_ERR_NO_MEMORY, KAS_ERR_KEY_NOT_FOUND }; size_t j; int err; diff --git a/c/tests/test_file_format.c b/c/tests/test_file_format.c index 7d62404c15..988d25d0da 100644 --- a/c/tests/test_file_format.c +++ b/c/tests/test_file_format.c @@ -643,26 +643,155 @@ test_metadata_schemas_optional(void) } static void -test_table_collection_load_errors(void) +test_load_bad_file_formats(void) { tsk_table_collection_t tables; - int ret; + tsk_treeseq_t ts; + int ret, ret2; + off_t offset; + FILE *f; + + /* A zero byte file is TSK_ERR_EOF */ + f = fopen(_tmp_file_name, "w+"); + ret = tsk_table_collection_loadf(&tables, f, 0); + ret2 = tsk_treeseq_loadf(&ts, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, ret2); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); + tsk_table_collection_free(&tables); + tsk_treeseq_free(&ts); + fclose(f); + + for (offset = 1; offset < 100; offset++) { + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.sequence_length = 1.0; + ret = tsk_table_collection_dump(&tables, _tmp_file_name, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + truncate(_tmp_file_name, offset); + ret = tsk_table_collection_load(&tables, _tmp_file_name, TSK_NO_INIT); + CU_ASSERT_EQUAL_FATAL(ret ^ (1 << TSK_KAS_ERR_BIT), KAS_ERR_BAD_FILE_FORMAT); + tsk_table_collection_free(&tables); + } +} + +static void +test_load_errors(void) +{ + tsk_table_collection_t tables; + tsk_treeseq_t ts; + int ret, ret2; const char *str; + FILE *f; ret = tsk_table_collection_load(&tables, "/", 0); + ret2 = tsk_treeseq_load(&ts, "/", 0); + CU_ASSERT_EQUAL_FATAL(ret, ret2); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IO); + str = tsk_strerror(ret); + CU_ASSERT_TRUE(strlen(str) > 0); + CU_ASSERT_STRING_EQUAL(str, strerror(EISDIR)); + tsk_table_collection_free(&tables); + tsk_treeseq_free(&ts); + + ret = tsk_table_collection_load(&tables, "/bin/theres_no_way_this_file_exists", 0); + ret2 = tsk_treeseq_load(&ts, "/bin/theres_no_way_this_file_exists", 0); + CU_ASSERT_EQUAL_FATAL(ret, ret2); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IO); + str = tsk_strerror(ret); + CU_ASSERT_TRUE(strlen(str) > 0); + CU_ASSERT_STRING_EQUAL(str, strerror(ENOENT)); + tsk_table_collection_free(&tables); + tsk_treeseq_free(&ts); + + ret = tsk_table_collection_load(&tables, "/bin/sh", 0); + ret2 = tsk_treeseq_load(&ts, "/bin/sh", 0); + CU_ASSERT_EQUAL_FATAL(ret, ret2); CU_ASSERT_TRUE(tsk_is_kas_error(ret)); - CU_ASSERT_EQUAL_FATAL(ret ^ (1 << TSK_KAS_ERR_BIT), KAS_ERR_IO); + CU_ASSERT_EQUAL_FATAL(ret ^ (1 << TSK_KAS_ERR_BIT), KAS_ERR_BAD_FILE_FORMAT); str = tsk_strerror(ret); CU_ASSERT_TRUE(strlen(str) > 0); + tsk_table_collection_free(&tables); + /* open a file in the wrong mode */ + f = fopen(_tmp_file_name, "w"); + ret = tsk_table_collection_loadf(&tables, f, 0); + ret2 = tsk_treeseq_loadf(&ts, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, ret2); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IO); + str = tsk_strerror(ret); + CU_ASSERT_TRUE(strlen(str) > 0); + CU_ASSERT_STRING_EQUAL(str, strerror(EBADF)); tsk_table_collection_free(&tables); + tsk_treeseq_free(&ts); + fclose(f); +} + +static void +test_load_eof(void) +{ + tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); + tsk_table_collection_t tables; + int ret; + FILE *f; + + f = fopen(_tmp_file_name, "w+"); + CU_ASSERT_NOT_EQUAL(f, NULL); + ret = tsk_table_collection_loadf(&tables, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); + fclose(f); + tsk_table_collection_free(&tables); + + /* Reading an empty file also returns EOF */ + ret = tsk_table_collection_load(&tables, _tmp_file_name, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); + tsk_table_collection_free(&tables); + + f = fopen(_tmp_file_name, "w+"); + CU_ASSERT_NOT_EQUAL(f, NULL); + ret = tsk_treeseq_dumpf(ts, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + /* Reading from the end of the stream gives EOF */ + ret = tsk_table_collection_loadf(&tables, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); + tsk_table_collection_free(&tables); + + /* Reading the start of the stream is fine */ + fseek(f, 0, SEEK_SET); + ret = tsk_table_collection_loadf(&tables, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_table_collection_free(&tables); + + /* And we should be back to the end of the stream */ + ret = tsk_table_collection_loadf(&tables, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); + tsk_table_collection_free(&tables); + + /* Trying to read the same end stream should give the same + * result. */ + ret = tsk_table_collection_loadf(&tables, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); + tsk_table_collection_free(&tables); + + /* A previously init'd tables should be good too */ + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_loadf(&tables, f, TSK_NO_INIT); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_EOF); + tsk_table_collection_free(&tables); + + fclose(f); + tsk_treeseq_free(ts); + free(ts); } static void -test_table_collection_dump_errors(void) +test_dump_errors(void) { tsk_table_collection_t tables; int ret; + FILE *f; const char *str; ret = tsk_table_collection_init(&tables, 0); @@ -670,10 +799,27 @@ test_table_collection_dump_errors(void) tables.sequence_length = 1.0; ret = tsk_table_collection_dump(&tables, "/", 0); - CU_ASSERT_TRUE(tsk_is_kas_error(ret)); - CU_ASSERT_EQUAL_FATAL(ret ^ (1 << TSK_KAS_ERR_BIT), KAS_ERR_IO); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IO); + str = tsk_strerror(ret); + CU_ASSERT_TRUE(strlen(str) > 0); + CU_ASSERT_STRING_EQUAL(str, strerror(EISDIR)); + + /* We're assuming that we don't have write access to /bin, so don't run this + * as root! */ + ret = tsk_table_collection_dump(&tables, "/bin/theres_no_way_this_file_exists", 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IO); + str = tsk_strerror(ret); + CU_ASSERT_TRUE(strlen(str) > 0); + CU_ASSERT_STRING_EQUAL(str, strerror(EACCES)); + + /* open a file in the wrong mode */ + f = fopen(_tmp_file_name, "r"); + ret = tsk_table_collection_dumpf(&tables, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_IO); str = tsk_strerror(ret); CU_ASSERT_TRUE(strlen(str) > 0); + CU_ASSERT_STRING_EQUAL(str, strerror(EBADF)); + fclose(f); /* We'd like to catch close errors also, but it's hard to provoke them * without intercepting calls to fclose() */ @@ -775,10 +921,12 @@ static void test_example_round_trip(void) { int ret; - tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3); + tsk_treeseq_t *ts1 = caterpillar_tree(5, 3, 3); + tsk_treeseq_t ts2; tsk_table_collection_t t1, t2; + FILE *f; - ret = tsk_treeseq_copy_tables(ts, &t1, 0); + ret = tsk_treeseq_copy_tables(ts1, &t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_dump(&t1, _tmp_file_name, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -786,10 +934,111 @@ test_example_round_trip(void) CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2)); - tsk_table_collection_free(&t1); + /* Reading multiple times into the same tables with TSK_NO_INIT is supported. */ + ret = tsk_table_collection_load(&t2, _tmp_file_name, TSK_NO_INIT); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2)); tsk_table_collection_free(&t2); - tsk_treeseq_free(ts); - free(ts); + + /* Do the same thing with treeseq API */ + remove(_tmp_file_name); + ret = tsk_treeseq_dump(ts1, _tmp_file_name, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_treeseq_load(&ts2, _tmp_file_name, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables)); + tsk_treeseq_free(&ts2); + + /* Use loadf form */ + f = fopen(_tmp_file_name, "w+"); + ret = tsk_table_collection_dumpf(&t1, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + fseek(f, 0, SEEK_SET); + ret = tsk_table_collection_loadf(&t2, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2)); + tsk_table_collection_free(&t2); + fclose(f); + + /* Do the same thing with treeseq API */ + f = fopen(_tmp_file_name, "w+"); + ret = tsk_treeseq_dumpf(ts1, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + fseek(f, 0, SEEK_SET); + ret = tsk_treeseq_loadf(&ts2, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables)); + tsk_treeseq_free(&ts2); + + fclose(f); + tsk_table_collection_free(&t1); + tsk_treeseq_free(ts1); + free(ts1); +} + +static void +test_multiple_round_trip(void) +{ + int ret; + tsk_size_t j; + tsk_size_t num_examples = 10; + tsk_treeseq_t *ts; + tsk_table_collection_t in_tables[num_examples]; + tsk_table_collection_t out_tables; + FILE *f = fopen(_tmp_file_name, "w+"); + + CU_ASSERT_NOT_EQUAL_FATAL(f, NULL); + + for (j = 0; j < num_examples; j++) { + ts = caterpillar_tree(5 + j, 3 + j, 3 + j); + ret = tsk_treeseq_copy_tables(ts, &in_tables[j], 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_treeseq_dumpf(ts, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_treeseq_free(ts); + free(ts); + } + + fseek(f, 0, SEEK_SET); + for (j = 0; j < num_examples; j++) { + ret = tsk_table_collection_loadf(&out_tables, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(&in_tables[j], &out_tables)); + tsk_table_collection_free(&out_tables); + } + + /* Can do the same with the same set of previously init'd tables. */ + ret = tsk_table_collection_init(&out_tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + fseek(f, 0, SEEK_SET); + for (j = 0; j < num_examples; j++) { + ret = tsk_table_collection_loadf(&out_tables, f, TSK_NO_INIT); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(&in_tables[j], &out_tables)); + } + tsk_table_collection_free(&out_tables); + + /* Can also read until EOF to do the same thing */ + ret = tsk_table_collection_init(&out_tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + fseek(f, 0, SEEK_SET); + j = 0; + while (true) { + ret = tsk_table_collection_loadf(&out_tables, f, TSK_NO_INIT); + if (ret == TSK_ERR_EOF) { + break; + } + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(&in_tables[j], &out_tables)); + j++; + } + tsk_table_collection_free(&out_tables); + CU_ASSERT_EQUAL_FATAL(j, num_examples); + + for (j = 0; j < num_examples; j++) { + tsk_table_collection_free(&in_tables[j]); + } + fclose(f); } static void @@ -826,9 +1075,12 @@ main(int argc, char **argv) { "test_bad_offset_columns", test_bad_offset_columns }, { "test_metadata_schemas_optional", test_metadata_schemas_optional }, { "test_load_node_table_errors", test_load_node_table_errors }, - { "test_table_collection_load_errors", test_table_collection_load_errors }, - { "test_table_collection_dump_errors", test_table_collection_dump_errors }, + { "test_load_bad_file_formats", test_load_bad_file_formats }, + { "test_load_errors", test_load_errors }, + { "test_load_eof", test_load_eof }, + { "test_dump_errors", test_dump_errors }, { "test_example_round_trip", test_example_round_trip }, + { "test_multiple_round_trip", test_multiple_round_trip }, { "test_copy_store_drop_columns", test_copy_store_drop_columns }, { NULL, NULL }, }; diff --git a/c/tests/test_minimal_cpp.cpp b/c/tests/test_minimal_cpp.cpp index 3a9ce6afb6..a6d8d9bbf7 100644 --- a/c/tests/test_minimal_cpp.cpp +++ b/c/tests/test_minimal_cpp.cpp @@ -57,7 +57,7 @@ test_load_error() std::cout << "test_open_error" << endl; tsk_treeseq_t ts; int ret = tsk_treeseq_load(&ts, "no such file", 0); - assert(tsk_is_kas_error(ret)); + assert(ret == TSK_ERR_IO); tsk_treeseq_free(&ts); } diff --git a/c/tskit/core.c b/c/tskit/core.c index 0424cfcba9..317c2c8097 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -154,6 +154,9 @@ tsk_strerror_internal(int err) case TSK_ERR_GENERATE_UUID: ret = "Error generating UUID"; break; + case TSK_ERR_EOF: + ret = "End of file"; + break; /* File format errors */ case TSK_ERR_FILE_FORMAT: @@ -423,8 +426,14 @@ tsk_strerror_internal(int err) int tsk_set_kas_error(int err) { - /* Flip this bit. As the error is negative, this sets the bit to 0 */ - return err ^ (1 << TSK_KAS_ERR_BIT); + if (err == KAS_ERR_IO) { + /* If we've detected an IO error, report it as TSK_ERR_IO so that we have + * a consistent error code covering these situtations */ + return TSK_ERR_IO; + } else { + /* Flip this bit. As the error is negative, this sets the bit to 0 */ + return err ^ (1 << TSK_KAS_ERR_BIT); + } } bool diff --git a/c/tskit/core.h b/c/tskit/core.h index 622f075dde..3d74499afa 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -119,6 +119,10 @@ An IO error occured. #define TSK_ERR_BUFFER_OVERFLOW -5 #define TSK_ERR_UNSUPPORTED_OPERATION -6 #define TSK_ERR_GENERATE_UUID -7 +/** +The file stream ended after reading zero bytes. +*/ +#define TSK_ERR_EOF -8 /** @} */ /** diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 607cd4ce42..6f5e8ef38e 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019 Tskit Developers + * Copyright (c) 2019-2020 Tskit Developers * Copyright (c) 2017-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -7052,7 +7052,10 @@ tsk_table_collection_read_format_data(tsk_table_collection_t *self, kastore_t *s ret = TSK_ERR_FILE_FORMAT; goto out; } - + /* This is safe because either we are in a case where TSK_NO_INIT has been set + * and there is a valid pointer, or this is a fresh table collection where all + * all pointers have been set to zero */ + tsk_safe_free(self->file_uuid); /* Allow space for \0 so we can print it as a string */ self->file_uuid = malloc(TSK_UUID_SIZE + 1); if (self->file_uuid == NULL) { @@ -7163,22 +7166,23 @@ tsk_table_collection_load_indexes(tsk_table_collection_t *self, kastore_t *store return ret; } -int TSK_WARN_UNUSED -tsk_table_collection_load( - tsk_table_collection_t *self, const char *filename, tsk_flags_t options) +static int TSK_WARN_UNUSED +tsk_table_collection_loadf_inited(tsk_table_collection_t *self, FILE *file) { int ret = 0; kastore_t store; - memset(&store, 0, sizeof(store)); - - ret = tsk_table_collection_init(self, options); + ret = kastore_openf(&store, file, "r", KAS_READ_ALL); if (ret != 0) { - goto out; - } - ret = kastore_open(&store, filename, "r", KAS_READ_ALL); - if (ret != 0) { - ret = tsk_set_kas_error(ret); + if (ret == KAS_ERR_EOF) { + /* KAS_ERR_EOF means that we tried to read a store from the stream + * and we hit EOF immediately without reading any bytes. We signal + * this back to the client, which allows it to read an indefinite + * number of stores from a stream */ + ret = TSK_ERR_EOF; + } else { + ret = tsk_set_kas_error(ret); + } goto out; } ret = tsk_table_collection_read_format_data(self, &store); @@ -7233,6 +7237,61 @@ tsk_table_collection_load( return ret; } +int TSK_WARN_UNUSED +tsk_table_collection_loadf(tsk_table_collection_t *self, FILE *file, tsk_flags_t options) +{ + int ret = 0; + + if (!(options & TSK_NO_INIT)) { + ret = tsk_table_collection_init(self, options); + if (ret != 0) { + goto out; + } + } + ret = tsk_table_collection_loadf_inited(self, file); + if (ret != 0) { + goto out; + } +out: + return ret; +} + +int TSK_WARN_UNUSED +tsk_table_collection_load( + tsk_table_collection_t *self, const char *filename, tsk_flags_t options) +{ + int ret = 0; + FILE *file = NULL; + + if (!(options & TSK_NO_INIT)) { + ret = tsk_table_collection_init(self, options); + if (ret != 0) { + goto out; + } + } + file = fopen(filename, "rb"); + if (file == NULL) { + ret = TSK_ERR_IO; + goto out; + } + ret = tsk_table_collection_loadf_inited(self, file); + if (ret != 0) { + goto out; + } + if (fclose(file) != 0) { + ret = TSK_ERR_IO; + goto out; + } + file = NULL; +out: + if (file != NULL) { + /* Ignore any additional errors we might get when closing the file + * in error conditions */ + fclose(file); + } + return ret; +} + static int TSK_WARN_UNUSED tsk_table_collection_write_format_data(tsk_table_collection_t *self, kastore_t *store) { @@ -7266,6 +7325,36 @@ tsk_table_collection_write_format_data(tsk_table_collection_t *self, kastore_t * int TSK_WARN_UNUSED tsk_table_collection_dump( tsk_table_collection_t *self, const char *filename, tsk_flags_t options) +{ + int ret = 0; + FILE *file = fopen(filename, "wb"); + + if (file == NULL) { + ret = TSK_ERR_IO; + goto out; + } + ret = tsk_table_collection_dumpf(self, file, options); + if (ret != 0) { + goto out; + } + if (fclose(file) != 0) { + ret = TSK_ERR_IO; + goto out; + } + file = NULL; +out: + if (file != NULL) { + /* Ignore any additional errors we might get when closing the file + * in error conditions */ + fclose(file); + /* If an error occurred make sure that the filename is removed */ + remove(filename); + } + return ret; +} + +int TSK_WARN_UNUSED +tsk_table_collection_dumpf(tsk_table_collection_t *self, FILE *file, tsk_flags_t options) { int ret = 0; kastore_t store; @@ -7282,7 +7371,7 @@ tsk_table_collection_dump( } } - ret = kastore_open(&store, filename, "w", 0); + ret = kastore_openf(&store, file, "w", 0); if (ret != 0) { ret = tsk_set_kas_error(ret); goto out; @@ -7330,11 +7419,12 @@ tsk_table_collection_dump( if (ret != 0) { goto out; } + ret = kastore_close(&store); if (ret != 0) { ret = tsk_set_kas_error(ret); + goto out; } - out: /* It's safe to close a kastore twice. */ if (ret != 0) { diff --git a/c/tskit/tables.h b/c/tskit/tables.h index 82604a1922..47d3d9e3cd 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -2095,15 +2095,22 @@ on and may change arbitrarily between versions. void tsk_table_collection_print_state(tsk_table_collection_t *self, FILE *out); /** -@brief Load a table collection from file. +@brief Load a table collection from a file path. @rst Loads the data from the specified file into this table collection. By default, the table collection is also initialised. +The resources allocated must be freed using +:c:func:`tsk_table_collection_free` even in error conditions. + If the :c:macro:`TSK_NO_INIT` option is set, the table collection is not initialised, allowing an already initialised table collection to be overwritten with the data from a file. +If the file contains multiple table collections, this function will load +the first. Please see the :c:func:`tsk_table_collection_loadf` for details +on how to sequentially load table collections from a stream. + **Options** Options can be specified by providing one or more of the following bitwise @@ -2136,6 +2143,50 @@ TSK_NO_INIT int tsk_table_collection_load( tsk_table_collection_t *self, const char *filename, tsk_flags_t options); +/** +@brief Load a table collection from a stream. + +@rst +Loads a tables definition from the specified file stream to this table +collection. By default, the table collection is also initialised. +The resources allocated must be freed using +:c:func:`tsk_table_collection_free` even in error conditions. + +If the :c:macro:`TSK_NO_INIT` option is set, the table collection is +not initialised, allowing an already initialised table collection to +be overwritten with the data from a file. + +If the stream contains multiple table collection definitions, this function +will load the next table collection from the stream. If the stream contains no +more table collection definitions the error value :c:macro:`TSK_ERR_EOF` will +be returned. Note that EOF is only returned in the case where zero bytes are +read from the stream --- malformed files or other errors will result in +different error conditions. Please see the +:ref:`sec_c_api_examples_file_streaming` section for an example of how to +sequentially load tree sequences from a stream. + +**Options** + +Options can be specified by providing one or more of the following bitwise +flags: + +TSK_NO_INIT + Do not initialise this :c:type:`tsk_table_collection_t` before loading. + +@endrst + +@param self A pointer to an uninitialised tsk_table_collection_t object + if the TSK_NO_INIT option is not set (default), or an initialised + tsk_table_collection_t otherwise. +@param file A FILE stream opened in an appropriate mode for reading (e.g. + "r", "r+" or "w+") positioned at the beginning of a table collection + definition. +@param options Bitwise options. See above for details. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_table_collection_loadf( + tsk_table_collection_t *self, FILE *file, tsk_flags_t options); + /** @brief Write a table collection to file. @@ -2151,6 +2202,9 @@ load time. This behaviour requires that the tables are sorted. If this automatic indexing is not desired, it can be disabled using the `TSK_NO_BUILD_INDEXES` option. +If an error occurs the file path is deleted, ensuring that only complete +and well formed files will be written. + **Options** Options can be specified by providing one or more of the following bitwise @@ -2179,13 +2233,44 @@ TSK_NO_BUILD_INDEXES @param self A pointer to an initialised tsk_table_collection_t object. @param filename A NULL terminated string containing the filename. -@param options Write options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. +@param options Bitwise options. See above for details. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_dump( tsk_table_collection_t *self, const char *filename, tsk_flags_t options); +/** +@brief Write a table collection to a stream. + +@rst +Writes the data from this table collection to the specified FILE stream. +Semantics are identical to :c:func:`tsk_table_collection_dump`. + +Please see the :ref:`sec_c_api_examples_file_streaming` section for an example +of how to sequentially dump and load tree sequences from a stream. + +**Options** + +Options can be specified by providing one or more of the following bitwise +flags: + +TSK_NO_BUILD_INDEXES + Do not build indexes for this table before writing to file. This is useful + if you wish to write unsorted tables to file, as building the indexes + will raise an error if the table is unsorted. + +@endrst + +@param self A pointer to an initialised tsk_table_collection_t object. +@param file A FILE stream opened in an appropriate mode for writing (e.g. + "w", "a", "r+" or "w+"). +@param options Bitwise options. See above for details. +@return Return 0 on success or a negative value on failure. +*/ + +int tsk_table_collection_dumpf( + tsk_table_collection_t *self, FILE *file, tsk_flags_t options); + /** @brief Record the number of rows in each table in the specified tsk_bookmark_t object. diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 4486dc716c..f7109bac1b 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -1,7 +1,7 @@ /* * MIT License * - * Copyright (c) 2019 Tskit Developers + * Copyright (c) 2019-2020 Tskit Developers * Copyright (c) 2015-2018 University of Oxford * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -353,8 +353,7 @@ tsk_treeseq_init_nodes(tsk_treeseq_t *self) * in this case because we have an independent copy. * - Need an option to take 'ownership' of the tables so that we keep the * tables and free them at the end of the treeseq's lifetime. This will be - * used in tsk_treeseq_load below, where we can take advantage of the read-only - * access directly into the store's memory and avoid copying the tree sequence. + * used in tsk_treeseq_load below where we can avoid copying the tree sequence. * - We should also allow a read-only "borrowed reference" where we use the * tables directly, but don't free it at the end. */ @@ -459,12 +458,43 @@ tsk_treeseq_load( return ret; } +int TSK_WARN_UNUSED +tsk_treeseq_loadf(tsk_treeseq_t *self, FILE *file, tsk_flags_t TSK_UNUSED(options)) +{ + int ret = 0; + tsk_table_collection_t tables; + + /* Need to make sure that we're zero'd out in case of error */ + memset(self, 0, sizeof(*self)); + ret = tsk_table_collection_loadf(&tables, file, 0); + if (ret != 0) { + goto out; + } + /* TODO the implementation is wasteful here, as we don't need to allocate + * a new table here but could load directly into the main table instead. + * See notes on the owned reference for treeseq_alloc above. + */ + ret = tsk_treeseq_init(self, &tables, 0); + if (ret != 0) { + goto out; + } +out: + tsk_table_collection_free(&tables); + return ret; +} + int TSK_WARN_UNUSED tsk_treeseq_dump(tsk_treeseq_t *self, const char *filename, tsk_flags_t options) { return tsk_table_collection_dump(self->tables, filename, options); } +int TSK_WARN_UNUSED +tsk_treeseq_dumpf(tsk_treeseq_t *self, FILE *file, tsk_flags_t options) +{ + return tsk_table_collection_dumpf(self->tables, file, options); +} + /* Simple attribute getters */ char * diff --git a/c/tskit/trees.h b/c/tskit/trees.h index d12b238836..1a41da1d86 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -220,8 +220,10 @@ int tsk_treeseq_init( tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options); int tsk_treeseq_load(tsk_treeseq_t *self, const char *filename, tsk_flags_t options); +int tsk_treeseq_loadf(tsk_treeseq_t *self, FILE *file, tsk_flags_t options); int tsk_treeseq_dump(tsk_treeseq_t *self, const char *filename, tsk_flags_t options); +int tsk_treeseq_dumpf(tsk_treeseq_t *self, FILE *file, tsk_flags_t options); int tsk_treeseq_copy_tables( tsk_treeseq_t *self, tsk_table_collection_t *tables, tsk_flags_t options); int tsk_treeseq_free(tsk_treeseq_t *self); diff --git a/docs/c-api.rst b/docs/c-api.rst index e8f3fb578b..2ba7c9ffd9 100644 --- a/docs/c-api.rst +++ b/docs/c-api.rst @@ -442,3 +442,37 @@ tree in three different ways: .. literalinclude:: ../c/examples/tree_traversal.c :language: c +.. _sec_c_api_examples_file_streaming: + +-------------- +File streaming +-------------- + +It is often useful to read tree sequence files from a stream rather than +from a fixed filename. This example shows how to do this using the +:c:func:`tsk_table_collection_loadf` and +:c:func:`tsk_table_collection_dumpf` functions. Here, we sequentially +load table collections from the ``stdin`` stream and write them +back out to ``stdout`` with their mutations removed. + +.. literalinclude:: ../c/examples/streaming.c + :language: c + +Note that we use the value :c:macro:`TSK_ERR_EOF` to detect when the stream +ends, as we don't know how many tree sequences to expect on the input. +In this case, :c:macro:`TSK_ERR_EOF` is not considered an error and we exit +normally. + +Running this program on some tree sequence files we might get:: + + $ cat tmp1.trees tmp2.trees | ./build/streaming > no_mutations.trees + Tree sequence 0 had 38 mutations + Tree sequence 1 had 132 mutations + +Then, running this program again on the output of the previous command, +we see that we now have two tree sequences with their mutations removed +stored in the file ``no_mutations.trees``:: + + $ ./build/streaming < no_mutations.trees > /dev/null + Tree sequence 0 had 0 mutations + Tree sequence 1 had 0 mutations diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 55d36c7f44..1c65d6c03a 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -10,6 +10,10 @@ In development (see below) to stabilise the node ordering and to make trees more readily comparable. The old behaviour is still available with ``order="tree"``. +- File system operations such as dump/load now raise an appropriate OSError + instead of tskit.FileFormatError. Loading from an empty file now raises + and EOFError. + **New features** - Add support for trees with internal samples for the Kendall-Colijn tree distance diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 581dd817d5..64464025b4 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -219,6 +219,12 @@ handle_library_error(int err) case TSK_ERR_FILE_FORMAT: PyErr_SetString(TskitFileFormatError, tsk_strerror(err)); break; + case TSK_ERR_IO: + PyErr_SetFromErrno(PyExc_OSError); + break; + case TSK_ERR_EOF: + PyErr_Format(PyExc_EOFError, "End of file"); + break; default: PyErr_SetString(TskitLibraryError, tsk_strerror(err)); } diff --git a/python/tests/test_cli.py b/python/tests/test_cli.py index 405e4c3e09..13ea65ef99 100644 --- a/python/tests/test_cli.py +++ b/python/tests/test_cli.py @@ -553,7 +553,9 @@ def verify(self, command): with mock.patch("sys.exit", side_effect=TestException) as mocked_exit: with self.assertRaises(TestException): capture_output(cli.tskit_main, ["info", "/no/such/file"]) - mocked_exit.assert_called_once_with("Load error: No such file or directory") + mocked_exit.assert_called_once_with( + "Load error: [Errno 2] No such file or directory" + ) def test_info(self): self.verify("info") diff --git a/python/tests/test_file_format.py b/python/tests/test_file_format.py index b6107a761d..ee0c397003 100644 --- a/python/tests/test_file_format.py +++ b/python/tests/test_file_format.py @@ -983,8 +983,8 @@ def test_format_name_error(self): def test_load_bad_formats(self): # try loading a bunch of files in various formats. - # First, check the emtpy file. - self.assertRaises(exceptions.FileFormatError, tskit.load, self.temp_file) + # First, check the empty file. + self.assertRaises(EOFError, tskit.load, self.temp_file) # Now some ascii text with open(self.temp_file, "wb") as f: f.write(b"Some ASCII text") diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index 6947eefe72..8cdcad44a9 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -260,21 +260,19 @@ def loader(*args): self.assertRaises(TypeError, func, bad_type) # Try to dump/load files we don't have access to or don't exist. for f in ["/", "/test.trees", "/dir_does_not_exist/x.trees"]: - self.assertRaises(_tskit.FileFormatError, func, f) + self.assertRaises(OSError, func, f) try: func(f) - except _tskit.FileFormatError as e: + except OSError as e: message = str(e) self.assertGreater(len(message), 0) - # use a long filename and make sure we don't overflow error - # buffers f = "/" + 4000 * "x" - self.assertRaises(_tskit.FileFormatError, func, f) + self.assertRaises(OSError, func, f) try: func(f) - except _tskit.FileFormatError as e: + except OSError as e: message = str(e) - self.assertLess(len(message), 1024) + self.assertTrue(message.endswith("File name too long")) def test_initial_state(self): # Check the initial state to make sure that it is empty. diff --git a/python/tskit/cli.py b/python/tskit/cli.py index 2e1713ee97..f344b5d3c3 100644 --- a/python/tskit/cli.py +++ b/python/tskit/cli.py @@ -46,7 +46,7 @@ def sys_exit(message): def load_tree_sequence(path): try: return tskit.load(path) - except tskit.FileFormatError as e: + except OSError as e: sys_exit(f"Load error: {e}")