From ddd1e66d7dffa3d82286457c8bcd94ad99bdac6b Mon Sep 17 00:00:00 2001 From: Dominik Lohmann Date: Thu, 21 Dec 2023 22:53:18 +0100 Subject: [PATCH 1/2] Allow setting the header for csv, tsv, and ssv manually This adds a new `--header
` option to the `xsv` parser, and by extension the `csv`, `tsv`, and `ssv` parsers. When set, the new option effectively adds a line at the top of the input. This is useful when parsing values from a file without a header, which is quite the common occurrence. --- changelog/next/features/3778--xsv-header.md | 2 + libtenzir/builtins/formats/xsv.cpp | 50 ++++++++++++--------- tenzir/integration/tests.yaml | 2 +- web/docs/formats/xsv.md | 15 ++++--- 4 files changed, 43 insertions(+), 26 deletions(-) create mode 100644 changelog/next/features/3778--xsv-header.md diff --git a/changelog/next/features/3778--xsv-header.md b/changelog/next/features/3778--xsv-header.md new file mode 100644 index 00000000000..320f64a76cd --- /dev/null +++ b/changelog/next/features/3778--xsv-header.md @@ -0,0 +1,2 @@ +The `csv`, `tsv`, `ssv` and `xsv` parsers now support setting the header line +manually with the `--header` option. diff --git a/libtenzir/builtins/formats/xsv.cpp b/libtenzir/builtins/formats/xsv.cpp index 0e3e56bc7c2..d5c6e37b8f5 100644 --- a/libtenzir/builtins/formats/xsv.cpp +++ b/libtenzir/builtins/formats/xsv.cpp @@ -38,6 +38,7 @@ struct xsv_options { char list_sep = {}; std::string null_value = {}; bool allow_comments = {}; + std::optional header = {}; static auto try_parse(parser_interface& p, std::string name, bool is_parser) -> xsv_options { @@ -47,8 +48,10 @@ struct xsv_options { auto field_sep_str = located{}; auto list_sep_str = located{}; auto null_value = located{}; + auto header = std::optional{}; if (is_parser) { parser.add("--allow-comments", allow_comments); + parser.add("--header", header, "
"); } parser.add(field_sep_str, ""); parser.add(list_sep_str, ""); @@ -93,15 +96,15 @@ struct xsv_options { .list_sep = *list_sep, .null_value = std::move(null_value.inner), .allow_comments = allow_comments, + .header = std::move(header), }; } friend auto inspect(auto& f, xsv_options& x) -> bool { - return f.object(x).fields(f.field("name", x.name), - f.field("field_sep", x.field_sep), - f.field("list_sep", x.list_sep), - f.field("null_value", x.null_value), - f.field("allow_comments", x.allow_comments)); + return f.object(x).fields( + f.field("name", x.name), f.field("field_sep", x.field_sep), + f.field("list_sep", x.list_sep), f.field("null_value", x.null_value), + f.field("allow_comments", x.allow_comments), f.field("header", x.header)); } }; @@ -245,22 +248,25 @@ auto parse_impl(generator> lines, // Parse header. auto it = lines.begin(); auto header = std::optional{}; - while (it != lines.end()) { - auto line = *it; - ++it; - if (not line) { - co_yield {}; - continue; + if (args.header) { + header = *args.header; + } else + while (it != lines.end()) { + auto line = *it; + ++it; + if (not line) { + co_yield {}; + continue; + } + if (line->empty()) + continue; + if (args.allow_comments && line->front() == '#') + continue; + header = line; + break; + if (not header) + co_return; } - if (line->empty()) - continue; - if (args.allow_comments && line->front() == '#') - continue; - header = line; - break; - } - if (not header) - co_return; const auto qqstring_value_parser = parsers::qqstr.then([](std::string in) { static auto unescaper = [](auto& f, auto l, auto out) { if (*f != '\\') { // Skip every non-escape character. @@ -493,7 +499,9 @@ class configured_xsv_plugin final : public virtual parser_parser_plugin, -> std::unique_ptr override { auto parser = argument_parser{name()}; bool allow_comments = {}; + std::optional header = {}; parser.add("--allow-comments", allow_comments); + parser.add("--header", header, "
"); parser.parse(p); return std::make_unique(xsv_options{ .name = std::string{Name.str()}, @@ -501,6 +509,7 @@ class configured_xsv_plugin final : public virtual parser_parser_plugin, .list_sep = ListSep, .null_value = std::string{Null.str()}, .allow_comments = allow_comments, + .header = std::move(header), }); } @@ -513,6 +522,7 @@ class configured_xsv_plugin final : public virtual parser_parser_plugin, .list_sep = ListSep, .null_value = std::string{Null.str()}, .allow_comments = false, + .header = {}, }); } diff --git a/tenzir/integration/tests.yaml b/tenzir/integration/tests.yaml index 3cfaaa15666..59b44720f13 100644 --- a/tenzir/integration/tests.yaml +++ b/tenzir/integration/tests.yaml @@ -933,7 +933,7 @@ tests: - command: exec 'read zeek-tsv | head 1 | write json -c | shell rev' input: data/zeek/conn.log.gz - command: exec 'shell "echo foo"' - - command: exec 'shell "{ echo \"#\"; seq 1 2 10; }" | read csv | write json -c' + - command: exec 'shell "{ seq 1 2 10; }" | read csv --header "#" | write json -c' Top and Rare Operators: fixture: ServerTester diff --git a/web/docs/formats/xsv.md b/web/docs/formats/xsv.md index 372178769a6..a5603da388c 100644 --- a/web/docs/formats/xsv.md +++ b/web/docs/formats/xsv.md @@ -12,10 +12,10 @@ Reads and writes lines with separated values. ## Synopsis ``` -csv [--allow-comments] -ssv [--allow-comments] -tsv [--allow-comments] -xsv [--allow-comments] +csv [--allow-comments] [--header
] +ssv [--allow-comments] [--header
] +tsv [--allow-comments] [--header
] +xsv [--allow-comments] [--header
] ``` ## Description @@ -70,10 +70,15 @@ Specifies the string that separates list elements *within* a field. Specifies the string that denotes an absent value. -### `--allow-comments` +### `--allow-comments` (Parser only) Treat lines beginning with `'#'` as comments. +### `--header
` (Parser only) + +Use the manually provided header line instead of treating the first line as the +header. + ## Examples Read CSV from stdin: From f5ad565755ed6f8f5f6b4e49971f8d1ab92a4217 Mon Sep 17 00:00:00 2001 From: Dominik Lohmann Date: Mon, 8 Jan 2024 13:27:02 +0100 Subject: [PATCH 2/2] Make docs styling more consistent Co-authored-by: Matthias Vallentin --- web/docs/formats/xsv.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/web/docs/formats/xsv.md b/web/docs/formats/xsv.md index a5603da388c..a98d0d60ff0 100644 --- a/web/docs/formats/xsv.md +++ b/web/docs/formats/xsv.md @@ -70,11 +70,11 @@ Specifies the string that separates list elements *within* a field. Specifies the string that denotes an absent value. -### `--allow-comments` (Parser only) +### `--allow-comments` (Parser) Treat lines beginning with `'#'` as comments. -### `--header
` (Parser only) +### `--header
` (Parser) Use the manually provided header line instead of treating the first line as the header.