Skip to content

Commit

Permalink
Gracefully handle misaligned header and values in xsv parser (#3874)
Browse files Browse the repository at this point in the history
The `xsv` parser (and by extension the `csv`, `tsv`, and `ssv` parsers)
previously skipped lines that had a mismatch between the number of
values contained and the number of fields defined in the header. This is
now handled in a more lenient way in both directions:
1. If the header contains more fields than a row contains data, the
remaining fields are filled in as `null` values.
2. If a row contains more values than the header had fields defined for,
we dynamically expand the header with fields nammed `unnamed1`,
`unnammed2`, and so forth. This is hidden behind the option `--auto-expand`.

These two examples show the new behavior:

```
❯ echo "1,2,3,4,5" | tenzir -q 'read csv --header "foo,bar,baz" --auto-expand'
{
  "foo": 1,
  "bar": 2,
  "baz": 3,
  "unnamed1": 4,
  "unnamed2": 5
}
```

```
❯ echo "1" | tenzir -q 'read csv --header "foo,bar,baz"'
{
  "foo": 1,
  "bar": null,
  "baz": null
}
```

Fixes tenzir/issues#1500
  • Loading branch information
dominiklohmann committed Feb 4, 2024
2 parents 25ca707 + d328a78 commit faefc84
Show file tree
Hide file tree
Showing 10 changed files with 60 additions and 13 deletions.
5 changes: 5 additions & 0 deletions changelog/next/bug-fixes/3874--xsv-leniency.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
The `xsv` parser (and by extension the `csv`, `tsv`, and `ssv` parsers) skipped
lines that had a mismatch between the number of values contained and the number
of fields defined in the header. Instead, it now fills in `null` values for
missing values and, if the new `--auto-expand` option is set, also adds new
header fields for excess values.
38 changes: 29 additions & 9 deletions libtenzir/builtins/formats/xsv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include "tenzir/concept/printable/tenzir/json.hpp"
#include "tenzir/detail/string_literal.hpp"
#include "tenzir/detail/to_xsv_sep.hpp"
#include "tenzir/detail/zip_iterator.hpp"
#include "tenzir/parser_interface.hpp"
#include "tenzir/plugin.hpp"
#include "tenzir/series_builder.hpp"
Expand All @@ -38,6 +37,7 @@ struct xsv_options {
char list_sep = {};
std::string null_value = {};
bool allow_comments = {};
bool auto_expand = {};
std::optional<std::string> header = {};
bool no_header = {};

Expand All @@ -46,13 +46,15 @@ struct xsv_options {
auto parser
= argument_parser{"xsv", "https://docs.tenzir.com/next/formats/xsv"};
auto allow_comments = bool{};
auto auto_expand = bool{};
auto field_sep_str = located<std::string>{};
auto list_sep_str = located<std::string>{};
auto null_value = located<std::string>{};
auto header = std::optional<std::string>{};
auto no_header = bool{};
if (is_parser) {
parser.add("--allow-comments", allow_comments);
parser.add("--auto-expand", auto_expand);
parser.add("--header", header, "<header>");
} else {
parser.add("--no-header", no_header);
Expand Down Expand Up @@ -100,6 +102,7 @@ struct xsv_options {
.list_sep = *list_sep,
.null_value = std::move(null_value.inner),
.allow_comments = allow_comments,
.auto_expand = auto_expand,
.header = std::move(header),
.no_header = no_header,
};
Expand All @@ -109,7 +112,8 @@ struct xsv_options {
return f.object(x).fields(
f.field("name", x.name), f.field("field_sep", x.field_sep),
f.field("list_sep", x.list_sep), f.field("null_value", x.null_value),
f.field("allow_comments", x.allow_comments), f.field("header", x.header),
f.field("allow_comments", x.allow_comments),
f.field("auto_expand", x.auto_expand), f.field("header", x.header),
f.field("no_header", x.no_header));
}
};
Expand Down Expand Up @@ -365,22 +369,34 @@ auto parse_impl(generator<std::optional<std::string_view>> lines,
});
auto values_parser = (value_parser % args.field_sep);
auto values = std::vector<data>{};
if (!values_parser(*line, values)) {
if (not values_parser(*line, values)) {
diagnostic::warning("skips unparseable line")
.note("from `{}` parser", args.name)
.emit(ctrl.diagnostics());
continue;
}
if (values.size() != fields.size()) {
diagnostic::warning("skips unparseable line")
.hint("expected {} fields but got {}", fields.size(), values.size())
auto generated_field_id = 0;
if (args.auto_expand) {
while (fields.size() < values.size()) {
auto name = fmt::format("unnamed{}", ++generated_field_id);
if (std::find(fields.begin(), fields.end(), name) == fields.end()) {
fields.push_back(name);
}
}
} else if (fields.size() < values.size()) {
diagnostic::warning("skips {} excess values in line",
values.size() - fields.size())
.hint("use `--auto-expand` to add fields for excess values")
.note("from `{}` parser", args.name)
.emit(ctrl.diagnostics());
continue;
}
auto row = b.record();
for (const auto& [field, value] : detail::zip(fields, values)) {
auto result = row.field(field).try_data(value);
for (size_t i = 0; i < fields.size(); ++i) {
if (i >= values.size()) {
row.field(fields[i]).null();
continue;
}
auto result = row.field(fields[i]).try_data(values[i]);
if (not result) {
diagnostic::warning(result.error())
.note("from `{}` parser", args.name)
Expand Down Expand Up @@ -524,8 +540,10 @@ class configured_xsv_plugin final : public virtual parser_parser_plugin,
-> std::unique_ptr<plugin_parser> override {
auto parser = argument_parser{name()};
bool allow_comments = {};
bool auto_expand = {};
std::optional<std::string> header = {};
parser.add("--allow-comments", allow_comments);
parser.add("--auto-expand", auto_expand);
parser.add("--header", header, "<header>");
parser.parse(p);
return std::make_unique<xsv_parser>(xsv_options{
Expand All @@ -534,6 +552,7 @@ class configured_xsv_plugin final : public virtual parser_parser_plugin,
.list_sep = ListSep,
.null_value = std::string{Null.str()},
.allow_comments = allow_comments,
.auto_expand = auto_expand,
.header = std::move(header),
.no_header = false,
});
Expand All @@ -551,6 +570,7 @@ class configured_xsv_plugin final : public virtual parser_parser_plugin,
.list_sep = ListSep,
.null_value = std::string{Null.str()},
.allow_comments = false,
.auto_expand = false,
.header = {},
.no_header = no_header,
});
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pipeline [
list_sep: 59,
null_value: "",
allow_comments: false,
auto_expand: false,
no_header: false
},
save file {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"foo": 1, "bar": 2, "baz": 3}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"foo": 1, "bar": 2, "baz": null}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"foo": 1, "bar": 2, "baz": 3}
warning: skips 2 excess values in line
= hint: use `--auto-expand` to add fields for excess values
= note: from `csv` parser
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"foo": 1, "bar": 2, "baz": 3, "unnamed1": 4, "unnamed2": 5}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"foo": 1, "unnamed1": 2, "baz": 3, "unnamed2": 4, "unnamed3": 5}
8 changes: 8 additions & 0 deletions tenzir/integration/tests/pipelines_local.bats
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,14 @@ setup() {
check tenzir "from ${INPUTSDIR}/xsv/nulls-and-escaping.csv read csv"
}

@test "read xsv auto expand" {
echo "1,2,3" | check tenzir 'read csv --header "foo,bar,baz"'
echo "1,2" | check tenzir 'read csv --header "foo,bar,baz"'
echo "1,2,3,4,5" | check tenzir 'read csv --header "foo,bar,baz"'
echo "1,2,3,4,5" | check tenzir 'read csv --header "foo,bar,baz" --auto-expand'
echo "1,2,3,4,5" | check tenzir 'read csv --header "foo,unnamed1,baz" --auto-expand'
}

# bats test_tags=pipelines, xsv
@test "Slice" {
check tenzir "from ${INPUTSDIR}/zeek/conn.log.gz read zeek-tsv | head 100 | enumerate | slice --begin 1"
Expand Down
13 changes: 9 additions & 4 deletions web/docs/formats/xsv.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ Reads and writes lines with separated values.
Parser:

```
csv [--allow-comments] [--header <header>]
ssv [--allow-comments] [--header <header>]
tsv [--allow-comments] [--header <header>]
xsv <field-sep> <list-sep> <null-value> [--allow-comments] [--header <header>]
csv [--allow-comments] [--auto-expand] [--header <header>]
ssv [--allow-comments] [--auto-expand] [--header <header>]
tsv [--allow-comments] [--auto-expand] [--header <header>]
xsv <field-sep> <list-sep> <null-value> [--allow-comments] [--auto-expand] [--header <header>]
```

Printer:
Expand Down Expand Up @@ -85,6 +85,11 @@ Specifies the string that denotes an absent value.

Treat lines beginning with `'#'` as comments.

### `--auto-expand (Parser)

Automatically add fields to the schema when encountering events with too many
values instead of dropping the excess values.

### `--header <header>` (Parser)

Use the manually provided header line instead of treating the first line as the
Expand Down

0 comments on commit faefc84

Please sign in to comment.