Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Apache access log fix for non http requests #4228

Merged
39 changes: 38 additions & 1 deletion lib/scanner/csv-scanner/csv-scanner.c
Expand Up @@ -179,26 +179,63 @@ _parse_left_whitespace(CSVScanner *self)
static void
_parse_character_with_quotation(CSVScanner *self)
{
gchar ch;
/* quoted character */
if (self->options->dialect == CSV_SCANNER_ESCAPE_BACKSLASH &&
*self->src == '\\' &&
*(self->src + 1))
{
self->src++;
ch = *self->src;
}
else if (self->options->dialect == CSV_SCANNER_ESCAPE_BACKSLASH_WITH_SEQUENCES &&
*self->src == '\\' &&
*(self->src + 1))
{
self->src++;
ch = *self->src;
if (ch != self->current_quote)
{
switch (ch)
{
case 'a':
ch = '\a';
break;
case 'n':
ch = '\n';
break;
case 'r':
ch = '\r';
break;
case 't':
ch = '\t';
break;
case 'v':
ch = '\v';
break;
default:
break;
}
}
}
else if (self->options->dialect == CSV_SCANNER_ESCAPE_DOUBLE_CHAR &&
*self->src == self->current_quote &&
*(self->src+1) == self->current_quote)
{
self->src++;
ch = *self->src;
}
else if (*self->src == self->current_quote)
{
self->current_quote = 0;
self->src++;
return;
}
g_string_append_c(self->current_value, *self->src);
else
{
ch = *self->src;
}
g_string_append_c(self->current_value, ch);
self->src++;
}

Expand Down
3 changes: 2 additions & 1 deletion lib/scanner/csv-scanner/csv-scanner.h
Expand Up @@ -32,7 +32,8 @@ typedef enum
{
CSV_SCANNER_ESCAPE_NONE,
CSV_SCANNER_ESCAPE_BACKSLASH,
CSV_SCANNER_ESCAPE_DOUBLE_CHAR
CSV_SCANNER_ESCAPE_BACKSLASH_WITH_SEQUENCES,
CSV_SCANNER_ESCAPE_DOUBLE_CHAR,
} CSVScannerDialect;

#define CSV_SCANNER_STRIP_WHITESPACE 0x0001
Expand Down
84 changes: 84 additions & 0 deletions lib/scanner/csv-scanner/tests/test_csv_scanner.c
Expand Up @@ -275,6 +275,90 @@ Test(csv_scanner, greedy_column_null_value)
csv_scanner_deinit(&scanner);
}

Test(csv_scanner, escape_double_char)
{
const gchar *columns[] = { "foo", "bar", NULL };

_default_options_with_flags(columns, CSV_SCANNER_STRIP_WHITESPACE);

csv_scanner_options_set_dialect(&options, CSV_SCANNER_ESCAPE_DOUBLE_CHAR);
csv_scanner_init(&scanner, &options, "foo,\"this is a single quote \"\" character\"");

cr_expect(_column_name_equals("foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_name_equals("foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_name_equals("bar"));
cr_expect(_column_nv_equals("bar", "this is a single quote \" character"));
cr_expect(!_scan_complete());

/* go past the last column */
cr_expect(!_scan_next());
cr_expect(_scan_complete());
cr_expect(_column_name_unset());
csv_scanner_deinit(&scanner);
}

Test(csv_scanner, escape_backslash)
{
const gchar *columns[] = { "foo", "bar", NULL };

_default_options_with_flags(columns, CSV_SCANNER_STRIP_WHITESPACE);

csv_scanner_options_set_dialect(&options, CSV_SCANNER_ESCAPE_BACKSLASH);
csv_scanner_init(&scanner, &options, "foo,\"this is a single quote \\\" character\\n\"");

cr_expect(_column_name_equals("foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_name_equals("foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_name_equals("bar"));
cr_expect(_column_nv_equals("bar", "this is a single quote \" charactern"));
cr_expect(!_scan_complete());

/* go past the last column */
cr_expect(!_scan_next());
cr_expect(_scan_complete());
cr_expect(_column_name_unset());
csv_scanner_deinit(&scanner);
}

Test(csv_scanner, escape_backslash_sequences)
{
const gchar *columns[] = { "foo", "bar", NULL };

_default_options_with_flags(columns, CSV_SCANNER_STRIP_WHITESPACE);

csv_scanner_options_set_dialect(&options, CSV_SCANNER_ESCAPE_BACKSLASH_WITH_SEQUENCES);
csv_scanner_init(&scanner, &options, "foo,\"\\\"\\a\\t\\v\\r\\n\\\"\"");

cr_expect(_column_name_equals("foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_name_equals("foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_name_equals("bar"));
cr_expect(_column_nv_equals("bar", "\"\a\t\v\r\n\""));
cr_expect(!_scan_complete());

/* go past the last column */
cr_expect(!_scan_next());
cr_expect(_scan_complete());
cr_expect(_column_name_unset());
csv_scanner_deinit(&scanner);
}

static void
setup(void)
{
Expand Down
1 change: 1 addition & 0 deletions modules/csvparser/csvparser-grammar.ym
Expand Up @@ -139,6 +139,7 @@ parser_csv_dialect
gint mode = csv_parser_lookup_dialect($1);
CHECK_ERROR(mode >= 0, @1, "unknown dialect() argument for csv-parser()");
free($1);
$$ = mode;
}

/* INCLUDE_RULES */
Expand Down
2 changes: 2 additions & 0 deletions modules/csvparser/csvparser.c
Expand Up @@ -242,6 +242,8 @@ csv_parser_lookup_dialect(const gchar *flag)
return CSV_SCANNER_ESCAPE_NONE;
else if (strcmp(flag, "escape-backslash") == 0)
return CSV_SCANNER_ESCAPE_BACKSLASH;
else if (strcmp(flag, "escape-backslash-with-sequences") == 0)
return CSV_SCANNER_ESCAPE_BACKSLASH_WITH_SEQUENCES;
else if (strcmp(flag, "escape-double-char") == 0)
return CSV_SCANNER_ESCAPE_DOUBLE_CHAR;
return -1;
Expand Down
8 changes: 8 additions & 0 deletions news/bugfix-4228.md
@@ -0,0 +1,8 @@
`csv-parser()`: fixed the processing of the dialect() parameter, which was
not taken into consideration.

`apache-accesslog-parser()`: Apache may use backslash-style escapes in the
`request` field, so support it by setting the csv-parser() dialect to
`escape-backslash-with-sequences`. Also added validation that the
`rawrequest` field contains a valid HTTP request and only extract `verb`,
`request` and `httpversion` if this is the case.
3 changes: 3 additions & 0 deletions news/feature-4228.md
@@ -0,0 +1,3 @@
`csv-parser()`: add a new dialect, called escape-backslash-with-sequences
which uses "\" as an escape character but also supports C-style escape
sequences, like "\n" or "\r".
39 changes: 25 additions & 14 deletions scl/apache/apache.conf
Expand Up @@ -40,7 +40,7 @@ block parser apache-accesslog-parser-vhost(prefix() template()) {
filter { match("^[A-Za-z0-9\-\._]+:[0-9]+ " template(`template`)); };
parser {
csv-parser(
dialect(escape-double-char)
dialect(escape-backslash-with-sequences)
flags(strip-whitespace)
delimiters(" ")
template(`template`)
Expand Down Expand Up @@ -69,7 +69,7 @@ block parser apache-accesslog-parser-combined(prefix() template()) {
parser {
csv-parser(
prefix(`prefix`)
dialect(escape-double-char)
dialect(escape-backslash-with-sequences)
flags(strip-whitespace)
delimiters(" ")
template(`template`)
Expand Down Expand Up @@ -98,20 +98,31 @@ block parser apache-accesslog-parser(prefix(".apache.") template("${MESSAGE}"))
# mungle values to match Kibana/elastic schema and common to all
# supported formats.
parser {
csv-parser(
prefix(`prefix`)
template("${`prefix`rawrequest}")
delimiters(" ")
dialect(escape-none)
flags(strip-whitespace)
columns("verb", "request", "httpversion"));

date-parser(format("%d/%b/%Y:%H:%M:%S %z")
template("${`prefix`timestamp}"));
};
};

if {
# Sometimes the rawrequest is not a proper HTTP request (e.g. when someone
# submits an request like this):
#
# _default_:443 106.75.178.169 "-" - [22/Nov/2022:00:03:53 +0100] "{\"params\": [\"miner1\", \"bf\", \"00000001\", \"504e86ed\", \"b2957c02\"], \"id\": 4, \"method\": \"mining.submit\"}\n" 400 226 "-" "-"

parser {
csv-parser(
prefix(`prefix`)
template("${`prefix`rawrequest}")
delimiters(" ")
dialect(escape-none)
flags(strip-whitespace, drop-invalid)
columns("verb", "request", "httpversion"));

};

rewrite {
subst("^HTTP/(.*)$", "$1", value("`prefix`httpversion"));
};
};

rewrite {
subst("^HTTP/(.*)$", "$1", value("`prefix`httpversion"));
};
};
};