Skip to content

Commit

Permalink
Merge pull request #4228 from bazsi/apache-access-log-fix-for-non-htt…
Browse files Browse the repository at this point in the history
…p-requests

Apache access log fix for non http requests
  • Loading branch information
MrAnno committed Dec 5, 2022
2 parents aeeb499 + 6bd8cc2 commit 8fa4dc8
Show file tree
Hide file tree
Showing 10 changed files with 308 additions and 16 deletions.
39 changes: 38 additions & 1 deletion lib/scanner/csv-scanner/csv-scanner.c
Expand Up @@ -179,26 +179,63 @@ _parse_left_whitespace(CSVScanner *self)
static void
_parse_character_with_quotation(CSVScanner *self)
{
gchar ch;
/* quoted character */
if (self->options->dialect == CSV_SCANNER_ESCAPE_BACKSLASH &&
*self->src == '\\' &&
*(self->src + 1))
{
self->src++;
ch = *self->src;
}
else if (self->options->dialect == CSV_SCANNER_ESCAPE_BACKSLASH_WITH_SEQUENCES &&
*self->src == '\\' &&
*(self->src + 1))
{
self->src++;
ch = *self->src;
if (ch != self->current_quote)
{
switch (ch)
{
case 'a':
ch = '\a';
break;
case 'n':
ch = '\n';
break;
case 'r':
ch = '\r';
break;
case 't':
ch = '\t';
break;
case 'v':
ch = '\v';
break;
default:
break;
}
}
}
else if (self->options->dialect == CSV_SCANNER_ESCAPE_DOUBLE_CHAR &&
*self->src == self->current_quote &&
*(self->src+1) == self->current_quote)
{
self->src++;
ch = *self->src;
}
else if (*self->src == self->current_quote)
{
self->current_quote = 0;
self->src++;
return;
}
g_string_append_c(self->current_value, *self->src);
else
{
ch = *self->src;
}
g_string_append_c(self->current_value, ch);
self->src++;
}

Expand Down
3 changes: 2 additions & 1 deletion lib/scanner/csv-scanner/csv-scanner.h
Expand Up @@ -32,7 +32,8 @@ typedef enum
{
CSV_SCANNER_ESCAPE_NONE,
CSV_SCANNER_ESCAPE_BACKSLASH,
CSV_SCANNER_ESCAPE_DOUBLE_CHAR
CSV_SCANNER_ESCAPE_BACKSLASH_WITH_SEQUENCES,
CSV_SCANNER_ESCAPE_DOUBLE_CHAR,
} CSVScannerDialect;

#define CSV_SCANNER_STRIP_WHITESPACE 0x0001
Expand Down
84 changes: 84 additions & 0 deletions lib/scanner/csv-scanner/tests/test_csv_scanner.c
Expand Up @@ -275,6 +275,90 @@ Test(csv_scanner, greedy_column_null_value)
csv_scanner_deinit(&scanner);
}

Test(csv_scanner, escape_double_char)
{
const gchar *columns[] = { "foo", "bar", NULL };

_default_options_with_flags(columns, CSV_SCANNER_STRIP_WHITESPACE);

csv_scanner_options_set_dialect(&options, CSV_SCANNER_ESCAPE_DOUBLE_CHAR);
csv_scanner_init(&scanner, &options, "foo,\"this is a single quote \"\" character\"");

cr_expect(_column_name_equals("foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_name_equals("foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_name_equals("bar"));
cr_expect(_column_nv_equals("bar", "this is a single quote \" character"));
cr_expect(!_scan_complete());

/* go past the last column */
cr_expect(!_scan_next());
cr_expect(_scan_complete());
cr_expect(_column_name_unset());
csv_scanner_deinit(&scanner);
}

Test(csv_scanner, escape_backslash)
{
const gchar *columns[] = { "foo", "bar", NULL };

_default_options_with_flags(columns, CSV_SCANNER_STRIP_WHITESPACE);

csv_scanner_options_set_dialect(&options, CSV_SCANNER_ESCAPE_BACKSLASH);
csv_scanner_init(&scanner, &options, "foo,\"this is a single quote \\\" character\\n\"");

cr_expect(_column_name_equals("foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_name_equals("foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_name_equals("bar"));
cr_expect(_column_nv_equals("bar", "this is a single quote \" charactern"));
cr_expect(!_scan_complete());

/* go past the last column */
cr_expect(!_scan_next());
cr_expect(_scan_complete());
cr_expect(_column_name_unset());
csv_scanner_deinit(&scanner);
}

Test(csv_scanner, escape_backslash_sequences)
{
const gchar *columns[] = { "foo", "bar", NULL };

_default_options_with_flags(columns, CSV_SCANNER_STRIP_WHITESPACE);

csv_scanner_options_set_dialect(&options, CSV_SCANNER_ESCAPE_BACKSLASH_WITH_SEQUENCES);
csv_scanner_init(&scanner, &options, "foo,\"\\\"\\a\\t\\v\\r\\n\\\"\"");

cr_expect(_column_name_equals("foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_name_equals("foo"));
cr_expect(!_scan_complete());

cr_expect(_scan_next());
cr_expect(_column_name_equals("bar"));
cr_expect(_column_nv_equals("bar", "\"\a\t\v\r\n\""));
cr_expect(!_scan_complete());

/* go past the last column */
cr_expect(!_scan_next());
cr_expect(_scan_complete());
cr_expect(_column_name_unset());
csv_scanner_deinit(&scanner);
}

static void
setup(void)
{
Expand Down
1 change: 1 addition & 0 deletions modules/csvparser/csvparser-grammar.ym
Expand Up @@ -139,6 +139,7 @@ parser_csv_dialect
gint mode = csv_parser_lookup_dialect($1);
CHECK_ERROR(mode >= 0, @1, "unknown dialect() argument for csv-parser()");
free($1);
$$ = mode;
}

/* INCLUDE_RULES */
Expand Down
2 changes: 2 additions & 0 deletions modules/csvparser/csvparser.c
Expand Up @@ -242,6 +242,8 @@ csv_parser_lookup_dialect(const gchar *flag)
return CSV_SCANNER_ESCAPE_NONE;
else if (strcmp(flag, "escape-backslash") == 0)
return CSV_SCANNER_ESCAPE_BACKSLASH;
else if (strcmp(flag, "escape-backslash-with-sequences") == 0)
return CSV_SCANNER_ESCAPE_BACKSLASH_WITH_SEQUENCES;
else if (strcmp(flag, "escape-double-char") == 0)
return CSV_SCANNER_ESCAPE_DOUBLE_CHAR;
return -1;
Expand Down
8 changes: 8 additions & 0 deletions news/bugfix-4228.md
@@ -0,0 +1,8 @@
`csv-parser()`: fixed the processing of the dialect() parameter, which was
not taken into consideration.

`apache-accesslog-parser()`: Apache may use backslash-style escapes in the
`request` field, so support it by setting the csv-parser() dialect to
`escape-backslash-with-sequences`. Also added validation that the
`rawrequest` field contains a valid HTTP request and only extract `verb`,
`request` and `httpversion` if this is the case.
3 changes: 3 additions & 0 deletions news/feature-4228.md
@@ -0,0 +1,3 @@
`csv-parser()`: add a new dialect, called escape-backslash-with-sequences
which uses "\" as an escape character but also supports C-style escape
sequences, like "\n" or "\r".
39 changes: 25 additions & 14 deletions scl/apache/apache.conf
Expand Up @@ -40,7 +40,7 @@ block parser apache-accesslog-parser-vhost(prefix() template()) {
filter { match("^[A-Za-z0-9\-\._]+:[0-9]+ " template(`template`)); };
parser {
csv-parser(
dialect(escape-double-char)
dialect(escape-backslash-with-sequences)
flags(strip-whitespace)
delimiters(" ")
template(`template`)
Expand Down Expand Up @@ -69,7 +69,7 @@ block parser apache-accesslog-parser-combined(prefix() template()) {
parser {
csv-parser(
prefix(`prefix`)
dialect(escape-double-char)
dialect(escape-backslash-with-sequences)
flags(strip-whitespace)
delimiters(" ")
template(`template`)
Expand Down Expand Up @@ -98,20 +98,31 @@ block parser apache-accesslog-parser(prefix(".apache.") template("${MESSAGE}"))
# mungle values to match Kibana/elastic schema and common to all
# supported formats.
parser {
csv-parser(
prefix(`prefix`)
template("${`prefix`rawrequest}")
delimiters(" ")
dialect(escape-none)
flags(strip-whitespace)
columns("verb", "request", "httpversion"));

date-parser(format("%d/%b/%Y:%H:%M:%S %z")
template("${`prefix`timestamp}"));
};
};

if {
# Sometimes the rawrequest is not a proper HTTP request (e.g. when someone
# submits an request like this):
#
# _default_:443 106.75.178.169 "-" - [22/Nov/2022:00:03:53 +0100] "{\"params\": [\"miner1\", \"bf\", \"00000001\", \"504e86ed\", \"b2957c02\"], \"id\": 4, \"method\": \"mining.submit\"}\n" 400 226 "-" "-"

parser {
csv-parser(
prefix(`prefix`)
template("${`prefix`rawrequest}")
delimiters(" ")
dialect(escape-none)
flags(strip-whitespace, drop-invalid)
columns("verb", "request", "httpversion"));

};

rewrite {
subst("^HTTP/(.*)$", "$1", value("`prefix`httpversion"));
};
};

rewrite {
subst("^HTTP/(.*)$", "$1", value("`prefix`httpversion"));
};
};
};

0 comments on commit 8fa4dc8

Please sign in to comment.