Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(vrl): Support DataDog grok parser #7705

Closed
wants to merge 41 commits into from
Closed
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
46c8e60
Support DataDog grok parser
vladimir-dd May 31, 2021
a486b98
Generate VRL AST instead of VRL expressions
vladimir-dd Jun 16, 2021
9cfccfc
(enhancement: datadog grok) add more matchers
vladimir-dd Jun 22, 2021
ac9419f
nullIf
vladimir-dd Jun 22, 2021
5f49c90
json
vladimir-dd Jun 22, 2021
286d053
Merge branch 'master' into vladimir-dd/dd-grok-functions
vladimir-dd Jun 22, 2021
8bbb962
json; rubyhash
vladimir-dd Jun 22, 2021
4a0dfb6
lowercase; uppercase
vladimir-dd Jun 22, 2021
55a8450
decodeuricomponent
vladimir-dd Jun 22, 2021
7730c4f
warnings
vladimir-dd Jun 23, 2021
c68e099
fix test
vladimir-dd Jun 23, 2021
a83bb7c
typo
vladimir-dd Jun 23, 2021
0285e78
correct comments
vladimir-dd Jun 23, 2021
4698c4a
get rid of intermediate VRL representation
vladimir-dd Jun 28, 2021
1fd87ed
bench
vladimir-dd Jul 1, 2021
2237443
delete transform
vladimir-dd Jul 1, 2021
9ff4da9
cargo check
vladimir-dd Jul 1, 2021
3219767
address comments
vladimir-dd Jul 1, 2021
b8c17ca
merge master
vladimir-dd Jul 1, 2021
45c5b86
feat(remap): Support DataDog grok parser
vladimir-dd Jul 1, 2021
2b2a029
add tokio dependency back
vladimir-dd Jul 1, 2021
338743c
use char instead of string
vladimir-dd Jul 1, 2021
ca77c55
use custom value to get rid of vector-core dependency
vladimir-dd Jul 6, 2021
c46925d
cargo check
vladimir-dd Jul 6, 2021
0b41d66
clippy
vladimir-dd Jul 6, 2021
9b3672f
udeps
vladimir-dd Jul 6, 2021
9c2107e
rollback unnecessary changes
vladimir-dd Jul 6, 2021
f266bba
remove empty line
vladimir-dd Jul 6, 2021
0776fb4
exclude lalrpop-generated file from check-style
vladimir-dd Jul 6, 2021
8f1a43a
exclude lalrpop-generated file from sources
vladimir-dd Jul 6, 2021
5f4f158
correct lalrpop generation
vladimir-dd Jul 6, 2021
f791d81
move build.rs up
vladimir-dd Jul 6, 2021
3be14c7
correct example
vladimir-dd Jul 6, 2021
ee6e85c
fix VRL example
vladimir-dd Jul 6, 2021
1027beb
remove trailing spaces
vladimir-dd Jul 6, 2021
4b8f998
review suggestions
vladimir-dd Jul 23, 2021
a494d2c
Apply suggestions from code review
vladimir-dd Jul 23, 2021
5b4d761
Apply suggestions from code review
vladimir-dd Jul 23, 2021
215574a
inverse expect error messages
vladimir-dd Aug 10, 2021
732bafb
use match instead of if + is_some()
vladimir-dd Aug 10, 2021
5cbe134
[nit] alphabetical order
vladimir-dd Aug 10, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
81 changes: 79 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion Cargo.toml
Expand Up @@ -89,7 +89,9 @@ members = [
"lib/vrl/stdlib",
"lib/vrl/tests",
"lib/vrl/proptests",
"lib/datadog/search-syntax"
"lib/datadog/search-syntax",
"lib/datadog/grok",
"lib/parsing"
]

[dependencies]
Expand Down Expand Up @@ -193,6 +195,7 @@ vrl-compiler = { path = "lib/vrl/compiler", optional = true }
# Lookup
lookup = { path = "lib/lookup" }


# External libs
anyhow = { version = "1.0.41", default-features = false }
async-compression = { version = "0.3.7", default-features = false, features = ["tokio", "gzip", "zstd"] }
Expand Down
42 changes: 42 additions & 0 deletions lib/datadog/grok/Cargo.toml
@@ -0,0 +1,42 @@
[package]
name = "datadog-grok"
version = "0.1.0"
authors = ["Vector Contributors <vector@timber.io>"]
edition = "2018"
build = "src/build.rs" # LALRPOP preprocessing

[dependencies]
serde = { version = "1.0.126", default-features = false, features = ["derive"] }
serde_yaml = "0.8"
lalrpop-util = "0.19"
thiserror = "1"
lazy_static = "1.3.0"
regex = "1"
grok = "1"
indoc = { version = "1.0.3", default-features = false }
inventory = { version = "0.1.10", default-features = false }
async-trait = "0.1.50"
typetag = { version = "0.1.7", default-features = false }
derivative = { version = "2.2.0", default-features = false }
itertools = { version = "0.10.1", default-features = false }
serde_json = { version = "1.0.64", default-features = false }
percent-encoding = { version = "2.1.0", default-features = false }
bytes = "1.0.0"
enum_derive = "0.1.3"
custom_derive = "0.1.2"
strum = "0.21.0"
strum_macros = "0.21"
vladimir-dd marked this conversation as resolved.
Show resolved Hide resolved
tracing = { version = "0.1.26", default-features = false }
tokio = { version = "1", features = ["sync"] } # is it the right way to enable sync?

# Internal
shared = { path = "../../shared" }
lookup = { path = "../../lookup" }
vector_core = { path = "../../vector-core"}
parsing = { path = "../../parsing"}

[build-dependencies]
lalrpop = "0.19"

[dev-dependencies]
anyhow = "1"
64 changes: 64 additions & 0 deletions lib/datadog/grok/patterns/core
@@ -0,0 +1,64 @@
# Basic constructs
numberStr [+-]?(?>\d+(?:\.(?:\d*)?)?|\.\d+)
numberExtStr [+-]?(?>\d+(?:\.(?:\d*)?)?|\.\d+)(?:[eE][+-]?\d+)?
integerStr [+-]?\d+
integerExtStr [+-]?\d+(?:[eE][+-]?\d+)?
word \b\w+\b

doubleQuotedString "[^"]*"
singleQuotedString '[^']*'
quotedString (?>%{doubleQuotedString}|%{singleQuotedString})
qs %{quotedString}

uuid [A-Fa-f0-9]{8}-(?:[A-Fa-f0-9]{4}-){3}[A-Fa-f0-9]{12}

notSpace \S+
data .*?
greedyData .*
space \s+

# Username
user [\w.-]+

# Networking
# MAC addresses
ciscoMac (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4})
windowsMac (?:(?:[A-Fa-f0-9]{2}-){5}[A-Fa-f0-9]{2})
commonMac (?:(?:[A-Fa-f0-9]{2}:){5}[A-Fa-f0-9]{2})
mac (?:%{ciscoMac}|%{windowsMac}|%{commonMac})

# IP addresses
ipv6 ((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?
ipv4 (?<![0-9])(?:(?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2})[.](?:25[0-5]|2[0-4][0-9]|[0-1]?[0-9]{1,2}))(?![0-9])
ip (?:%{ipv6}|%{ipv4})

# Hostname
# We allow underscores in hostnames (https://issues.apache.org/bugzilla/show_bug.cgi?id=21133)
hostname \b(?:[0-9A-Za-z][0-9A-Za-z-_]{0,62})(?:\.(?:[0-9A-Za-z][0-9A-Za-z-_]{0,62}))*(\.?|\b)
host %{hostname}

# IP or host
ipOrHost (?:%{ip}|%{host})

# A port can be any value from 1 to 65535
port [1-9]\d{0,4}
hostPort %{ipOrHost}:%{port}

# Paths
path (?:%{unixPath}|%{winPath})
unixPath (?>/(?>[\w_%!$@:.,~-]+|\\.)*)+
tty (?:/dev/(pts|tty([pq])?)(\w+)?/?(?:[0-9]+))
winPath (?>[A-Za-z]+:|\\)(?:\\[^\\?*]*)+
uriProto [A-Za-z]+(?:\+[A-Za-z+]+)?
uriHost %{ipOrHost}(?::%{port})?

# Uripath comes loosely from RFC1738, but mostly from what Firefox
# doesn't turn into %XX
uriPath (?:/[A-Za-z0-9$.+!*'(){},~:;=@#%_\-]*)+
#URIPARAM \?(?:[A-Za-z0-9]+(?:=(?:[^&]*))?(?:&(?:[A-Za-z0-9]+(?:=(?:[^&]*))?)?)*)?
uriParam \?[A-Za-z0-9$.+!*'|(){},~@#%&/=:;_?\-\[\]]*
uriPathParam %{uriPath}(?:%{uriParam})?
uri %{uriProto}://(?:%{user}(?::[^@]*)?@)?(?:%{uriHost})?(?:%{uriPathParam})?

# Log Levels
# LOGLEVEL ([Aa]lert|ALERT|[Tt]race|TRACE|[Dd]ebug|DEBUG|[Nn]otice|NOTICE|[Ii]nfo|INFO|[Ww]arn?(?:ing)?|WARN?(?:ING)?|[Ee]rr?(?:or)?|ERR?(?:OR)?|[Cc]rit?(?:ical)?|CRIT?(?:ICAL)?|[Ff]atal|FATAL|[Ss]evere|SEVERE|EMERG(?:ENCY)?|[Ee]merg(?:ency)?)
26 changes: 26 additions & 0 deletions lib/datadog/grok/src/ast.rs
@@ -0,0 +1,26 @@
use lookup::LookupBuf;
use vector_core::event::Value;

#[derive(Clone, Debug)]
pub struct GrokPattern {
pub match_fn: Function,
pub destination: Option<Destination>,
}

#[derive(Clone, Debug)]
pub struct Destination {
pub path: LookupBuf,
pub filter_fn: Option<Function>,
}

#[derive(Clone, Debug)]
pub struct Function {
pub name: String,
pub args: Option<Vec<FunctionArgument>>,
}

#[derive(Clone, Debug)]
pub enum FunctionArgument {
Function(Function),
Arg(Value),
}
41 changes: 41 additions & 0 deletions lib/datadog/grok/src/build.rs
@@ -0,0 +1,41 @@
extern crate lalrpop;

use std::fmt::Write as fmt_write;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
use std::{env, fs};

fn main() {
lalrpop::Configuration::new()
.always_use_colors()
.process_current_dir()
.unwrap();

println!("cargo:rerun-if-changed=src/parser.lalrpop");

read_grok_patterns();
}

/// Reads grok patterns defined in the `patterns` folder into the static `PATTERNS` variable
fn read_grok_patterns() {
let mut output = "static PATTERNS: &[(&str, &str)] = &[\n".to_string();

fs::read_dir(Path::new("patterns"))
.expect("can read 'patterns' dir")
.filter_map(|path| File::open(path.expect("can read 'patterns' dir").path()).ok())
.flat_map(|f| BufReader::new(f).lines().filter_map(|l| l.ok()))
.filter(|line| !line.starts_with('#') && !line.is_empty())
.for_each(|line| {
let (key, value) =
line.split_at(line.find(' ').expect("pattern is 'ruleName definition'"));
write!(output, "\t(\"{}\", r#\"{}\"#),", key, &value[1..])
.expect("can append patterns");
});

output.push_str("];\n");

let out_dir = env::var("OUT_DIR").expect("OUT_DIR is defined");
let dest_path = Path::new(&out_dir).join("patterns.rs");
fs::write(dest_path, output).expect("'patterns.rs' is created");
}