Skip to content

Commit

Permalink
enhancement(redact): Add redactor option
Browse files Browse the repository at this point in the history
This option allows you to specify either a custom string, or a hash function to use as the redactor,
in addition to the current behavior of the fixed string "[Redacted]".

Fixes vectordotdev#633
  • Loading branch information
tmccombs committed Jan 12, 2024
1 parent 0265730 commit dd3708e
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 22 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ lua = ["dep:mlua"]
test = ["string_path"]

# All stdlib functions
stdlib = ["compiler", "core", "datadog", "dep:aes", "dep:chacha20poly1305", "dep:crypto_secretbox", "dep:ctr", "dep:cbc", "dep:cfb-mode", "dep:ofb", "dep:base16", "dep:nom", "dep:strip-ansi-escapes", "dep:utf8-width", "dep:hex", "dep:seahash", "dep:syslog_loose", "dep:hostname", "dep:zstd", "dep:quoted_printable", "dep:once_cell", "dep:base64", "dep:uuid", "dep:percent-encoding", "dep:uaparser", "dep:rust_decimal", "dep:indexmap", "dep:flate2", "dep:charset", "dep:data-encoding", "dep:hmac", "dep:sha-1", "dep:cidr-utils", "dep:sha-2", "dep:md-5", "dep:url", "dep:woothee", "dep:csv", "dep:roxmltree", "dep:rand", "dep:dns-lookup", "dep:sha-3", "dep:grok", "dep:community-id", "dep:snap"]
stdlib = ["compiler", "core", "datadog", "dep:aes", "dep:chacha20poly1305", "dep:crypto_secretbox", "dep:ctr", "dep:cbc", "dep:cfb-mode", "dep:ofb", "dep:base16", "dep:nom", "dep:strip-ansi-escapes", "dep:utf8-width", "dep:hex", "dep:seahash", "dep:syslog_loose", "dep:hostname", "dep:zstd", "dep:quoted_printable", "dep:once_cell", "dep:base64", "dep:uuid", "dep:percent-encoding", "dep:uaparser", "dep:rust_decimal", "dep:indexmap", "dep:flate2", "dep:charset", "dep:data-encoding", "dep:hmac", "dep:digest", "dep:sha-1", "dep:cidr-utils", "dep:sha-2", "dep:md-5", "dep:url", "dep:woothee", "dep:csv", "dep:roxmltree", "dep:rand", "dep:dns-lookup", "dep:sha-3", "dep:grok", "dep:community-id", "dep:snap"]

[dependencies]
cfg-if = "1.0.0"
Expand All @@ -68,6 +68,7 @@ csv = { version = "1.3", optional = true }
clap = { version = "4.4.14", features = ["derive"], optional = true }
codespan-reporting = {version = "0.11", optional = true }
data-encoding = { version = "2.5.0", optional = true }
digest = { version = "0.10", optional = true }
dyn-clone = { version = "1.0.16", default-features = false, optional = true }
exitcode = {version = "1", optional = true }
flate2 = { version = "1.0.28", default-features = false, features = ["default"], optional = true }
Expand Down
2 changes: 2 additions & 0 deletions changelog.d/633_enhancement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Added `redactor` option to `redact` to allow replacing with a custom string or hash of the redacted
content.
173 changes: 152 additions & 21 deletions src/stdlib/redact.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ use once_cell::sync::Lazy;
use std::{
borrow::Cow,
convert::{TryFrom, TryInto},
str::FromStr,
};

// https://www.oreilly.com/library/view/regular-expressions-cookbook/9781449327453/ch04s12.html
Expand Down Expand Up @@ -38,6 +37,11 @@ impl Function for Redact {
kind: kind::ARRAY,
required: true,
},
Parameter {
keyword: "redactor",
kind: kind::OBJECT | kind::BYTES,
required: false,
},
]
}

Expand All @@ -53,6 +57,7 @@ impl Function for Redact {
source: r#"redact({ "name": "John Doe", "ssn": "123-12-1234"}, filters: ["us_social_security_number"])"#,
result: Ok(r#"{ "name": "John Doe", "ssn": "[REDACTED]" }"#),
},
// TODO: redactor examples
]
}

Expand Down Expand Up @@ -88,7 +93,20 @@ impl Function for Redact {
})
.collect::<std::result::Result<Vec<Filter>, _>>()?;

let redactor = Redactor::Full;
let redactor = arguments
.optional_literal("redactor", state)?
.map(|value| {
value
.clone()
.try_into()
.map_err(|error| function::Error::InvalidArgument {
keyword: "redactor",
value,
error,
})
})
.transpose()?
.unwrap_or(Redactor::Full);

Ok(RedactFn {
value,
Expand All @@ -109,6 +127,9 @@ struct RedactFn {
}

fn redact(value: Value, filters: &[Filter], redactor: &Redactor) -> Value {
// possible optimization. match the redactor here, and use different calls depending on
// the value, so that we don't have to do the comparision in the loop of replacment.
// that would complicate the code though.
match value {
Value::Bytes(bytes) => {
let input = String::from_utf8_lossy(&bytes);
Expand Down Expand Up @@ -220,52 +241,162 @@ impl Filter {
patterns
.iter()
.fold(Cow::Borrowed(input), |input, pattern| match pattern {
Pattern::Regex(regex) => regex
.replace_all(&input, redactor.pattern())
.into_owned()
.into(),
Pattern::String(pattern) => {
input.replace(pattern, redactor.pattern()).into()
Pattern::Regex(regex) => {
regex.replace_all(&input, redactor).into_owned().into()
}
Pattern::String(pattern) => str_replace(&input, pattern, redactor).into(),
})
}
Filter::UsSocialSecurityNumber => {
US_SOCIAL_SECURITY_NUMBER.replace_all(input, redactor.pattern())
US_SOCIAL_SECURITY_NUMBER.replace_all(input, redactor)
}
}
}
}

fn str_replace(haystack: &str, pattern: &str, redactor: &Redactor) -> String {
let mut result = String::new();
let mut last_end = 0;
for (start, original) in haystack.match_indices(pattern) {
result.push_str(&haystack[last_end..start]);
redactor.replace_str(original, &mut result);
last_end = start + original.len();
}
result.push_str(&haystack[last_end..]);
result
}

/// The recipe for redacting the matched filters.
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
#[derive(Debug, Default, Clone, PartialEq, Eq)]
enum Redactor {
#[default]
Full,
/// Replace with a fixed string
Text(String), // possible optimization: use Arc<str> instead of String to speed up cloning
// this simplifies the code, but the Debug implmentation probably isn't very useful
// alternatively we could have a separate variant for each hash algorithm/variant combination
// we could also create a custom Debug implementation that does a comparison of the fn pointer
// to function pointers we might use.
/// Replace with a hash of the redacted content
Hash(fn(&[u8]) -> String),
}

impl Redactor {
fn pattern(&self) -> &str {
use Redactor::Full;
const REDACTED: &str = "[REDACTED]";

impl Redactor {
fn replace_str(&self, original: &str, dst: &mut String) {
match self {
Full => "[REDACTED]",
Redactor::Full => {
dst.push_str(REDACTED);
}
Redactor::Text(s) => {
dst.push_str(s);
}
Redactor::Hash(hash) => dst.push_str(&hash(original.as_bytes())),
}
}

fn from_object(obj: ObjectMap) -> std::result::Result<Self, &'static str> {
let r#type = match obj.get("type").ok_or(
"redactor specified as objects must have type
parameter",
)? {
Value::Bytes(bytes) => Ok(bytes.clone()),
_ => Err("type key in redactor must be a string"),
}?;

match r#type.as_ref() {
b"full" => Ok(Redactor::Full),
b"text" => {
match obj.get("replacement").ok_or(
"text redactor must have
`replacement` specified",
)? {
Value::Bytes(bytes) => {
Ok(Redactor::Text(String::from_utf8_lossy(bytes).into_owned()))
}
_ => Err("`replacement` must be a string"),
}
}
b"sha2" => {
let hash = if let Some(variant) = obj.get("variant") {
match variant
.as_bytes()
.ok_or("`variant` must be a string")?
.as_ref()
{
b"SHA-224" => hex_digest::<sha_2::Sha224>,
b"SHA-256" => hex_digest::<sha_2::Sha256>,
b"SHA-384" => hex_digest::<sha_2::Sha384>,
b"SHA-512" => hex_digest::<sha_2::Sha512>,
b"SHA-512/224" => hex_digest::<sha_2::Sha512_224>,
b"SHA-512/256" => hex_digest::<sha_2::Sha512_256>,
_ => return Err("invalid sha2 variant"),
}
} else {
hex_digest::<sha_2::Sha512_256>
};
Ok(Redactor::Hash(hash))
}
b"sha3" => {
let hash = if let Some(variant) = obj.get("variant") {
match variant
.as_bytes()
.ok_or("`variant must be a string")?
.as_ref()
{
b"SHA3-224" => hex_digest::<sha_3::Sha3_224>,
b"SHA3-256" => hex_digest::<sha_3::Sha3_256>,
b"SHA3-384" => hex_digest::<sha_3::Sha3_384>,
b"SHA3-512" => hex_digest::<sha_3::Sha3_512>,
_ => return Err("invalid sha2 variant"),
}
} else {
hex_digest::<sha_2::Sha512_256>
};
Ok(Redactor::Hash(hash))
}
_ => Err("unknown `type` for `redactor`"),
}
}
}

impl FromStr for Redactor {
type Err = &'static str;
impl regex::Replacer for &Redactor {
fn replace_append(&mut self, caps: &regex::Captures, dst: &mut String) {
self.replace_str(&caps[0], dst);
}

fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
use Redactor::Full;
fn no_expansion(&mut self) -> Option<Cow<str>> {
match self {
Redactor::Full => Some(REDACTED.into()),
Redactor::Text(s) => Some(s.into()),
_ => None,
}
}
}

match s {
"full" => Ok(Full),
_ => Err("unknown redactor"),
impl TryFrom<Value> for Redactor {
type Error = &'static str;

fn try_from(value: Value) -> std::result::Result<Self, Self::Error> {
match value {
Value::Object(object) => Redactor::from_object(object),
Value::Bytes(bytes) => match bytes.as_ref() {
b"full" => Ok(Redactor::Full),
b"sha2" => Ok(Redactor::Hash(hex_digest::<sha_2::Sha512_256>)),
b"sha3" => Ok(Redactor::Hash(hex_digest::<sha_3::Sha3_512>)),
_ => Err("unknown name of redactor"),
},
_ => Err("unknown literal for redactor, must be redactor name or object"),
}
}
}

/// Compute the digest of some bytes as hex string
fn hex_digest<T: digest::Digest>(value: &[u8]) -> String {
hex::encode(T::digest(value))
}

#[cfg(test)]
mod test {
use super::*;
Expand Down

0 comments on commit dd3708e

Please sign in to comment.