Skip to content

Commit

Permalink
Properly normalize attribute values
Browse files Browse the repository at this point in the history
closes #371
  • Loading branch information
dralley committed Apr 3, 2022
1 parent 8a74258 commit 401bb77
Showing 1 changed file with 119 additions and 1 deletion.
120 changes: 119 additions & 1 deletion src/events/attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,97 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> {
}
}

///
///
/// 1) All line breaks MUST have been normalized on input to #xA as described in 2.11 End-of-Line Handling, so the rest of this algorithm operates on text normalized in this way.
/// 2) Begin with a normalized value consisting of the empty string.
/// 3) For each character, entity reference, or character reference in the unnormalized attribute value, beginning with the first and continuing to the last, do the following:
/// * For a character reference, append the referenced character to the normalized value.
/// * For an entity reference, recursively apply step 3 of this algorithm to the replacement text of the entity.
/// * For a white space character (#x20, #xD, #xA, #x9), append a space character (#x20) to the normalized value.
/// * For another character, append the character to the normalized value.
///
/// If the attribute type is not CDATA, then the XML processor MUST further process the normalized attribute value by discarding any leading and trailing space (#x20) characters,
/// and by replacing sequences of space (#x20) characters by a single space (#x20) character.
///
/// Note that if the unnormalized attribute value contains a character reference to a white space character other than space (#x20), the normalized value contains the referenced
/// character itself (#xD, #xA or #x9). This contrasts with the case where the unnormalized value contains a white space character (not a reference), which is replaced with a
/// space character (#x20) in the normalized value and also contrasts with the case where the unnormalized value contains an entity reference whose replacement text contains a
/// white space character; being recursively processed, the white space character is replaced with a space character (#x20) in the normalized value.
fn normalize_attribute_value(attr: &[u8]) -> Cow<[u8]> {
// TODO: character references, entity references, error handling associated with those

#[derive(PartialEq)]
enum ParseState {
Space,
CDATA,
}

let is_whitespace_like = |c| matches!(c, b'\n' | b'\r' | b'\t' | b' ');

let first_non_space_char = attr.iter().position(|c| !is_whitespace_like(*c));

if first_non_space_char.is_none() {
// The entire value was whitespace-like characters
return Cow::Borrowed(b"");
}

let last_non_space_char = attr.iter().rposition(|c| !is_whitespace_like(*c));

// Trim all whitespace-like characters away from the beginning and end of the attribute value.
let begin = first_non_space_char.unwrap();
let end = last_non_space_char.unwrap_or(attr.len());
let trimmed_attr = &attr[begin..=end];

// A new buffer is only created when we encounter a situation that requires it.
let mut normalized: Option<Vec<u8>> = None;
// We start on character data because all whitespace-like characters are already trimmed away.
let mut current_state = ParseState::CDATA;

// Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
// or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
// buffer and continue using this buffer.
for (idx, ch) in trimmed_attr.iter().enumerate() {
match ch {
b'\n' | b'\r' | b'\t' | b' ' => match current_state {
ParseState::Space => match normalized {
Some(_) => continue,
None => normalized = Some(Vec::from(&trimmed_attr[..idx])),
},
ParseState::CDATA => {
current_state = ParseState::Space;
match normalized.as_mut() {
Some(buf) => buf.push(b' '),
None => {
let mut buf = Vec::from(&trimmed_attr[..idx]);
buf.push(b' ');
normalized = Some(buf);
}
}
}
},
c @ _ => match current_state {
ParseState::Space => {
current_state = ParseState::CDATA;
if let Some(normalized) = normalized.as_mut() {
normalized.push(*c);
}
}
ParseState::CDATA => {
if let Some(normalized) = normalized.as_mut() {
normalized.push(*c);
}
}
},
}
}

match normalized {
Some(normalized) => Cow::Owned(normalized),
None => Cow::Borrowed(trimmed_attr),
}
}

impl<'a> Iterator for Attributes<'a> {
type Item = Result<Attribute<'a>>;
fn next(&mut self) -> Option<Self::Item> {
Expand All @@ -355,7 +446,7 @@ impl<'a> Iterator for Attributes<'a> {
($key:expr, $val:expr) => {
Some(Ok(Attribute {
key: &self.bytes[$key],
value: Cow::Borrowed(&self.bytes[$val]),
value: normalize_attribute_value(&self.bytes[$val]),
}))
};
}
Expand Down Expand Up @@ -513,4 +604,31 @@ mod tests {
assert_eq!(&*a.value, b"ee");
assert!(attributes.next().is_none());
}

#[test]
fn attribute_value_normalization() {
// empty value
assert_eq!(normalize_attribute_value(b"").as_ref(), b"");
// return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
assert_eq!(
normalize_attribute_value(b"\rfoo\rbar\tbaz\ndelta\n").as_ref(),
b"foo bar baz delta"
);
// leading and trailing spaces must be stripped
assert_eq!(normalize_attribute_value(b" foo ").as_ref(), b"foo");
// leading space
assert_eq!(normalize_attribute_value(b" bar").as_ref(), b"bar");
// trailing space
assert_eq!(normalize_attribute_value(b"baz ").as_ref(), b"baz");
// sequences of spaces must be replaced with a single space
assert_eq!(
normalize_attribute_value(b" foo bar baz ").as_ref(),
b"foo bar baz"
);
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
assert_eq!(
normalize_attribute_value(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r").as_ref(),
b"foo bar baz delta echo foxtrot"
);
}
}

0 comments on commit 401bb77

Please sign in to comment.