From f1e4bf150960fb81fd8e54791ef291dfa06ac64a Mon Sep 17 00:00:00 2001 From: Andrei Vasiliu Date: Fri, 17 Apr 2020 14:18:51 +0300 Subject: [PATCH 1/4] Fix tests on Windows --- tests/unit_tests.rs | 4 ++ tests/xmlrs_reader_tests.rs | 105 +++++++++++++++++++----------------- 2 files changed, 59 insertions(+), 50 deletions(-) diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index e8cdb009..da377b1b 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -241,6 +241,10 @@ fn test_writer_borrow() { #[test] fn test_writer_indent() { let txt = include_str!("../tests/documents/test_writer_indent.xml"); + // Normalize newlines on Windows to just \n, which is what the reader and + // writer use. + let normalized_txt = txt.replace("\r\n", "\n"); + let txt = normalized_txt.as_str(); let mut reader = Reader::from_str(txt); reader.trim_text(true); let mut writer = Writer::new_with_indent(Cursor::new(Vec::new()), b' ', 4); diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 10df35c9..257be377 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -7,8 +7,8 @@ use std::str::from_utf8; #[test] fn sample_1_short() { test( - include_bytes!("documents/sample_1.xml"), - include_bytes!("documents/sample_1_short.txt"), + include_str!("documents/sample_1.xml"), + include_str!("documents/sample_1_short.txt"), true, ); } @@ -16,8 +16,8 @@ fn sample_1_short() { #[test] fn sample_1_full() { test( - include_bytes!("documents/sample_1.xml"), - include_bytes!("documents/sample_1_full.txt"), + include_str!("documents/sample_1.xml"), + include_str!("documents/sample_1_full.txt"), false, ); } @@ -25,8 +25,8 @@ fn sample_1_full() { #[test] fn sample_2_short() { test( - include_bytes!("documents/sample_2.xml"), - include_bytes!("documents/sample_2_short.txt"), + include_str!("documents/sample_2.xml"), + include_str!("documents/sample_2_short.txt"), true, ); } @@ -34,8 +34,8 @@ fn sample_2_short() { #[test] fn sample_2_full() { test( - include_bytes!("documents/sample_2.xml"), - include_bytes!("documents/sample_2_full.txt"), + include_str!("documents/sample_2.xml"), + include_str!("documents/sample_2_full.txt"), false, ); } @@ -43,8 +43,8 @@ fn sample_2_full() { // #[test] // fn sample_3_short() { // test( -// include_bytes!("documents/sample_3.xml"), -// include_bytes!("documents/sample_3_short.txt"), +// include_str!("documents/sample_3.xml"), +// include_str!("documents/sample_3_short.txt"), // true // ); // } @@ -52,8 +52,8 @@ fn sample_2_full() { // #[test] // fn sample_3_full() { // test( -// include_bytes!("documents/sample_3.xml"), -// include_bytes!("documents/sample_3_full.txt"), +// include_str!("documents/sample_3.xml"), +// include_str!("documents/sample_3_full.txt"), // false // ); // } @@ -61,8 +61,8 @@ fn sample_2_full() { // #[test] // fn sample_4_short() { // test( -// include_bytes!("documents/sample_4.xml"), -// include_bytes!("documents/sample_4_short.txt"), +// include_str!("documents/sample_4.xml"), +// include_str!("documents/sample_4_short.txt"), // true // ); // } @@ -70,8 +70,8 @@ fn sample_2_full() { // #[test] // fn sample_4_full() { // test( -// include_bytes!("documents/sample_4.xml"), -// include_bytes!("documents/sample_4_full.txt"), +// include_str!("documents/sample_4.xml"), +// include_str!("documents/sample_4_full.txt"), // false // ); // @@ -80,8 +80,8 @@ fn sample_2_full() { #[test] fn sample_ns_short() { test( - include_bytes!("documents/sample_ns.xml"), - include_bytes!("documents/sample_ns_short.txt"), + include_str!("documents/sample_ns.xml"), + include_str!("documents/sample_ns_short.txt"), true, ); } @@ -89,8 +89,8 @@ fn sample_ns_short() { #[test] fn eof_1() { test( - br#""#, - br#" + r#""#, + r#" |Error: Unexpected token '--' "#, true, ); test( - br#""#, - br#" + r#""#, + r#" |Error: Unexpected token '--' "#, true, @@ -126,8 +126,8 @@ fn dashes_in_comments() { #[test] fn tabs_1() { test( - b"\t\t", - br#" + "\t\t", + r#" StartElement(a) EmptyElement(b) EndElement(a) @@ -142,8 +142,8 @@ fn issue_83_duplicate_attributes() { // Error when parsing attributes won't stop main event reader // as it is a lazy operation => add ending events test( - br#""#, - b" + r#""#, + " |StartElement(hello) |1:30 EmptyElement(some-tag, attr-error: error while parsing \ attribute at position 16: Duplicate attribute at position 9 and 16) @@ -157,14 +157,13 @@ fn issue_83_duplicate_attributes() { #[test] fn issue_93_large_characters_in_entity_references() { test( - r#"&𤶼;"#.as_bytes(), + r#"&𤶼;"#, r#" |StartElement(hello) |1:10 Error while escaping character at range 1..5: Unrecognized escape symbol: Ok("𤶼") |EndElement(hello) |EndDocument - "# - .as_bytes(), + "#, true, ) } @@ -172,8 +171,8 @@ fn issue_93_large_characters_in_entity_references() { #[test] fn issue_98_cdata_ending_with_right_bracket() { test( - br#""#, - br#" + r#""#, + r#" |StartElement(hello) |Characters() |CData(Foo [Bar]) @@ -188,8 +187,8 @@ fn issue_98_cdata_ending_with_right_bracket() { #[test] fn issue_105_unexpected_double_dash() { test( - br#"-- "#, - br#" + r#"-- "#, + r#" |StartElement(hello) |Characters(-- ) |EndElement(hello) @@ -199,8 +198,8 @@ fn issue_105_unexpected_double_dash() { ); test( - br#"--"#, - br#" + r#"--"#, + r#" |StartElement(hello) |Characters(--) |EndElement(hello) @@ -210,8 +209,8 @@ fn issue_105_unexpected_double_dash() { ); test( - br#"-->"#, - br#" + r#"-->"#, + r#" |StartElement(hello) |Characters(-->) |EndElement(hello) @@ -221,8 +220,8 @@ fn issue_105_unexpected_double_dash() { ); test( - br#""#, - br#" + r#""#, + r#" |StartElement(hello) |Characters() |CData(--) @@ -239,8 +238,8 @@ fn issue_attributes_have_no_default_namespace() { // At the moment, the 'test' method doesn't render namespaces for attribute names. // This test only checks whether the default namespace got applied to the EmptyElement. test( - br#""#, - br#" + r#""#, + r#" |EmptyElement({urn:foo}hello [x="y"]) |EndDocument "#, @@ -252,8 +251,8 @@ fn issue_attributes_have_no_default_namespace() { fn issue_default_namespace_on_outermost_element() { // Regression test test( - br#""#, - br#" + r#""#, + r#" |EmptyElement({urn:foo}hello) |EndDocument "#, @@ -264,10 +263,10 @@ fn issue_default_namespace_on_outermost_element() { #[test] fn default_namespace_applies_to_end_elem() { test( - br#" + r#" "#, - br#" + r#" |StartElement({urn:foo}hello [x="y"]) |EmptyElement({urn:foo}inner) |EndElement({urn:foo}hello) @@ -277,7 +276,13 @@ fn default_namespace_applies_to_end_elem() { ); } -fn test(input: &[u8], output: &[u8], is_short: bool) { +fn test(input: &str, output: &str, is_short: bool) { + // Normalize newlines on Windows to just \n, which is what the reader and + // writer use. + let input = input.replace("\r\n", "\n"); + let input = input.as_bytes(); + let output = output.replace("\r\n", "\n"); + let output = output.as_bytes(); let mut reader = Reader::from_reader(input); reader .trim_text(is_short) From cb2c67a6a9b8ba5b6420fa20b4983657cbe16fdc Mon Sep 17 00:00:00 2001 From: Andrei Vasiliu Date: Mon, 6 Apr 2020 13:21:38 +0300 Subject: [PATCH 2/4] Add BufferedInput trait, rework read_until/read_elem_until to return slices --- src/reader.rs | 556 +++++++++++++++++++++++++++++++------------------- 1 file changed, 350 insertions(+), 206 deletions(-) diff --git a/src/reader.rs b/src/reader.rs index 092d45ae..a9397470 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -210,26 +210,50 @@ impl Reader { /// return a `Text` event fn read_until_open<'a, 'b>(&'a mut self, buf: &'b mut Vec) -> Result> { self.tag_state = TagState::Opened; - let buf_start = buf.len(); - match read_until(&mut self.reader, b'<', buf, &mut self.buf_position) { - Ok(0) => Ok(Event::Eof), - Ok(_) => { - let (start, len) = if self.trim_text { - match buf.iter().skip(buf_start).position(|&b| !is_whitespace(b)) { - Some(start) => ( - buf_start + start, - buf.iter() - .rposition(|&b| !is_whitespace(b)) - .map_or_else(|| buf.len(), |p| p + 1), - ), - None => return self.read_event(buf), - } - } else { - (buf_start, buf.len()) - }; - Ok(Event::Text(BytesText::from_escaped(&buf[start..len]))) + + let skip_text = if self.trim_text { + self.reader.skip_whitespace(&mut self.buf_position)?; + + let start = match self.reader.peek_one() { + Ok(None) => return Ok(Event::Eof), + Ok(Some(byte)) => byte, + Err(e) => return Err(e), + }; + + if start == b'<' { + // Trimming whitespace skipped all text and reached a tag + self.reader + .read_bytes_until(b'<', buf, &mut self.buf_position)?; + true + } else { + false } - Err(e) => Err(e), + } else { + false + }; + + if skip_text { + return self.read_event(buf); + } else { + return match self + .reader + .read_bytes_until(b'<', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => { + // Skip the ending '<' + let len = if self.trim_text { + bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1) + } else { + bytes.len() + }; + Ok(Event::Text(BytesText::from_escaped(&bytes[0..len]))) + } + Err(e) => Err(e), + }; } } @@ -238,39 +262,39 @@ impl Reader { self.tag_state = TagState::Closed; // need to read 1 character to decide whether pay special attention to attribute values - let buf_start = buf.len(); - let start = loop { - match self.reader.fill_buf() { - Ok(n) if n.is_empty() => return Ok(Event::Eof), - Ok(n) => { - // We intentionally don't `consume()` the byte, otherwise we would have to - // handle things like '<>' here already. - break n[0]; - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => return Err(Error::Io(e)), - } + let start = match self.reader.peek_one() { + Ok(None) => return Ok(Event::Eof), + Ok(Some(byte)) => byte, + Err(e) => return Err(e), }; if start != b'/' && start != b'!' && start != b'?' { - match read_elem_until(&mut self.reader, b'>', buf, &mut self.buf_position) { - Ok(0) => Ok(Event::Eof), - Ok(_) => { + match self.reader.read_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => { // we already *know* that we are in this case - self.read_start(&buf[buf_start..]) + self.read_start(bytes) } Err(e) => Err(e), } + } else if start == b'!' { + match self.reader.read_bang_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_bang(bytes), + Err(e) => Err(e), + } } else { - match read_until(&mut self.reader, b'>', buf, &mut self.buf_position) { - Ok(0) => Ok(Event::Eof), - Ok(_) => match start { - b'/' => self.read_end(&buf[buf_start..]), - b'!' => self.read_bang(buf_start, buf), - b'?' => self.read_question_mark(&buf[buf_start..]), + match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => match start { + b'/' => self.read_end(bytes), + b'?' => self.read_question_mark(bytes), _ => unreachable!( - "We checked that `start` must be one of [/!?], was {:?} \ - instead.", + "We checked that `start` must be one of [/?], was {:?} \ + instead.", start ), }, @@ -325,80 +349,32 @@ impl Reader { /// /// Note: depending on the start of the Event, we may need to read more /// data, thus we need a mutable buffer - fn read_bang<'a, 'b>( - &'a mut self, - buf_start: usize, - buf: &'b mut Vec, - ) -> Result> { - if buf[buf_start..].starts_with(b"!--") { - while buf.len() < buf_start + 5 || !buf.ends_with(b"--") { - buf.push(b'>'); - match read_until(&mut self.reader, b'>', buf, &mut self.buf_position) { - Ok(0) => { - self.buf_position -= buf.len() - buf_start; - return Err(Error::UnexpectedEof("Comment".to_string())); - } - Ok(_) => (), - Err(e) => return Err(e.into()), - } - } - let len = buf.len(); + fn read_bang<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result> { + let len = buf.len(); + if buf.starts_with(b"!--") { + // FIXME: actually, isn't, it misses + debug_assert!(len >= 5, "Minimum length guaranteed by read_bang_elem"); if self.check_comments { // search if '--' not in comments - if let Some(p) = memchr::memchr_iter(b'-', &buf[buf_start + 3..len - 2]) - .position(|p| buf[buf_start + 3 + p + 1] == b'-') + if let Some(p) = + memchr::memchr_iter(b'-', &buf[3..len - 2]).position(|p| buf[3 + p + 1] == b'-') { - self.buf_position -= buf.len() - buf_start + p; + // FIXME: Should be `- p` + self.buf_position -= buf.len() + p; return Err(Error::UnexpectedToken("--".to_string())); } } - Ok(Event::Comment(BytesText::from_escaped( - &buf[buf_start + 3..len - 2], + Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2]))) + } else if buf.starts_with(b"![CDATA[") { + debug_assert!(len >= 10, "Minimum length guaranteed by read_bang_elem"); + Ok(Event::CData(BytesText::from_escaped( + &buf[8..buf.len() - 2], ))) - } else if buf.len() >= buf_start + 8 { - match &buf[buf_start + 1..buf_start + 8] { - b"[CDATA[" => { - while buf.len() < 10 || !buf.ends_with(b"]]") { - buf.push(b'>'); - match read_until(&mut self.reader, b'>', buf, &mut self.buf_position) { - Ok(0) => { - self.buf_position -= buf.len() - buf_start; - return Err(Error::UnexpectedEof("CData".to_string())); - } - Ok(_) => (), - Err(e) => return Err(e), - } - } - Ok(Event::CData(BytesText::from_escaped( - &buf[buf_start + 8..buf.len() - 2], - ))) - } - b"DOCTYPE" => { - let mut count = buf.iter().skip(buf_start).filter(|&&b| b == b'<').count(); - while count > 0 { - buf.push(b'>'); - match read_until(&mut self.reader, b'>', buf, &mut self.buf_position) { - Ok(0) => { - self.buf_position -= buf.len() - buf_start; - return Err(Error::UnexpectedEof("DOCTYPE".to_string())); - } - Ok(n) => { - let start = buf.len() - n; - count += buf.iter().skip(start).filter(|&&b| b == b'<').count(); - count -= 1; - } - Err(e) => return Err(e), - } - } - Ok(Event::DocType(BytesText::from_escaped( - &buf[buf_start + 8..buf.len()], - ))) - } - _ => return Err(Error::UnexpectedBang), - } + } else if buf.starts_with(b"!DOCTYPE") { + debug_assert!(len >= 8, "Minimum length guaranteed by read_bang_elem"); + Ok(Event::DocType(BytesText::from_escaped(&buf[8..]))) } else { - self.buf_position -= buf.len() - buf_start; - return Err(Error::UnexpectedBang); + unreachable!("Proper bang start guaranteed by read_bang_elem"); } } @@ -898,21 +874,119 @@ impl<'a> Reader<&'a [u8]> { } } -/// read until `byte` is found or end of file -/// return the position of byte -#[inline] -fn read_until( - r: &mut R, - byte: u8, - buf: &mut Vec, - position: &mut usize, -) -> Result { - let mut read = 0; - let mut done = false; - while !done { - let used = { - let available = match r.fill_buf() { - Ok(n) if n.is_empty() => break, +trait BufferedInput<'r, B> { + fn read_bytes_until( + &mut self, + byte: u8, + buf: B, + position: &mut usize, + ) -> Result>; + + fn read_bang_element( + &mut self, + buf: &'r mut Vec, + position: &mut usize, + ) -> Result>; + + fn read_element(&mut self, buf: B, position: &mut usize) -> Result>; + + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>; + + fn peek_one(&mut self) -> Result>; +} + +/// Implementation of BufferedInput for any BufRead reader using a user-given +/// Vec as buffer that will be borrowed by events. +impl<'b, R: BufRead> BufferedInput<'b, &'b mut Vec> for R { + /// read until `byte` is found or end of file + /// return the position of byte + #[inline] + fn read_bytes_until( + &mut self, + byte: u8, + buf: &'b mut Vec, + position: &mut usize, + ) -> Result> { + let mut read = 0; + let mut done = false; + let start = buf.len(); + while !done { + let used = { + let available = match self.fill_buf() { + Ok(n) if n.is_empty() => break, + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + }; + + match memchr::memchr(byte, available) { + Some(i) => { + buf.extend_from_slice(&available[..i]); + done = true; + i + 1 + } + None => { + buf.extend_from_slice(available); + available.len() + } + } + }; + self.consume(used); + read += used; + } + *position += read; + + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } + } + + fn read_bang_element( + &mut self, + buf: &'b mut Vec, + position: &mut usize, + ) -> Result> { + // Peeked one bang ('!') before being called, so it's guaranteed to + // start with it. + let start = buf.len(); + let mut read = 1; + buf.push(b'!'); + self.consume(1); + + enum BangType { + // + CData, + // + Comment, + // + DocType, + } + + let bang_type = match self.peek_one()? { + Some(b'[') => BangType::CData, + Some(b'-') => BangType::Comment, + Some(b'D') => BangType::DocType, + Some(_) => return Err(Error::UnexpectedBang), + None => return Err(Error::UnexpectedEof("unknown".to_string())), + }; + + loop { + let available = match self.fill_buf() { + Ok(n) if n.is_empty() => { + // Note: Do not update position, so the error points to + // somewhere sane rather than at the EOF + let bang_str = match bang_type { + BangType::CData => "CData", + BangType::Comment => "Comment", + BangType::DocType => "DOCTYPE", + }; + return Err(Error::UnexpectedEof(bang_str.to_string())); + } Ok(n) => n, Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, Err(e) => { @@ -921,103 +995,173 @@ fn read_until( } }; - match memchr::memchr(byte, available) { + match memchr::memchr(b'>', available) { Some(i) => { buf.extend_from_slice(&available[..i]); - done = true; - i + 1 + let used = i + 1; + self.consume(used); + read += used; + + let finished = match bang_type { + BangType::Comment => read >= 5 && buf.ends_with(b"--"), + BangType::CData => buf.ends_with(b"]]"), + BangType::DocType => { + // Inefficient, but unlikely to happen often + let open = buf.iter().skip(start).filter(|b| **b == b'<').count(); + let closed = buf.iter().skip(start).filter(|b| **b == b'>').count(); + open == closed + } + }; + + if finished { + break; + } else { + // '>' was omitted in the extend_from_slice above + buf.push(b'>'); + } } None => { buf.extend_from_slice(available); - available.len() + let used = available.len(); + self.consume(used); + read += used; } } - }; - r.consume(used); - read += used; - } - *position += read; - Ok(read) -} + } + *position += read; -/// Derived from `read_until`, but modified to handle XML attributes using a minimal state machine. -/// [W3C Extensible Markup Language (XML) 1.1 (2006)](https://www.w3.org/TR/xml11) -/// -/// Attribute values are defined as follows: -/// ```plain -/// AttValue := '"' (([^<&"]) | Reference)* '"' -/// | "'" (([^<&']) | Reference)* "'" -/// ``` -/// (`Reference` is something like `"`, but we don't care about escaped characters at this -/// level) -#[inline] -fn read_elem_until( - r: &mut R, - end_byte: u8, - buf: &mut Vec, - position: &mut usize, -) -> Result { - #[derive(Clone, Copy)] - enum State { - /// The initial state (inside element, but outside of attribute value) - Elem, - /// Inside a single-quoted attribute value - SingleQ, - /// Inside a double-quoted attribute value - DoubleQ, + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } } - let mut state = State::Elem; - let mut read = 0; - let mut done = false; - while !done { - let used = { - let available = match r.fill_buf() { - Ok(n) if n.is_empty() => return Ok(read), - Ok(n) => n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e)); + + /// Derived from `read_until`, but modified to handle XML attributes using a minimal state machine. + /// [W3C Extensible Markup Language (XML) 1.1 (2006)](https://www.w3.org/TR/xml11) + /// + /// Attribute values are defined as follows: + /// ```plain + /// AttValue := '"' (([^<&"]) | Reference)* '"' + /// | "'" (([^<&']) | Reference)* "'" + /// ``` + /// (`Reference` is something like `"`, but we don't care about escaped characters at this + /// level) + #[inline] + fn read_element( + &mut self, + buf: &'b mut Vec, + position: &mut usize, + ) -> Result> { + #[derive(Clone, Copy)] + enum State { + /// The initial state (inside element, but outside of attribute value) + Elem, + /// Inside a single-quoted attribute value + SingleQ, + /// Inside a double-quoted attribute value + DoubleQ, + } + let mut state = State::Elem; + let mut read = 0; + let mut done = false; + let end_byte = b'>'; + let start = buf.len(); + while !done { + let used = { + let available = match self.fill_buf() { + Ok(n) if n.is_empty() => { + if read == 0 { + return Ok(None); + } else { + return Ok(Some(&buf[start..])); + } + } + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + }; + + let mut memiter = memchr::memchr3_iter(end_byte, b'\'', b'"', available); + let used: usize; + loop { + match memiter.next() { + Some(i) => { + state = match (state, available[i]) { + (State::Elem, b) if b == end_byte => { + // only allowed to match `end_byte` while we are in state `Elem` + buf.extend_from_slice(&available[..i]); + done = true; + used = i + 1; + break; + } + (State::Elem, b'\'') => State::SingleQ, + (State::Elem, b'\"') => State::DoubleQ, + + // the only end_byte that gets us out if the same character + (State::SingleQ, b'\'') | (State::DoubleQ, b'\"') => State::Elem, + + // all other bytes: no state change + _ => state, + }; + } + None => { + buf.extend_from_slice(available); + used = available.len(); + break; + } + } } + used }; + self.consume(used); + read += used; + } + *position += read; - let mut memiter = memchr::memchr3_iter(end_byte, b'\'', b'"', available); - let used: usize; - loop { - match memiter.next() { - Some(i) => { - state = match (state, available[i]) { - (State::Elem, b) if b == end_byte => { - // only allowed to match `end_byte` while we are in state `Elem` - buf.extend_from_slice(&available[..i]); - done = true; - used = i + 1; - break; - } - (State::Elem, b'\'') => State::SingleQ, - (State::Elem, b'\"') => State::DoubleQ, - - // the only end_byte that gets us out if the same character - (State::SingleQ, b'\'') | (State::DoubleQ, b'\"') => State::Elem, - - // all other bytes: no state change - _ => state, - }; - } - None => { - buf.extend_from_slice(available); - used = available.len(); - break; + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } + } + + /// Consume and discard all the whitespace until the next non-whitespace + /// character or EOF. + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + loop { + break match self.fill_buf() { + Ok(n) => { + let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len()); + if count > 0 { + self.consume(count); + *position += count; + continue; + } else { + Ok(()) } } - } - used - }; - r.consume(used); - read += used; + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } + + /// Return one character without consuming it, so that future `read_*` calls + /// will still include it. On EOF, return None. + fn peek_one(&mut self) -> Result> { + loop { + break match self.fill_buf() { + Ok(n) if n.is_empty() => Ok(None), + Ok(n) => Ok(Some(n[0])), + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } } - *position += read; - Ok(read) } /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) From f24ae9106dca2d14d5ade5959aef7c15ea4a1105 Mon Sep 17 00:00:00 2001 From: Andrei Vasiliu Date: Tue, 7 Apr 2020 20:56:01 +0300 Subject: [PATCH 3/4] Add read_event_unbuffered --- src/reader.rs | 233 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 199 insertions(+), 34 deletions(-) diff --git a/src/reader.rs b/src/reader.rs index a9397470..d5d420dc 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -61,9 +61,9 @@ enum TagState { /// buf.clear(); /// } /// ``` -pub struct Reader { +pub struct Reader { /// reader - reader: B, + reader: R, /// current buffer position, useful for debuging errors buf_position: usize, /// current state Open/Close @@ -93,9 +93,9 @@ pub struct Reader { is_encoding_set: bool, } -impl Reader { +impl Reader { /// Creates a `Reader` that reads from a reader implementing `BufRead`. - pub fn from_reader(reader: B) -> Reader { + pub fn from_reader(reader: R) -> Reader { Reader { reader: reader, opened_buffer: Vec::new(), @@ -126,7 +126,7 @@ impl Reader { /// [`Empty`]: events/enum.Event.html#variant.Empty /// [`Start`]: events/enum.Event.html#variant.Start /// [`End`]: events/enum.Event.html#variant.End - pub fn expand_empty_elements(&mut self, val: bool) -> &mut Reader { + pub fn expand_empty_elements(&mut self, val: bool) -> &mut Reader { self.expand_empty_elements = val; self } @@ -139,7 +139,7 @@ impl Reader { /// (`false` by default) /// /// [`Text`]: events/enum.Event.html#variant.Text - pub fn trim_text(&mut self, val: bool) -> &mut Reader { + pub fn trim_text(&mut self, val: bool) -> &mut Reader { self.trim_text = val; self } @@ -155,7 +155,7 @@ impl Reader { /// (`true` by default) /// /// [`End`]: events/enum.Event.html#variant.End - pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Reader { + pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Reader { self.trim_markup_names_in_closing_tags = val; self } @@ -173,7 +173,7 @@ impl Reader { /// (`true` by default) /// /// [`End`]: events/enum.Event.html#variant.End - pub fn check_end_names(&mut self, val: bool) -> &mut Reader { + pub fn check_end_names(&mut self, val: bool) -> &mut Reader { self.check_end_names = val; self } @@ -188,7 +188,7 @@ impl Reader { /// (`false` by default) /// /// [`Comment`]: events/enum.Event.html#variant.Comment - pub fn check_comments(&mut self, val: bool) -> &mut Reader { + pub fn check_comments(&mut self, val: bool) -> &mut Reader { self.check_comments = val; self } @@ -208,32 +208,21 @@ impl Reader { /// private function to read until '<' is found /// return a `Text` event - fn read_until_open<'a, 'b>(&'a mut self, buf: &'b mut Vec) -> Result> { + fn read_until_open<'i, B>(&mut self, buf: B) -> Result> + where + R: BufferedInput<'i, B> + { self.tag_state = TagState::Opened; let skip_text = if self.trim_text { self.reader.skip_whitespace(&mut self.buf_position)?; - - let start = match self.reader.peek_one() { - Ok(None) => return Ok(Event::Eof), - Ok(Some(byte)) => byte, - Err(e) => return Err(e), - }; - - if start == b'<' { - // Trimming whitespace skipped all text and reached a tag - self.reader - .read_bytes_until(b'<', buf, &mut self.buf_position)?; - true - } else { - false - } + self.reader.skip_one(b'<', &mut self.buf_position)? } else { false }; if skip_text { - return self.read_event(buf); + return self.read_event_buffered(buf); } else { return match self .reader @@ -258,7 +247,10 @@ impl Reader { } /// private function to read until '>' is found - fn read_until_close<'a, 'b>(&'a mut self, buf: &'b mut Vec) -> Result> { + fn read_until_close<'i, B>(&mut self, buf: B) -> Result> + where + R: BufferedInput<'i, B> + { self.tag_state = TagState::Closed; // need to read 1 character to decide whether pay special attention to attribute values @@ -497,6 +489,16 @@ impl Reader { /// println!("Text events: {:?}", txt); /// ``` pub fn read_event<'a, 'b>(&'a mut self, buf: &'b mut Vec) -> Result> { + self.read_event_buffered(buf) + } + + /// Read text into the given buffer, and return an event that borrows from + /// either that buffer or from the input itself, based on the type of the + /// reader. + fn read_event_buffered<'i, B>(&mut self, buf: B) -> Result> + where + R: BufferedInput<'i, B> + { let event = match self.tag_state { TagState::Opened => self.read_until_close(buf), TagState::Closed => self.read_until_open(buf), @@ -853,7 +855,7 @@ impl Reader { /// buf.clear(); /// } /// ``` - pub fn into_underlying_reader(self) -> B { + pub fn into_underlying_reader(self) -> R { self.reader } } @@ -872,6 +874,11 @@ impl<'a> Reader<&'a [u8]> { pub fn from_str(s: &'a str) -> Reader<&'a [u8]> { Reader::from_reader(s.as_bytes()) } + + /// Read an event that borrows from the input rather than a buffer. + pub fn read_event_unbuffered(&mut self) -> Result> { + self.read_event_buffered(()) + } } trait BufferedInput<'r, B> { @@ -882,16 +889,14 @@ trait BufferedInput<'r, B> { position: &mut usize, ) -> Result>; - fn read_bang_element( - &mut self, - buf: &'r mut Vec, - position: &mut usize, - ) -> Result>; + fn read_bang_element(&mut self, buf: B, position: &mut usize) -> Result>; fn read_element(&mut self, buf: B, position: &mut usize) -> Result>; fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>; + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result; + fn peek_one(&mut self) -> Result>; } @@ -972,7 +977,7 @@ impl<'b, R: BufRead> BufferedInput<'b, &'b mut Vec> for R { Some(b'-') => BangType::Comment, Some(b'D') => BangType::DocType, Some(_) => return Err(Error::UnexpectedBang), - None => return Err(Error::UnexpectedEof("unknown".to_string())), + None => return Err(Error::UnexpectedEof("Bang".to_string())), }; loop { @@ -1150,6 +1155,19 @@ impl<'b, R: BufRead> BufferedInput<'b, &'b mut Vec> for R { } } + /// Consume and discard one character if it matches the given byte. Return + /// true if it matched. + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + match self.peek_one()? { + Some(b) if b == byte => { + *position += 1; + self.consume(1); + Ok(true) + }, + _ => Ok(false), + } + } + /// Return one character without consuming it, so that future `read_*` calls /// will still include it. On EOF, return None. fn peek_one(&mut self) -> Result> { @@ -1164,6 +1182,153 @@ impl<'b, R: BufRead> BufferedInput<'b, &'b mut Vec> for R { } } +/// Implementation of BufferedInput for any BufRead reader using a user-given +/// Vec as buffer that will be borrowed by events. +impl<'a> BufferedInput<'a, ()> for &'a [u8] { + fn read_bytes_until( + &mut self, + byte: u8, + _buf: (), + position: &mut usize, + ) -> Result> { + if self.is_empty() { + return Ok(None); + } + + let i = memchr::memchr(byte, self).unwrap_or(self.len()); + + *position += i; + let bytes = &self[..i]; + // Skip the end byte too. + *self = &self[i + 1..]; + + return Ok(Some(bytes)); + } + + fn read_bang_element(&mut self, _buf: (), position: &mut usize) -> Result> { + // Peeked one bang ('!') before being called, so it's guaranteed to + // start with it. + debug_assert_eq!(self[0], b'!'); + + enum BangType { + // + CData, + // + Comment, + // + DocType, + } + + let bang_type = match &self[1..].first() { + Some(b'[') => BangType::CData, + Some(b'-') => BangType::Comment, + Some(b'D') => BangType::DocType, + Some(_) => return Err(Error::UnexpectedBang), + None => return Err(Error::UnexpectedEof("Bang".to_string())), + }; + + for i in memchr::memchr_iter(b'>', self) { + let finished = match bang_type { + BangType::Comment => i >= 5 && self[..i].ends_with(b"--"), + BangType::CData => self[..i].ends_with(b"]]"), + BangType::DocType => { + // Inefficient, but unlikely to happen often + let open = self[..i].iter().filter(|b| **b == b'<').count(); + let closed = self[..i].iter().filter(|b| **b == b'>').count(); + open == closed + } + }; + + if finished { + *position += i; + let bytes = &self[..i]; + // Skip the '>' too. + *self = &self[i+1..]; + return Ok(Some(bytes)); + } + } + + // Note: Do not update position, so the error points to + // somewhere sane rather than at the EOF + let bang_str = match bang_type { + BangType::CData => "CData", + BangType::Comment => "Comment", + BangType::DocType => "DOCTYPE", + }; + Err(Error::UnexpectedEof(bang_str.to_string())) + } + + fn read_element(&mut self, _buf: (), position: &mut usize) -> Result> { + if self.is_empty() { + return Ok(None); + } + + #[derive(Clone, Copy)] + enum State { + /// The initial state (inside element, but outside of attribute value) + Elem, + /// Inside a single-quoted attribute value + SingleQ, + /// Inside a double-quoted attribute value + DoubleQ, + } + let mut state = State::Elem; + + let end_byte = b'>'; + + for i in memchr::memchr3_iter(end_byte, b'\'', b'"', self) { + state = match (state, self[i]) { + (State::Elem, b) if b == end_byte => { + // only allowed to match `end_byte` while we are in state `Elem` + *position += i; + let bytes = &self[..i]; + // Skip the '>' too. + *self = &self[i + 1..]; + return Ok(Some(bytes)); + } + (State::Elem, b'\'') => State::SingleQ, + (State::Elem, b'\"') => State::DoubleQ, + + // the only end_byte that gets us out if the same character + (State::SingleQ, b'\'') | (State::DoubleQ, b'\"') => State::Elem, + + // all other bytes: no state change + _ => state, + }; + } + + // Note: Do not update position, so the error points to a sane place + // rather than at the EOF. + Err(Error::UnexpectedEof("Element".to_string())) + + // FIXME: Figure out why the other one works without UnexpectedEof + } + + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + let whitespaces = self + .iter() + .position(|b| !is_whitespace(*b)) + .unwrap_or(self.len()); + *position += whitespaces; + *self = &self[whitespaces..]; + Ok(()) + } + + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + if self.first() == Some(&byte) { + *self = &self[1..]; + *position += 1; + Ok(true) + } else { + Ok(false) + } + } + + fn peek_one(&mut self) -> Result> { + Ok(self.first().copied()) + } +} + /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) #[inline] pub(crate) fn is_whitespace(b: u8) -> bool { From 3c637bf6408c9b5260140e21e2970833db24314d Mon Sep 17 00:00:00 2001 From: Andrei Vasiliu Date: Fri, 17 Apr 2020 13:56:00 +0300 Subject: [PATCH 4/4] Add borrowing support to deserializer --- src/de/map.rs | 28 ++--- src/de/mod.rs | 299 ++++++++++++++++++++++++++++++++++++++-------- src/de/seq.rs | 13 +- src/de/var.rs | 27 ++--- src/events/mod.rs | 29 ++++- src/reader.rs | 78 ++++++++++-- 6 files changed, 368 insertions(+), 106 deletions(-) diff --git a/src/de/map.rs b/src/de/map.rs index 334a0f32..18b94c20 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -1,12 +1,11 @@ //! Serde `Deserializer` module use crate::{ - de::{escape::EscapedDeserializer, Deserializer, INNER_VALUE}, + de::{escape::EscapedDeserializer, Deserializer, BorrowingReader, INNER_VALUE}, errors::serialize::DeError, - events::{attributes::Attribute, BytesStart, Event}, + events::{BytesStart, Event}, }; use serde::de::{self, DeserializeSeed, IntoDeserializer}; -use std::io::BufRead; enum MapValue { Empty, @@ -16,16 +15,16 @@ enum MapValue { } /// A deserializer for `Attributes` -pub(crate) struct MapAccess<'a, R: BufRead> { - start: BytesStart<'static>, - de: &'a mut Deserializer, +pub(crate) struct MapAccess<'de, 'a, R: BorrowingReader<'de> + 'a> { + start: BytesStart<'de>, + de: &'a mut Deserializer<'de, R>, position: usize, value: MapValue, } -impl<'a, R: BufRead> MapAccess<'a, R> { +impl<'de, 'a, R: BorrowingReader<'de>> MapAccess<'de, 'a, R> { /// Create a new MapAccess - pub fn new(de: &'a mut Deserializer, start: BytesStart<'static>) -> Result { + pub fn new(de: &'a mut Deserializer<'de, R>, start: BytesStart<'de>) -> Result { let position = start.attributes().position; Ok(MapAccess { de, @@ -35,28 +34,25 @@ impl<'a, R: BufRead> MapAccess<'a, R> { }) } - fn next_attr(&mut self) -> Result, DeError> { + fn next_attr(&mut self) -> Result, Vec)>, DeError> { let mut attributes = self.start.attributes(); attributes.position = self.position; - let next_att = attributes.next(); + let next_att = attributes.next().transpose()?; self.position = attributes.position; - Ok(next_att.transpose()?) + Ok(next_att.map(|a| (a.key.to_owned(), a.value.into_owned()))) } } -impl<'a, 'de, R: BufRead> de::MapAccess<'de> for MapAccess<'a, R> { +impl<'de, 'a, R: BorrowingReader<'de> + 'a> de::MapAccess<'de> for MapAccess<'de, 'a, R> { type Error = DeError; fn next_key_seed>( &mut self, seed: K, ) -> Result, Self::Error> { - let attr_key_val = self - .next_attr()? - .map(|a| (a.key.to_owned(), a.value.into_owned())); let decoder = self.de.reader.decoder(); let has_value_field = self.de.has_value_field; - if let Some((key, value)) = attr_key_val { + if let Some((key, value)) = self.next_attr()? { // try getting map from attributes (key= "value") self.value = MapValue::Attribute { value }; seed.deserialize(EscapedDeserializer::new(key, decoder, false)) diff --git a/src/de/mod.rs b/src/de/mod.rs index 4f07303d..92131850 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -115,35 +115,56 @@ mod var; pub use crate::errors::serialize::DeError; use crate::{ events::{BytesStart, BytesText, Event}, - Reader, + Reader, reader::Decoder, }; -use serde::de::{self, DeserializeOwned}; +use serde::de::{self, Deserialize, DeserializeOwned}; use serde::forward_to_deserialize_any; use std::io::BufRead; +use std::borrow::Cow; const INNER_VALUE: &str = "$value"; /// An xml deserializer -pub struct Deserializer { - reader: Reader, - peek: Option>, +pub struct Deserializer<'de, R: BorrowingReader<'de>> { + reader: R, + peek: Option>, has_value_field: bool, } /// Deserialize a xml string -pub fn from_str(s: &str) -> Result { - from_reader(s.as_bytes()) +pub fn from_str<'de, T: Deserialize<'de>>(s: &'de str) -> Result { + from_bytes(s.as_bytes()) +} + +/// Deserialize a xml slice of bytes +pub fn from_bytes<'de, T: Deserialize<'de>>(s: &'de [u8]) -> Result { + let mut reader = Reader::from_bytes(s); + reader + .expand_empty_elements(true) + .check_end_names(true) + .trim_text(true); + let mut de = Deserializer::from_borrowing_reader( + SliceReader { reader } + ); + T::deserialize(&mut de) } /// Deserialize from a reader pub fn from_reader(reader: R) -> Result { - let mut de = Deserializer::from_reader(reader); + let mut reader = Reader::from_reader(reader); + reader + .expand_empty_elements(true) + .check_end_names(true) + .trim_text(true); + let mut de = Deserializer::from_borrowing_reader( + IoReader { reader, buf: Vec::new() } + ); T::deserialize(&mut de) } -impl Deserializer { +impl<'de, R: BorrowingReader<'de>> Deserializer<'de, R> { /// Get a new deserializer - pub fn new(reader: Reader) -> Self { + pub fn new(reader: R) -> Self { Deserializer { reader, peek: None, @@ -152,56 +173,43 @@ impl Deserializer { } /// Get a new deserializer from a regular BufRead - pub fn from_reader(reader: R) -> Self { - let mut reader = Reader::from_reader(reader); - reader - .expand_empty_elements(true) - .check_end_names(true) - .trim_text(true); + pub fn from_borrowing_reader(reader: R) -> Self { Self::new(reader) } - fn peek(&mut self) -> Result>, DeError> { + fn peek(&mut self) -> Result>, DeError> { if self.peek.is_none() { - self.peek = Some(self.next(&mut Vec::new())?); + self.peek = Some(self.next()?); } Ok(self.peek.as_ref()) } - fn next<'a>(&mut self, buf: &'a mut Vec) -> Result, DeError> { + fn next(&mut self) -> Result, DeError> { if let Some(e) = self.peek.take() { return Ok(e); } - loop { - let e = self.reader.read_event(buf)?; - match e { - Event::Start(_) | Event::End(_) | Event::Text(_) | Event::Eof | Event::CData(_) => { - return Ok(e.into_owned()) - } - _ => buf.clear(), - } - } + self.reader.next() } - fn next_start(&mut self, buf: &mut Vec) -> Result>, DeError> { + fn next_start(&mut self) -> Result>, DeError> { loop { - let e = self.next(buf)?; + let e = self.next()?; match e { Event::Start(e) => return Ok(Some(e)), Event::End(_) => return Err(DeError::End), Event::Eof => return Ok(None), - _ => buf.clear(), // ignore texts + _ => (), // ignore texts } } } - fn next_text<'a>(&mut self) -> Result, DeError> { - match self.next(&mut Vec::new())? { + fn next_text(&mut self) -> Result, DeError> { + match self.next()? { Event::Text(e) | Event::CData(e) => Ok(e), Event::Eof => Err(DeError::Eof), Event::Start(e) => { // allow one nested level - let inner = self.next(&mut Vec::new())?; + let inner = self.next()?; let t = match inner { Event::Text(t) | Event::CData(t) => t, Event::Start(_) => return Err(DeError::Start), @@ -224,13 +232,13 @@ impl Deserializer { } fn read_to_end(&mut self, name: &[u8]) -> Result<(), DeError> { - let mut buf = Vec::new(); - match self.next(&mut buf)? { - Event::Start(e) => self.reader.read_to_end(e.name(), &mut Vec::new())?, + // First one might be in self.peek + match self.next()? { + Event::Start(e) => self.reader.read_to_end(e.name())?, Event::End(e) if e.name() == name => return Ok(()), - _ => buf.clear(), + _ => (), } - Ok(self.reader.read_to_end(name, &mut buf)?) + self.reader.read_to_end(name) } } @@ -240,17 +248,17 @@ macro_rules! deserialize_type { let txt = self.next_text()?; #[cfg(not(feature = "encoding"))] - let value = self.reader.decode(&*txt)?.parse()?; + let value = self.reader.decoder().decode(&*txt)?.parse()?; #[cfg(feature = "encoding")] - let value = self.reader.decode(&*txt).parse()?; + let value = self.reader.decoder().decode(&*txt).parse()?; visitor.$visit(value) } } } -impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { +impl<'de, 'a, R: BorrowingReader<'de>> de::Deserializer<'de> for &'a mut Deserializer<'de, R> { type Error = DeError; forward_to_deserialize_any! { newtype_struct identifier } @@ -261,7 +269,7 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { fields: &'static [&'static str], visitor: V, ) -> Result { - if let Some(e) = self.next_start(&mut Vec::new())? { + if let Some(e) = self.next_start()? { let name = e.name().to_vec(); self.has_value_field = fields.contains(&INNER_VALUE); let map = map::MapAccess::new(self, e)?; @@ -313,14 +321,19 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { b"false" | b"0" | b"False" | b"FALSE" | b"f" | b"No" | b"NO" | b"no" | b"n" => { visitor.visit_bool(false) } - e => Err(DeError::InvalidBoolean(self.reader.decode(e)?.into())), + e => Err(DeError::InvalidBoolean( + self.reader.decoder().decode(e)?.into() + )), } } } fn deserialize_string>(self, visitor: V) -> Result { - let value = self.next_text()?.unescape_and_decode(&self.reader)?; - visitor.visit_string(value) + let text = self.next_text()?; + let unescaped = text.unescaped()?; + let decoded = self.reader.decoder().decode(&unescaped)?; + + visitor.visit_string(decoded.to_string()) } fn deserialize_char>(self, visitor: V) -> Result { @@ -328,7 +341,23 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { } fn deserialize_str>(self, visitor: V) -> Result { - self.deserialize_string(visitor) + let text = self.next_text()?; + let unescaped = text.into_unescaped()?; + + match unescaped { + Cow::Borrowed(unescaped) => { + // FIXME: Encoding has Cow instead + let decoded = self.reader.decoder().decode(unescaped)?; + + visitor.visit_borrowed_str(decoded) + }, + Cow::Owned(unescaped) => { + // FIXME: Encoding has Cow instead + let decoded = self.reader.decoder().decode_owned(unescaped)?; + + visitor.visit_string(decoded) + }, + } } fn deserialize_bytes>(self, visitor: V) -> Result { @@ -340,8 +369,7 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { } fn deserialize_unit>(self, visitor: V) -> Result { - let mut buf = Vec::new(); - match self.next(&mut buf)? { + match self.next()? { Event::Start(s) => { self.read_to_end(s.name())?; visitor.visit_unit() @@ -402,7 +430,7 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { } fn deserialize_ignored_any>(self, visitor: V) -> Result { - match self.next(&mut Vec::new())? { + match self.next()? { Event::Start(e) => self.read_to_end(e.name())?, Event::End(_) => return Err(DeError::End), _ => (), @@ -419,11 +447,180 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { } } +/// A trait that borrows an XML reader that borrows from the input. For a &[u8] +/// input the events will borrow from that input, whereas with a BufRead input +/// all events will be converted to 'static, allocating whenever necessary. +pub trait BorrowingReader<'i> where Self: 'i { + /// Return an input-borrowing event. + fn next(&mut self) -> Result, DeError>; + + /// Skips until end element is found. Unlike `next()` it will not allocate + /// when it cannot satisfy the lifetime. + fn read_to_end(&mut self, name: &[u8]) -> Result<(), DeError>; + + /// A copy of the reader's decoder used to decode strings. + fn decoder(&self) -> Decoder { + Decoder + } +} + +struct IoReader { + reader: Reader, + buf: Vec, +} + +impl<'i, R: BufRead + 'i> BorrowingReader<'i> for IoReader { + fn next(&mut self) -> Result, DeError> { + let event = loop { + let e = self.reader.read_event(&mut self.buf)?; + match e { + Event::Start(_) | Event::End(_) | Event::Text(_) | Event::Eof | Event::CData(_) => { + break Ok(e.into_owned()) + } + _ => self.buf.clear(), + } + }; + + self.buf.clear(); + + event + } + + fn read_to_end(&mut self, name: &[u8]) -> Result<(), DeError> { + Ok(self.reader.read_to_end(name, &mut self.buf)?) + } +} + +struct SliceReader<'de> { + reader: Reader<&'de [u8]>, +} + +impl<'de> BorrowingReader<'de> for SliceReader<'de> { + fn next(&mut self) -> Result, DeError> { + loop { + let e = self.reader.read_event_unbuffered()?; + match e { + Event::Start(_) | Event::End(_) | Event::Text(_) | Event::Eof | Event::CData(_) => { + break Ok(e) + } + _ => (), + } + } + } + + fn read_to_end(&mut self, name: &[u8]) -> Result<(), DeError> { + Ok(self.reader.read_to_end_unbuffered(name)?) + } +} + #[cfg(test)] mod tests { use super::*; use serde::Deserialize; + #[test] + fn borrowing_reader_parity() { + let s = r##" + Some text + + + "##.as_bytes(); + + let mut reader1 = IoReader { + reader: Reader::from_reader(s), + buf: Vec::new() + }; + let mut reader2 = SliceReader { + reader: Reader::from_bytes(s), + }; + + loop { + let event1 = reader1.next().unwrap(); + let event2 = reader2.next().unwrap(); + + if let (Event::Eof, Event::Eof) = (&event1, &event2) { + break; + } + + assert_eq!(format!("{:?}", event1), format!("{:?}", event2)); + } + } + + #[test] + fn borrowing_reader_events() { + let s = r##" + Some text + + + + "##.as_bytes(); + + let mut reader = SliceReader { + reader: Reader::from_bytes(s), + }; + + reader.reader + .trim_text(true) + .expand_empty_elements(true) + .check_end_names(true); + + let mut events = Vec::new(); + + loop { + let event = reader.next().unwrap(); + if let Event::Eof = event { + break; + } + events.push(event); + } + + use crate::events::{BytesStart, BytesText, BytesEnd, Event::*}; + + assert_eq!(events, vec![ + Start(BytesStart::borrowed(br#"item name="hello" source="world.rs""#, 4)), + Text(BytesText::from_escaped(b"Some text".as_ref())), + End(BytesEnd::borrowed(b"item")), + Start(BytesStart::borrowed(b"item2", 5)), + End(BytesEnd::borrowed(b"item2")), + Start(BytesStart::borrowed(b"item3", 5)), + End(BytesEnd::borrowed(b"item3")), + Start(BytesStart::borrowed(br#"item4 value="world" "#, 5)), + End(BytesEnd::borrowed(b"item4")), + ]) + } + + #[test] + fn borrowing_read_to_end() { + let s = " "; + let mut reader = SliceReader { + reader: Reader::from_str(s), + }; + + reader.reader + .trim_text(true) + .expand_empty_elements(true) + .check_end_names(true); + + assert_eq!(reader.next().unwrap(), Event::Start(BytesStart::borrowed(b"item ", 4))); + reader.read_to_end(b"item").unwrap(); + assert_eq!(reader.next().unwrap(), Event::Eof); + } + + #[derive(Debug, Deserialize, PartialEq)] + struct BorrowedText<'a> { + #[serde(rename = "$value")] + text: &'a str, + } + + #[test] + fn string_borrow() { + let s = "Hello world"; + + let borrowed_item: BorrowedText = from_str(s).unwrap(); + + assert_eq!(borrowed_item.text, "Hello world"); + } + #[derive(Debug, Deserialize, PartialEq)] struct Item { name: String, @@ -436,7 +633,7 @@ mod tests { "##; - let item: Item = from_str(s).unwrap(); + let item: Item = from_reader(s.as_bytes()).unwrap(); assert_eq!( item, diff --git a/src/de/seq.rs b/src/de/seq.rs index eeb8224c..fc76a989 100644 --- a/src/de/seq.rs +++ b/src/de/seq.rs @@ -1,10 +1,9 @@ -use crate::de::{DeError, Deserializer}; +use crate::de::{DeError, Deserializer, BorrowingReader}; use crate::{ events::{BytesStart, Event}, reader::Decoder, }; use serde::de; -use std::io::BufRead; #[derive(Debug)] enum Names { @@ -27,15 +26,15 @@ impl Names { } /// A SeqAccess -pub struct SeqAccess<'a, R: BufRead> { - de: &'a mut Deserializer, +pub struct SeqAccess<'de, 'a, R: BorrowingReader<'de>> { + de: &'a mut Deserializer<'de, R>, max_size: Option, names: Names, } -impl<'a, R: BufRead> SeqAccess<'a, R> { +impl<'a, 'de, R: BorrowingReader<'de>> SeqAccess<'de, 'a, R> { /// Get a new SeqAccess - pub fn new(de: &'a mut Deserializer, max_size: Option) -> Result { + pub fn new(de: &'a mut Deserializer<'de, R>, max_size: Option) -> Result { let decoder = de.reader.decoder(); let names = if de.has_value_field { Names::Unknown @@ -58,7 +57,7 @@ impl<'a, R: BufRead> SeqAccess<'a, R> { } } -impl<'de, 'a, R: 'a + BufRead> de::SeqAccess<'de> for SeqAccess<'a, R> { +impl<'de, 'a, R: BorrowingReader<'de>> de::SeqAccess<'de> for SeqAccess<'de, 'a, R> { type Error = DeError; fn size_hint(&self) -> Option { diff --git a/src/de/var.rs b/src/de/var.rs index 781c7d96..7cfd22dc 100644 --- a/src/de/var.rs +++ b/src/de/var.rs @@ -1,35 +1,34 @@ use crate::{ - de::{escape::EscapedDeserializer, Deserializer}, + de::{escape::EscapedDeserializer, Deserializer, BorrowingReader}, errors::serialize::DeError, events::Event, }; use serde::de::{self, Deserializer as SerdeDeserializer}; -use std::io::BufRead; /// An enum access -pub struct EnumAccess<'a, R: BufRead> { - de: &'a mut Deserializer, +pub struct EnumAccess<'de, 'a, R: BorrowingReader<'de>> { + de: &'a mut Deserializer<'de, R>, } -impl<'a, R: BufRead> EnumAccess<'a, R> { - pub fn new(de: &'a mut Deserializer) -> Self { +impl<'de, 'a, R: BorrowingReader<'de>> EnumAccess<'de, 'a, R> { + pub fn new(de: &'a mut Deserializer<'de, R>) -> Self { EnumAccess { de } } } -impl<'de, 'a, R: 'a + BufRead> de::EnumAccess<'de> for EnumAccess<'a, R> { +impl<'de, 'a, R: BorrowingReader<'de>> de::EnumAccess<'de> for EnumAccess<'de, 'a, R> { type Error = DeError; - type Variant = VariantAccess<'a, R>; + type Variant = VariantAccess<'de, 'a, R>; fn variant_seed>( self, seed: V, - ) -> Result<(V::Value, VariantAccess<'a, R>), DeError> { + ) -> Result<(V::Value, VariantAccess<'de, 'a, R>), DeError> { let decoder = self.de.reader.decoder(); let de = match self.de.peek()? { Some(Event::Text(t)) => EscapedDeserializer::new(t.to_vec(), decoder, true), Some(Event::Start(e)) => EscapedDeserializer::new(e.name().to_vec(), decoder, false), - Some(e) => return Err(DeError::InvalidEnum(e.to_owned())), + Some(e) => return Err(DeError::InvalidEnum(e.clone().into_owned())), None => return Err(DeError::Eof), }; let name = seed.deserialize(de)?; @@ -37,15 +36,15 @@ impl<'de, 'a, R: 'a + BufRead> de::EnumAccess<'de> for EnumAccess<'a, R> { } } -pub struct VariantAccess<'a, R: BufRead> { - de: &'a mut Deserializer, +pub struct VariantAccess<'de, 'a, R: BorrowingReader<'de>> { + de: &'a mut Deserializer<'de, R>, } -impl<'de, 'a, R: BufRead> de::VariantAccess<'de> for VariantAccess<'a, R> { +impl<'de, 'a, R: BorrowingReader<'de>> de::VariantAccess<'de> for VariantAccess<'de, 'a, R> { type Error = DeError; fn unit_variant(self) -> Result<(), DeError> { - match self.de.next(&mut Vec::new())? { + match self.de.next()? { Event::Start(e) => self.de.read_to_end(e.name()), Event::Text(_) => Ok(()), _ => unreachable!(), diff --git a/src/events/mod.rs b/src/events/mod.rs index 107207b9..3720cbdb 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -27,7 +27,7 @@ use memchr; /// [`local_name`]: #method.local_name /// [`unescaped`]: #method.unescaped /// [`attributes`]: #method.attributes -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq)] pub struct BytesStart<'a> { /// content of the element, before any utf8 conversion buf: Cow<'a, [u8]>, @@ -132,7 +132,7 @@ impl<'a> BytesStart<'a> { /// Returns an iterator over the attributes of this tag. pub fn attributes(&self) -> Attributes { - Attributes::new(self, self.name_len) + Attributes::new(&self.buf, self.name_len) } /// Returns an iterator over the HTML-like attributes of this tag (no mandatory quotes or `=`). @@ -242,7 +242,7 @@ impl<'a> std::fmt::Debug for BytesStart<'a> { /// An XML declaration (`Event::Decl`). /// /// [W3C XML 1.1 Prolog and Document Type Declaration](http://w3.org/TR/xml11/#sec-prolog-dtd) -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Eq, PartialEq)] pub struct BytesDecl<'a> { element: BytesStart<'a>, } @@ -364,7 +364,7 @@ impl<'a> BytesDecl<'a> { } /// A struct to manage `Event::End` events -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq)] pub struct BytesEnd<'a> { name: Cow<'a, [u8]>, } @@ -423,7 +423,7 @@ impl<'a> std::fmt::Debug for BytesEnd<'a> { } /// Data from various events (most notably, `Event::Text`). -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq)] pub struct BytesText<'a> { // Invariant: The content is always escaped. content: Cow<'a, [u8]>, @@ -480,6 +480,23 @@ impl<'a> BytesText<'a> { unescape(self).map_err(Error::EscapeError) } + /// gets escaped content + /// + /// Same as `unescaped()`, but will reuse the internal buffer when possible. + pub fn into_unescaped(self) -> Result> { + match self.content { + Cow::Owned(bytes) => { + // TODO: Make unescape accept a Cow and reuse the owned string + unescape(&bytes) + .map(|b| b.into_owned().into()) + .map_err(Error::EscapeError) + }, + Cow::Borrowed(bytes) => { + unescape(bytes).map_err(Error::EscapeError) + } + } + } + /// helper method to unescape then decode self using the reader encoding /// but without BOM (Byte order mark) /// @@ -559,7 +576,7 @@ impl<'a> std::fmt::Debug for BytesText<'a> { /// Event emitted by [`Reader::read_event`]. /// /// [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Eq, PartialEq)] pub enum Event<'a> { /// Start tag (with attributes) ``. Start(BytesStart<'a>), diff --git a/src/reader.rs b/src/reader.rs index d5d420dc..c1cc7763 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -63,7 +63,7 @@ enum TagState { /// ``` pub struct Reader { /// reader - reader: R, + pub(crate) reader: R, /// current buffer position, useful for debuging errors buf_position: usize, /// current state Open/Close @@ -208,9 +208,9 @@ impl Reader { /// private function to read until '<' is found /// return a `Text` event - fn read_until_open<'i, B>(&mut self, buf: B) -> Result> + fn read_until_open<'i, 'r, B>(&mut self, buf: B) -> Result> where - R: BufferedInput<'i, B> + R: BufferedInput<'i, 'r, B> { self.tag_state = TagState::Opened; @@ -247,9 +247,9 @@ impl Reader { } /// private function to read until '>' is found - fn read_until_close<'i, B>(&mut self, buf: B) -> Result> + fn read_until_close<'i, 'r, B>(&mut self, buf: B) -> Result> where - R: BufferedInput<'i, B> + R: BufferedInput<'i, 'r, B> { self.tag_state = TagState::Closed; @@ -495,9 +495,9 @@ impl Reader { /// Read text into the given buffer, and return an event that borrows from /// either that buffer or from the input itself, based on the type of the /// reader. - fn read_event_buffered<'i, B>(&mut self, buf: B) -> Result> + fn read_event_buffered<'i, 'r, B>(&mut self, buf: B) -> Result> where - R: BufferedInput<'i, B> + R: BufferedInput<'i, 'r, B> { let event = match self.tag_state { TagState::Opened => self.read_until_close(buf), @@ -875,13 +875,45 @@ impl<'a> Reader<&'a [u8]> { Reader::from_reader(s.as_bytes()) } + /// Creates an XML reader from a slice of bytes. + pub fn from_bytes(s: &'a [u8]) -> Reader<&'a [u8]> { + Reader::from_reader(s) + } + /// Read an event that borrows from the input rather than a buffer. pub fn read_event_unbuffered(&mut self) -> Result> { self.read_event_buffered(()) } + + /// Reads until end element is found + /// + /// Manages nested cases where parent and child elements have the same name + pub fn read_to_end_unbuffered>(&mut self, end: K) -> Result<()> { + let mut depth = 0; + let end = end.as_ref(); + loop { + match self.read_event_unbuffered() { + Ok(Event::End(ref e)) if e.name() == end => { + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + Ok(Event::Start(ref e)) if e.name() == end => depth += 1, + Err(e) => return Err(e), + Ok(Event::Eof) => { + return Err(Error::UnexpectedEof(format!("", from_utf8(end)))); + } + _ => (), + } + } + } } -trait BufferedInput<'r, B> { +trait BufferedInput<'r, 'i, B> +where + Self: 'i +{ fn read_bytes_until( &mut self, byte: u8, @@ -898,11 +930,13 @@ trait BufferedInput<'r, B> { fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result; fn peek_one(&mut self) -> Result>; + + fn input_borrowed(event: Event<'r>) -> Event<'i>; } /// Implementation of BufferedInput for any BufRead reader using a user-given /// Vec as buffer that will be borrowed by events. -impl<'b, R: BufRead> BufferedInput<'b, &'b mut Vec> for R { +impl<'b, 'i, R: BufRead + 'i> BufferedInput<'b, 'i, &'b mut Vec> for R { /// read until `byte` is found or end of file /// return the position of byte #[inline] @@ -1180,11 +1214,15 @@ impl<'b, R: BufRead> BufferedInput<'b, &'b mut Vec> for R { }; } } + + fn input_borrowed(event: Event<'b>) -> Event<'i> { + event.into_owned() + } } /// Implementation of BufferedInput for any BufRead reader using a user-given /// Vec as buffer that will be borrowed by events. -impl<'a> BufferedInput<'a, ()> for &'a [u8] { +impl<'a> BufferedInput<'a, 'a, ()> for &'a [u8] { fn read_bytes_until( &mut self, byte: u8, @@ -1199,8 +1237,14 @@ impl<'a> BufferedInput<'a, ()> for &'a [u8] { *position += i; let bytes = &self[..i]; - // Skip the end byte too. - *self = &self[i + 1..]; + let i = if i < self.len() { + // Skip the matched byte too. + i + 1 + } else { + // Unless we're at the end of the string + i + }; + *self = &self[i..]; return Ok(Some(bytes)); } @@ -1327,6 +1371,10 @@ impl<'a> BufferedInput<'a, ()> for &'a [u8] { fn peek_one(&mut self) -> Result> { Ok(self.first().copied()) } + + fn input_borrowed(event: Event<'a>) -> Event<'a> { + return event; + } } /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) @@ -1527,6 +1575,12 @@ impl Decoder { from_utf8(bytes).map_err(Error::Utf8) } + #[cfg(not(feature = "encoding"))] + pub fn decode_owned<'c>(&self, bytes: Vec) -> Result { + String::from_utf8(bytes) + .map_err(|e| Error::Utf8(e.utf8_error())) + } + #[cfg(feature = "encoding")] pub fn decode<'c>(&self, bytes: &'c [u8]) -> Cow<'c, str> { self.encoding.decode(bytes).0