diff --git a/src/de/map.rs b/src/de/map.rs index 7ba17027..90387ca5 100644 --- a/src/de/map.rs +++ b/src/de/map.rs @@ -1,12 +1,11 @@ //! Serde `Deserializer` module use crate::{ - de::{escape::EscapedDeserializer, Deserializer, INNER_VALUE, UNFLATTEN_PREFIX}, + de::{escape::EscapedDeserializer, BorrowingReader, Deserializer, INNER_VALUE}, errors::serialize::DeError, - events::{attributes::Attribute, BytesStart, Event}, + events::{BytesStart, Event}, }; use serde::de::{self, DeserializeSeed, IntoDeserializer}; -use std::io::BufRead; enum MapValue { Empty, @@ -16,10 +15,10 @@ enum MapValue { } /// A deserializer for `Attributes` -pub(crate) struct MapAccess<'a, R: BufRead> { +pub(crate) struct MapAccess<'de, 'a, R: BorrowingReader<'de> + 'a> { /// Tag -- owner of attributes - start: BytesStart<'static>, - de: &'a mut Deserializer, + start: BytesStart<'de>, + de: &'a mut Deserializer<'de, R>, /// Position in flat byte slice of all attributes from which next /// attribute should be parsed. This field is required because we /// do not store reference to `Attributes` itself but instead create @@ -29,9 +28,9 @@ pub(crate) struct MapAccess<'a, R: BufRead> { value: MapValue, } -impl<'a, R: BufRead> MapAccess<'a, R> { +impl<'de, 'a, R: BorrowingReader<'de>> MapAccess<'de, 'a, R> { /// Create a new MapAccess - pub fn new(de: &'a mut Deserializer, start: BytesStart<'static>) -> Result { + pub fn new(de: &'a mut Deserializer<'de, R>, start: BytesStart<'de>) -> Result { let position = start.attributes().position; Ok(MapAccess { de, @@ -41,29 +40,25 @@ impl<'a, R: BufRead> MapAccess<'a, R> { }) } - fn next_attr(&mut self) -> Result, DeError> { + fn next_attr(&mut self) -> Result, Vec)>, DeError> { let mut attributes = self.start.attributes(); attributes.position = self.position; - let next_att = attributes.next(); + let next_att = attributes.next().transpose()?; self.position = attributes.position; - Ok(next_att.transpose()?) + Ok(next_att.map(|a| (a.key.to_owned(), a.value.into_owned()))) } } -impl<'a, 'de, R: BufRead> de::MapAccess<'de> for MapAccess<'a, R> { +impl<'de, 'a, R: BorrowingReader<'de> + 'a> de::MapAccess<'de> for MapAccess<'de, 'a, R> { type Error = DeError; fn next_key_seed>( &mut self, seed: K, ) -> Result, Self::Error> { - let attr_key_val = self - .next_attr()? - .map(|a| (a.key.to_owned(), a.value.into_owned())); let decoder = self.de.reader.decoder(); let has_value_field = self.de.has_value_field; - let has_unflatten_field = self.de.has_unflatten_field; - if let Some((key, value)) = attr_key_val { + if let Some((key, value)) = self.next_attr()? { // try getting map from attributes (key= "value") self.value = MapValue::Attribute { value }; seed.deserialize(EscapedDeserializer::new(key, decoder, false)) @@ -97,8 +92,12 @@ impl<'a, 'de, R: BufRead> de::MapAccess<'de> for MapAccess<'a, R> { } Some(Event::Start(e)) if has_unflatten_field => { self.value = MapValue::InnerValue; - let key = format!("{}{}", UNFLATTEN_PREFIX, String::from_utf8(e.local_name().to_vec()) - .expect("$unflatten= did not contain valid Rust identifier")); + let key = format!( + "{}{}", + UNFLATTEN_PREFIX, + String::from_utf8(e.local_name().to_vec()) + .expect("$unflatten= did not contain valid Rust identifier") + ); seed.deserialize(key.into_deserializer()).map(Some) } Some(Event::Start(e)) => { diff --git a/src/de/mod.rs b/src/de/mod.rs index a4fad679..a9cfe38d 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -115,26 +115,39 @@ mod var; pub use crate::errors::serialize::DeError; use crate::{ events::{BytesStart, BytesText, Event}, + reader::Decoder, Reader, }; -use serde::de::{self, DeserializeOwned}; +use serde::de::{self, Deserialize, DeserializeOwned}; use serde::serde_if_integer128; +use std::borrow::Cow; use std::io::BufRead; pub(crate) const INNER_VALUE: &str = "$value"; pub(crate) const UNFLATTEN_PREFIX: &str = "$unflatten="; /// An xml deserializer -pub struct Deserializer { - reader: Reader, - peek: Option>, +pub struct Deserializer<'de, R: BorrowingReader<'de>> { + reader: R, + peek: Option>, has_value_field: bool, has_unflatten_field: bool, } /// Deserialize an instance of type T from a string of XML text. -pub fn from_str(s: &str) -> Result { - from_reader(s.as_bytes()) +pub fn from_str<'de, T: Deserialize<'de>>(s: &'de str) -> Result { + from_bytes(s.as_bytes()) +} + +/// Deserialize a xml slice of bytes +pub fn from_bytes<'de, T: Deserialize<'de>>(s: &'de [u8]) -> Result { + let mut reader = Reader::from_bytes(s); + reader + .expand_empty_elements(true) + .check_end_names(true) + .trim_text(true); + let mut de = Deserializer::from_borrowing_reader(SliceReader { reader }); + T::deserialize(&mut de) } /// Deserialize an instance of type T from bytes of XML text. @@ -144,13 +157,21 @@ pub fn from_slice(b: &[u8]) -> Result { /// Deserialize from a reader pub fn from_reader(reader: R) -> Result { - let mut de = Deserializer::from_reader(reader); + let mut reader = Reader::from_reader(reader); + reader + .expand_empty_elements(true) + .check_end_names(true) + .trim_text(true); + let mut de = Deserializer::from_borrowing_reader(IoReader { + reader, + buf: Vec::new(), + }); T::deserialize(&mut de) } -impl Deserializer { +impl<'de, R: BorrowingReader<'de>> Deserializer<'de, R> { /// Get a new deserializer - pub fn new(reader: Reader) -> Self { + pub fn new(reader: R) -> Self { Deserializer { reader, peek: None, @@ -160,45 +181,32 @@ impl Deserializer { } /// Get a new deserializer from a regular BufRead - pub fn from_reader(reader: R) -> Self { - let mut reader = Reader::from_reader(reader); - reader - .expand_empty_elements(true) - .check_end_names(true) - .trim_text(true); + pub fn from_borrowing_reader(reader: R) -> Self { Self::new(reader) } - fn peek(&mut self) -> Result>, DeError> { + fn peek(&mut self) -> Result>, DeError> { if self.peek.is_none() { - self.peek = Some(self.next(&mut Vec::new())?); + self.peek = Some(self.next()?); } Ok(self.peek.as_ref()) } - fn next<'a>(&mut self, buf: &'a mut Vec) -> Result, DeError> { + fn next(&mut self) -> Result, DeError> { if let Some(e) = self.peek.take() { return Ok(e); } - loop { - let e = self.reader.read_event(buf)?; - match e { - Event::Start(_) | Event::End(_) | Event::Text(_) | Event::Eof | Event::CData(_) => { - return Ok(e.into_owned()) - } - _ => buf.clear(), - } - } + self.reader.next() } - fn next_start(&mut self, buf: &mut Vec) -> Result>, DeError> { + fn next_start(&mut self) -> Result>, DeError> { loop { - let e = self.next(buf)?; + let e = self.next()?; match e { Event::Start(e) => return Ok(Some(e)), Event::End(_) => return Err(DeError::End), Event::Eof => return Ok(None), - _ => buf.clear(), // ignore texts + _ => (), // ignore texts } } } @@ -210,13 +218,13 @@ impl Deserializer { /// |`text`|`text` |Complete tag consumed | /// |`` |empty slice|Virtual end tag not consumed| /// |`` |empty slice|Not consumed | - fn next_text<'a>(&mut self) -> Result, DeError> { - match self.next(&mut Vec::new())? { + fn next_text(&mut self) -> Result, DeError> { + match self.next()? { Event::Text(e) | Event::CData(e) => Ok(e), Event::Eof => Err(DeError::Eof), Event::Start(e) => { // allow one nested level - let inner = self.next(&mut Vec::new())?; + let inner = self.next()?; let t = match inner { Event::Text(t) | Event::CData(t) => t, Event::Start(_) => return Err(DeError::Start), @@ -239,13 +247,13 @@ impl Deserializer { } fn read_to_end(&mut self, name: &[u8]) -> Result<(), DeError> { - let mut buf = Vec::new(); - match self.next(&mut buf)? { - Event::Start(e) => self.reader.read_to_end(e.name(), &mut Vec::new())?, + // First one might be in self.peek + match self.next()? { + Event::Start(e) => self.reader.read_to_end(e.name())?, Event::End(e) if e.name() == name => return Ok(()), - _ => buf.clear(), + _ => (), } - Ok(self.reader.read_to_end(name, &mut buf)?) + self.reader.read_to_end(name) } } @@ -255,17 +263,17 @@ macro_rules! deserialize_type { let txt = self.next_text()?; #[cfg(not(feature = "encoding"))] - let value = self.reader.decode(&*txt)?.parse()?; + let value = self.reader.decoder().decode(&*txt)?.parse()?; #[cfg(feature = "encoding")] - let value = self.reader.decode(&*txt).parse()?; + let value = self.reader.decoder().decode(&*txt).parse()?; visitor.$visit(value) } }; } -impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { +impl<'de, 'a, R: BorrowingReader<'de>> de::Deserializer<'de> for &'a mut Deserializer<'de, R> { type Error = DeError; fn deserialize_struct>( @@ -274,7 +282,7 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { fields: &'static [&'static str], visitor: V, ) -> Result { - if let Some(e) = self.next_start(&mut Vec::new())? { + if let Some(e) = self.next_start()? { let name = e.name().to_vec(); self.has_value_field = fields.contains(&INNER_VALUE); self.has_unflatten_field = fields.iter().any(|elem| elem.starts_with(UNFLATTEN_PREFIX)); @@ -311,7 +319,7 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { #[cfg(feature = "encoding")] { #[cfg(feature = "encoding")] - let value = self.reader.decode(&*txt); + let value = self.reader.decoder().decode(&*txt); match value.as_ref() { "true" | "1" | "True" | "TRUE" | "t" | "Yes" | "YES" | "yes" | "y" => { @@ -333,14 +341,17 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { b"false" | b"0" | b"False" | b"FALSE" | b"f" | b"No" | b"NO" | b"no" | b"n" => { visitor.visit_bool(false) } - e => Err(DeError::InvalidBoolean(self.reader.decode(e)?.into())), + e => Err(DeError::InvalidBoolean( + self.reader.decoder().decode(e)?.into(), + )), } } } fn deserialize_string>(self, visitor: V) -> Result { - let value = self.next_text()?.unescape_and_decode(&self.reader)?; - visitor.visit_string(value) + let text = self.next_text()?; + let string = text.decode_and_escape(self.reader.decoder())?; + visitor.visit_string(string.into_owned()) } fn deserialize_char>(self, visitor: V) -> Result { @@ -348,7 +359,12 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { } fn deserialize_str>(self, visitor: V) -> Result { - self.deserialize_string(visitor) + let text = self.next_text()?; + let string = text.decode_and_escape(self.reader.decoder())?; + match string { + Cow::Borrowed(string) => visitor.visit_borrowed_str(string), + Cow::Owned(string) => visitor.visit_string(string), + } } fn deserialize_bytes>(self, visitor: V) -> Result { @@ -364,8 +380,7 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { } fn deserialize_unit>(self, visitor: V) -> Result { - let mut buf = Vec::new(); - match self.next(&mut buf)? { + match self.next()? { Event::Start(s) => { self.read_to_end(s.name())?; visitor.visit_unit() @@ -438,7 +453,7 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { } fn deserialize_ignored_any>(self, visitor: V) -> Result { - match self.next(&mut Vec::new())? { + match self.next()? { Event::Start(e) => self.read_to_end(e.name())?, Event::End(_) => return Err(DeError::End), _ => (), @@ -455,11 +470,202 @@ impl<'de, 'a, R: BufRead> de::Deserializer<'de> for &'a mut Deserializer { } } +/// A trait that borrows an XML reader that borrows from the input. For a &[u8] +/// input the events will borrow from that input, whereas with a BufRead input +/// all events will be converted to 'static, allocating whenever necessary. +pub trait BorrowingReader<'i> +where + Self: 'i, +{ + /// Return an input-borrowing event. + fn next(&mut self) -> Result, DeError>; + + /// Skips until end element is found. Unlike `next()` it will not allocate + /// when it cannot satisfy the lifetime. + fn read_to_end(&mut self, name: &[u8]) -> Result<(), DeError>; + + /// A copy of the reader's decoder used to decode strings. + fn decoder(&self) -> Decoder; +} + +struct IoReader { + reader: Reader, + buf: Vec, +} + +impl<'i, R: BufRead + 'i> BorrowingReader<'i> for IoReader { + fn next(&mut self) -> Result, DeError> { + let event = loop { + let e = self.reader.read_event(&mut self.buf)?; + match e { + Event::Start(_) | Event::End(_) | Event::Text(_) | Event::Eof | Event::CData(_) => { + break Ok(e.into_owned()) + } + _ => self.buf.clear(), + } + }; + + self.buf.clear(); + + event + } + + fn read_to_end(&mut self, name: &[u8]) -> Result<(), DeError> { + Ok(self.reader.read_to_end(name, &mut self.buf)?) + } + + fn decoder(&self) -> Decoder { + self.reader.decoder() + } +} + +struct SliceReader<'de> { + reader: Reader<&'de [u8]>, +} + +impl<'de> BorrowingReader<'de> for SliceReader<'de> { + fn next(&mut self) -> Result, DeError> { + loop { + let e = self.reader.read_event_unbuffered()?; + match e { + Event::Start(_) | Event::End(_) | Event::Text(_) | Event::Eof | Event::CData(_) => { + break Ok(e) + } + _ => (), + } + } + } + + fn read_to_end(&mut self, name: &[u8]) -> Result<(), DeError> { + Ok(self.reader.read_to_end_unbuffered(name)?) + } + + fn decoder(&self) -> Decoder { + self.reader.decoder() + } +} + #[cfg(test)] mod tests { use super::*; use serde::Deserialize; + #[test] + fn borrowing_reader_parity() { + let s = r##" + Some text + + + "## + .as_bytes(); + + let mut reader1 = IoReader { + reader: Reader::from_reader(s), + buf: Vec::new(), + }; + let mut reader2 = SliceReader { + reader: Reader::from_bytes(s), + }; + + loop { + let event1 = reader1.next().unwrap(); + let event2 = reader2.next().unwrap(); + + if let (Event::Eof, Event::Eof) = (&event1, &event2) { + break; + } + + assert_eq!(format!("{:?}", event1), format!("{:?}", event2)); + } + } + + #[test] + fn borrowing_reader_events() { + let s = r##" + Some text + + + + "## + .as_bytes(); + + let mut reader = SliceReader { + reader: Reader::from_bytes(s), + }; + + reader + .reader + .trim_text(true) + .expand_empty_elements(true) + .check_end_names(true); + + let mut events = Vec::new(); + + loop { + let event = reader.next().unwrap(); + if let Event::Eof = event { + break; + } + events.push(event); + } + + use crate::events::{BytesEnd, BytesStart, BytesText, Event::*}; + + assert_eq!( + events, + vec![ + Start(BytesStart::borrowed( + br#"item name="hello" source="world.rs""#, + 4 + )), + Text(BytesText::from_escaped(b"Some text".as_ref())), + End(BytesEnd::borrowed(b"item")), + Start(BytesStart::borrowed(b"item2", 5)), + End(BytesEnd::borrowed(b"item2")), + Start(BytesStart::borrowed(b"item3", 5)), + End(BytesEnd::borrowed(b"item3")), + Start(BytesStart::borrowed(br#"item4 value="world" "#, 5)), + End(BytesEnd::borrowed(b"item4")), + ] + ) + } + + #[test] + fn borrowing_read_to_end() { + let s = " "; + let mut reader = SliceReader { + reader: Reader::from_str(s), + }; + + reader + .reader + .trim_text(true) + .expand_empty_elements(true) + .check_end_names(true); + + assert_eq!( + reader.next().unwrap(), + Event::Start(BytesStart::borrowed(b"item ", 4)) + ); + reader.read_to_end(b"item").unwrap(); + assert_eq!(reader.next().unwrap(), Event::Eof); + } + + #[derive(Debug, Deserialize, PartialEq)] + struct BorrowedText<'a> { + #[serde(rename = "$value")] + text: &'a str, + } + + #[test] + fn string_borrow() { + let s = "Hello world"; + + let borrowed_item: BorrowedText = from_str(s).unwrap(); + + assert_eq!(borrowed_item.text, "Hello world"); + } + #[derive(Debug, Deserialize, PartialEq)] struct Item { name: String, @@ -472,7 +678,7 @@ mod tests { "##; - let item: Item = from_str(s).unwrap(); + let item: Item = from_reader(s.as_bytes()).unwrap(); assert_eq!( item, diff --git a/src/de/seq.rs b/src/de/seq.rs index eeb8224c..82848d42 100644 --- a/src/de/seq.rs +++ b/src/de/seq.rs @@ -1,10 +1,9 @@ -use crate::de::{DeError, Deserializer}; +use crate::de::{BorrowingReader, DeError, Deserializer}; use crate::{ events::{BytesStart, Event}, reader::Decoder, }; use serde::de; -use std::io::BufRead; #[derive(Debug)] enum Names { @@ -27,15 +26,15 @@ impl Names { } /// A SeqAccess -pub struct SeqAccess<'a, R: BufRead> { - de: &'a mut Deserializer, +pub struct SeqAccess<'de, 'a, R: BorrowingReader<'de>> { + de: &'a mut Deserializer<'de, R>, max_size: Option, names: Names, } -impl<'a, R: BufRead> SeqAccess<'a, R> { +impl<'a, 'de, R: BorrowingReader<'de>> SeqAccess<'de, 'a, R> { /// Get a new SeqAccess - pub fn new(de: &'a mut Deserializer, max_size: Option) -> Result { + pub fn new(de: &'a mut Deserializer<'de, R>, max_size: Option) -> Result { let decoder = de.reader.decoder(); let names = if de.has_value_field { Names::Unknown @@ -58,7 +57,7 @@ impl<'a, R: BufRead> SeqAccess<'a, R> { } } -impl<'de, 'a, R: 'a + BufRead> de::SeqAccess<'de> for SeqAccess<'a, R> { +impl<'de, 'a, R: BorrowingReader<'de>> de::SeqAccess<'de> for SeqAccess<'de, 'a, R> { type Error = DeError; fn size_hint(&self) -> Option { diff --git a/src/de/var.rs b/src/de/var.rs index 781c7d96..024508d9 100644 --- a/src/de/var.rs +++ b/src/de/var.rs @@ -1,35 +1,34 @@ use crate::{ - de::{escape::EscapedDeserializer, Deserializer}, + de::{escape::EscapedDeserializer, BorrowingReader, Deserializer}, errors::serialize::DeError, events::Event, }; use serde::de::{self, Deserializer as SerdeDeserializer}; -use std::io::BufRead; /// An enum access -pub struct EnumAccess<'a, R: BufRead> { - de: &'a mut Deserializer, +pub struct EnumAccess<'de, 'a, R: BorrowingReader<'de>> { + de: &'a mut Deserializer<'de, R>, } -impl<'a, R: BufRead> EnumAccess<'a, R> { - pub fn new(de: &'a mut Deserializer) -> Self { +impl<'de, 'a, R: BorrowingReader<'de>> EnumAccess<'de, 'a, R> { + pub fn new(de: &'a mut Deserializer<'de, R>) -> Self { EnumAccess { de } } } -impl<'de, 'a, R: 'a + BufRead> de::EnumAccess<'de> for EnumAccess<'a, R> { +impl<'de, 'a, R: BorrowingReader<'de>> de::EnumAccess<'de> for EnumAccess<'de, 'a, R> { type Error = DeError; - type Variant = VariantAccess<'a, R>; + type Variant = VariantAccess<'de, 'a, R>; fn variant_seed>( self, seed: V, - ) -> Result<(V::Value, VariantAccess<'a, R>), DeError> { + ) -> Result<(V::Value, VariantAccess<'de, 'a, R>), DeError> { let decoder = self.de.reader.decoder(); let de = match self.de.peek()? { Some(Event::Text(t)) => EscapedDeserializer::new(t.to_vec(), decoder, true), Some(Event::Start(e)) => EscapedDeserializer::new(e.name().to_vec(), decoder, false), - Some(e) => return Err(DeError::InvalidEnum(e.to_owned())), + Some(e) => return Err(DeError::InvalidEnum(e.clone().into_owned())), None => return Err(DeError::Eof), }; let name = seed.deserialize(de)?; @@ -37,15 +36,15 @@ impl<'de, 'a, R: 'a + BufRead> de::EnumAccess<'de> for EnumAccess<'a, R> { } } -pub struct VariantAccess<'a, R: BufRead> { - de: &'a mut Deserializer, +pub struct VariantAccess<'de, 'a, R: BorrowingReader<'de>> { + de: &'a mut Deserializer<'de, R>, } -impl<'de, 'a, R: BufRead> de::VariantAccess<'de> for VariantAccess<'a, R> { +impl<'de, 'a, R: BorrowingReader<'de>> de::VariantAccess<'de> for VariantAccess<'de, 'a, R> { type Error = DeError; fn unit_variant(self) -> Result<(), DeError> { - match self.de.next(&mut Vec::new())? { + match self.de.next()? { Event::Start(e) => self.de.read_to_end(e.name()), Event::Text(_) => Ok(()), _ => unreachable!(), diff --git a/src/events/mod.rs b/src/events/mod.rs index cf8620ef..d366f122 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -34,7 +34,6 @@ //! [`Writer`]: ../writer/struct.Writer.html //! [`Event`]: enum.Event.html - pub mod attributes; #[cfg(feature = "encoding_rs")] @@ -58,7 +57,7 @@ use memchr; /// [`local_name`]: #method.local_name /// [`unescaped`]: #method.unescaped /// [`attributes`]: #method.attributes -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq)] pub struct BytesStart<'a> { /// content of the element, before any utf8 conversion buf: Cow<'a, [u8]>, @@ -225,7 +224,7 @@ impl<'a> BytesStart<'a> { /// Returns an iterator over the attributes of this tag. pub fn attributes(&self) -> Attributes { - Attributes::new(self, self.name_len) + Attributes::new(&self.buf, self.name_len) } /// Returns an iterator over the HTML-like attributes of this tag (no mandatory quotes or `=`). @@ -377,7 +376,7 @@ impl<'a> std::fmt::Debug for BytesStart<'a> { /// An XML declaration (`Event::Decl`). /// /// [W3C XML 1.1 Prolog and Document Type Declaration](http://w3.org/TR/xml11/#sec-prolog-dtd) -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Eq, PartialEq)] pub struct BytesDecl<'a> { element: BytesStart<'a>, } @@ -499,7 +498,7 @@ impl<'a> BytesDecl<'a> { } /// A struct to manage `Event::End` events -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq)] pub struct BytesEnd<'a> { name: Cow<'a, [u8]>, } @@ -558,7 +557,7 @@ impl<'a> std::fmt::Debug for BytesEnd<'a> { } /// Data from various events (most notably, `Event::Text`). -#[derive(Clone)] +#[derive(Clone, Eq, PartialEq)] pub struct BytesText<'a> { // Invariant: The content is always escaped. content: Cow<'a, [u8]>, @@ -649,6 +648,56 @@ impl<'a> BytesText<'a> { do_unescape(self, custom_entities).map_err(Error::EscapeError) } + #[cfg(feature = "serialize")] + pub(crate) fn decode_and_escape( + &self, + decoder: crate::reader::Decoder, + ) -> Result> { + let decoded: Cow = match &self.content { + Cow::Borrowed(bytes) => { + #[cfg(feature = "encoding")] + { + decoder.decode(bytes) + } + #[cfg(not(feature = "encoding"))] + { + decoder.decode(bytes)?.into() + } + } + Cow::Owned(bytes) => { + #[cfg(feature = "encoding")] + let decoded = decoder.decode(bytes).into_owned(); + + #[cfg(not(feature = "encoding"))] + let decoded = decoder.decode(bytes)?.to_string(); + + decoded.into() + } + }; + + match decoded { + Cow::Borrowed(decoded) => { + let unescaped = + do_unescape(decoded.as_bytes(), None).map_err(Error::EscapeError)?; + match unescaped { + Cow::Borrowed(unescaped) => { + from_utf8(unescaped).map(|s| s.into()).map_err(Error::Utf8) + } + Cow::Owned(unescaped) => String::from_utf8(unescaped) + .map(|s| s.into()) + .map_err(|e| Error::Utf8(e.utf8_error())), + } + } + Cow::Owned(decoded) => { + let unescaped = + do_unescape(decoded.as_bytes(), None).map_err(Error::EscapeError)?; + String::from_utf8(unescaped.into_owned()) + .map(|s| s.into()) + .map_err(|e| Error::Utf8(e.utf8_error())) + } + } + } + /// helper method to unescape then decode self using the reader encoding /// but without BOM (Byte order mark) /// @@ -814,7 +863,7 @@ impl<'a> std::fmt::Debug for BytesText<'a> { /// Event emitted by [`Reader::read_event`]. /// /// [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event -#[derive(Clone, Debug)] +#[derive(Clone, Debug, Eq, PartialEq)] pub enum Event<'a> { /// Start tag (with attributes) ``. Start(BytesStart<'a>), diff --git a/src/reader.rs b/src/reader.rs index ddb672fb..22d1c173 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -61,9 +61,9 @@ enum TagState { /// } /// ``` #[derive(Clone)] -pub struct Reader { +pub struct Reader { /// reader - reader: B, + pub(crate) reader: R, /// current buffer position, useful for debuging errors buf_position: usize, /// current state Open/Close @@ -95,9 +95,9 @@ pub struct Reader { is_encoding_set: bool, } -impl Reader { +impl Reader { /// Creates a `Reader` that reads from a reader implementing `BufRead`. - pub fn from_reader(reader: B) -> Reader { + pub fn from_reader(reader: R) -> Reader { Reader { reader, opened_buffer: Vec::new(), @@ -129,7 +129,7 @@ impl Reader { /// [`Empty`]: events/enum.Event.html#variant.Empty /// [`Start`]: events/enum.Event.html#variant.Start /// [`End`]: events/enum.Event.html#variant.End - pub fn expand_empty_elements(&mut self, val: bool) -> &mut Reader { + pub fn expand_empty_elements(&mut self, val: bool) -> &mut Reader { self.expand_empty_elements = val; self } @@ -142,7 +142,7 @@ impl Reader { /// (`false` by default) /// /// [`Text`]: events/enum.Event.html#variant.Text - pub fn trim_text(&mut self, val: bool) -> &mut Reader { + pub fn trim_text(&mut self, val: bool) -> &mut Reader { self.trim_text_start = val; self.trim_text_end = val; self @@ -155,7 +155,7 @@ impl Reader { /// (`false` by default) /// /// [`Text`]: events/enum.Event.html#variant.Text - pub fn trim_text_end(&mut self, val: bool) -> &mut Reader { + pub fn trim_text_end(&mut self, val: bool) -> &mut Reader { self.trim_text_end = val; self } @@ -171,7 +171,7 @@ impl Reader { /// (`true` by default) /// /// [`End`]: events/enum.Event.html#variant.End - pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Reader { + pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Reader { self.trim_markup_names_in_closing_tags = val; self } @@ -189,7 +189,7 @@ impl Reader { /// (`true` by default) /// /// [`End`]: events/enum.Event.html#variant.End - pub fn check_end_names(&mut self, val: bool) -> &mut Reader { + pub fn check_end_names(&mut self, val: bool) -> &mut Reader { self.check_end_names = val; self } @@ -204,7 +204,7 @@ impl Reader { /// (`false` by default) /// /// [`Comment`]: events/enum.Event.html#variant.Comment - pub fn check_comments(&mut self, val: bool) -> &mut Reader { + pub fn check_comments(&mut self, val: bool) -> &mut Reader { self.check_comments = val; self } @@ -224,74 +224,78 @@ impl Reader { /// private function to read until '<' is found /// return a `Text` event - fn read_until_open<'a, 'b>(&'a mut self, buf: &'b mut Vec) -> Result> { + fn read_until_open<'i, 'r, B>(&mut self, buf: B) -> Result> + where + R: BufferedInput<'i, 'r, B>, + { self.tag_state = TagState::Opened; - let buf_start = buf.len(); - match read_until(&mut self.reader, b'<', buf, &mut self.buf_position) { - Ok(0) => Ok(Event::Eof), - Ok(_) => { - let (start, len) = ( - buf_start - + if self.trim_text_start { - match buf.iter().skip(buf_start).position(|&b| !is_whitespace(b)) { - Some(start) => start, - None => return self.read_event(buf), - } - } else { - 0 - }, - if self.trim_text_end { - buf.iter() - .rposition(|&b| !is_whitespace(b)) - .map_or_else(|| buf.len(), |p| p + 1) - } else { - buf.len() - }, - ); - Ok(Event::Text(BytesText::from_escaped(&buf[start..len]))) + + if self.trim_text_start { + self.reader.skip_whitespace(&mut self.buf_position)?; + if self.reader.skip_one(b'<', &mut self.buf_position)? { + return self.read_event_buffered(buf); } + } + + match self + .reader + .read_bytes_until(b'<', buf, &mut self.buf_position) + { + Ok(Some(bytes)) if self.trim_text_end => { + // Skip the ending '< + let len = bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1); + Ok(Event::Text(BytesText::from_escaped(&bytes[..len]))) + } + Ok(Some(bytes)) => Ok(Event::Text(BytesText::from_escaped(bytes))), + Ok(None) => Ok(Event::Eof), Err(e) => Err(e), } } /// private function to read until '>' is found - fn read_until_close<'a, 'b>(&'a mut self, buf: &'b mut Vec) -> Result> { + fn read_until_close<'i, 'r, B>(&mut self, buf: B) -> Result> + where + R: BufferedInput<'i, 'r, B>, + { self.tag_state = TagState::Closed; // need to read 1 character to decide whether pay special attention to attribute values - let buf_start = buf.len(); - let start = loop { - match self.reader.fill_buf() { - Ok(n) if n.is_empty() => return Ok(Event::Eof), - Ok(n) => { - // We intentionally don't `consume()` the byte, otherwise we would have to - // handle things like '<>' here already. - break n[0]; - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => return Err(Error::Io(e)), - } + let start = match self.reader.peek_one() { + Ok(None) => return Ok(Event::Eof), + Ok(Some(byte)) => byte, + Err(e) => return Err(e), }; if start != b'/' && start != b'!' && start != b'?' { - match read_elem_until(&mut self.reader, b'>', buf, &mut self.buf_position) { - Ok(0) => Ok(Event::Eof), - Ok(_) => { + match self.reader.read_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => { // we already *know* that we are in this case - self.read_start(&buf[buf_start..]) + self.read_start(bytes) } Err(e) => Err(e), } + } else if start == b'!' { + match self.reader.read_bang_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_bang(bytes), + Err(e) => Err(e), + } } else { - match read_until(&mut self.reader, b'>', buf, &mut self.buf_position) { - Ok(0) => Ok(Event::Eof), - Ok(_) => match start { - b'/' => self.read_end(&buf[buf_start..]), - b'!' => self.read_bang(buf_start, buf), - b'?' => self.read_question_mark(&buf[buf_start..]), + match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => match start { + b'/' => self.read_end(bytes), + b'?' => self.read_question_mark(bytes), _ => unreachable!( - "We checked that `start` must be one of [/!?], was {:?} \ - instead.", + "We checked that `start` must be one of [/?], was {:?} \ + instead.", start ), }, @@ -343,83 +347,33 @@ impl Reader { /// reads `BytesElement` starting with a `!`, /// return `Comment`, `CData` or `DocType` event - /// - /// Note: depending on the start of the Event, we may need to read more - /// data, thus we need a mutable buffer - fn read_bang<'a, 'b>( - &'a mut self, - buf_start: usize, - buf: &'b mut Vec, - ) -> Result> { - if buf[buf_start..].starts_with(b"!--") { - while buf.len() < buf_start + 5 || !buf.ends_with(b"--") { - buf.push(b'>'); - match read_until(&mut self.reader, b'>', buf, &mut self.buf_position) { - Ok(0) => { - self.buf_position -= buf.len() - buf_start; - return Err(Error::UnexpectedEof("Comment".to_string())); - } - Ok(_) => (), - Err(e) => return Err(e), - } - } - let len = buf.len(); + fn read_bang<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result> { + let uncased_starts_with = |string: &[u8], prefix: &[u8]| { + string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) + }; + + let len = buf.len(); + if buf.starts_with(b"!--") { + // FIXME: actually, isn't, it misses + debug_assert!(len >= 5, "Minimum length guaranteed by read_bang_elem"); if self.check_comments { // search if '--' not in comments - if let Some(p) = memchr::memchr_iter(b'-', &buf[buf_start + 3..len - 2]) - .position(|p| buf[buf_start + 3 + p + 1] == b'-') + if let Some(p) = + memchr::memchr_iter(b'-', &buf[3..len - 2]).position(|p| buf[3 + p + 1] == b'-') { - self.buf_position -= buf.len() - buf_start + p; + self.buf_position += buf.len() - p; return Err(Error::UnexpectedToken("--".to_string())); } } - Ok(Event::Comment(BytesText::from_escaped( - &buf[buf_start + 3..len - 2], - ))) - } else if buf.len() >= buf_start + 8 { - match &buf[buf_start + 1..buf_start + 8] { - b"[CDATA[" => { - while buf.len() < 10 || !buf.ends_with(b"]]") { - buf.push(b'>'); - match read_until(&mut self.reader, b'>', buf, &mut self.buf_position) { - Ok(0) => { - self.buf_position -= buf.len() - buf_start; - return Err(Error::UnexpectedEof("CData".to_string())); - } - Ok(_) => (), - Err(e) => return Err(e), - } - } - Ok(Event::CData(BytesText::from_plain( - &buf[buf_start + 8..buf.len() - 2], - ))) - } - x if x.eq_ignore_ascii_case(b"DOCTYPE") => { - let mut count = buf.iter().skip(buf_start).filter(|&&b| b == b'<').count(); - while count > 0 { - buf.push(b'>'); - match read_until(&mut self.reader, b'>', buf, &mut self.buf_position) { - Ok(0) => { - self.buf_position -= buf.len() - buf_start; - return Err(Error::UnexpectedEof("DOCTYPE".to_string())); - } - Ok(n) => { - let start = buf.len() - n; - count += buf.iter().skip(start).filter(|&&b| b == b'<').count(); - count -= 1; - } - Err(e) => return Err(e), - } - } - Ok(Event::DocType(BytesText::from_escaped( - &buf[buf_start + 8..buf.len()], - ))) - } - _ => Err(Error::UnexpectedBang), - } + Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2]))) + } else if uncased_starts_with(buf, b"![CDATA[") { + debug_assert!(len >= 10, "Minimum length guaranteed by read_bang_elem"); + Ok(Event::CData(BytesText::from_plain(&buf[8..buf.len() - 2]))) + } else if uncased_starts_with(buf, b"!DOCTYPE") { + debug_assert!(len >= 8, "Minimum length guaranteed by read_bang_elem"); + Ok(Event::DocType(BytesText::from_escaped(&buf[8..]))) } else { - self.buf_position -= buf.len() - buf_start; - Err(Error::UnexpectedBang) + unreachable!("Proper bang start guaranteed by read_bang_elem"); } } @@ -541,7 +495,18 @@ impl Reader { /// println!("Found {} start events", count); /// println!("Text events: {:?}", txt); /// ``` + #[inline] pub fn read_event<'a, 'b>(&'a mut self, buf: &'b mut Vec) -> Result> { + self.read_event_buffered(buf) + } + + /// Read text into the given buffer, and return an event that borrows from + /// either that buffer or from the input itself, based on the type of the + /// reader. + fn read_event_buffered<'i, 'r, B>(&mut self, buf: B) -> Result> + where + R: BufferedInput<'i, 'r, B>, + { let event = match self.tag_state { TagState::Opened => self.read_until_close(buf), TagState::Closed => self.read_until_open(buf), @@ -898,7 +863,7 @@ impl Reader { /// buf.clear(); /// } /// ``` - pub fn into_underlying_reader(self) -> B { + pub fn into_underlying_reader(self) -> R { self.reader } } @@ -917,23 +882,159 @@ impl<'a> Reader<&'a [u8]> { pub fn from_str(s: &'a str) -> Reader<&'a [u8]> { Reader::from_reader(s.as_bytes()) } + + /// Creates an XML reader from a slice of bytes. + pub fn from_bytes(s: &'a [u8]) -> Reader<&'a [u8]> { + Reader::from_reader(s) + } + + /// Read an event that borrows from the input rather than a buffer. + #[inline] + pub fn read_event_unbuffered(&mut self) -> Result> { + self.read_event_buffered(()) + } + + /// Reads until end element is found + /// + /// Manages nested cases where parent and child elements have the same name + pub fn read_to_end_unbuffered>(&mut self, end: K) -> Result<()> { + let mut depth = 0; + let end = end.as_ref(); + loop { + match self.read_event_unbuffered() { + Ok(Event::End(ref e)) if e.name() == end => { + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + Ok(Event::Start(ref e)) if e.name() == end => depth += 1, + Err(e) => return Err(e), + Ok(Event::Eof) => { + return Err(Error::UnexpectedEof(format!("", from_utf8(end)))); + } + _ => (), + } + } + } } -/// read until `byte` is found or end of file -/// return the position of byte -#[inline] -fn read_until( - r: &mut R, - byte: u8, - buf: &mut Vec, - position: &mut usize, -) -> Result { - let mut read = 0; - let mut done = false; - while !done { - let used = { - let available = match r.fill_buf() { - Ok(n) if n.is_empty() => break, +trait BufferedInput<'r, 'i, B> +where + Self: 'i, +{ + fn read_bytes_until( + &mut self, + byte: u8, + buf: B, + position: &mut usize, + ) -> Result>; + + fn read_bang_element(&mut self, buf: B, position: &mut usize) -> Result>; + + fn read_element(&mut self, buf: B, position: &mut usize) -> Result>; + + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>; + + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result; + + fn peek_one(&mut self) -> Result>; + + fn input_borrowed(event: Event<'r>) -> Event<'i>; +} + +/// Implementation of BufferedInput for any BufRead reader using a user-given +/// Vec as buffer that will be borrowed by events. +impl<'b, 'i, R: BufRead + 'i> BufferedInput<'b, 'i, &'b mut Vec> for R { + /// read until `byte` is found or end of file + /// return the position of byte + #[inline] + fn read_bytes_until( + &mut self, + byte: u8, + buf: &'b mut Vec, + position: &mut usize, + ) -> Result> { + let mut read = 0; + let mut done = false; + let start = buf.len(); + while !done { + let used = { + let available = match self.fill_buf() { + Ok(n) if n.is_empty() => break, + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + }; + + match memchr::memchr(byte, available) { + Some(i) => { + buf.extend_from_slice(&available[..i]); + done = true; + i + 1 + } + None => { + buf.extend_from_slice(available); + available.len() + } + } + }; + self.consume(used); + read += used; + } + *position += read; + + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } + } + + fn read_bang_element( + &mut self, + buf: &'b mut Vec, + position: &mut usize, + ) -> Result> { + // Peeked one bang ('!') before being called, so it's guaranteed to + // start with it. + let start = buf.len(); + let mut read = 1; + buf.push(b'!'); + self.consume(1); + + enum BangType { + // + CData, + // + Comment, + // + DocType, + } + + let bang_type = match self.peek_one()? { + Some(b'[') => BangType::CData, + Some(b'-') => BangType::Comment, + Some(b'D') | Some(b'd') => BangType::DocType, + Some(_) => return Err(Error::UnexpectedBang), + None => return Err(Error::UnexpectedEof("Bang".to_string())), + }; + + loop { + let available = match self.fill_buf() { + Ok(n) if n.is_empty() => { + // Note: Do not update position, so the error points to + // somewhere sane rather than at the EOF + let bang_str = match bang_type { + BangType::CData => "CData", + BangType::Comment => "Comment", + BangType::DocType => "DOCTYPE", + }; + return Err(Error::UnexpectedEof(bang_str.to_string())); + } Ok(n) => n, Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, Err(e) => { @@ -942,103 +1043,347 @@ fn read_until( } }; - match memchr::memchr(byte, available) { + match memchr::memchr(b'>', available) { Some(i) => { buf.extend_from_slice(&available[..i]); - done = true; - i + 1 + let used = i + 1; + self.consume(used); + read += used; + + let finished = match bang_type { + BangType::Comment => read >= 5 && buf.ends_with(b"--"), + BangType::CData => buf.ends_with(b"]]"), + BangType::DocType => { + memchr::memchr2_iter(b'<', b'>', buf) + .map(|p| if buf[p] == b'<' { 1i32 } else { -1 }) + .sum::() + == 0 + } + }; + + if finished { + break; + } else { + // '>' was omitted in the extend_from_slice above + buf.push(b'>'); + } } None => { buf.extend_from_slice(available); - available.len() + let used = available.len(); + self.consume(used); + read += used; } } - }; - r.consume(used); - read += used; - } - *position += read; - Ok(read) -} + } + *position += read; -/// Derived from `read_until`, but modified to handle XML attributes using a minimal state machine. -/// [W3C Extensible Markup Language (XML) 1.1 (2006)](https://www.w3.org/TR/xml11) -/// -/// Attribute values are defined as follows: -/// ```plain -/// AttValue := '"' (([^<&"]) | Reference)* '"' -/// | "'" (([^<&']) | Reference)* "'" -/// ``` -/// (`Reference` is something like `"`, but we don't care about escaped characters at this -/// level) -#[inline] -fn read_elem_until( - r: &mut R, - end_byte: u8, - buf: &mut Vec, - position: &mut usize, -) -> Result { - #[derive(Clone, Copy)] - enum State { - /// The initial state (inside element, but outside of attribute value) - Elem, - /// Inside a single-quoted attribute value - SingleQ, - /// Inside a double-quoted attribute value - DoubleQ, + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } } - let mut state = State::Elem; - let mut read = 0; - let mut done = false; - while !done { - let used = { - let available = match r.fill_buf() { - Ok(n) if n.is_empty() => return Ok(read), - Ok(n) => n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e)); + + /// Derived from `read_until`, but modified to handle XML attributes using a minimal state machine. + /// [W3C Extensible Markup Language (XML) 1.1 (2006)](https://www.w3.org/TR/xml11) + /// + /// Attribute values are defined as follows: + /// ```plain + /// AttValue := '"' (([^<&"]) | Reference)* '"' + /// | "'" (([^<&']) | Reference)* "'" + /// ``` + /// (`Reference` is something like `"`, but we don't care about escaped characters at this + /// level) + #[inline] + fn read_element( + &mut self, + buf: &'b mut Vec, + position: &mut usize, + ) -> Result> { + #[derive(Clone, Copy)] + enum State { + /// The initial state (inside element, but outside of attribute value) + Elem, + /// Inside a single-quoted attribute value + SingleQ, + /// Inside a double-quoted attribute value + DoubleQ, + } + let mut state = State::Elem; + let mut read = 0; + let mut done = false; + let end_byte = b'>'; + let start = buf.len(); + while !done { + let used = { + let available = match self.fill_buf() { + Ok(n) if n.is_empty() => { + if read == 0 { + return Ok(None); + } else { + return Ok(Some(&buf[start..])); + } + } + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + }; + + let mut memiter = memchr::memchr3_iter(end_byte, b'\'', b'"', available); + let used: usize; + loop { + match memiter.next() { + Some(i) => { + state = match (state, available[i]) { + (State::Elem, b) if b == end_byte => { + // only allowed to match `end_byte` while we are in state `Elem` + buf.extend_from_slice(&available[..i]); + done = true; + used = i + 1; + break; + } + (State::Elem, b'\'') => State::SingleQ, + (State::Elem, b'\"') => State::DoubleQ, + + // the only end_byte that gets us out if the same character + (State::SingleQ, b'\'') | (State::DoubleQ, b'\"') => State::Elem, + + // all other bytes: no state change + _ => state, + }; + } + None => { + buf.extend_from_slice(available); + used = available.len(); + break; + } + } } + used }; + self.consume(used); + read += used; + } + *position += read; - let mut memiter = memchr::memchr3_iter(end_byte, b'\'', b'"', available); - let used: usize; - loop { - match memiter.next() { - Some(i) => { - state = match (state, available[i]) { - (State::Elem, b) if b == end_byte => { - // only allowed to match `end_byte` while we are in state `Elem` - buf.extend_from_slice(&available[..i]); - done = true; - used = i + 1; - break; - } - (State::Elem, b'\'') => State::SingleQ, - (State::Elem, b'\"') => State::DoubleQ, - - // the only end_byte that gets us out if the same character - (State::SingleQ, b'\'') | (State::DoubleQ, b'\"') => State::Elem, - - // all other bytes: no state change - _ => state, - }; - } - None => { - buf.extend_from_slice(available); - used = available.len(); - break; + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } + } + + /// Consume and discard all the whitespace until the next non-whitespace + /// character or EOF. + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + loop { + break match self.fill_buf() { + Ok(n) => { + let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len()); + if count > 0 { + self.consume(count); + *position += count; + continue; + } else { + Ok(()) } } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } + + /// Consume and discard one character if it matches the given byte. Return + /// true if it matched. + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + match self.peek_one()? { + Some(b) if b == byte => { + *position += 1; + self.consume(1); + Ok(true) } - used + _ => Ok(false), + } + } + + /// Return one character without consuming it, so that future `read_*` calls + /// will still include it. On EOF, return None. + fn peek_one(&mut self) -> Result> { + loop { + break match self.fill_buf() { + Ok(n) if n.is_empty() => Ok(None), + Ok(n) => Ok(Some(n[0])), + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } + + fn input_borrowed(event: Event<'b>) -> Event<'i> { + event.into_owned() + } +} + +/// Implementation of BufferedInput for any BufRead reader using a user-given +/// Vec as buffer that will be borrowed by events. +impl<'a> BufferedInput<'a, 'a, ()> for &'a [u8] { + fn read_bytes_until( + &mut self, + byte: u8, + _buf: (), + position: &mut usize, + ) -> Result> { + if self.is_empty() { + return Ok(None); + } + + let i = memchr::memchr(byte, self).unwrap_or(self.len()); + + *position += i; + let bytes = &self[..i]; + let i = if i < self.len() { + // Skip the matched byte too. + i + 1 + } else { + // Unless we're at the end of the string + i + }; + *self = &self[i..]; + + return Ok(Some(bytes)); + } + + fn read_bang_element(&mut self, _buf: (), position: &mut usize) -> Result> { + // Peeked one bang ('!') before being called, so it's guaranteed to + // start with it. + debug_assert_eq!(self[0], b'!'); + + enum BangType { + // + CData, + // + Comment, + // + DocType, + } + + let bang_type = match &self[1..].first() { + Some(b'[') => BangType::CData, + Some(b'-') => BangType::Comment, + Some(b'D') => BangType::DocType, + Some(_) => return Err(Error::UnexpectedBang), + None => return Err(Error::UnexpectedEof("Bang".to_string())), }; - r.consume(used); - read += used; + + for i in memchr::memchr_iter(b'>', self) { + let finished = match bang_type { + BangType::Comment => i >= 5 && self[..i].ends_with(b"--"), + BangType::CData => self[..i].ends_with(b"]]"), + BangType::DocType => { + // Inefficient, but unlikely to happen often + let open = self[..i].iter().filter(|b| **b == b'<').count(); + let closed = self[..i].iter().filter(|b| **b == b'>').count(); + open == closed + } + }; + + if finished { + *position += i; + let bytes = &self[..i]; + // Skip the '>' too. + *self = &self[i + 1..]; + return Ok(Some(bytes)); + } + } + + // Note: Do not update position, so the error points to + // somewhere sane rather than at the EOF + let bang_str = match bang_type { + BangType::CData => "CData", + BangType::Comment => "Comment", + BangType::DocType => "DOCTYPE", + }; + Err(Error::UnexpectedEof(bang_str.to_string())) + } + + fn read_element(&mut self, _buf: (), position: &mut usize) -> Result> { + if self.is_empty() { + return Ok(None); + } + + #[derive(Clone, Copy)] + enum State { + /// The initial state (inside element, but outside of attribute value) + Elem, + /// Inside a single-quoted attribute value + SingleQ, + /// Inside a double-quoted attribute value + DoubleQ, + } + let mut state = State::Elem; + + let end_byte = b'>'; + + for i in memchr::memchr3_iter(end_byte, b'\'', b'"', self) { + state = match (state, self[i]) { + (State::Elem, b) if b == end_byte => { + // only allowed to match `end_byte` while we are in state `Elem` + *position += i; + let bytes = &self[..i]; + // Skip the '>' too. + *self = &self[i + 1..]; + return Ok(Some(bytes)); + } + (State::Elem, b'\'') => State::SingleQ, + (State::Elem, b'\"') => State::DoubleQ, + + // the only end_byte that gets us out if the same character + (State::SingleQ, b'\'') | (State::DoubleQ, b'\"') => State::Elem, + + // all other bytes: no state change + _ => state, + }; + } + + // Note: Do not update position, so the error points to a sane place + // rather than at the EOF. + Err(Error::UnexpectedEof("Element".to_string())) + + // FIXME: Figure out why the other one works without UnexpectedEof + } + + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + let whitespaces = self + .iter() + .position(|b| !is_whitespace(*b)) + .unwrap_or(self.len()); + *position += whitespaces; + *self = &self[whitespaces..]; + Ok(()) + } + + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + if self.first() == Some(&byte) { + *self = &self[1..]; + *position += 1; + Ok(true) + } else { + Ok(false) + } + } + + fn peek_one(&mut self) -> Result> { + Ok(self.first().copied()) + } + + fn input_borrowed(event: Event<'a>) -> Event<'a> { + return event; } - *position += read; - Ok(read) } /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) @@ -1239,6 +1584,11 @@ impl Decoder { from_utf8(bytes).map_err(Error::Utf8) } + #[cfg(not(feature = "encoding"))] + pub fn decode_owned<'c>(&self, bytes: Vec) -> Result { + String::from_utf8(bytes).map_err(|e| Error::Utf8(e.utf8_error())) + } + #[cfg(feature = "encoding")] pub fn decode<'c>(&self, bytes: &'c [u8]) -> Cow<'c, str> { self.encoding.decode(bytes).0 diff --git a/tests/serde_roundtrip.rs b/tests/serde_roundtrip.rs index cf0dbb1b..3e557f58 100644 --- a/tests/serde_roundtrip.rs +++ b/tests/serde_roundtrip.rs @@ -129,11 +129,13 @@ fn test_parse_unflatten_field() { #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] struct Unflatten { #[serde(rename = "$unflatten=NewKey")] - field: String + field: String, } let source = "Foo"; - let expected = Unflatten { field: "Foo".to_string() }; + let expected = Unflatten { + field: "Foo".to_string(), + }; let parsed: Unflatten = ::quick_xml::de::from_str(source).unwrap(); assert_eq!(&parsed, &expected); diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index 34dc2342..3515666a 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -238,6 +238,10 @@ fn test_writer_borrow() -> Result<()> { #[test] fn test_writer_indent() -> Result<()> { let txt = include_str!("../tests/documents/test_writer_indent.xml"); + // Normalize newlines on Windows to just \n, which is what the reader and + // writer use. + let normalized_txt = txt.replace("\r\n", "\n"); + let txt = normalized_txt.as_str(); let mut reader = Reader::from_str(txt); reader.trim_text(true); let mut writer = Writer::new_with_indent(Cursor::new(Vec::new()), b' ', 4); diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 6150375d..3a0c705b 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -7,8 +7,8 @@ use std::str::from_utf8; #[test] fn sample_1_short() { test( - include_bytes!("documents/sample_1.xml"), - include_bytes!("documents/sample_1_short.txt"), + include_str!("documents/sample_1.xml"), + include_str!("documents/sample_1_short.txt"), true, ); } @@ -16,8 +16,8 @@ fn sample_1_short() { #[test] fn sample_1_full() { test( - include_bytes!("documents/sample_1.xml"), - include_bytes!("documents/sample_1_full.txt"), + include_str!("documents/sample_1.xml"), + include_str!("documents/sample_1_full.txt"), false, ); } @@ -25,8 +25,8 @@ fn sample_1_full() { #[test] fn sample_2_short() { test( - include_bytes!("documents/sample_2.xml"), - include_bytes!("documents/sample_2_short.txt"), + include_str!("documents/sample_2.xml"), + include_str!("documents/sample_2_short.txt"), true, ); } @@ -34,8 +34,8 @@ fn sample_2_short() { #[test] fn sample_2_full() { test( - include_bytes!("documents/sample_2.xml"), - include_bytes!("documents/sample_2_full.txt"), + include_str!("documents/sample_2.xml"), + include_str!("documents/sample_2_full.txt"), false, ); } @@ -47,7 +47,7 @@ fn sample_2_full() { // Unexpected event at line 6: // Expected: InvalidUtf8([10, 38, 110, 98, 115, 112, 59, 10]; invalid utf-8 sequence of 1 bytes from index 1) // Found: Characters( -// +// // ) // ``` #[ignore] @@ -74,8 +74,8 @@ fn html5() { // #[test] // fn sample_3_short() { // test( -// include_bytes!("documents/sample_3.xml"), -// include_bytes!("documents/sample_3_short.txt"), +// include_str!("documents/sample_3.xml"), +// include_str!("documents/sample_3_short.txt"), // true // ); // } @@ -83,8 +83,8 @@ fn html5() { // #[test] // fn sample_3_full() { // test( -// include_bytes!("documents/sample_3.xml"), -// include_bytes!("documents/sample_3_full.txt"), +// include_str!("documents/sample_3.xml"), +// include_str!("documents/sample_3_full.txt"), // false // ); // } @@ -92,8 +92,8 @@ fn html5() { // #[test] // fn sample_4_short() { // test( -// include_bytes!("documents/sample_4.xml"), -// include_bytes!("documents/sample_4_short.txt"), +// include_str!("documents/sample_4.xml"), +// include_str!("documents/sample_4_short.txt"), // true // ); // } @@ -101,8 +101,8 @@ fn html5() { // #[test] // fn sample_4_full() { // test( -// include_bytes!("documents/sample_4.xml"), -// include_bytes!("documents/sample_4_full.txt"), +// include_str!("documents/sample_4.xml"), +// include_str!("documents/sample_4_full.txt"), // false // ); // @@ -111,8 +111,8 @@ fn html5() { #[test] fn sample_ns_short() { test( - include_bytes!("documents/sample_ns.xml"), - include_bytes!("documents/sample_ns_short.txt"), + include_str!("documents/sample_ns.xml"), + include_str!("documents/sample_ns_short.txt"), true, ); } @@ -120,8 +120,8 @@ fn sample_ns_short() { #[test] fn eof_1() { test( - br#""#, - br#" + r#""#, + r#" |Error: Unexpected token '--' "#, true, ); test( - br#""#, - br#" + r#""#, + r#" |Error: Unexpected token '--' "#, true, @@ -157,8 +157,8 @@ fn dashes_in_comments() { #[test] fn tabs_1() { test( - b"\t\t", - br#" + "\t\t", + r#" StartElement(a) EmptyElement(b) EndElement(a) @@ -173,8 +173,8 @@ fn issue_83_duplicate_attributes() { // Error when parsing attributes won't stop main event reader // as it is a lazy operation => add ending events test( - br#""#, - b" + r#""#, + " |StartElement(hello) |1:30 EmptyElement(some-tag, attr-error: error while parsing \ attribute at position 16: Duplicate attribute at position 9 and 16) @@ -188,14 +188,13 @@ fn issue_83_duplicate_attributes() { #[test] fn issue_93_large_characters_in_entity_references() { test( - r#"&𤶼;"#.as_bytes(), + r#"&𤶼;"#, r#" |StartElement(hello) |1:10 FailedUnescape([38, 240, 164, 182, 188, 59]; Error while escaping character at range 1..5: Unrecognized escape symbol: Ok("𤶼")) |EndElement(hello) |EndDocument - "# - .as_bytes(), + "#, true, ) } @@ -203,8 +202,8 @@ fn issue_93_large_characters_in_entity_references() { #[test] fn issue_98_cdata_ending_with_right_bracket() { test( - br#""#, - br#" + r#""#, + r#" |StartElement(hello) |Characters() |CData(Foo [Bar]) @@ -219,8 +218,8 @@ fn issue_98_cdata_ending_with_right_bracket() { #[test] fn issue_105_unexpected_double_dash() { test( - br#"-- "#, - br#" + r#"-- "#, + r#" |StartElement(hello) |Characters(-- ) |EndElement(hello) @@ -230,8 +229,8 @@ fn issue_105_unexpected_double_dash() { ); test( - br#"--"#, - br#" + r#"--"#, + r#" |StartElement(hello) |Characters(--) |EndElement(hello) @@ -241,8 +240,8 @@ fn issue_105_unexpected_double_dash() { ); test( - br#"-->"#, - br#" + r#"-->"#, + r#" |StartElement(hello) |Characters(-->) |EndElement(hello) @@ -252,8 +251,8 @@ fn issue_105_unexpected_double_dash() { ); test( - br#""#, - br#" + r#""#, + r#" |StartElement(hello) |Characters() |CData(--) @@ -270,8 +269,8 @@ fn issue_attributes_have_no_default_namespace() { // At the moment, the 'test' method doesn't render namespaces for attribute names. // This test only checks whether the default namespace got applied to the EmptyElement. test( - br#""#, - br#" + r#""#, + r#" |EmptyElement({urn:foo}hello [x="y"]) |EndDocument "#, @@ -283,8 +282,8 @@ fn issue_attributes_have_no_default_namespace() { fn issue_default_namespace_on_outermost_element() { // Regression test test( - br#""#, - br#" + r#""#, + r#" |EmptyElement({urn:foo}hello) |EndDocument "#, @@ -295,10 +294,10 @@ fn issue_default_namespace_on_outermost_element() { #[test] fn default_namespace_applies_to_end_elem() { test( - br#" + r#" "#, - br#" + r#" |StartElement({urn:foo}hello [x="y"]) |EmptyElement({urn:foo}inner) |EndElement({urn:foo}hello) @@ -308,14 +307,20 @@ fn default_namespace_applies_to_end_elem() { ); } -fn test(input: &[u8], output: &[u8], is_short: bool) { - let mut reader = Reader::from_reader(input); +fn test(input: &str, output: &str, is_short: bool) { + // Normalize newlines on Windows to just \n, which is what the reader and + // writer use. + // let input = input.replace("\r\n", "\n"); + // let input = input.as_bytes(); + // let output = output.replace("\r\n", "\n"); + // let output = output.as_bytes(); + let mut reader = Reader::from_reader(input.as_bytes()); reader .trim_text(is_short) .check_comments(true) .expand_empty_elements(false); - let mut spec_lines = SpecIter(output).enumerate(); + let mut spec_lines = SpecIter(output.as_bytes()).enumerate(); let mut buf = Vec::new(); let mut ns_buffer = Vec::new();