From 22bbac1b758b0fd44a31844e86ebe17de2d59b64 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 15 Jun 2022 19:34:20 +0500 Subject: [PATCH 01/15] Use `Self` instead of explicitly named type where applicable --- src/reader.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/reader.rs b/src/reader.rs index 10f97439..6350534e 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -120,8 +120,8 @@ pub struct Reader { impl Reader { /// Creates a `Reader` that reads from a reader implementing `BufRead`. - pub fn from_reader(reader: R) -> Reader { - Reader { + pub fn from_reader(reader: R) -> Self { + Self { reader, opened_buffer: Vec::new(), opened_starts: Vec::new(), @@ -159,7 +159,7 @@ impl Reader { /// [`Empty`]: events/enum.Event.html#variant.Empty /// [`Start`]: events/enum.Event.html#variant.Start /// [`End`]: events/enum.Event.html#variant.End - pub fn expand_empty_elements(&mut self, val: bool) -> &mut Reader { + pub fn expand_empty_elements(&mut self, val: bool) -> &mut Self { self.expand_empty_elements = val; self } @@ -172,7 +172,7 @@ impl Reader { /// (`false` by default) /// /// [`Text`]: events/enum.Event.html#variant.Text - pub fn trim_text(&mut self, val: bool) -> &mut Reader { + pub fn trim_text(&mut self, val: bool) -> &mut Self { self.trim_text_start = val; self.trim_text_end = val; self @@ -185,7 +185,7 @@ impl Reader { /// (`false` by default) /// /// [`Text`]: events/enum.Event.html#variant.Text - pub fn trim_text_end(&mut self, val: bool) -> &mut Reader { + pub fn trim_text_end(&mut self, val: bool) -> &mut Self { self.trim_text_end = val; self } @@ -201,7 +201,7 @@ impl Reader { /// (`true` by default) /// /// [`End`]: events/enum.Event.html#variant.End - pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Reader { + pub fn trim_markup_names_in_closing_tags(&mut self, val: bool) -> &mut Self { self.trim_markup_names_in_closing_tags = val; self } @@ -223,7 +223,7 @@ impl Reader { /// (`true` by default) /// /// [`End`]: events/enum.Event.html#variant.End - pub fn check_end_names(&mut self, val: bool) -> &mut Reader { + pub fn check_end_names(&mut self, val: bool) -> &mut Self { self.check_end_names = val; self } @@ -238,7 +238,7 @@ impl Reader { /// (`false` by default) /// /// [`Comment`]: events/enum.Event.html#variant.Comment - pub fn check_comments(&mut self, val: bool) -> &mut Reader { + pub fn check_comments(&mut self, val: bool) -> &mut Self { self.check_comments = val; self } @@ -916,22 +916,22 @@ impl Reader { impl Reader> { /// Creates an XML reader from a file path. - pub fn from_file>(path: P) -> Result>> { + pub fn from_file>(path: P) -> Result { let file = File::open(path).map_err(Error::Io)?; let reader = BufReader::new(file); - Ok(Reader::from_reader(reader)) + Ok(Self::from_reader(reader)) } } impl<'a> Reader<&'a [u8]> { /// Creates an XML reader from a string slice. - pub fn from_str(s: &'a str) -> Reader<&'a [u8]> { - Reader::from_reader(s.as_bytes()) + pub fn from_str(s: &'a str) -> Self { + Self::from_reader(s.as_bytes()) } /// Creates an XML reader from a slice of bytes. - pub fn from_bytes(s: &'a [u8]) -> Reader<&'a [u8]> { - Reader::from_reader(s) + pub fn from_bytes(s: &'a [u8]) -> Self { + Self::from_reader(s) } /// Read an event that borrows from the input rather than a buffer. From 3febf081b61653d84418d5a1850de2e6381d4dec Mon Sep 17 00:00:00 2001 From: Mingun Date: Fri, 17 Jun 2022 23:34:50 +0500 Subject: [PATCH 02/15] Replace full path with import + short path --- src/events/mod.rs | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/events/mod.rs b/src/events/mod.rs index 227d55da..11977a04 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -38,7 +38,12 @@ pub mod attributes; #[cfg(feature = "encoding_rs")] use encoding_rs::Encoding; -use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Deref, str::from_utf8}; +use std::borrow::Cow; +use std::collections::HashMap; +use std::fmt::{self, Debug, Formatter}; +use std::io::BufRead; +use std::ops::Deref; +use std::str::from_utf8; use crate::escape::{do_unescape, escape, partial_escape}; use crate::name::{LocalName, QName}; @@ -368,8 +373,8 @@ impl<'a> BytesStart<'a> { } } -impl<'a> std::fmt::Debug for BytesStart<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl<'a> Debug for BytesStart<'a> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!(f, "BytesStart {{ buf: ")?; write_cow_string(f, &self.buf)?; write!(f, ", name_len: {} }}", self.name_len) @@ -656,8 +661,8 @@ impl<'a> BytesEnd<'a> { } } -impl<'a> std::fmt::Debug for BytesEnd<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl<'a> Debug for BytesEnd<'a> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!(f, "BytesEnd {{ name: ")?; write_cow_string(f, &self.name)?; write!(f, " }}") @@ -923,8 +928,8 @@ impl<'a> BytesText<'a> { } } -impl<'a> std::fmt::Debug for BytesText<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl<'a> Debug for BytesText<'a> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!(f, "BytesText {{ content: ")?; write_cow_string(f, &self.content)?; write!(f, " }}") @@ -1036,8 +1041,8 @@ impl<'a> BytesCData<'a> { } } -impl<'a> std::fmt::Debug for BytesCData<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +impl<'a> Debug for BytesCData<'a> { + fn fmt(&self, f: &mut Formatter) -> fmt::Result { write!(f, "BytesCData {{ content: ")?; write_cow_string(f, &self.content)?; write!(f, " }}") From 0e60d2dcef39d038067c4e65defcf07695e0f632 Mon Sep 17 00:00:00 2001 From: Mingun Date: Fri, 17 Jun 2022 23:38:29 +0500 Subject: [PATCH 03/15] Move `Deref` impls to the corresponding structs --- src/events/mod.rs | 79 ++++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/src/events/mod.rs b/src/events/mod.rs index 11977a04..6715659b 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -381,6 +381,14 @@ impl<'a> Debug for BytesStart<'a> { } } +impl<'a> Deref for BytesStart<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &*self.buf + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// An XML declaration (`Event::Decl`). @@ -613,6 +621,14 @@ impl<'a> BytesDecl<'a> { } } +impl<'a> Deref for BytesDecl<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &*self.element + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// A struct to manage `Event::End` events @@ -669,6 +685,14 @@ impl<'a> Debug for BytesEnd<'a> { } } +impl<'a> Deref for BytesEnd<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &*self.name + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// Data from various events (most notably, `Event::Text`) that stored in XML @@ -936,6 +960,14 @@ impl<'a> Debug for BytesText<'a> { } } +impl<'a> Deref for BytesText<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &*self.content + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// CDATA content contains unescaped data from the reader. If you want to write them as a text, @@ -1049,6 +1081,14 @@ impl<'a> Debug for BytesCData<'a> { } } +impl<'a> Deref for BytesCData<'a> { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + &*self.content + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// Event emitted by [`Reader::read_event`]. @@ -1097,46 +1137,9 @@ impl<'a> Event<'a> { } } -//////////////////////////////////////////////////////////////////////////////////////////////////// - -impl<'a> Deref for BytesStart<'a> { - type Target = [u8]; - fn deref(&self) -> &[u8] { - &*self.buf - } -} - -impl<'a> Deref for BytesDecl<'a> { - type Target = [u8]; - fn deref(&self) -> &[u8] { - &*self.element - } -} - -impl<'a> Deref for BytesEnd<'a> { - type Target = [u8]; - fn deref(&self) -> &[u8] { - &*self.name - } -} - -impl<'a> Deref for BytesText<'a> { - type Target = [u8]; - fn deref(&self) -> &[u8] { - &*self.content - } -} - -impl<'a> Deref for BytesCData<'a> { - type Target = [u8]; - - fn deref(&self) -> &[u8] { - &*self.content - } -} - impl<'a> Deref for Event<'a> { type Target = [u8]; + fn deref(&self) -> &[u8] { match *self { Event::Start(ref e) | Event::Empty(ref e) => &*e, From 7cf5f570617420f590bb25877c08a6018c23023c Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 15 Jun 2022 22:27:32 +0500 Subject: [PATCH 04/15] Split `Reader` implementation block into four sections: builders, getters, read methods and private methods --- src/reader.rs | 908 +++++++++++++++++++++++++------------------------- 1 file changed, 459 insertions(+), 449 deletions(-) diff --git a/src/reader.rs b/src/reader.rs index 6350534e..778c530d 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -118,6 +118,7 @@ pub struct Reader { is_encoding_set: bool, } +/// Builder methods impl Reader { /// Creates a `Reader` that reads from a reader implementing `BufRead`. pub fn from_reader(reader: R) -> Self { @@ -242,6 +243,76 @@ impl Reader { self.check_comments = val; self } +} + +/// Getters +impl Reader { + /// Consumes `Reader` returning the underlying reader + /// + /// Can be used to compute line and column of a parsing error position + /// + /// # Examples + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use std::{str, io::Cursor}; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes())); + /// let mut buf = Vec::new(); + /// + /// fn into_line_and_column(reader: Reader>) -> (usize, usize) { + /// let end_pos = reader.buffer_position(); + /// let mut cursor = reader.into_inner(); + /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned()) + /// .expect("can't make a string"); + /// let mut line = 1; + /// let mut column = 0; + /// for c in s.chars() { + /// if c == '\n' { + /// line += 1; + /// column = 0; + /// } else { + /// column += 1; + /// } + /// } + /// (line, column) + /// } + /// + /// loop { + /// match reader.read_event(&mut buf) { + /// Ok(Event::Start(ref e)) => match e.name().as_ref() { + /// b"tag1" | b"tag2" => (), + /// tag => { + /// assert_eq!(b"tag3", tag); + /// assert_eq!((3, 22), into_line_and_column(reader)); + /// break; + /// } + /// }, + /// Ok(Event::Eof) => unreachable!(), + /// _ => (), + /// } + /// buf.clear(); + /// } + /// ``` + pub fn into_inner(self) -> R { + self.reader + } + + /// Gets a reference to the underlying reader. + pub fn get_ref(&self) -> &R { + &self.reader + } + + /// Gets a mutable reference to the underlying reader. + pub fn get_mut(&mut self) -> &mut R { + &mut self.reader + } /// Gets the current byte position in the input data. /// @@ -256,221 +327,148 @@ impl Reader { } } - /// private function to read until '<' is found - /// return a `Text` event - fn read_until_open<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - self.tag_state = TagState::Opened; - - if self.trim_text_start { - self.reader.skip_whitespace(&mut self.buf_position)?; - } - - // If we already at the `<` symbol, do not try to return an empty Text event - if self.reader.skip_one(b'<', &mut self.buf_position)? { - return self.read_event_buffered(buf); - } + /// Resolves a potentially qualified **event name** into (namespace name, local name). + /// + /// *Qualified* attribute names have the form `prefix:local-name` where the`prefix` is defined + /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix + /// can be defined on the same element as the attribute in question. + /// + /// *Unqualified* event inherits the current *default namespace*. + /// + /// # Lifetimes + /// + /// - `'n`: lifetime of an element name + /// - `'ns`: lifetime of a namespaces buffer, where all found namespaces are stored + #[inline] + pub fn event_namespace<'n, 'ns>( + &self, + name: QName<'n>, + namespace_buffer: &'ns [u8], + ) -> (ResolveResult<'ns>, LocalName<'n>) { + self.ns_resolver.resolve(name, namespace_buffer, true) + } - match self - .reader - .read_bytes_until(b'<', buf, &mut self.buf_position) - { - Ok(Some(bytes)) if self.trim_text_end => { - // Skip the ending '< - let len = bytes - .iter() - .rposition(|&b| !is_whitespace(b)) - .map_or_else(|| bytes.len(), |p| p + 1); - Ok(Event::Text(BytesText::from_escaped(&bytes[..len]))) - } - Ok(Some(bytes)) => Ok(Event::Text(BytesText::from_escaped(bytes))), - Ok(None) => Ok(Event::Eof), - Err(e) => Err(e), - } + /// Resolves a potentially qualified **attribute name** into (namespace name, local name). + /// + /// *Qualified* attribute names have the form `prefix:local-name` where the`prefix` is defined + /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix + /// can be defined on the same element as the attribute in question. + /// + /// *Unqualified* attribute names do *not* inherit the current *default namespace*. + /// + /// # Lifetimes + /// + /// - `'n`: lifetime of an attribute + /// - `'ns`: lifetime of a namespaces buffer, where all found namespaces are stored + #[inline] + pub fn attribute_namespace<'n, 'ns>( + &self, + name: QName<'n>, + namespace_buffer: &'ns [u8], + ) -> (ResolveResult<'ns>, LocalName<'n>) { + self.ns_resolver.resolve(name, namespace_buffer, false) } - /// Private function to read until `>` is found. This function expects that - /// it was called just after encounter a `<` symbol. - fn read_until_close<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - self.tag_state = TagState::Closed; + /// Returns the `Reader`s encoding. + /// + /// The used encoding may change after parsing the XML declaration. + /// + /// This encoding will be used by [`decode`]. + /// + /// [`decode`]: #method.decode + #[cfg(feature = "encoding")] + pub fn encoding(&self) -> &'static Encoding { + self.encoding + } - match self.reader.peek_one() { - // ` match self.reader.read_bang_element(buf, &mut self.buf_position) { - Ok(None) => Ok(Event::Eof), - Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), - Err(e) => Err(e), - }, - // ` match self - .reader - .read_bytes_until(b'>', buf, &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_end(bytes), - Err(e) => Err(e), - }, - // ` match self - .reader - .read_bytes_until(b'>', buf, &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_question_mark(bytes), - Err(e) => Err(e), - }, - // `<...` - opening or self-closed tag - Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_start(bytes), - Err(e) => Err(e), - }, - Ok(None) => Ok(Event::Eof), - Err(e) => Err(e), + /// Get utf8 decoder + #[cfg(feature = "encoding")] + pub fn decoder(&self) -> Decoder { + Decoder { + encoding: self.encoding, } } - /// reads `BytesElement` starting with a `/`, - /// if `self.check_end_names`, checks that element matches last opened element - /// return `End` event - fn read_end<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result> { - // XML standard permits whitespaces after the markup name in closing tags. - // Let's strip them from the buffer before comparing tag names. - let name = if self.trim_markup_names_in_closing_tags { - if let Some(pos_end_name) = buf[1..].iter().rposition(|&b| !b.is_ascii_whitespace()) { - let (name, _) = buf[1..].split_at(pos_end_name + 1); - name - } else { - &buf[1..] - } - } else { - &buf[1..] - }; - if self.check_end_names { - let mismatch_err = |expected: &[u8], found: &[u8], buf_position: &mut usize| { - *buf_position -= buf.len(); - Err(Error::EndEventMismatch { - expected: from_utf8(expected).unwrap_or("").to_owned(), - found: from_utf8(found).unwrap_or("").to_owned(), - }) - }; - match self.opened_starts.pop() { - Some(start) => { - let expected = &self.opened_buffer[start..]; - if name != expected { - mismatch_err(expected, name, &mut self.buf_position) - } else { - self.opened_buffer.truncate(start); - Ok(Event::End(BytesEnd::borrowed(name))) - } - } - None => mismatch_err(b"", &buf[1..], &mut self.buf_position), - } - } else { - Ok(Event::End(BytesEnd::borrowed(name))) - } + /// Get utf8 decoder + #[cfg(not(feature = "encoding"))] + pub fn decoder(&self) -> Decoder { + Decoder } - /// reads `BytesElement` starting with a `!`, - /// return `Comment`, `CData` or `DocType` event - fn read_bang<'a, 'b>(&'a mut self, bang_type: BangType, buf: &'b [u8]) -> Result> { - let uncased_starts_with = |string: &[u8], prefix: &[u8]| { - string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) - }; + /// Decodes a slice using the encoding specified in the XML declaration. + /// + /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the + /// `U+FFFD REPLACEMENT CHARACTER`. + /// + /// If no encoding is specified, defaults to UTF-8. + #[inline] + #[cfg(feature = "encoding")] + pub fn decode<'c>(&self, bytes: &'c [u8]) -> Cow<'c, str> { + self.encoding.decode(bytes).0 + } - let len = buf.len(); - match bang_type { - BangType::Comment if buf.starts_with(b"!--") => { - if self.check_comments { - // search if '--' not in comments - if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2]) - .position(|p| buf[3 + p + 1] == b'-') - { - self.buf_position += len - p; - return Err(Error::UnexpectedToken("--".to_string())); - } - } - Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2]))) - } - BangType::CData if uncased_starts_with(buf, b"![CDATA[") => { - Ok(Event::CData(BytesCData::new(&buf[8..]))) - } - BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { - let start = buf[8..] - .iter() - .position(|b| !is_whitespace(*b)) - .unwrap_or_else(|| len - 8); - debug_assert!(start < len - 8, "DocType must have a name"); - Ok(Event::DocType(BytesText::from_escaped(&buf[8 + start..]))) - } - _ => Err(bang_type.to_err()), - } + /// Decodes a UTF8 slice regardless of XML declaration. + /// + /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the + /// `U+FFFD REPLACEMENT CHARACTER`. + /// + /// # Note + /// + /// If you instead want to use XML declared encoding, use the `encoding` feature + #[inline] + #[cfg(not(feature = "encoding"))] + pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { + from_utf8(bytes).map_err(Error::Utf8) } - /// reads `BytesElement` starting with a `?`, - /// return `Decl` or `PI` event - fn read_question_mark<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result> { - let len = buf.len(); - if len > 2 && buf[len - 1] == b'?' { - if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) { - let event = BytesDecl::from_start(BytesStart::borrowed(&buf[1..len - 1], 3)); - - // Try getting encoding from the declaration event - #[cfg(feature = "encoding")] - if let Some(enc) = event.encoder() { - self.encoding = enc; - self.is_encoding_set = true; - } - - Ok(Event::Decl(event)) - } else { - Ok(Event::PI(BytesText::from_escaped(&buf[1..len - 1]))) - } - } else { - self.buf_position -= len; - Err(Error::UnexpectedEof("XmlDecl".to_string())) + /// Decodes a slice using without BOM (Byte order mark) the encoding specified in the XML declaration. + /// + /// Decode `bytes` without BOM and with malformed sequences replaced with the + /// `U+FFFD REPLACEMENT CHARACTER`. + /// + /// If no encoding is specified, defaults to UTF-8. + #[inline] + #[cfg(feature = "encoding")] + pub fn decode_without_bom<'c>(&mut self, mut bytes: &'c [u8]) -> Cow<'c, str> { + if self.is_encoding_set { + return self.encoding.decode_with_bom_removal(bytes).0; } + if bytes.starts_with(b"\xEF\xBB\xBF") { + self.is_encoding_set = true; + bytes = &bytes[3..]; + } else if bytes.starts_with(b"\xFF\xFE") { + self.is_encoding_set = true; + self.encoding = UTF_16LE; + bytes = &bytes[2..]; + } else if bytes.starts_with(b"\xFE\xFF") { + self.is_encoding_set = true; + self.encoding = UTF_16BE; + bytes = &bytes[3..]; + }; + self.encoding.decode_without_bom_handling(bytes).0 } + /// Decodes a UTF8 slice without BOM (Byte order mark) regardless of XML declaration. + /// + /// Decode `bytes` without BOM and with malformed sequences replaced with the + /// `U+FFFD REPLACEMENT CHARACTER`. + /// + /// # Note + /// + /// If you instead want to use XML declared encoding, use the `encoding` feature #[inline] - fn close_expanded_empty(&mut self) -> Result> { - self.tag_state = TagState::Closed; - let name = self - .opened_buffer - .split_off(self.opened_starts.pop().unwrap()); - Ok(Event::End(BytesEnd::owned(name))) - } - - /// reads `BytesElement` starting with any character except `/`, `!` or ``?` - /// return `Start` or `Empty` event - fn read_start<'a, 'b>(&'a mut self, buf: &'b [u8]) -> Result> { - // TODO: do this directly when reading bufreader ... - let len = buf.len(); - let name_end = buf.iter().position(|&b| is_whitespace(b)).unwrap_or(len); - if let Some(&b'/') = buf.last() { - let end = if name_end < len { name_end } else { len - 1 }; - if self.expand_empty_elements { - self.tag_state = TagState::Empty; - self.opened_starts.push(self.opened_buffer.len()); - self.opened_buffer.extend(&buf[..end]); - Ok(Event::Start(BytesStart::borrowed(&buf[..len - 1], end))) - } else { - Ok(Event::Empty(BytesStart::borrowed(&buf[..len - 1], end))) - } + #[cfg(not(feature = "encoding"))] + pub fn decode_without_bom<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { + if bytes.starts_with(b"\xEF\xBB\xBF") { + from_utf8(&bytes[3..]).map_err(Error::Utf8) } else { - if self.check_end_names { - self.opened_starts.push(self.opened_buffer.len()); - self.opened_buffer.extend(&buf[..name_end]); - } - Ok(Event::Start(BytesStart::borrowed(buf, name_end))) + from_utf8(bytes).map_err(Error::Utf8) } } +} +/// Read methods +impl Reader { /// Reads the next `Event`. /// /// This is the main entry point for reading XML `Event`s. @@ -515,72 +513,10 @@ impl Reader { /// println!("Text events: {:?}", txt); /// ``` #[inline] - pub fn read_event<'a, 'b>(&'a mut self, buf: &'b mut Vec) -> Result> { + pub fn read_event<'b>(&mut self, buf: &'b mut Vec) -> Result> { self.read_event_buffered(buf) } - /// Read text into the given buffer, and return an event that borrows from - /// either that buffer or from the input itself, based on the type of the - /// reader. - fn read_event_buffered<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - let event = match self.tag_state { - TagState::Opened => self.read_until_close(buf), - TagState::Closed => self.read_until_open(buf), - TagState::Empty => self.close_expanded_empty(), - TagState::Exit => return Ok(Event::Eof), - }; - match event { - Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, - _ => {} - } - event - } - - /// Resolves a potentially qualified **event name** into (namespace name, local name). - /// - /// *Qualified* attribute names have the form `prefix:local-name` where the`prefix` is defined - /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix - /// can be defined on the same element as the attribute in question. - /// - /// *Unqualified* event inherits the current *default namespace*. - /// - /// # Lifetimes - /// - /// - `'n`: lifetime of an element name - /// - `'ns`: lifetime of a namespaces buffer, where all found namespaces are stored - #[inline] - pub fn event_namespace<'n, 'ns>( - &self, - name: QName<'n>, - namespace_buffer: &'ns [u8], - ) -> (ResolveResult<'ns>, LocalName<'n>) { - self.ns_resolver.resolve(name, namespace_buffer, true) - } - - /// Resolves a potentially qualified **attribute name** into (namespace name, local name). - /// - /// *Qualified* attribute names have the form `prefix:local-name` where the`prefix` is defined - /// on any containing XML element via `xmlns:prefix="the:namespace:uri"`. The namespace prefix - /// can be defined on the same element as the attribute in question. - /// - /// *Unqualified* attribute names do *not* inherit the current *default namespace*. - /// - /// # Lifetimes - /// - /// - `'n`: lifetime of an attribute - /// - `'ns`: lifetime of a namespaces buffer, where all found namespaces are stored - #[inline] - pub fn attribute_namespace<'n, 'ns>( - &self, - name: QName<'n>, - namespace_buffer: &'ns [u8], - ) -> (ResolveResult<'ns>, LocalName<'n>) { - self.ns_resolver.resolve(name, namespace_buffer, false) - } - /// Reads the next event and resolves its namespace (if applicable). /// /// # Examples @@ -677,143 +613,46 @@ impl Reader { } } - /// Returns the `Reader`s encoding. - /// - /// The used encoding may change after parsing the XML declaration. - /// - /// This encoding will be used by [`decode`]. + /// Reads until end element is found /// - /// [`decode`]: #method.decode - #[cfg(feature = "encoding")] - pub fn encoding(&self) -> &'static Encoding { - self.encoding + /// Manages nested cases where parent and child elements have the same name + pub fn read_to_end>(&mut self, end: K, buf: &mut Vec) -> Result<()> { + let mut depth = 0; + let end = end.as_ref(); + loop { + match self.read_event(buf) { + Ok(Event::End(ref e)) if e.name().as_ref() == end => { + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + Ok(Event::Start(ref e)) if e.name().as_ref() == end => depth += 1, + Err(e) => return Err(e), + Ok(Event::Eof) => { + return Err(Error::UnexpectedEof(format!("", from_utf8(end)))); + } + _ => (), + } + buf.clear(); + } } - /// Decodes a slice using the encoding specified in the XML declaration. - /// - /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. + /// Reads optional text between start and end tags. /// - /// If no encoding is specified, defaults to UTF-8. - #[inline] - #[cfg(feature = "encoding")] - pub fn decode<'b, 'c>(&'b self, bytes: &'c [u8]) -> Cow<'c, str> { - self.encoding.decode(bytes).0 - } - - /// Decodes a UTF8 slice without BOM (Byte order mark) regardless of XML declaration. + /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a + /// `String`. If the next event is an [`End`] event, returns the empty string. In all other + /// cases, returns an error. /// - /// Decode `bytes` without BOM and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. + /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 + /// if none is specified). /// - /// # Note + /// # Examples /// - /// If you instead want to use XML declared encoding, use the `encoding` feature - #[inline] - #[cfg(not(feature = "encoding"))] - pub fn decode_without_bom<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { - if bytes.starts_with(b"\xEF\xBB\xBF") { - from_utf8(&bytes[3..]).map_err(Error::Utf8) - } else { - from_utf8(bytes).map_err(Error::Utf8) - } - } - - /// Decodes a slice using without BOM (Byte order mark) the encoding specified in the XML declaration. - /// - /// Decode `bytes` without BOM and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. - /// - /// If no encoding is specified, defaults to UTF-8. - #[inline] - #[cfg(feature = "encoding")] - pub fn decode_without_bom<'b, 'c>(&'b mut self, mut bytes: &'c [u8]) -> Cow<'c, str> { - if self.is_encoding_set { - return self.encoding.decode_with_bom_removal(bytes).0; - } - if bytes.starts_with(b"\xEF\xBB\xBF") { - self.is_encoding_set = true; - bytes = &bytes[3..]; - } else if bytes.starts_with(b"\xFF\xFE") { - self.is_encoding_set = true; - self.encoding = UTF_16LE; - bytes = &bytes[2..]; - } else if bytes.starts_with(b"\xFE\xFF") { - self.is_encoding_set = true; - self.encoding = UTF_16BE; - bytes = &bytes[3..]; - }; - self.encoding.decode_without_bom_handling(bytes).0 - } - - /// Decodes a UTF8 slice regardless of XML declaration. - /// - /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. - /// - /// # Note - /// - /// If you instead want to use XML declared encoding, use the `encoding` feature - #[inline] - #[cfg(not(feature = "encoding"))] - pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { - from_utf8(bytes).map_err(Error::Utf8) - } - - /// Get utf8 decoder - #[cfg(feature = "encoding")] - pub fn decoder(&self) -> Decoder { - Decoder { - encoding: self.encoding, - } - } - - /// Get utf8 decoder - #[cfg(not(feature = "encoding"))] - pub fn decoder(&self) -> Decoder { - Decoder - } - - /// Reads until end element is found - /// - /// Manages nested cases where parent and child elements have the same name - pub fn read_to_end>(&mut self, end: K, buf: &mut Vec) -> Result<()> { - let mut depth = 0; - let end = end.as_ref(); - loop { - match self.read_event(buf) { - Ok(Event::End(ref e)) if e.name().as_ref() == end => { - if depth == 0 { - return Ok(()); - } - depth -= 1; - } - Ok(Event::Start(ref e)) if e.name().as_ref() == end => depth += 1, - Err(e) => return Err(e), - Ok(Event::Eof) => { - return Err(Error::UnexpectedEof(format!("", from_utf8(end)))); - } - _ => (), - } - buf.clear(); - } - } - - /// Reads optional text between start and end tags. - /// - /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a - /// `String`. If the next event is an [`End`] event, returns the empty string. In all other - /// cases, returns an error. - /// - /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 - /// if none is specified). - /// - /// # Examples - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use quick_xml::Reader; - /// use quick_xml::events::Event; + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; /// /// let mut xml = Reader::from_reader(b" /// <b> @@ -845,72 +684,243 @@ impl Reader { self.read_to_end(end, buf)?; s } +} - /// Consumes `Reader` returning the underlying reader - /// - /// Can be used to compute line and column of a parsing error position - /// - /// # Examples - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use std::{str, io::Cursor}; - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// - /// let xml = r#" - /// Test - /// Test 2 - /// "#; - /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes())); - /// let mut buf = Vec::new(); - /// - /// fn into_line_and_column(reader: Reader>) -> (usize, usize) { - /// let end_pos = reader.buffer_position(); - /// let mut cursor = reader.into_inner(); - /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned()) - /// .expect("can't make a string"); - /// let mut line = 1; - /// let mut column = 0; - /// for c in s.chars() { - /// if c == '\n' { - /// line += 1; - /// column = 0; - /// } else { - /// column += 1; - /// } - /// } - /// (line, column) - /// } - /// - /// loop { - /// match reader.read_event(&mut buf) { - /// Ok(Event::Start(ref e)) => match e.name().as_ref() { - /// b"tag1" | b"tag2" => (), - /// tag => { - /// assert_eq!(b"tag3", tag); - /// assert_eq!((3, 22), into_line_and_column(reader)); - /// break; - /// } - /// }, - /// Ok(Event::Eof) => unreachable!(), - /// _ => (), - /// } - /// buf.clear(); - /// } - /// ``` - pub fn into_inner(self) -> R { - self.reader +/// Private methods +impl Reader { + /// Read text into the given buffer, and return an event that borrows from + /// either that buffer or from the input itself, based on the type of the + /// reader. + fn read_event_buffered<'i, B>(&mut self, buf: B) -> Result> + where + R: XmlSource<'i, B>, + { + let event = match self.tag_state { + TagState::Closed => self.read_until_open(buf), + TagState::Opened => self.read_until_close(buf), + TagState::Empty => self.close_expanded_empty(), + TagState::Exit => return Ok(Event::Eof), + }; + match event { + Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, + _ => {} + } + event } - /// Gets a reference to the underlying reader. - pub fn get_ref(&self) -> &R { - &self.reader + /// private function to read until '<' is found + /// return a `Text` event + fn read_until_open<'i, B>(&mut self, buf: B) -> Result> + where + R: XmlSource<'i, B>, + { + self.tag_state = TagState::Opened; + + if self.trim_text_start { + self.reader.skip_whitespace(&mut self.buf_position)?; + } + + // If we already at the `<` symbol, do not try to return an empty Text event + if self.reader.skip_one(b'<', &mut self.buf_position)? { + return self.read_event_buffered(buf); + } + + match self + .reader + .read_bytes_until(b'<', buf, &mut self.buf_position) + { + Ok(Some(bytes)) if self.trim_text_end => { + // Skip the ending '< + let len = bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1); + Ok(Event::Text(BytesText::from_escaped(&bytes[..len]))) + } + Ok(Some(bytes)) => Ok(Event::Text(BytesText::from_escaped(bytes))), + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } } - /// Gets a mutable reference to the underlying reader. - pub fn get_mut(&mut self) -> &mut R { - &mut self.reader + /// Private function to read until `>` is found. This function expects that + /// it was called just after encounter a `<` symbol. + fn read_until_close<'i, B>(&mut self, buf: B) -> Result> + where + R: XmlSource<'i, B>, + { + self.tag_state = TagState::Closed; + + match self.reader.peek_one() { + // ` match self.reader.read_bang_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), + Err(e) => Err(e), + }, + // ` match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_end(bytes), + Err(e) => Err(e), + }, + // ` match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_question_mark(bytes), + Err(e) => Err(e), + }, + // `<...` - opening or self-closed tag + Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_start(bytes), + Err(e) => Err(e), + }, + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } + + /// reads `BytesElement` starting with a `!`, + /// return `Comment`, `CData` or `DocType` event + fn read_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result> { + let uncased_starts_with = |string: &[u8], prefix: &[u8]| { + string.len() >= prefix.len() && string[..prefix.len()].eq_ignore_ascii_case(prefix) + }; + + let len = buf.len(); + match bang_type { + BangType::Comment if buf.starts_with(b"!--") => { + if self.check_comments { + // search if '--' not in comments + if let Some(p) = memchr::memchr_iter(b'-', &buf[3..len - 2]) + .position(|p| buf[3 + p + 1] == b'-') + { + self.buf_position += len - p; + return Err(Error::UnexpectedToken("--".to_string())); + } + } + Ok(Event::Comment(BytesText::from_escaped(&buf[3..len - 2]))) + } + BangType::CData if uncased_starts_with(buf, b"![CDATA[") => { + Ok(Event::CData(BytesCData::new(&buf[8..]))) + } + BangType::DocType if uncased_starts_with(buf, b"!DOCTYPE") => { + let start = buf[8..] + .iter() + .position(|b| !is_whitespace(*b)) + .unwrap_or_else(|| len - 8); + debug_assert!(start < len - 8, "DocType must have a name"); + Ok(Event::DocType(BytesText::from_escaped(&buf[8 + start..]))) + } + _ => Err(bang_type.to_err()), + } + } + + /// reads `BytesElement` starting with a `/`, + /// if `self.check_end_names`, checks that element matches last opened element + /// return `End` event + fn read_end<'b>(&mut self, buf: &'b [u8]) -> Result> { + // XML standard permits whitespaces after the markup name in closing tags. + // Let's strip them from the buffer before comparing tag names. + let name = if self.trim_markup_names_in_closing_tags { + if let Some(pos_end_name) = buf[1..].iter().rposition(|&b| !b.is_ascii_whitespace()) { + let (name, _) = buf[1..].split_at(pos_end_name + 1); + name + } else { + &buf[1..] + } + } else { + &buf[1..] + }; + if self.check_end_names { + let mismatch_err = |expected: &[u8], found: &[u8], buf_position: &mut usize| { + *buf_position -= buf.len(); + Err(Error::EndEventMismatch { + expected: from_utf8(expected).unwrap_or("").to_owned(), + found: from_utf8(found).unwrap_or("").to_owned(), + }) + }; + match self.opened_starts.pop() { + Some(start) => { + let expected = &self.opened_buffer[start..]; + if name != expected { + mismatch_err(expected, name, &mut self.buf_position) + } else { + self.opened_buffer.truncate(start); + Ok(Event::End(BytesEnd::borrowed(name))) + } + } + None => mismatch_err(b"", &buf[1..], &mut self.buf_position), + } + } else { + Ok(Event::End(BytesEnd::borrowed(name))) + } + } + + /// reads `BytesElement` starting with a `?`, + /// return `Decl` or `PI` event + fn read_question_mark<'b>(&mut self, buf: &'b [u8]) -> Result> { + let len = buf.len(); + if len > 2 && buf[len - 1] == b'?' { + if len > 5 && &buf[1..4] == b"xml" && is_whitespace(buf[4]) { + let event = BytesDecl::from_start(BytesStart::borrowed(&buf[1..len - 1], 3)); + + // Try getting encoding from the declaration event + #[cfg(feature = "encoding")] + if let Some(enc) = event.encoder() { + self.encoding = enc; + self.is_encoding_set = true; + } + + Ok(Event::Decl(event)) + } else { + Ok(Event::PI(BytesText::from_escaped(&buf[1..len - 1]))) + } + } else { + self.buf_position -= len; + Err(Error::UnexpectedEof("XmlDecl".to_string())) + } + } + + #[inline] + fn close_expanded_empty(&mut self) -> Result> { + self.tag_state = TagState::Closed; + let name = self + .opened_buffer + .split_off(self.opened_starts.pop().unwrap()); + Ok(Event::End(BytesEnd::owned(name))) + } + + /// reads `BytesElement` starting with any character except `/`, `!` or ``?` + /// return `Start` or `Empty` event + fn read_start<'b>(&mut self, buf: &'b [u8]) -> Result> { + // TODO: do this directly when reading bufreader ... + let len = buf.len(); + let name_end = buf.iter().position(|&b| is_whitespace(b)).unwrap_or(len); + if let Some(&b'/') = buf.last() { + let end = if name_end < len { name_end } else { len - 1 }; + if self.expand_empty_elements { + self.tag_state = TagState::Empty; + self.opened_starts.push(self.opened_buffer.len()); + self.opened_buffer.extend(&buf[..end]); + Ok(Event::Start(BytesStart::borrowed(&buf[..len - 1], end))) + } else { + Ok(Event::Empty(BytesStart::borrowed(&buf[..len - 1], end))) + } + } else { + if self.check_end_names { + self.opened_starts.push(self.opened_buffer.len()); + self.opened_buffer.extend(&buf[..name_end]); + } + Ok(Event::Start(BytesStart::borrowed(buf, name_end))) + } } } From 7f2137dbc0b84f7f09ba159e7a1356239009d37f Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 16 Jun 2022 22:14:56 +0500 Subject: [PATCH 05/15] Eliminate pointless string recoding in tests --- tests/xmlrs_reader_tests.rs | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 12f86594..788eab99 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -420,15 +420,11 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) { } } -fn namespace_name(n: ResolveResult, name: &[u8]) -> String { +fn namespace_name(n: ResolveResult, name: Cow) -> String { match n { // Produces string '{namespace}local_name' - ResolveResult::Bound(n) => format!( - "{{{}}}{}", - from_utf8(n.as_ref()).unwrap(), - from_utf8(name).unwrap() - ), - _ => from_utf8(name).unwrap().to_owned(), + ResolveResult::Bound(n) => format!("{{{}}}{}", from_utf8(n.as_ref()).unwrap(), name), + _ => name.to_string(), } } @@ -465,7 +461,7 @@ fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> { fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8]>) -> String { match opt_event { Ok((n, Event::Start(ref e))) => { - let name = namespace_name(n, decode(e.name().as_ref(), reader).as_bytes()); + let name = namespace_name(n, decode(e.name().as_ref(), reader)); match make_attrs(e) { Ok(ref attrs) if attrs.is_empty() => format!("StartElement({})", &name), Ok(ref attrs) => format!("StartElement({} [{}])", &name, &attrs), @@ -473,7 +469,7 @@ fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8 } } Ok((n, Event::Empty(ref e))) => { - let name = namespace_name(n, decode(e.name().as_ref(), reader).as_bytes()); + let name = namespace_name(n, decode(e.name().as_ref(), reader)); match make_attrs(e) { Ok(ref attrs) if attrs.is_empty() => format!("EmptyElement({})", &name), Ok(ref attrs) => format!("EmptyElement({} [{}])", &name, &attrs), @@ -481,16 +477,13 @@ fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8 } } Ok((n, Event::End(ref e))) => { - let name = namespace_name(n, decode(e.name().as_ref(), reader).as_bytes()); + let name = namespace_name(n, decode(e.name().as_ref(), reader)); format!("EndElement({})", name) } Ok((_, Event::Comment(ref e))) => format!("Comment({})", from_utf8(e).unwrap()), Ok((_, Event::CData(ref e))) => format!("CData({})", from_utf8(e).unwrap()), Ok((_, Event::Text(ref e))) => match e.unescaped() { - Ok(c) => match from_utf8(decode(&*c, reader).as_bytes()) { - Ok(c) => format!("Characters({})", c), - Err(ref err) => format!("InvalidUtf8({:?}; {})", e.escaped(), err), - }, + Ok(c) => format!("Characters({})", decode(c.as_ref(), reader).as_ref()), Err(ref err) => format!("FailedUnescape({:?}; {})", e.escaped(), err), }, Ok((_, Event::Decl(ref e))) => { From aa802bf48f20874d5678adcc7d169feca7e5f783 Mon Sep 17 00:00:00 2001 From: Mingun Date: Thu, 16 Jun 2022 22:37:06 +0500 Subject: [PATCH 06/15] Move decoding of a name into `namespace_name` to decrease redundancy in the code --- tests/xmlrs_reader_tests.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index 788eab99..e04e6669 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -1,5 +1,5 @@ use quick_xml::events::{BytesStart, Event}; -use quick_xml::name::ResolveResult; +use quick_xml::name::{QName, ResolveResult}; use quick_xml::{Reader, Result}; use std::borrow::Cow; use std::str::from_utf8; @@ -420,9 +420,10 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) { } } -fn namespace_name(n: ResolveResult, name: Cow) -> String { +fn namespace_name(n: ResolveResult, name: QName, reader: &Reader<&[u8]>) -> String { + let name = decode(name.as_ref(), reader); match n { - // Produces string '{namespace}local_name' + // Produces string '{namespace}prefixed_name' ResolveResult::Bound(n) => format!("{{{}}}{}", from_utf8(n.as_ref()).unwrap(), name), _ => name.to_string(), } @@ -461,7 +462,7 @@ fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> { fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8]>) -> String { match opt_event { Ok((n, Event::Start(ref e))) => { - let name = namespace_name(n, decode(e.name().as_ref(), reader)); + let name = namespace_name(n, e.name(), reader); match make_attrs(e) { Ok(ref attrs) if attrs.is_empty() => format!("StartElement({})", &name), Ok(ref attrs) => format!("StartElement({} [{}])", &name, &attrs), @@ -469,7 +470,7 @@ fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8 } } Ok((n, Event::Empty(ref e))) => { - let name = namespace_name(n, decode(e.name().as_ref(), reader)); + let name = namespace_name(n, e.name(), reader); match make_attrs(e) { Ok(ref attrs) if attrs.is_empty() => format!("EmptyElement({})", &name), Ok(ref attrs) => format!("EmptyElement({} [{}])", &name, &attrs), @@ -477,7 +478,7 @@ fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8 } } Ok((n, Event::End(ref e))) => { - let name = namespace_name(n, decode(e.name().as_ref(), reader)); + let name = namespace_name(n, e.name(), reader); format!("EndElement({})", name) } Ok((_, Event::Comment(ref e))) => format!("Comment({})", from_utf8(e).unwrap()), From 493c75ab860c2d483a73bbc62b378dae7cb1a729 Mon Sep 17 00:00:00 2001 From: Mingun Date: Wed, 15 Jun 2022 23:16:20 +0500 Subject: [PATCH 07/15] Remove unused `Decoder::decode_owned` function Because `Decoder` type was private, hardly ever that someone use it --- Changelog.md | 4 ++++ src/reader.rs | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Changelog.md b/Changelog.md index 24690d08..063dbe00 100644 --- a/Changelog.md +++ b/Changelog.md @@ -56,6 +56,9 @@ - [#393]: Now `BytesStart::name()` and `BytesEnd::name()` returns `QName`, and `BytesStart::local_name()` and `BytesEnd::local_name()` returns `LocalName` +- [#191]: Remove unused `reader.decoder().decode_owned()`. If you ever used it, + use `String::from_utf8` instead (which that function did) + ### New Tests - [#9]: Added tests for incorrect nested tags in input @@ -66,6 +69,7 @@ [#8]: https://github.com/Mingun/fast-xml/pull/8 [#9]: https://github.com/Mingun/fast-xml/pull/9 +[#191]: https://github.com/tafia/quick-xml/issues/191 [#363]: https://github.com/tafia/quick-xml/issues/363 [#387]: https://github.com/tafia/quick-xml/pull/387 [#391]: https://github.com/tafia/quick-xml/pull/391 diff --git a/src/reader.rs b/src/reader.rs index 778c530d..d4496ee9 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1491,11 +1491,6 @@ impl Decoder { from_utf8(bytes).map_err(Error::Utf8) } - #[cfg(not(feature = "encoding"))] - pub fn decode_owned<'c>(&self, bytes: Vec) -> Result { - String::from_utf8(bytes).map_err(|e| Error::Utf8(e.utf8_error())) - } - #[cfg(feature = "encoding")] pub fn decode<'c>(&self, bytes: &'c [u8]) -> Cow<'c, str> { self.encoding.decode(bytes).0 From 286367468a249853175bd6c0df61418d8ce35536 Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 14 Jun 2022 21:30:48 +0500 Subject: [PATCH 08/15] #180: Make `Decoder` struct public The method `Reader::decoder()` is public anyway, but its result type is not, which means that it cannot be used as method argument, and that is not good --- Changelog.md | 4 ++++ src/lib.rs | 2 +- src/reader.rs | 55 ++++++++++++++++++++++++++++----------------------- 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/Changelog.md b/Changelog.md index 063dbe00..9074d52e 100644 --- a/Changelog.md +++ b/Changelog.md @@ -17,6 +17,9 @@ - [#393]: New module `name` with `QName`, `LocalName`, `Namespace`, `Prefix` and `PrefixDeclaration` wrappers around byte arrays and `ResolveResult` with the result of namespace resolution +- [#180]: Make `Decoder` struct public. You already had access to it via the + `Reader::decoder()` method, but could not name it in the code. Now the preferred + way to access decoding functionality is via this struct ### Bug Fixes @@ -69,6 +72,7 @@ [#8]: https://github.com/Mingun/fast-xml/pull/8 [#9]: https://github.com/Mingun/fast-xml/pull/9 +[#180]: https://github.com/tafia/quick-xml/issues/180 [#191]: https://github.com/tafia/quick-xml/issues/191 [#363]: https://github.com/tafia/quick-xml/issues/363 [#387]: https://github.com/tafia/quick-xml/pull/387 diff --git a/src/lib.rs b/src/lib.rs index 122520f0..378488bb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -156,5 +156,5 @@ mod writer; #[cfg(feature = "serialize")] pub use crate::errors::serialize::DeError; pub use crate::errors::{Error, Result}; -pub use crate::reader::Reader; +pub use crate::reader::{Decoder, Reader}; pub use crate::writer::{ElementWriter, Writer}; diff --git a/src/reader.rs b/src/reader.rs index d4496ee9..0174b227 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -381,20 +381,14 @@ impl Reader { self.encoding } - /// Get utf8 decoder - #[cfg(feature = "encoding")] + /// Get the decoder, used to decode bytes, read by this reader, to the strings. pub fn decoder(&self) -> Decoder { Decoder { + #[cfg(feature = "encoding")] encoding: self.encoding, } } - /// Get utf8 decoder - #[cfg(not(feature = "encoding"))] - pub fn decoder(&self) -> Decoder { - Decoder - } - /// Decodes a slice using the encoding specified in the XML declaration. /// /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the @@ -1473,47 +1467,58 @@ pub(crate) fn is_whitespace(b: u8) -> bool { } } -/// Utf8 Decoder -#[cfg(not(feature = "encoding"))] -#[derive(Clone, Copy, Debug)] -pub struct Decoder; +//////////////////////////////////////////////////////////////////////////////////////////////////// -/// Utf8 Decoder -#[cfg(feature = "encoding")] +/// Decoder of byte slices to the strings. This is lightweight object that can be copied. +/// +/// If feature `encoding` is enabled, this encoding taken from the `"encoding"` +/// XML declaration or assumes UTF-8, if XML has no declaration, encoding +/// key is not defined or contains unknown encoding. +/// +/// The library supports any UTF-8 compatible encodings that crate `encoding_rs` +/// is supported. [*UTF-16 is not supported at the present*][utf16]. +/// +/// If feature `encoding` is disabled, the decoder is always UTF-8 decoder: +/// any XML declarations are ignored. +/// +/// [utf16]: https://github.com/tafia/quick-xml/issues/158 #[derive(Clone, Copy, Debug)] pub struct Decoder { + #[cfg(feature = "encoding")] encoding: &'static Encoding, } +#[cfg(not(feature = "encoding"))] impl Decoder { - #[cfg(not(feature = "encoding"))] + /// Decodes specified bytes using UTF-8 encoding pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { from_utf8(bytes).map_err(Error::Utf8) } +} - #[cfg(feature = "encoding")] +#[cfg(feature = "encoding")] +impl Decoder { + /// Decodes specified bytes using encoding, declared in the XML, if it was + /// declared there, or UTF-8 otherwise pub fn decode<'c>(&self, bytes: &'c [u8]) -> Cow<'c, str> { self.encoding.decode(bytes).0 } } -/// This implementation is required for tests of other parts of the library -#[cfg(test)] -#[cfg(feature = "serialize")] impl Decoder { - #[cfg(not(feature = "encoding"))] - pub(crate) fn utf8() -> Self { - Decoder - } - - #[cfg(feature = "encoding")] + /// This implementation is required for tests of other parts of the library + #[cfg(test)] + #[cfg(feature = "serialize")] pub(crate) fn utf8() -> Self { Decoder { + #[cfg(feature = "encoding")] encoding: encoding_rs::UTF_8, } } } +//////////////////////////////////////////////////////////////////////////////////////////////////// + #[cfg(test)] mod test { macro_rules! check { From 24fac6997f24b51c585996e9b75dc3dc5bbc0c04 Mon Sep 17 00:00:00 2001 From: Mingun Date: Fri, 17 Jun 2022 00:44:47 +0500 Subject: [PATCH 09/15] Remove all `*_without_bom` functions from Attributes struct BOM cannot arise in the attribute values - this is by definition a mark that can be located only at the begin of the stream and every attribute value is inside the stream --- Changelog.md | 2 + src/events/attributes.rs | 94 ---------------------------------------- 2 files changed, 2 insertions(+), 94 deletions(-) diff --git a/Changelog.md b/Changelog.md index 9074d52e..de6ca929 100644 --- a/Changelog.md +++ b/Changelog.md @@ -61,6 +61,8 @@ - [#191]: Remove unused `reader.decoder().decode_owned()`. If you ever used it, use `String::from_utf8` instead (which that function did) +- [#191]: Remove `*_without_bom` methods from the `Attributes` struct because they are useless. + Use the same-named methods without that suffix instead. Attribute values cannot contain BOM ### New Tests diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 1d8e2583..feb7ea03 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -131,100 +131,6 @@ impl<'a> Attribute<'a> { do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) } - - /// helper method to unescape then decode self using the reader encoding - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - #[cfg(feature = "encoding")] - pub fn unescape_and_decode_without_bom( - &self, - reader: &mut Reader, - ) -> XmlResult { - self.do_unescape_and_decode_without_bom(reader, None) - } - - /// helper method to unescape then decode self using the reader encoding - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - #[cfg(not(feature = "encoding"))] - pub fn unescape_and_decode_without_bom( - &self, - reader: &Reader, - ) -> XmlResult { - self.do_unescape_and_decode_without_bom(reader, None) - } - - /// helper method to unescape then decode self using the reader encoding with custom entities - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - /// - /// # Pre-condition - /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - #[cfg(feature = "encoding")] - pub fn unescape_and_decode_without_bom_with_custom_entities( - &self, - reader: &mut Reader, - custom_entities: &HashMap, Vec>, - ) -> XmlResult { - self.do_unescape_and_decode_without_bom(reader, Some(custom_entities)) - } - - /// helper method to unescape then decode self using the reader encoding with custom entities - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - /// - /// # Pre-condition - /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - #[cfg(not(feature = "encoding"))] - pub fn unescape_and_decode_without_bom_with_custom_entities( - &self, - reader: &Reader, - custom_entities: &HashMap, Vec>, - ) -> XmlResult { - self.do_unescape_and_decode_without_bom(reader, Some(custom_entities)) - } - - #[cfg(feature = "encoding")] - fn do_unescape_and_decode_without_bom( - &self, - reader: &mut Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> XmlResult { - let decoded = reader.decode_without_bom(&*self.value); - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } - - #[cfg(not(feature = "encoding"))] - fn do_unescape_and_decode_without_bom( - &self, - reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> XmlResult { - let decoded = reader.decode_without_bom(&*self.value)?; - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } } impl<'a> Debug for Attribute<'a> { From c8236e47f6a50599826d9e6a1db9f78aa863344b Mon Sep 17 00:00:00 2001 From: Mingun Date: Tue, 14 Jun 2022 19:46:11 +0500 Subject: [PATCH 10/15] Remove `reader.decode()` in flavor to `reader.decoder().decode()` Holding reference to a reader prevents some usages in the future Co-authored-by: Daniel Alley --- Changelog.md | 3 ++ src/events/attributes.rs | 4 +-- src/events/mod.rs | 8 ++--- src/reader.rs | 63 +++++++++++++++---------------------- tests/xmlrs_reader_tests.rs | 4 +-- 5 files changed, 36 insertions(+), 46 deletions(-) diff --git a/Changelog.md b/Changelog.md index de6ca929..cf82063d 100644 --- a/Changelog.md +++ b/Changelog.md @@ -63,6 +63,9 @@ use `String::from_utf8` instead (which that function did) - [#191]: Remove `*_without_bom` methods from the `Attributes` struct because they are useless. Use the same-named methods without that suffix instead. Attribute values cannot contain BOM +- [#191]: Remove `Reader::decode()`, it is replaced by `Decoder::decode()`. + Use `reader.decoder().decode(...)` instead of `reader.decode(...)` for now. + `Reader::encoding()` is replaced by `Decoder::encoding()` as well ### New Tests diff --git a/src/events/attributes.rs b/src/events/attributes.rs index feb7ea03..9625665c 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -114,7 +114,7 @@ impl<'a> Attribute<'a> { reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> XmlResult { - let decoded = reader.decode(&*self.value); + let decoded = reader.decoder().decode(&*self.value); let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) @@ -126,7 +126,7 @@ impl<'a> Attribute<'a> { reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> XmlResult { - let decoded = reader.decode(&*self.value)?; + let decoded = reader.decoder().decode(&*self.value)?; let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) diff --git a/src/events/mod.rs b/src/events/mod.rs index 6715659b..ce6508bb 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -261,7 +261,7 @@ impl<'a> BytesStart<'a> { reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> Result { - let decoded = reader.decode(&*self); + let decoded = reader.decoder().decode(&*self); let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) @@ -274,7 +274,7 @@ impl<'a> BytesStart<'a> { reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> Result { - let decoded = reader.decode(&*self)?; + let decoded = reader.decoder().decode(&*self)?; let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) @@ -928,7 +928,7 @@ impl<'a> BytesText<'a> { reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> Result { - let decoded = reader.decode(&*self); + let decoded = reader.decoder().decode(&*self); let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) @@ -940,7 +940,7 @@ impl<'a> BytesText<'a> { reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> Result { - let decoded = reader.decode(&*self)?; + let decoded = reader.decoder().decode(&*self)?; let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) diff --git a/src/reader.rs b/src/reader.rs index 0174b227..24209e95 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -369,19 +369,13 @@ impl Reader { self.ns_resolver.resolve(name, namespace_buffer, false) } - /// Returns the `Reader`s encoding. - /// - /// The used encoding may change after parsing the XML declaration. + /// Get the decoder, used to decode bytes, read by this reader, to the strings. /// - /// This encoding will be used by [`decode`]. + /// If `encoding` feature is enabled, the used encoding may change after + /// parsing the XML declaration, otherwise encoding is fixed to UTF-8. /// - /// [`decode`]: #method.decode - #[cfg(feature = "encoding")] - pub fn encoding(&self) -> &'static Encoding { - self.encoding - } - - /// Get the decoder, used to decode bytes, read by this reader, to the strings. + /// If `encoding` feature is enabled and no encoding is specified in declaration, + /// defaults to UTF-8. pub fn decoder(&self) -> Decoder { Decoder { #[cfg(feature = "encoding")] @@ -389,32 +383,6 @@ impl Reader { } } - /// Decodes a slice using the encoding specified in the XML declaration. - /// - /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. - /// - /// If no encoding is specified, defaults to UTF-8. - #[inline] - #[cfg(feature = "encoding")] - pub fn decode<'c>(&self, bytes: &'c [u8]) -> Cow<'c, str> { - self.encoding.decode(bytes).0 - } - - /// Decodes a UTF8 slice regardless of XML declaration. - /// - /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. - /// - /// # Note - /// - /// If you instead want to use XML declared encoding, use the `encoding` feature - #[inline] - #[cfg(not(feature = "encoding"))] - pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { - from_utf8(bytes).map_err(Error::Utf8) - } - /// Decodes a slice using without BOM (Byte order mark) the encoding specified in the XML declaration. /// /// Decode `bytes` without BOM and with malformed sequences replaced with the @@ -1490,7 +1458,14 @@ pub struct Decoder { #[cfg(not(feature = "encoding"))] impl Decoder { - /// Decodes specified bytes using UTF-8 encoding + /// Decodes a UTF8 slice regardless of XML declaration. + /// + /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the + /// `U+FFFD REPLACEMENT CHARACTER`. + /// + /// # Note + /// + /// If you instead want to use XML declared encoding, use the `encoding` feature pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { from_utf8(bytes).map_err(Error::Utf8) } @@ -1498,8 +1473,20 @@ impl Decoder { #[cfg(feature = "encoding")] impl Decoder { + /// Returns the `Reader`s encoding. + /// + /// This encoding will be used by [`decode`]. + /// + /// [`decode`]: Self::decode + pub fn encoding(&self) -> &'static Encoding { + self.encoding + } + /// Decodes specified bytes using encoding, declared in the XML, if it was /// declared there, or UTF-8 otherwise + /// + /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the + /// `U+FFFD REPLACEMENT CHARACTER`. pub fn decode<'c>(&self, bytes: &'c [u8]) -> Cow<'c, str> { self.encoding.decode(bytes).0 } diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index e04e6669..c6eaadc9 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -451,10 +451,10 @@ fn make_attrs(e: &BytesStart) -> ::std::result::Result { // FIXME: The public API differs based on the "encoding" feature fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> { #[cfg(feature = "encoding")] - let decoded = reader.decode(text); + let decoded = reader.decoder().decode(text); #[cfg(not(feature = "encoding"))] - let decoded = Cow::Borrowed(reader.decode(text).unwrap()); + let decoded = Cow::Borrowed(reader.decoder().decode(text).unwrap()); decoded } From 9f1e655460022db0157e6c75bd8844f3e5c0d0c0 Mon Sep 17 00:00:00 2001 From: Mingun Date: Fri, 17 Jun 2022 00:48:50 +0500 Subject: [PATCH 11/15] #191: Add new event `StartText` which contains text before the first XML element --- Changelog.md | 2 + benches/bench.rs | 7 +-- src/de/mod.rs | 8 +++ src/events/mod.rs | 98 ++++++++++++++++++++++++++++++++++++- src/reader.rs | 91 ++++++++++++++++++++++++++++------ src/writer.rs | 1 + tests/test.rs | 10 +++- tests/unit_tests.rs | 23 ++++++--- tests/xmlrs_reader_tests.rs | 1 + 9 files changed, 212 insertions(+), 29 deletions(-) diff --git a/Changelog.md b/Changelog.md index cf82063d..2deb883c 100644 --- a/Changelog.md +++ b/Changelog.md @@ -20,6 +20,8 @@ - [#180]: Make `Decoder` struct public. You already had access to it via the `Reader::decoder()` method, but could not name it in the code. Now the preferred way to access decoding functionality is via this struct +- [#191]: New event variant `StartText` emitted for bytes before the XML declaration + or a start comment or a tag. For streams with BOM this event will contain a BOM ### Bug Fixes diff --git a/benches/bench.rs b/benches/bench.rs index 389af6f4..29c9f90f 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -1,6 +1,7 @@ use criterion::{self, criterion_group, criterion_main, Criterion}; use pretty_assertions::assert_eq; use quick_xml::events::Event; +use quick_xml::name::QName; use quick_xml::Reader; static SAMPLE: &[u8] = include_bytes!("../tests/sample_rss.xml"); @@ -173,7 +174,7 @@ fn bytes_text_unescaped(c: &mut Criterion) { /// Benchmarks, how fast individual event parsed fn one_event(c: &mut Criterion) { let mut group = c.benchmark_group("One event"); - group.bench_function("Text", |b| { + group.bench_function("StartText", |b| { let src = "Hello world!".repeat(512 / 12).into_bytes(); let mut buf = Vec::with_capacity(1024); b.iter(|| { @@ -181,7 +182,7 @@ fn one_event(c: &mut Criterion) { let mut nbtxt = criterion::black_box(0); r.check_end_names(false).check_comments(false); match r.read_event(&mut buf) { - Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(), + Ok(Event::StartText(e)) => nbtxt += e.unescaped().unwrap().len(), something_else => panic!("Did not expect {:?}", something_else), }; @@ -310,7 +311,7 @@ fn attributes(c: &mut Criterion) { let mut buf = Vec::new(); loop { match r.read_event(&mut buf) { - Ok(Event::Empty(e)) if e.name() == b"player" => { + Ok(Event::Empty(e)) if e.name() == QName(b"player") => { for name in ["num", "status", "avg"] { if let Some(_attr) = e.try_get_attribute(name).unwrap() { count += 1 diff --git a/src/de/mod.rs b/src/de/mod.rs index 3529f329..c7adec3e 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -952,6 +952,10 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader { let event = loop { let e = self.reader.read_event(&mut self.buf)?; match e { + //TODO: Probably not the best idea treat StartText as usual text + // Usually this event will represent a BOM + // Changing this requires review of the serde-de::top_level::one_element test + Event::StartText(e) => break Ok(DeEvent::Text(e.into_owned().into())), Event::Start(e) => break Ok(DeEvent::Start(e.into_owned())), Event::End(e) => break Ok(DeEvent::End(e.into_owned())), Event::Text(e) => break Ok(DeEvent::Text(e.into_owned())), @@ -992,6 +996,10 @@ impl<'de> XmlRead<'de> for SliceReader<'de> { loop { let e = self.reader.read_event_unbuffered()?; match e { + //TODO: Probably not the best idea treat StartText as usual text + // Usually this event will represent a BOM + // Changing this requires review of the serde-de::top_level::one_element test + Event::StartText(e) => break Ok(DeEvent::Text(e.into())), Event::Start(e) => break Ok(DeEvent::Start(e)), Event::End(e) => break Ok(DeEvent::End(e)), Event::Text(e) => break Ok(DeEvent::Text(e)), diff --git a/src/events/mod.rs b/src/events/mod.rs index ce6508bb..70460dbc 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -54,6 +54,46 @@ use attributes::{Attribute, Attributes}; #[cfg(feature = "serialize")] use crate::escape::EscapeError; +/// Text that appeared before an XML declaration, a start element or a comment. +/// +/// In well-formed XML it could contain a Byte-Order-Mark (BOM). If this event +/// contains something else except BOM, the XML should be considered ill-formed. +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct BytesStartText<'a> { + content: BytesText<'a>, +} + +impl<'a> BytesStartText<'a> { + /// Converts the event into an owned event. + pub fn into_owned(self) -> BytesStartText<'static> { + BytesStartText { + content: self.content.into_owned(), + } + } + + /// Extracts the inner `Cow` from the `BytesStartText` event container. + #[inline] + pub fn into_inner(self) -> Cow<'a, [u8]> { + self.content.into_inner() + } +} + +impl<'a> Deref for BytesStartText<'a> { + type Target = BytesText<'a>; + + fn deref(&self) -> &Self::Target { + &self.content + } +} + +impl<'a> From> for BytesStartText<'a> { + fn from(content: BytesText<'a>) -> Self { + Self { content } + } +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// + /// Opening tag data (`Event::Start`), with optional attributes. /// /// ``. @@ -968,6 +1008,12 @@ impl<'a> Deref for BytesText<'a> { } } +impl<'a> From> for BytesText<'a> { + fn from(content: BytesStartText<'a>) -> Self { + content.content + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// CDATA content contains unescaped data from the reader. If you want to write them as a text, @@ -1092,10 +1138,56 @@ impl<'a> Deref for BytesCData<'a> { //////////////////////////////////////////////////////////////////////////////////////////////////// /// Event emitted by [`Reader::read_event`]. -/// -/// [`Reader::read_event`]: ../reader/struct.Reader.html#method.read_event #[derive(Clone, Debug, Eq, PartialEq)] pub enum Event<'a> { + /// Text that appeared before the first opening tag or an [XML declaration]. + /// [According to the XML standard][std], no text allowed before the XML + /// declaration. However, if there is a BOM in the stream, some data may be + /// present. + /// + /// When this event is generated, it is the very first event emitted by the + /// [`Reader`], and there can be the only one such event. + /// + /// The [`Writer`] writes content of this event "as is" without encoding or + /// escaping. If you write it, it should be written first and only one time + /// (but writer does not enforce that). + /// + /// # Examples + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use std::borrow::Cow; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// // XML in UTF-8 with BOM + /// let xml = b"\xEF\xBB\xBF"; + /// let mut reader = Reader::from_bytes(xml); + /// let mut events_processed = 0; + /// loop { + /// match reader.read_event_unbuffered() { + /// Ok(Event::StartText(e)) => { + /// assert_eq!(events_processed, 0); + /// // Content contains BOM + /// assert_eq!(e.into_inner(), Cow::Borrowed(b"\xEF\xBB\xBF")); + /// } + /// Ok(Event::Decl(_)) => { + /// assert_eq!(events_processed, 1); + /// } + /// Ok(Event::Eof) => { + /// assert_eq!(events_processed, 2); + /// break; + /// } + /// e => panic!("Unexpected event {:?}", e), + /// } + /// events_processed += 1; + /// } + /// ``` + /// + /// [XML declaration]: Event::Decl + /// [std]: https://www.w3.org/TR/xml11/#NT-document + /// [`Writer`]: crate::writer::Writer + StartText(BytesStartText<'a>), /// Start tag (with attributes) ``. Start(BytesStart<'a>), /// End tag ``. @@ -1123,6 +1215,7 @@ impl<'a> Event<'a> { /// buffer used when reading but incurring a new, separate allocation. pub fn into_owned(self) -> Event<'static> { match self { + Event::StartText(e) => Event::StartText(e.into_owned()), Event::Start(e) => Event::Start(e.into_owned()), Event::End(e) => Event::End(e.into_owned()), Event::Empty(e) => Event::Empty(e.into_owned()), @@ -1142,6 +1235,7 @@ impl<'a> Deref for Event<'a> { fn deref(&self) -> &[u8] { match *self { + Event::StartText(ref e) => &*e, Event::Start(ref e) | Event::Empty(ref e) => &*e, Event::End(ref e) => &*e, Event::Text(ref e) => &*e, diff --git a/src/reader.rs b/src/reader.rs index 24209e95..ba8a468f 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -14,12 +14,46 @@ use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult}; use memchr; +/// Possible reader states. The state transition diagram (`true` and `false` shows +/// value of [`Reader::expand_empty_elements()`] option): +/// +/// ```mermaid +/// flowchart LR +/// subgraph _ +/// direction LR +/// +/// Init -- "(no event)"\nStartText --> Opened +/// Opened -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> Closed +/// Closed -- "#lt;false#gt;\n(no event)"\nText --> Opened +/// end +/// Closed -- "#lt;true#gt;"\nStart --> Empty +/// Empty -- End --> Closed +/// _ -. Eof .-> Exit +/// ``` #[derive(Clone)] enum TagState { + /// Initial state in which reader stay after creation. Transition from that + /// state could produce a `StartText`, `Decl`, `Comment` or `Start` event. + /// The next state is always `Opened`. The reader will never return to this + /// state. The event emitted during transition to `Opened` is a `StartEvent` + /// if the first symbol not `<`, otherwise no event are emitted. + Init, + /// State after seeing the `<` symbol. Depending on the next symbol all other + /// events (except `StartText`) could be generated. + /// + /// After generating ane event the reader moves to the `Closed` state. Opened, + /// State in which reader searches the `<` symbol of a markup. All bytes before + /// that symbol will be returned in the [`Event::Text`] event. After that + /// the reader moves to the `Opened` state. Closed, + /// This state is used only if option `expand_empty_elements` is set to `true`. + /// Reader enters to this state when it is in a `Closed` state and emits an + /// [`Event::Start`] event. The next event emitted will be an [`Event::End`], + /// after which reader returned to the `Closed` state. Empty, - /// Either Eof or Errored + /// Reader enters this state when `Eof` event generated or an error occurred. + /// This is the last state, the reader stay in it forever. Exit, } @@ -126,7 +160,7 @@ impl Reader { reader, opened_buffer: Vec::new(), opened_starts: Vec::new(), - tag_state: TagState::Closed, + tag_state: TagState::Init, expand_empty_elements: false, trim_text_start: false, trim_text_end: false, @@ -658,7 +692,8 @@ impl Reader { R: XmlSource<'i, B>, { let event = match self.tag_state { - TagState::Closed => self.read_until_open(buf), + TagState::Init => self.read_until_open(buf, true), + TagState::Closed => self.read_until_open(buf, false), TagState::Opened => self.read_until_close(buf), TagState::Empty => self.close_expanded_empty(), TagState::Exit => return Ok(Event::Eof), @@ -670,9 +705,10 @@ impl Reader { event } - /// private function to read until '<' is found - /// return a `Text` event - fn read_until_open<'i, B>(&mut self, buf: B) -> Result> + /// Read until '<' is found and moves reader to an `Opened` state. + /// + /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise + fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result> where R: XmlSource<'i, B>, { @@ -691,15 +727,24 @@ impl Reader { .reader .read_bytes_until(b'<', buf, &mut self.buf_position) { - Ok(Some(bytes)) if self.trim_text_end => { - // Skip the ending '< - let len = bytes - .iter() - .rposition(|&b| !is_whitespace(b)) - .map_or_else(|| bytes.len(), |p| p + 1); - Ok(Event::Text(BytesText::from_escaped(&bytes[..len]))) + Ok(Some(bytes)) => { + let content = if self.trim_text_end { + // Skip the ending '< + let len = bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1); + &bytes[..len] + } else { + bytes + }; + + Ok(if first { + Event::StartText(BytesText::from_escaped(content).into()) + } else { + Event::Text(BytesText::from_escaped(content)) + }) } - Ok(Some(bytes)) => Ok(Event::Text(BytesText::from_escaped(bytes))), Ok(None) => Ok(Event::Eof), Err(e) => Err(e), } @@ -2250,6 +2295,16 @@ mod test { use crate::reader::Reader; use pretty_assertions::assert_eq; + #[test] + fn start_text() { + let mut reader = Reader::from_str("bom"); + + assert_eq!( + reader.read_event_buffered($buf).unwrap(), + Event::StartText(BytesText::from_escaped(b"bom".as_ref()).into()) + ); + } + #[test] fn declaration() { let mut reader = Reader::from_str(""); @@ -2313,9 +2368,15 @@ mod test { ); } + /// Text event cannot be generated without preceding event of another type #[test] fn text() { - let mut reader = Reader::from_str("text"); + let mut reader = Reader::from_str("text"); + + assert_eq!( + reader.read_event_buffered($buf).unwrap(), + Event::Empty(BytesStart::borrowed_name(b"tag")) + ); assert_eq!( reader.read_event_buffered($buf).unwrap(), diff --git a/src/writer.rs b/src/writer.rs index 9b4f484a..d6fcf960 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -91,6 +91,7 @@ impl Writer { pub fn write_event<'a, E: AsRef>>(&mut self, event: E) -> Result<()> { let mut next_should_line_break = true; let result = match *event.as_ref() { + Event::StartText(ref e) => self.write(&e), Event::Start(ref e) => { let result = self.write_wrapped(b"<", e, b">"); if let Some(i) = self.indent.as_mut() { diff --git a/tests/test.rs b/tests/test.rs index 37426391..cf547c8e 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -181,7 +181,10 @@ fn fuzz_101() { fn test_no_trim() { let mut reader = Reader::from_str(" text "); - assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_))); + assert!(matches!( + reader.read_event_unbuffered().unwrap(), + StartText(_) + )); assert!(matches!(reader.read_event_unbuffered().unwrap(), Start(_))); assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_))); assert!(matches!(reader.read_event_unbuffered().unwrap(), End(_))); @@ -193,7 +196,10 @@ fn test_trim_end() { let mut reader = Reader::from_str(" text "); reader.trim_text_end(true); - assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_))); + assert!(matches!( + reader.read_event_unbuffered().unwrap(), + StartText(_) + )); assert!(matches!(reader.read_event_unbuffered().unwrap(), Start(_))); assert!(matches!(reader.read_event_unbuffered().unwrap(), Text(_))); assert!(matches!(reader.read_event_unbuffered().unwrap(), End(_))); diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index 21f53fb4..ca081079 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -29,7 +29,7 @@ macro_rules! next_eq_content { ($r:expr, $t:tt, $bytes:expr) => { let mut buf = Vec::new(); match $r.read_event(&mut buf).unwrap() { - $t(ref e) if &**e == $bytes => (), + $t(ref e) if e.as_ref() == $bytes => (), e => panic!( "expecting {}({:?}), found {:?}", stringify!($t), @@ -42,6 +42,7 @@ macro_rules! next_eq_content { } macro_rules! next_eq { + ($r:expr, StartText, $bytes:expr) => (next_eq_content!($r, StartText, $bytes);); ($r:expr, Start, $bytes:expr) => (next_eq_name!($r, Start, $bytes);); ($r:expr, End, $bytes:expr) => (next_eq_name!($r, End, $bytes);); ($r:expr, Empty, $bytes:expr) => (next_eq_name!($r, Empty, $bytes);); @@ -794,7 +795,9 @@ fn test_unescape_and_decode_without_bom_removes_utf8_bom() { loop { match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => txt.push(e.unescape_and_decode_without_bom(&reader).unwrap()), + Ok(Event::StartText(e)) => { + txt.push(e.unescape_and_decode_without_bom(&reader).unwrap()) + } Ok(Event::Eof) => break, _ => (), } @@ -813,12 +816,14 @@ fn test_unescape_and_decode_without_bom_removes_utf16be_bom() { loop { match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()), + Ok(Event::StartText(e)) => { + txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()) + } Ok(Event::Eof) => break, _ => (), } } - assert_eq!(txt[0], ""); + assert_eq!(Some(txt[0].as_ref()), Some("")); } #[test] @@ -832,12 +837,14 @@ fn test_unescape_and_decode_without_bom_removes_utf16le_bom() { loop { match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()), + Ok(Event::StartText(e)) => { + txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()) + } Ok(Event::Eof) => break, _ => (), } } - assert_eq!(txt[0], ""); + assert_eq!(Some(txt[0].as_ref()), Some("")); } #[test] @@ -853,7 +860,9 @@ fn test_unescape_and_decode_without_bom_does_nothing_if_no_bom_exists() { loop { match reader.read_event(&mut buf) { - Ok(Event::Text(e)) => txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()), + Ok(Event::StartText(e)) => { + txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()) + } Ok(Event::Eof) => break, _ => (), } diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index c6eaadc9..a3f66bcb 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -461,6 +461,7 @@ fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> { fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8]>) -> String { match opt_event { + Ok((_, Event::StartText(_))) => "StartText".to_string(), Ok((n, Event::Start(ref e))) => { let name = namespace_name(n, e.name(), reader); match make_attrs(e) { From 15f5075774a0057c786897c5c433d435e296e6ef Mon Sep 17 00:00:00 2001 From: Mingun Date: Fri, 17 Jun 2022 21:31:09 +0500 Subject: [PATCH 12/15] Autodetect encoding automatically when start parsing instead of doing that on demand even at inappropriate time --- Changelog.md | 13 ++++- src/events/mod.rs | 113 ++++++----------------------------- src/reader.rs | 139 +++++++++++++++++++++++++++----------------- tests/unit_tests.rs | 134 ++++++++++++++++++++++-------------------- 4 files changed, 188 insertions(+), 211 deletions(-) diff --git a/Changelog.md b/Changelog.md index 2deb883c..f83cd351 100644 --- a/Changelog.md +++ b/Changelog.md @@ -65,9 +65,18 @@ use `String::from_utf8` instead (which that function did) - [#191]: Remove `*_without_bom` methods from the `Attributes` struct because they are useless. Use the same-named methods without that suffix instead. Attribute values cannot contain BOM -- [#191]: Remove `Reader::decode()`, it is replaced by `Decoder::decode()`. - Use `reader.decoder().decode(...)` instead of `reader.decode(...)` for now. +- [#191]: Remove `Reader::decode()` and `Reader::decode_without_bom()`, they are replaced by + `Decoder::decode()` and `Decoder::decode_with_bom_removal()`. + Use `reader.decoder().decode_*(...)` instead of `reader.decode_*(...)` for now. `Reader::encoding()` is replaced by `Decoder::encoding()` as well +- [#191]: Remove poorly designed `BytesText::unescape_and_decode_without_bom()` and + `BytesText::unescape_and_decode_without_bom_with_custom_entities()`. Although these methods worked + as expected, this was only due to good luck. They was replaced by the + `BytesStartText::decode_with_bom_removal()`: + - conceptually, you should decode BOM only for the first `Text` event from the + reader (since now `StartText` event is emitted instead for this) + - text before the first tag is not an XML content at all, so it is meaningless + to try to unescape something in it ### New Tests diff --git a/src/events/mod.rs b/src/events/mod.rs index 70460dbc..45010749 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -45,10 +45,11 @@ use std::io::BufRead; use std::ops::Deref; use std::str::from_utf8; +use crate::errors::{Error, Result}; use crate::escape::{do_unescape, escape, partial_escape}; use crate::name::{LocalName, QName}; +use crate::reader::{Decoder, Reader}; use crate::utils::write_cow_string; -use crate::{errors::Error, errors::Result, reader::Reader}; use attributes::{Attribute, Attributes}; #[cfg(feature = "serialize")] @@ -76,6 +77,22 @@ impl<'a> BytesStartText<'a> { pub fn into_inner(self) -> Cow<'a, [u8]> { self.content.into_inner() } + + /// Decodes bytes of event, stripping byte order mark (BOM) if it is presented + /// in the event. + /// + /// This method does not unescapes content, because no escape sequences can + /// appeared in the BOM or in the text before the first tag. + pub fn decode_with_bom_removal(&self, decoder: Decoder) -> Result { + //TODO: Fix lifetime issue - it should be possible to borrow string + #[cfg(feature = "encoding")] + let decoded = decoder.decode_with_bom_removal(&*self); + + #[cfg(not(feature = "encoding"))] + let decoded = decoder.decode_with_bom_removal(&*self)?; + + Ok(decoded.to_string()) + } } impl<'a> Deref for BytesStartText<'a> { @@ -840,100 +857,6 @@ impl<'a> BytesText<'a> { do_unescape(self, custom_entities).map_err(Error::EscapeError) } - /// helper method to unescape then decode self using the reader encoding - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - #[cfg(feature = "encoding")] - pub fn unescape_and_decode_without_bom( - &self, - reader: &mut Reader, - ) -> Result { - self.do_unescape_and_decode_without_bom(reader, None) - } - - /// helper method to unescape then decode self using the reader encoding - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - #[cfg(not(feature = "encoding"))] - pub fn unescape_and_decode_without_bom( - &self, - reader: &Reader, - ) -> Result { - self.do_unescape_and_decode_without_bom(reader, None) - } - - /// helper method to unescape then decode self using the reader encoding with custom entities - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - /// - /// # Pre-condition - /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - #[cfg(feature = "encoding")] - pub fn unescape_and_decode_without_bom_with_custom_entities( - &self, - reader: &mut Reader, - custom_entities: &HashMap, Vec>, - ) -> Result { - self.do_unescape_and_decode_without_bom(reader, Some(custom_entities)) - } - - /// helper method to unescape then decode self using the reader encoding with custom entities - /// but without BOM (Byte order mark) - /// - /// for performance reasons (could avoid allocating a `String`), - /// it might be wiser to manually use - /// 1. BytesText::unescaped() - /// 2. Reader::decode(...) - /// - /// # Pre-condition - /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - #[cfg(not(feature = "encoding"))] - pub fn unescape_and_decode_without_bom_with_custom_entities( - &self, - reader: &Reader, - custom_entities: &HashMap, Vec>, - ) -> Result { - self.do_unescape_and_decode_without_bom(reader, Some(custom_entities)) - } - - #[cfg(feature = "encoding")] - fn do_unescape_and_decode_without_bom( - &self, - reader: &mut Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> Result { - let decoded = reader.decode_without_bom(&*self); - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } - - #[cfg(not(feature = "encoding"))] - fn do_unescape_and_decode_without_bom( - &self, - reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> Result { - let decoded = reader.decode_without_bom(&*self)?; - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } - /// helper method to unescape then decode self using the reader encoding /// /// for performance reasons (could avoid allocating a `String`), diff --git a/src/reader.rs b/src/reader.rs index ba8a468f..2d2e99ab 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -6,7 +6,7 @@ use std::io::{self, BufRead, BufReader}; use std::{fs::File, path::Path, str::from_utf8}; #[cfg(feature = "encoding")] -use encoding_rs::{Encoding, UTF_16BE, UTF_16LE}; +use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; use crate::errors::{Error, Result}; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; @@ -416,51 +416,6 @@ impl Reader { encoding: self.encoding, } } - - /// Decodes a slice using without BOM (Byte order mark) the encoding specified in the XML declaration. - /// - /// Decode `bytes` without BOM and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. - /// - /// If no encoding is specified, defaults to UTF-8. - #[inline] - #[cfg(feature = "encoding")] - pub fn decode_without_bom<'c>(&mut self, mut bytes: &'c [u8]) -> Cow<'c, str> { - if self.is_encoding_set { - return self.encoding.decode_with_bom_removal(bytes).0; - } - if bytes.starts_with(b"\xEF\xBB\xBF") { - self.is_encoding_set = true; - bytes = &bytes[3..]; - } else if bytes.starts_with(b"\xFF\xFE") { - self.is_encoding_set = true; - self.encoding = UTF_16LE; - bytes = &bytes[2..]; - } else if bytes.starts_with(b"\xFE\xFF") { - self.is_encoding_set = true; - self.encoding = UTF_16BE; - bytes = &bytes[3..]; - }; - self.encoding.decode_without_bom_handling(bytes).0 - } - - /// Decodes a UTF8 slice without BOM (Byte order mark) regardless of XML declaration. - /// - /// Decode `bytes` without BOM and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. - /// - /// # Note - /// - /// If you instead want to use XML declared encoding, use the `encoding` feature - #[inline] - #[cfg(not(feature = "encoding"))] - pub fn decode_without_bom<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { - if bytes.starts_with(b"\xEF\xBB\xBF") { - from_utf8(&bytes[3..]).map_err(Error::Utf8) - } else { - from_utf8(bytes).map_err(Error::Utf8) - } - } } /// Read methods @@ -728,6 +683,14 @@ impl Reader { .read_bytes_until(b'<', buf, &mut self.buf_position) { Ok(Some(bytes)) => { + #[cfg(feature = "encoding")] + if first { + if let Some(encoding) = detect_encoding(bytes) { + self.encoding = encoding; + self.is_encoding_set = true; + } + } + let content = if self.trim_text_end { // Skip the ending '< let len = bytes @@ -1503,17 +1466,29 @@ pub struct Decoder { #[cfg(not(feature = "encoding"))] impl Decoder { - /// Decodes a UTF8 slice regardless of XML declaration. + /// Decodes a UTF8 slice regardless of XML declaration and ignoring BOM if + /// it is present in the `bytes`. /// - /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. - /// - /// # Note + /// Returns an error in case of malformed sequences in the `bytes`. /// /// If you instead want to use XML declared encoding, use the `encoding` feature pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { from_utf8(bytes).map_err(Error::Utf8) } + + /// Decodes a slice regardless of XML declaration with BOM removal if + /// it is present in the `bytes`. + /// + /// Returns an error in case of malformed sequences in the `bytes`. + /// + /// If you instead want to use XML declared encoding, use the `encoding` feature + pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<&'b str> { + if bytes.starts_with(b"\xEF\xBB\xBF") { + from_utf8(&bytes[3..]).map_err(Error::Utf8) + } else { + from_utf8(bytes).map_err(Error::Utf8) + } + } } #[cfg(feature = "encoding")] @@ -1532,9 +1507,22 @@ impl Decoder { /// /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the /// `U+FFFD REPLACEMENT CHARACTER`. - pub fn decode<'c>(&self, bytes: &'c [u8]) -> Cow<'c, str> { + pub fn decode<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> { self.encoding.decode(bytes).0 } + + /// Decodes a slice with BOM removal if it is present in the `bytes` using + /// the reader encoding. + /// + /// If this method called after reading XML declaration with the `"encoding"` + /// key, then this encoding is used, otherwise UTF-8 is used. + /// + /// If XML declaration is absent in the XML, UTF-8 is used. + /// + /// Any malformed sequences replaced with the `U+FFFD REPLACEMENT CHARACTER`. + pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> { + self.encoding.decode_with_bom_removal(bytes).0 + } } impl Decoder { @@ -1549,6 +1537,53 @@ impl Decoder { } } +/// Automatic encoding detection of XML files based using the [recommended algorithm] +/// (https://www.w3.org/TR/xml11/#sec-guessing) +/// +/// The algorithm suggests examine up to the first 4 bytes to determine encoding +/// according to the following table: +/// +/// | Bytes |Detected encoding +/// |-------------|------------------------------------------ +/// |`00 00 FE FF`|UCS-4, big-endian machine (1234 order) +/// |`FF FE 00 00`|UCS-4, little-endian machine (4321 order) +/// |`00 00 FF FE`|UCS-4, unusual octet order (2143) +/// |`FE FF 00 00`|UCS-4, unusual octet order (3412) +/// |`FE FF ## ##`|UTF-16, big-endian +/// |`FF FE ## ##`|UTF-16, little-endian +/// |`EF BB BF` |UTF-8 +/// |-------------|------------------------------------------ +/// |`00 00 00 3C`|UCS-4 or similar (use declared encoding to find the exact one), in big-endian (1234) +/// |`3C 00 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in little-endian (4321) +/// |`00 00 3C 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (2143) +/// |`00 3C 00 00`|UCS-4 or similar (use declared encoding to find the exact one), in unusual byte orders (3412) +/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one) +/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one) +/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably +/// |`4C 6F A7 94`|EBCDIC (in some flavor; the full encoding declaration must be read to tell which code page is in use) +/// |_Other_ |UTF-8 without an encoding declaration, or else the data stream is mislabeled (lacking a required encoding declaration), corrupt, fragmentary, or enclosed in a wrapper of some kind +/// +/// Because [`encoding_rs`] crate supported only subset of those encodings, only +/// supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE. +/// +/// If encoding is detected, `Some` is returned, otherwise `None` is returned. +#[cfg(feature = "encoding")] +fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { + match bytes { + // with BOM + _ if bytes.starts_with(&[0xFE, 0xFF]) => Some(UTF_16BE), + _ if bytes.starts_with(&[0xFF, 0xFE]) => Some(UTF_16LE), + _ if bytes.starts_with(&[0xEF, 0xBB, 0xBF]) => Some(UTF_8), + + // without BOM + _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(UTF_16BE), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2 + _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(UTF_16LE), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2 + _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => Some(UTF_8), // Some ASCII compatible + + _ => None, + } +} + //////////////////////////////////////////////////////////////////////////////////////////////////// #[cfg(test)] diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index ca081079..39af252d 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -782,90 +782,100 @@ fn test_closing_bracket_in_single_quote_mixed() { next_eq!(r, End, b"a"); } -#[test] -#[cfg(not(feature = "encoding"))] -fn test_unescape_and_decode_without_bom_removes_utf8_bom() { - let input: &str = std::str::from_utf8(b"\xEF\xBB\xBF").unwrap(); +mod decode_with_bom_removal { + use super::*; + use pretty_assertions::assert_eq; - let mut reader = Reader::from_str(&input); - reader.trim_text(true); + #[test] + #[cfg(not(feature = "encoding"))] + fn removes_utf8_bom() { + let input: &str = std::str::from_utf8(b"\xEF\xBB\xBF").unwrap(); - let mut txt = Vec::new(); - let mut buf = Vec::new(); + let mut reader = Reader::from_str(&input); + reader.trim_text(true); - loop { - match reader.read_event(&mut buf) { - Ok(Event::StartText(e)) => { - txt.push(e.unescape_and_decode_without_bom(&reader).unwrap()) + let mut txt = Vec::new(); + let mut buf = Vec::new(); + + loop { + match reader.read_event(&mut buf) { + Ok(Event::StartText(e)) => { + txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()) + } + Ok(Event::Eof) => break, + _ => (), } - Ok(Event::Eof) => break, - _ => (), } + assert_eq!(txt, vec![""]); } - assert_eq!(txt, vec![""]); -} -#[test] -#[cfg(feature = "encoding")] -fn test_unescape_and_decode_without_bom_removes_utf16be_bom() { - let mut reader = Reader::from_file("./tests/documents/utf16be.xml").unwrap(); - reader.trim_text(true); - - let mut txt = Vec::new(); - let mut buf = Vec::new(); + /// Test is disabled: the input started with `[FE FF 00 3C 00 3F ...]` and currently + /// quick-xml captures `[FE FF 00]` as a `StartText` event, because it is stopped + /// at byte `<` (0x3C). That sequence represents UTF-16 BOM (=BE) and a first byte + /// of the `<` symbol, encoded in UTF-16 BE (`00 3C`). + #[test] + #[cfg(feature = "encoding")] + #[ignore = "Non-ASCII compatible encodings not properly supported yet. See https://github.com/tafia/quick-xml/issues/158"] + fn removes_utf16be_bom() { + let mut reader = Reader::from_file("./tests/documents/utf16be.xml").unwrap(); + reader.trim_text(true); + + let mut txt = Vec::new(); + let mut buf = Vec::new(); - loop { - match reader.read_event(&mut buf) { - Ok(Event::StartText(e)) => { - txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()) + loop { + match reader.read_event(&mut buf) { + Ok(Event::StartText(e)) => { + txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()) + } + Ok(Event::Eof) => break, + _ => (), } - Ok(Event::Eof) => break, - _ => (), } + assert_eq!(Some(txt[0].as_ref()), Some("")); } - assert_eq!(Some(txt[0].as_ref()), Some("")); -} -#[test] -#[cfg(feature = "encoding")] -fn test_unescape_and_decode_without_bom_removes_utf16le_bom() { - let mut reader = Reader::from_file("./tests/documents/utf16le.xml").unwrap(); - reader.trim_text(true); + #[test] + #[cfg(feature = "encoding")] + fn removes_utf16le_bom() { + let mut reader = Reader::from_file("./tests/documents/utf16le.xml").unwrap(); + reader.trim_text(true); - let mut txt = Vec::new(); - let mut buf = Vec::new(); + let mut txt = Vec::new(); + let mut buf = Vec::new(); - loop { - match reader.read_event(&mut buf) { - Ok(Event::StartText(e)) => { - txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()) + loop { + match reader.read_event(&mut buf) { + Ok(Event::StartText(e)) => { + txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()) + } + Ok(Event::Eof) => break, + _ => (), } - Ok(Event::Eof) => break, - _ => (), } + assert_eq!(Some(txt[0].as_ref()), Some("")); } - assert_eq!(Some(txt[0].as_ref()), Some("")); -} -#[test] -#[cfg(not(feature = "encoding"))] -fn test_unescape_and_decode_without_bom_does_nothing_if_no_bom_exists() { - let input: &str = std::str::from_utf8(b"").unwrap(); + #[test] + #[cfg(not(feature = "encoding"))] + fn does_nothing_if_no_bom_exists() { + let input: &str = std::str::from_utf8(b"").unwrap(); - let mut reader = Reader::from_str(&input); - reader.trim_text(true); + let mut reader = Reader::from_str(&input); + reader.trim_text(true); - let mut txt = Vec::new(); - let mut buf = Vec::new(); + let mut txt = Vec::new(); + let mut buf = Vec::new(); - loop { - match reader.read_event(&mut buf) { - Ok(Event::StartText(e)) => { - txt.push(e.unescape_and_decode_without_bom(&mut reader).unwrap()) + loop { + match reader.read_event(&mut buf) { + Ok(Event::StartText(e)) => { + txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()) + } + Ok(Event::Eof) => break, + _ => (), } - Ok(Event::Eof) => break, - _ => (), } + assert!(txt.is_empty()); } - assert!(txt.is_empty()); } From 8d5aaaab866c02c609a1e1f4cd4acad4613de700 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 19 Jun 2022 02:04:04 +0500 Subject: [PATCH 13/15] Fix #262: Decrease code redundancy by merging encoding-related methods --- src/events/attributes.rs | 14 +++----------- src/events/mod.rs | 29 ++++++----------------------- 2 files changed, 9 insertions(+), 34 deletions(-) diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 9625665c..d7facd6f 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -108,25 +108,17 @@ impl<'a> Attribute<'a> { } /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - #[cfg(feature = "encoding")] fn do_unescape_and_decode_value( &self, reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> XmlResult { + #[cfg(feature = "encoding")] let decoded = reader.decoder().decode(&*self.value); - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } - #[cfg(not(feature = "encoding"))] - fn do_unescape_and_decode_value( - &self, - reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> XmlResult { + #[cfg(not(feature = "encoding"))] let decoded = reader.decoder().decode(&*self.value)?; + let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) diff --git a/src/events/mod.rs b/src/events/mod.rs index 45010749..93da82ff 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -311,27 +311,18 @@ impl<'a> BytesStart<'a> { self.do_unescape_and_decode_with_custom_entities(reader, Some(custom_entities)) } - #[cfg(feature = "encoding")] #[inline] fn do_unescape_and_decode_with_custom_entities( &self, reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> Result { + #[cfg(feature = "encoding")] let decoded = reader.decoder().decode(&*self); - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } - #[cfg(not(feature = "encoding"))] - #[inline] - fn do_unescape_and_decode_with_custom_entities( - &self, - reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> Result { + #[cfg(not(feature = "encoding"))] let decoded = reader.decoder().decode(&*self)?; + let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) @@ -885,25 +876,17 @@ impl<'a> BytesText<'a> { self.do_unescape_and_decode_with_custom_entities(reader, Some(custom_entities)) } - #[cfg(feature = "encoding")] fn do_unescape_and_decode_with_custom_entities( &self, reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> Result { + #[cfg(feature = "encoding")] let decoded = reader.decoder().decode(&*self); - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) - } - #[cfg(not(feature = "encoding"))] - fn do_unescape_and_decode_with_custom_entities( - &self, - reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, - ) -> Result { + #[cfg(not(feature = "encoding"))] let decoded = reader.decoder().decode(&*self)?; + let unescaped = do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) From be9ec0c9f4c4dd5e81329184de8efce856f07487 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 19 Jun 2022 02:05:33 +0500 Subject: [PATCH 14/15] Use `?` operator instead of `map_err` --- src/de/mod.rs | 2 +- src/errors.rs | 38 ++++++++++++++++++++++++++++++++------ src/events/attributes.rs | 5 ++--- src/events/mod.rs | 12 +++++------- src/reader.rs | 11 ++++++----- src/se/mod.rs | 3 +-- 6 files changed, 47 insertions(+), 24 deletions(-) diff --git a/src/de/mod.rs b/src/de/mod.rs index c7adec3e..53c5ffd0 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -624,7 +624,7 @@ where allow_start: bool, ) -> Result, DeError> { match self.next()? { - DeEvent::Text(e) if unescape => e.unescape().map_err(|e| DeError::InvalidXml(e.into())), + DeEvent::Text(e) if unescape => e.unescape().map_err(Into::into), DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())), DeEvent::CData(e) => Ok(e), DeEvent::Start(e) if allow_start => { diff --git a/src/errors.rs b/src/errors.rs index 79eb7899..01c960c0 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -4,6 +4,7 @@ use crate::escape::EscapeError; use crate::events::attributes::AttrError; use crate::utils::write_byte_string; use std::str::Utf8Error; +use std::string::FromUtf8Error; /// The error type used by this crate. #[derive(Debug)] @@ -53,6 +54,14 @@ impl From for Error { } } +impl From for Error { + /// Creates a new `Error::Utf8` from the given error + #[inline] + fn from(error: FromUtf8Error) -> Error { + error.utf8_error().into() + } +} + impl From for Error { /// Creates a new `Error::EscapeError` from the given error #[inline] @@ -227,6 +236,7 @@ pub mod serialize { } impl From for DeError { + #[inline] fn from(e: Error) -> Self { Self::InvalidXml(e) } @@ -239,15 +249,17 @@ pub mod serialize { } } - impl From for DeError { - fn from(e: ParseIntError) -> Self { - Self::InvalidInt(e) + impl From for DeError { + #[inline] + fn from(e: Utf8Error) -> Self { + Self::InvalidXml(e.into()) } } - impl From for DeError { - fn from(e: ParseFloatError) -> Self { - Self::InvalidFloat(e) + impl From for DeError { + #[inline] + fn from(e: FromUtf8Error) -> Self { + Self::InvalidXml(e.into()) } } @@ -257,4 +269,18 @@ pub mod serialize { Self::InvalidXml(e.into()) } } + + impl From for DeError { + #[inline] + fn from(e: ParseIntError) -> Self { + Self::InvalidInt(e) + } + } + + impl From for DeError { + #[inline] + fn from(e: ParseFloatError) -> Self { + Self::InvalidFloat(e) + } + } } diff --git a/src/events/attributes.rs b/src/events/attributes.rs index d7facd6f..1e059d5e 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -119,9 +119,8 @@ impl<'a> Attribute<'a> { #[cfg(not(feature = "encoding"))] let decoded = reader.decoder().decode(&*self.value)?; - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) + let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; + Ok(String::from_utf8(unescaped.into_owned())?) } } diff --git a/src/events/mod.rs b/src/events/mod.rs index 93da82ff..e4865c3a 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -323,9 +323,8 @@ impl<'a> BytesStart<'a> { #[cfg(not(feature = "encoding"))] let decoded = reader.decoder().decode(&*self)?; - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) + let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; + Ok(String::from_utf8(unescaped.into_owned())?) } /// Edit the name of the BytesStart in-place @@ -512,7 +511,7 @@ impl<'a> BytesDecl<'a> { Some(Ok(a)) if a.key.as_ref() == b"version" => Ok(a.value), // first attribute was not "version" Some(Ok(a)) => { - let found = from_utf8(a.key.as_ref()).map_err(Error::Utf8)?.to_string(); + let found = from_utf8(a.key.as_ref())?.to_string(); Err(Error::XmlDeclWithoutVersion(Some(found))) } // error parsing attributes @@ -887,9 +886,8 @@ impl<'a> BytesText<'a> { #[cfg(not(feature = "encoding"))] let decoded = reader.decoder().decode(&*self)?; - let unescaped = - do_unescape(decoded.as_bytes(), custom_entities).map_err(Error::EscapeError)?; - String::from_utf8(unescaped.into_owned()).map_err(|e| Error::Utf8(e.utf8_error())) + let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; + Ok(String::from_utf8(unescaped.into_owned())?) } /// Gets escaped content. diff --git a/src/reader.rs b/src/reader.rs index 2d2e99ab..752b07b0 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1473,7 +1473,7 @@ impl Decoder { /// /// If you instead want to use XML declared encoding, use the `encoding` feature pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { - from_utf8(bytes).map_err(Error::Utf8) + Ok(from_utf8(bytes)?) } /// Decodes a slice regardless of XML declaration with BOM removal if @@ -1483,11 +1483,12 @@ impl Decoder { /// /// If you instead want to use XML declared encoding, use the `encoding` feature pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<&'b str> { - if bytes.starts_with(b"\xEF\xBB\xBF") { - from_utf8(&bytes[3..]).map_err(Error::Utf8) + let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") { + &bytes[3..] } else { - from_utf8(bytes).map_err(Error::Utf8) - } + bytes + }; + self.decode(bytes) } } diff --git a/src/se/mod.rs b/src/se/mod.rs index 92a97c40..bb737802 100644 --- a/src/se/mod.rs +++ b/src/se/mod.rs @@ -23,8 +23,7 @@ pub fn to_writer(writer: W, value: &S) -> Result<(), DeE pub fn to_string(value: &S) -> Result { let mut writer = Vec::new(); to_writer(&mut writer, value)?; - let s = String::from_utf8(writer).map_err(|e| crate::errors::Error::Utf8(e.utf8_error()))?; - Ok(s) + Ok(String::from_utf8(writer)?) } /// A Serializer From d49a4b8289c52afa81b6a8560b4ab08b88c6e5dd Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 19 Jun 2022 02:31:25 +0500 Subject: [PATCH 15/15] Fix #180: Eliminated the differences in the decoding API when feature `encoding` enabled and when it is disabled Co-authored-by: Daniel Alley --- Changelog.md | 10 ++++++++ src/de/escape.rs | 7 ------ src/de/mod.rs | 2 +- src/de/seq.rs | 8 +------ src/errors.rs | 14 ++++++----- src/events/attributes.rs | 4 ---- src/events/mod.rs | 27 +-------------------- src/reader.rs | 47 +++++++++++++++++++++++++++---------- tests/xmlrs_reader_tests.rs | 30 +++++++---------------- 9 files changed, 65 insertions(+), 84 deletions(-) diff --git a/Changelog.md b/Changelog.md index f83cd351..64bb395f 100644 --- a/Changelog.md +++ b/Changelog.md @@ -78,6 +78,16 @@ - text before the first tag is not an XML content at all, so it is meaningless to try to unescape something in it +- [#180]: Eliminated the differences in the decoding API when feature `encoding` enabled and when it is + disabled. Signatures of functions are now the same regardless of whether or not the feature is + enabled, and an error will be returned instead of performing replacements for invalid characters + in both cases. + + Previously, if the `encoding` feature was enabled, decoding functions would return `Result>` + while without this feature they would return `Result<&str>`. With this change, only `Result>` + is returned regardless of the status of the feature. +- [#180]: Error variant `Error::Utf8` replaced by `Error::NonDecodable` + ### New Tests - [#9]: Added tests for incorrect nested tags in input diff --git a/src/de/escape.rs b/src/de/escape.rs index 28f99364..e47109e0 100644 --- a/src/de/escape.rs +++ b/src/de/escape.rs @@ -45,12 +45,8 @@ macro_rules! deserialize_num { where V: Visitor<'de>, { - #[cfg(not(feature = "encoding"))] let value = self.decoder.decode(self.escaped_value.as_ref())?.parse()?; - #[cfg(feature = "encoding")] - let value = self.decoder.decode(self.escaped_value.as_ref()).parse()?; - visitor.$visit(value) } }; @@ -71,11 +67,8 @@ impl<'de, 'a> serde::Deserializer<'de> for EscapedDeserializer<'a> { V: Visitor<'de>, { let unescaped = self.unescaped()?; - #[cfg(not(feature = "encoding"))] let value = self.decoder.decode(&unescaped)?; - #[cfg(feature = "encoding")] - let value = self.decoder.decode(&unescaped); visitor.visit_str(&value) } diff --git a/src/de/mod.rs b/src/de/mod.rs index 53c5ffd0..e252ada7 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -337,7 +337,7 @@ where { #[cfg(feature = "encoding")] { - let value = decoder.decode(value); + let value = decoder.decode(value)?; // No need to unescape because valid boolean representations cannot be escaped match value.as_ref() { "true" | "1" | "True" | "TRUE" | "t" | "Yes" | "YES" | "yes" | "y" => { diff --git a/src/de/seq.rs b/src/de/seq.rs index 958f172d..fe4559bd 100644 --- a/src/de/seq.rs +++ b/src/de/seq.rs @@ -2,8 +2,6 @@ use crate::de::{DeError, DeEvent, Deserializer, XmlRead}; use crate::events::BytesStart; use crate::reader::Decoder; use serde::de::{DeserializeSeed, SeqAccess}; -#[cfg(not(feature = "encoding"))] -use std::borrow::Cow; /// Check if tag `start` is included in the `fields` list. `decoder` is used to /// get a string representation of a tag. @@ -14,11 +12,7 @@ pub fn not_in( start: &BytesStart, decoder: Decoder, ) -> Result { - #[cfg(not(feature = "encoding"))] - let tag = Cow::Borrowed(decoder.decode(start.name().into_inner())?); - - #[cfg(feature = "encoding")] - let tag = decoder.decode(start.name().into_inner()); + let tag = decoder.decode(start.name().into_inner())?; Ok(fields.iter().all(|&field| field != tag.as_ref())) } diff --git a/src/errors.rs b/src/errors.rs index 01c960c0..c1af08af 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -11,8 +11,9 @@ use std::string::FromUtf8Error; pub enum Error { /// IO error Io(::std::io::Error), - /// Utf8 error - Utf8(Utf8Error), + /// Input decoding error. If `encoding` feature is disabled, contains `None`, + /// otherwise contains the UTF-8 decoding error + NonDecodable(Option), /// Unexpected End of File UnexpectedEof(String), /// End event mismatch @@ -47,10 +48,10 @@ impl From<::std::io::Error> for Error { } impl From for Error { - /// Creates a new `Error::Utf8` from the given error + /// Creates a new `Error::NonDecodable` from the given error #[inline] fn from(error: Utf8Error) -> Error { - Error::Utf8(error) + Error::NonDecodable(Some(error)) } } @@ -86,7 +87,8 @@ impl std::fmt::Display for Error { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { Error::Io(e) => write!(f, "I/O error: {}", e), - Error::Utf8(e) => write!(f, "UTF8 error: {}", e), + Error::NonDecodable(None) => write!(f, "Malformed input, decoding impossible"), + Error::NonDecodable(Some(e)) => write!(f, "Malformed UTF-8 input: {}", e), Error::UnexpectedEof(e) => write!(f, "Unexpected EOF during reading {}", e), Error::EndEventMismatch { expected, found } => { write!(f, "Expecting found ", expected, found) @@ -118,7 +120,7 @@ impl std::error::Error for Error { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { Error::Io(e) => Some(e), - Error::Utf8(e) => Some(e), + Error::NonDecodable(Some(e)) => Some(e), Error::InvalidAttr(e) => Some(e), Error::EscapeError(e) => Some(e), _ => None, diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 1e059d5e..d6331bc7 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -113,10 +113,6 @@ impl<'a> Attribute<'a> { reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> XmlResult { - #[cfg(feature = "encoding")] - let decoded = reader.decoder().decode(&*self.value); - - #[cfg(not(feature = "encoding"))] let decoded = reader.decoder().decode(&*self.value)?; let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; diff --git a/src/events/mod.rs b/src/events/mod.rs index e4865c3a..0d799e7a 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -85,10 +85,6 @@ impl<'a> BytesStartText<'a> { /// appeared in the BOM or in the text before the first tag. pub fn decode_with_bom_removal(&self, decoder: Decoder) -> Result { //TODO: Fix lifetime issue - it should be possible to borrow string - #[cfg(feature = "encoding")] - let decoded = decoder.decode_with_bom_removal(&*self); - - #[cfg(not(feature = "encoding"))] let decoded = decoder.decode_with_bom_removal(&*self)?; Ok(decoded.to_string()) @@ -317,10 +313,6 @@ impl<'a> BytesStart<'a> { reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> Result { - #[cfg(feature = "encoding")] - let decoded = reader.decoder().decode(&*self); - - #[cfg(not(feature = "encoding"))] let decoded = reader.decoder().decode(&*self)?; let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; @@ -880,10 +872,6 @@ impl<'a> BytesText<'a> { reader: &Reader, custom_entities: Option<&HashMap, Vec>>, ) -> Result { - #[cfg(feature = "encoding")] - let decoded = reader.decoder().decode(&*self); - - #[cfg(not(feature = "encoding"))] let decoded = reader.decoder().decode(&*self)?; let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; @@ -1000,21 +988,8 @@ impl<'a> BytesCData<'a> { #[cfg(feature = "serialize")] pub(crate) fn decode(&self, decoder: crate::reader::Decoder) -> Result> { Ok(match &self.content { - Cow::Borrowed(bytes) => { - #[cfg(feature = "encoding")] - { - decoder.decode(bytes) - } - #[cfg(not(feature = "encoding"))] - { - decoder.decode(bytes)?.into() - } - } + Cow::Borrowed(bytes) => decoder.decode(bytes)?, Cow::Owned(bytes) => { - #[cfg(feature = "encoding")] - let decoded = decoder.decode(bytes).into_owned(); - - #[cfg(not(feature = "encoding"))] let decoded = decoder.decode(bytes)?.to_string(); decoded.into() diff --git a/src/reader.rs b/src/reader.rs index 752b07b0..4c556a8b 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1,6 +1,5 @@ //! A module to handle `Reader` -#[cfg(feature = "encoding")] use std::borrow::Cow; use std::io::{self, BufRead, BufReader}; use std::{fs::File, path::Path, str::from_utf8}; @@ -1472,8 +1471,9 @@ impl Decoder { /// Returns an error in case of malformed sequences in the `bytes`. /// /// If you instead want to use XML declared encoding, use the `encoding` feature - pub fn decode<'c>(&self, bytes: &'c [u8]) -> Result<&'c str> { - Ok(from_utf8(bytes)?) + #[inline] + pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { + Ok(Cow::Borrowed(from_utf8(bytes)?)) } /// Decodes a slice regardless of XML declaration with BOM removal if @@ -1482,7 +1482,7 @@ impl Decoder { /// Returns an error in case of malformed sequences in the `bytes`. /// /// If you instead want to use XML declared encoding, use the `encoding` feature - pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result<&'b str> { + pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { let bytes = if bytes.starts_with(b"\xEF\xBB\xBF") { &bytes[3..] } else { @@ -1504,12 +1504,18 @@ impl Decoder { } /// Decodes specified bytes using encoding, declared in the XML, if it was - /// declared there, or UTF-8 otherwise + /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present + /// in the `bytes`. /// - /// Decode `bytes` with BOM sniffing and with malformed sequences replaced with the - /// `U+FFFD REPLACEMENT CHARACTER`. - pub fn decode<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> { - self.encoding.decode(bytes).0 + /// Returns an error in case of malformed sequences in the `bytes`. + pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result> { + match self + .encoding + .decode_without_bom_handling_and_without_replacement(bytes) + { + None => Err(Error::NonDecodable(None)), + Some(s) => Ok(s), + } } /// Decodes a slice with BOM removal if it is present in the `bytes` using @@ -1520,9 +1526,26 @@ impl Decoder { /// /// If XML declaration is absent in the XML, UTF-8 is used. /// - /// Any malformed sequences replaced with the `U+FFFD REPLACEMENT CHARACTER`. - pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Cow<'b, str> { - self.encoding.decode_with_bom_removal(bytes).0 + /// Returns an error in case of malformed sequences in the `bytes`. + pub fn decode_with_bom_removal<'b>(&self, bytes: &'b [u8]) -> Result> { + self.decode(self.remove_bom(bytes)) + } + /// Copied from [`Encoding::decode_with_bom_removal`] + #[inline] + fn remove_bom<'b>(&self, bytes: &'b [u8]) -> &'b [u8] { + use encoding_rs::*; + + if self.encoding == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") { + return &bytes[3..]; + } + if self.encoding == UTF_16LE && bytes.starts_with(b"\xFF\xFE") { + return &bytes[2..]; + } + if self.encoding == UTF_16BE && bytes.starts_with(b"\xFE\xFF") { + return &bytes[2..]; + } + + bytes } } diff --git a/tests/xmlrs_reader_tests.rs b/tests/xmlrs_reader_tests.rs index a3f66bcb..a23bb03c 100644 --- a/tests/xmlrs_reader_tests.rs +++ b/tests/xmlrs_reader_tests.rs @@ -1,7 +1,6 @@ use quick_xml::events::{BytesStart, Event}; use quick_xml::name::{QName, ResolveResult}; -use quick_xml::{Reader, Result}; -use std::borrow::Cow; +use quick_xml::{Decoder, Reader, Result}; use std::str::from_utf8; #[test] @@ -381,7 +380,7 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) { loop { buf.clear(); let event = reader.read_namespaced_event(&mut buf, &mut ns_buffer); - let line = xmlrs_display(event, &reader); + let line = xmlrs_display(event, reader.decoder()); if let Some((n, spec)) = spec_lines.next() { if spec.trim() == "EndDocument" { break; @@ -420,8 +419,8 @@ fn test_bytes(input: &[u8], output: &[u8], is_short: bool) { } } -fn namespace_name(n: ResolveResult, name: QName, reader: &Reader<&[u8]>) -> String { - let name = decode(name.as_ref(), reader); +fn namespace_name(n: ResolveResult, name: QName, decoder: Decoder) -> String { + let name = decoder.decode(name.as_ref()).unwrap(); match n { // Produces string '{namespace}prefixed_name' ResolveResult::Bound(n) => format!("{{{}}}{}", from_utf8(n.as_ref()).unwrap(), name), @@ -448,22 +447,11 @@ fn make_attrs(e: &BytesStart) -> ::std::result::Result { Ok(atts.join(", ")) } -// FIXME: The public API differs based on the "encoding" feature -fn decode<'a>(text: &'a [u8], reader: &Reader<&[u8]>) -> Cow<'a, str> { - #[cfg(feature = "encoding")] - let decoded = reader.decoder().decode(text); - - #[cfg(not(feature = "encoding"))] - let decoded = Cow::Borrowed(reader.decoder().decode(text).unwrap()); - - decoded -} - -fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8]>) -> String { +fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, decoder: Decoder) -> String { match opt_event { Ok((_, Event::StartText(_))) => "StartText".to_string(), Ok((n, Event::Start(ref e))) => { - let name = namespace_name(n, e.name(), reader); + let name = namespace_name(n, e.name(), decoder); match make_attrs(e) { Ok(ref attrs) if attrs.is_empty() => format!("StartElement({})", &name), Ok(ref attrs) => format!("StartElement({} [{}])", &name, &attrs), @@ -471,7 +459,7 @@ fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8 } } Ok((n, Event::Empty(ref e))) => { - let name = namespace_name(n, e.name(), reader); + let name = namespace_name(n, e.name(), decoder); match make_attrs(e) { Ok(ref attrs) if attrs.is_empty() => format!("EmptyElement({})", &name), Ok(ref attrs) => format!("EmptyElement({} [{}])", &name, &attrs), @@ -479,13 +467,13 @@ fn xmlrs_display(opt_event: Result<(ResolveResult, Event)>, reader: &Reader<&[u8 } } Ok((n, Event::End(ref e))) => { - let name = namespace_name(n, e.name(), reader); + let name = namespace_name(n, e.name(), decoder); format!("EndElement({})", name) } Ok((_, Event::Comment(ref e))) => format!("Comment({})", from_utf8(e).unwrap()), Ok((_, Event::CData(ref e))) => format!("CData({})", from_utf8(e).unwrap()), Ok((_, Event::Text(ref e))) => match e.unescaped() { - Ok(c) => format!("Characters({})", decode(c.as_ref(), reader).as_ref()), + Ok(c) => format!("Characters({})", decoder.decode(c.as_ref()).unwrap()), Err(ref err) => format!("FailedUnescape({:?}; {})", e.escaped(), err), }, Ok((_, Event::Decl(ref e))) => {