Skip to content

Commit

Permalink
Add reusable parser for XML element and use it internally
Browse files Browse the repository at this point in the history
  • Loading branch information
Mingun authored and dralley committed Jun 9, 2024
1 parent 02de8a5 commit 0a6ecd6
Show file tree
Hide file tree
Showing 6 changed files with 132 additions and 49 deletions.
2 changes: 2 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ resolve predefined entities.
- `quick_xml::escape::resolve_xml_entity`
- `quick_xml::escape::resolve_html5_entity`
- [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`.
- [#754]: Added parser for elements: `quick_xml::reader::ElementParser`.

### Bug Fixes

Expand Down Expand Up @@ -101,6 +102,7 @@ resolve predefined entities.
[#743]: https://github.com/tafia/quick-xml/pull/743
[#748]: https://github.com/tafia/quick-xml/pull/748
[#753]: https://github.com/tafia/quick-xml/pull/753
[#754]: https://github.com/tafia/quick-xml/pull/754
[`DeEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.DeEvent.html
[`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html
[`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html
Expand Down
4 changes: 1 addition & 3 deletions src/reader/async_tokio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@ use crate::errors::{Error, Result, SyntaxError};
use crate::events::Event;
use crate::name::{QName, ResolveResult};
use crate::reader::buffered_reader::impl_buffered_source;
use crate::reader::{
is_whitespace, BangType, NsReader, ParseState, ReadElementState, Reader, Span,
};
use crate::reader::{is_whitespace, BangType, ElementParser, NsReader, ParseState, Reader, Span};

/// A struct for read XML asynchronously from an [`AsyncBufRead`].
///
Expand Down
13 changes: 7 additions & 6 deletions src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::path::Path;
use crate::errors::{Error, Result, SyntaxError};
use crate::events::Event;
use crate::name::QName;
use crate::reader::{is_whitespace, BangType, ReadElementState, Reader, Span, XmlSource};
use crate::reader::{is_whitespace, BangType, ElementParser, Reader, Span, XmlSource};

macro_rules! impl_buffered_source {
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
Expand Down Expand Up @@ -190,19 +190,20 @@ macro_rules! impl_buffered_source {
buf: &'b mut Vec<u8>,
position: &mut usize,
) -> Result<&'b [u8]> {
let mut state = ReadElementState::Elem;
let mut parser = ElementParser::default();
let mut read = 0;

let start = buf.len();
loop {
match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(available) => {
if let Some((consumed, used)) = state.change(available) {
buf.extend_from_slice(consumed);
if let Some(used) = parser.feed(available) {
buf.extend_from_slice(&available[..used]);

self $(.$reader)? .consume(used);
read += used;
// +1 for `>` which we do not include
self $(.$reader)? .consume(used + 1);
read += used + 1;

// Position now just after the `>` symbol
*position += read;
Expand Down
113 changes: 113 additions & 0 deletions src/reader/element.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
//! Contains a parser for an XML element.

/// A parser that search a `>` symbol in the slice outside of quoted regions.
///
/// The parser considers two quoted regions: a double-quoted (`"..."`) and
/// a single-quoted (`'...'`) region. Matches found inside those regions are not
/// considered as results. Each region starts and ends by its quote symbol,
/// which cannot be escaped (but can be encoded as XML character entity or named
/// entity. Anyway, that encoding does not contain literal quotes).
///
/// To use a parser create an instance of parser and [`feed`] data into it.
/// After successful search the parser will return [`Some`] with position of
/// found symbol. If search is unsuccessful, a [`None`] will be returned. You
/// typically would expect positive result of search, so that you should feed
/// new data until you get it.
///
/// NOTE: after successful match the parser does not returned to the initial
/// state and should not be used anymore. Create a new parser if you want to perform
/// new search.
///
/// # Example
///
/// ```
/// # use quick_xml::reader::ElementParser;
/// # use pretty_assertions::assert_eq;
/// let mut parser = ElementParser::default();
///
/// // Parse `<my-element with = 'some > inside'>and the text follow...`
/// // splitted into three chunks
/// assert_eq!(parser.feed(b"<my-element"), None);
/// // ...get new chunk of data
/// assert_eq!(parser.feed(b" with = 'some >"), None);
/// // ...get another chunk of data
/// assert_eq!(parser.feed(b" inside'>and the text follow..."), Some(8));
/// // ^ ^
/// // 0 8
/// ```
///
/// [`feed`]: Self::feed()
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum ElementParser {
/// The initial state (inside element, but outside of attribute value).
Outside,
/// Inside a single-quoted region (`'...'`).
SingleQ,
/// Inside a double-quoted region (`"..."`).
DoubleQ,
}

impl ElementParser {
/// Returns number of consumed bytes or `None` if `>` was not found in `bytes`.
#[inline]
pub fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
for i in memchr::memchr3_iter(b'>', b'\'', b'"', bytes) {
*self = match (*self, bytes[i]) {
// only allowed to match `>` while we are in state `Outside`
(Self::Outside, b'>') => return Some(i),
(Self::Outside, b'\'') => Self::SingleQ,
(Self::Outside, b'\"') => Self::DoubleQ,

// the only end_byte that gets us out if the same character
(Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Outside,

// all other bytes: no state change
_ => continue,
};
}
None
}
}

impl Default for ElementParser {
#[inline]
fn default() -> Self {
Self::Outside
}
}

#[test]
fn parse() {
use pretty_assertions::assert_eq;
use ElementParser::*;

/// Returns `Ok(pos)` with the position in the buffer where element is ended.
///
/// Returns `Err(internal_state)` if parsing does not done yet.
fn parse_element(bytes: &[u8], mut parser: ElementParser) -> Result<usize, ElementParser> {
match parser.feed(bytes) {
Some(i) => Ok(i),
None => Err(parser),
}
}

assert_eq!(parse_element(b"", Outside), Err(Outside));
assert_eq!(parse_element(b"", SingleQ), Err(SingleQ));
assert_eq!(parse_element(b"", DoubleQ), Err(DoubleQ));

assert_eq!(parse_element(b"'", Outside), Err(SingleQ));
assert_eq!(parse_element(b"'", SingleQ), Err(Outside));
assert_eq!(parse_element(b"'", DoubleQ), Err(DoubleQ));

assert_eq!(parse_element(b"\"", Outside), Err(DoubleQ));
assert_eq!(parse_element(b"\"", SingleQ), Err(SingleQ));
assert_eq!(parse_element(b"\"", DoubleQ), Err(Outside));

assert_eq!(parse_element(b">", Outside), Ok(0));
assert_eq!(parse_element(b">", SingleQ), Err(SingleQ));
assert_eq!(parse_element(b">", DoubleQ), Err(DoubleQ));

assert_eq!(parse_element(b"''>", Outside), Ok(2));
assert_eq!(parse_element(b"''>", SingleQ), Err(SingleQ));
assert_eq!(parse_element(b"''>", DoubleQ), Err(DoubleQ));
}
36 changes: 2 additions & 34 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -426,11 +426,13 @@ macro_rules! read_to_end {
#[cfg(feature = "async-tokio")]
mod async_tokio;
mod buffered_reader;
mod element;
mod ns_reader;
mod pi;
mod slice_reader;
mod state;

pub use element::ElementParser;
pub use ns_reader::NsReader;
pub use pi::PiParser;

Expand Down Expand Up @@ -986,40 +988,6 @@ impl BangType {
}
}

/// State machine for the [`XmlSource::read_element`]
#[derive(Clone, Copy)]
enum ReadElementState {
/// The initial state (inside element, but outside of attribute value)
Elem,
/// Inside a single-quoted attribute value
SingleQ,
/// Inside a double-quoted attribute value
DoubleQ,
}
impl ReadElementState {
/// Changes state by analyzing part of input.
/// Returns a tuple with part of chunk up to element closing symbol `>`
/// and a position after that symbol or `None` if such symbol was not found
#[inline(always)]
fn change<'b>(&mut self, chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
for i in memchr::memchr3_iter(b'>', b'\'', b'"', chunk) {
*self = match (*self, chunk[i]) {
// only allowed to match `>` while we are in state `Elem`
(Self::Elem, b'>') => return Some((&chunk[..i], i + 1)),
(Self::Elem, b'\'') => Self::SingleQ,
(Self::Elem, b'\"') => Self::DoubleQ,

// the only end_byte that gets us out if the same character
(Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Elem,

// all other bytes: no state change
_ => *self,
};
}
None
}
}

/// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
#[inline]
pub(crate) const fn is_whitespace(b: u8) -> bool {
Expand Down
13 changes: 7 additions & 6 deletions src/reader/slice_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use encoding_rs::{Encoding, UTF_8};
use crate::errors::{Error, Result, SyntaxError};
use crate::events::Event;
use crate::name::QName;
use crate::reader::{is_whitespace, BangType, PiParser, ReadElementState, Reader, Span, XmlSource};
use crate::reader::{is_whitespace, BangType, ElementParser, PiParser, Reader, Span, XmlSource};

/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
/// This implementation supports not using an intermediate buffer as the byte slice
Expand Down Expand Up @@ -312,12 +312,13 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
}

fn read_element(&mut self, _buf: (), position: &mut usize) -> Result<&'a [u8]> {
let mut state = ReadElementState::Elem;
let mut parser = ElementParser::default();

if let Some((bytes, i)) = state.change(self) {
// Position now just after the `>` symbol
*position += i;
*self = &self[i..];
if let Some(i) = parser.feed(self) {
// +1 for `>` which we do not include
*position += i + 1;
let bytes = &self[..i];
*self = &self[i + 1..];
return Ok(bytes);
}

Expand Down

0 comments on commit 0a6ecd6

Please sign in to comment.