Skip to content

Commit

Permalink
Add some documentation to internal macros and functions that implemen…
Browse files Browse the repository at this point in the history
…ts parser
  • Loading branch information
Mingun authored and dralley committed Mar 12, 2023
1 parent 078cd4c commit b1a23a1
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 6 deletions.
2 changes: 2 additions & 0 deletions src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,11 @@ macro_rules! impl_buffered_source {
self $(.$reader)? .consume(used);
read += used;

// Position now just after the `>` symbol
*position += read;
break;
} else {
// The `>` symbol not yet found, continue reading
buf.extend_from_slice(available);

let used = available.len();
Expand Down
41 changes: 38 additions & 3 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ macro_rules! read_event_impl {
) => {{
let event = loop {
match $self.parser.state {
ParseState::Init => {
ParseState::Init => { // Go to OpenedTag state
// If encoding set explicitly, we not need to detect it. For example,
// explicit UTF-8 set automatically if Reader was created using `from_str`.
// But we still need to remove BOM for consistency with no encoding
Expand All @@ -184,19 +184,21 @@ macro_rules! read_event_impl {
#[cfg(not(feature = "encoding"))]
$reader.remove_utf8_bom() $(.$await)? ?;

// Go to OpenedTag state
match $self.$read_until_open($buf) $(.$await)? {
Ok(Ok(ev)) => break Ok(ev),
Ok(Err(b)) => $buf = b,
Err(err) => break Err(err),
}
},
ParseState::ClosedTag => {
ParseState::ClosedTag => { // Go to OpenedTag state
match $self.$read_until_open($buf) $(.$await)? {
Ok(Ok(ev)) => break Ok(ev),
Ok(Err(b)) => $buf = b,
Err(err) => break Err(err),
}
},
// Go to ClosedTag state in next two arms
ParseState::OpenedTag => break $self.$read_until_close($buf) $(.$await)?,
ParseState::Empty => break $self.parser.close_expanded_empty(),
ParseState::Exit => break Ok(Event::Eof),
Expand All @@ -210,6 +212,15 @@ macro_rules! read_event_impl {
}};
}

/// Read bytes up to `<` and skip it. If current byte (after skipping all space
/// characters if [`Parser::trim_text_start`] is `true`) is already `<`, then
/// returns the next event, otherwise stay at position just after the `<` symbol.
///
/// Moves parser to the `OpenedTag` state.
///
/// This code is executed in two cases:
/// - after start of parsing just after skipping BOM if it is present
/// - after parsing `</tag>` or `<tag>`
macro_rules! read_until_open {
(
$self:ident, $buf:ident,
Expand All @@ -225,20 +236,42 @@ macro_rules! read_until_open {

// If we already at the `<` symbol, do not try to return an empty Text event
if $reader.skip_one(b'<', &mut $self.parser.offset) $(.$await)? ? {
// Pass $buf to the next next iteration of parsing loop
return Ok(Err($buf));
}

match $reader
.read_bytes_until(b'<', $buf, &mut $self.parser.offset)
$(.$await)?
{
// Return Text event with `bytes` content
Ok(Some(bytes)) => $self.parser.read_text(bytes).map(Ok),
Ok(None) => Ok(Ok(Event::Eof)),
Err(e) => Err(e),
}
}};
}

/// Read bytes up to the `>` and skip it. This method is expected to be called
/// after seeing the `<` symbol and skipping it. Inspects the next (current)
/// symbol and returns an appropriate [`Event`]:
///
/// |Symbol |Event
/// |-------|-------------------------------------
/// |`!` |[`Comment`], [`CData`] or [`DocType`]
/// |`/` |[`End`]
/// |`?` |[`PI`]
/// |_other_|[`Start`] or [`Empty`]
///
/// Moves parser to the `ClosedTag` state.
///
/// [`Comment`]: Event::Comment
/// [`CData`]: Event::CData
/// [`DocType`]: Event::DocType
/// [`End`]: Event::End
/// [`PI`]: Event::PI
/// [`Start`]: Event::Start
/// [`Empty`]: Event::Empty
macro_rules! read_until_close {
(
$self:ident, $buf:ident,
Expand Down Expand Up @@ -371,10 +404,12 @@ enum ParseState {
/// that symbol will be returned in the [`Event::Text`] event. After that
/// the reader moves to the `OpenedTag` state.
ClosedTag,
/// This state is used only if option `expand_empty_elements` is set to `true`.
/// This state is used only if option [`expand_empty_elements`] is set to `true`.
/// Reader enters to this state when it is in a `ClosedTag` state and emits an
/// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
/// after which reader returned to the `ClosedTag` state.
///
/// [`expand_empty_elements`]: Parser::expand_empty_elements
Empty,
/// Reader enters this state when `Eof` event generated or an error occurred.
/// This is the last state, the reader stay in it forever.
Expand Down
8 changes: 5 additions & 3 deletions src/reader/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,13 +204,15 @@ impl Parser {
}
}

/// reads `BytesElement` starting with any character except `/`, `!` or ``?`
/// return `Start` or `Empty` event
/// Converts content of a tag to a `Start` or an `Empty` event
///
/// # Parameters
/// - `buf`: Content of a tag between `<` and `>`
pub fn read_start<'b>(&mut self, buf: &'b [u8]) -> Result<Event<'b>> {
// TODO: do this directly when reading bufreader ...
let len = buf.len();
let name_end = buf.iter().position(|&b| is_whitespace(b)).unwrap_or(len);
if let Some(&b'/') = buf.last() {
// This is self-closed tag `<something/>`
let end = if name_end < len { name_end } else { len - 1 };
if self.expand_empty_elements {
self.state = ParseState::Empty;
Expand Down
1 change: 1 addition & 0 deletions src/reader/slice_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,7 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
let mut state = ReadElementState::Elem;

if let Some((bytes, i)) = state.change(self) {
// Position now just after the `>` symbol
*position += i;
*self = &self[i..];
return Ok(Some(bytes));
Expand Down

0 comments on commit b1a23a1

Please sign in to comment.