Skip to content

Commit

Permalink
Merge pull request #755 from Mingun/fix-trim-end
Browse files Browse the repository at this point in the history
Fix incorrect missing of trimming all-space text events when `trim_text_start = false` and `trim_text_end = true`
  • Loading branch information
Mingun committed Jun 14, 2024
2 parents a44792f + 7558577 commit 65bf651
Show file tree
Hide file tree
Showing 6 changed files with 248 additions and 213 deletions.
4 changes: 4 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,15 @@

### Bug Fixes

- [#755]: Fix incorrect missing of trimming all-space text events when
`trim_text_start = false` and `trim_text_end = true`.

### Misc Changes

- [#650]: Change the type of `Event::PI` to a new dedicated `BytesPI` type.

[#650]: https://github.com/tafia/quick-xml/issues/650
[#755]: https://github.com/tafia/quick-xml/pull/755


## 0.32.0 -- 2024-06-10
Expand Down
15 changes: 2 additions & 13 deletions src/reader/async_tokio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ use crate::events::Event;
use crate::name::{QName, ResolveResult};
use crate::reader::buffered_reader::impl_buffered_source;
use crate::reader::{
is_whitespace, BangType, ElementParser, NsReader, ParseState, Parser, PiParser, Reader, Span,
is_whitespace, BangType, ElementParser, NsReader, ParseState, Parser, PiParser, ReadTextResult,
Reader, Span,
};

/// A struct for read XML asynchronously from an [`AsyncBufRead`].
Expand Down Expand Up @@ -77,7 +78,6 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
read_event_impl!(
self, buf,
TokioAdapter(&mut self.reader),
read_until_open_async,
read_until_close_async,
await
)
Expand Down Expand Up @@ -141,17 +141,6 @@ impl<R: AsyncBufRead + Unpin> Reader<R> {
Ok(read_to_end!(self, end, buf, read_event_into_async, { buf.clear(); }, await))
}

/// Read until '<' is found, moves reader to an `OpenedTag` state and returns a `Text` event.
///
/// Returns inner `Ok` if the loop should be broken and an event returned.
/// Returns inner `Err` with the same `buf` because Rust borrowck stumbles upon this case in particular.
async fn read_until_open_async<'b>(
&mut self,
buf: &'b mut Vec<u8>,
) -> Result<std::result::Result<Event<'b>, &'b mut Vec<u8>>> {
read_until_open!(self, buf, TokioAdapter(&mut self.reader), read_event_into_async, await)
}

/// Private function to read until `>` is found. This function expects that
/// it was called just after encounter a `<` symbol.
async fn read_until_close_async<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
Expand Down
140 changes: 89 additions & 51 deletions src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ use std::path::Path;
use crate::errors::{Error, Result};
use crate::events::Event;
use crate::name::QName;
use crate::reader::{is_whitespace, BangType, Parser, Reader, Span, XmlSource};
use crate::reader::{is_whitespace, BangType, Parser, ReadTextResult, Reader, Span, XmlSource};

macro_rules! impl_buffered_source {
($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
#[cfg(not(feature = "encoding"))]
#[inline]
$($async)? fn remove_utf8_bom(&mut self) -> Result<()> {
$($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> {
use crate::encoding::UTF8_BOM;

loop {
Expand All @@ -26,14 +26,14 @@ macro_rules! impl_buffered_source {
Ok(())
},
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(Error::Io(e.into())),
Err(e) => Err(e),
};
}
}

#[cfg(feature = "encoding")]
#[inline]
$($async)? fn detect_encoding(&mut self) -> Result<Option<&'static encoding_rs::Encoding>> {
$($async)? fn detect_encoding(&mut self) -> io::Result<Option<&'static encoding_rs::Encoding>> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
Expand All @@ -43,54 +43,106 @@ macro_rules! impl_buffered_source {
Ok(None)
},
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(Error::Io(e.into())),
Err(e) => Err(e),
};
}
}

#[inline]
$($async)? fn read_text $(<$lf>)? (
&mut self,
buf: &'b mut Vec<u8>,
position: &mut usize,
) -> ReadTextResult<'b, &'b mut Vec<u8>> {
let mut read = 0;
let start = buf.len();
loop {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return ReadTextResult::Err(e);
}
};

match memchr::memchr(b'<', available) {
Some(0) => {
self $(.$reader)? .consume(1);
*position += 1;
return ReadTextResult::Markup(buf);
}
Some(i) => {
buf.extend_from_slice(&available[..i]);

let used = i + 1;
self $(.$reader)? .consume(used);
read += used;

*position += read;
return ReadTextResult::UpToMarkup(&buf[start..]);
}
None => {
buf.extend_from_slice(available);

let used = available.len();
self $(.$reader)? .consume(used);
read += used;
}
}
}

*position += read;
ReadTextResult::UpToEof(&buf[start..])
}

#[inline]
$($async)? fn read_bytes_until $(<$lf>)? (
&mut self,
byte: u8,
buf: &'b mut Vec<u8>,
position: &mut usize,
) -> Result<(&'b [u8], bool)> {
) -> io::Result<(&'b [u8], bool)> {
// search byte must be within the ascii range
debug_assert!(byte.is_ascii());

let mut read = 0;
let mut done = false;
let start = buf.len();
while !done {
let used = {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(Error::Io(e.into()));
}
};

match memchr::memchr(byte, available) {
Some(i) => {
buf.extend_from_slice(&available[..i]);
done = true;
i + 1
}
None => {
buf.extend_from_slice(available);
available.len()
}
loop {
let available = match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) if n.is_empty() => break,
Ok(n) => n,
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => {
*position += read;
return Err(e);
}
};
self $(.$reader)? .consume(used);
read += used;

match memchr::memchr(byte, available) {
Some(i) => {
buf.extend_from_slice(&available[..i]);

let used = i + 1;
self $(.$reader)? .consume(used);
read += used;

*position += read;
return Ok((&buf[start..], true));
}
None => {
buf.extend_from_slice(available);

let used = available.len();
self $(.$reader)? .consume(used);
read += used;
}
}
}
*position += read;

Ok((&buf[start..], done))
*position += read;
Ok((&buf[start..], false))
}

#[inline]
Expand Down Expand Up @@ -188,7 +240,7 @@ macro_rules! impl_buffered_source {
}

#[inline]
$($async)? fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> {
$($async)? fn skip_whitespace(&mut self, position: &mut usize) -> io::Result<()> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) => {
Expand All @@ -202,32 +254,18 @@ macro_rules! impl_buffered_source {
}
}
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(Error::Io(e.into())),
Err(e) => Err(e),
};
}
}

#[inline]
$($async)? fn skip_one(&mut self, byte: u8) -> Result<bool> {
// search byte must be within the ascii range
debug_assert!(byte.is_ascii());

match self.peek_one() $(.$await)? ? {
Some(b) if b == byte => {
self $(.$reader)? .consume(1);
Ok(true)
}
_ => Ok(false),
}
}

#[inline]
$($async)? fn peek_one(&mut self) -> Result<Option<u8>> {
$($async)? fn peek_one(&mut self) -> io::Result<Option<u8>> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) => Ok(n.first().cloned()),
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Err(e) => Err(Error::Io(e.into())),
Err(e) => Err(e),
};
}
}
Expand Down
Loading

0 comments on commit 65bf651

Please sign in to comment.