Permalink
Cannot retrieve contributors at this time
Join GitHub today
GitHub is home to over 28 million developers working together to host and review code, manage projects, and build software together.
Sign up
Fetching contributors…
| //! `Bytelines` is a simple library crate which offers line iteration for | |
| //! `BufRead` via `&[u8]` rather than `String`. | |
| //! | |
| //! Due to the removal of checking for `String` validity, this is typically | |
| //! much faster for reading in raw data and much more flexible. The APIs | |
| //! offered in this crate are intended to function exactly the same as the | |
| //! `lines` function inside the `BufRead` trait, except that the bytes which | |
| //! precede the line delimiter are not validated. | |
| //! | |
| //! Performance of [ByteLines](enum.ByteLines.html) is very close to that of | |
| //! writing a `loop` manually, whereas [RefByteLines](enum.RefByteLines.html) | |
| //! is practically identical due to the avoidance of "unnecessary" allocations. | |
| #![doc(html_root_url = "https://docs.rs/bytelines/1.0.1")] | |
| use std::io::BufRead; | |
| use std::marker::PhantomData; | |
| /// Represents anything which can provide iterators of byte lines. | |
| pub trait ByteLinesReader<'a, B> | |
| where | |
| B: BufRead, | |
| { | |
| /// Returns an iterator over the lines of this reader (as `Vec<u8>`). | |
| /// | |
| /// Just like the equivalent in the standard library, the iterator returned | |
| /// from this function will yield instances of `io::Result<String>`. Each | |
| /// string returned will not have a newline byte (the 0xA byte) or CRLF | |
| /// (0xD, 0xA bytes) at the end. | |
| fn byte_lines(self) -> ByteLines<'a, B>; | |
| /// Returns an iterator over the lines of this reader (as `&[u8]`). | |
| /// | |
| /// This method operates in the same way as [byte_lines](#method.byte_lines), | |
| /// except that the iterated values are references to the internal byte buffer. | |
| /// Due to this, you can only safely hold a single line at any given time, and | |
| /// as such this method is marked as `unsafe`. If you're using usual loop syntax | |
| /// of `for $x in $y` your code will not come across this unsafe contract. | |
| /// | |
| /// When performance is important, this method should be used rather than | |
| /// [byte_lines](#method.byte_lines) as there is only a single buffer | |
| /// allocation (disregarding any potential resizing that may be required), | |
| /// whereas [byte_lines](#method.byte_lines) will allocate a `Vec<u8>` for | |
| /// each input line and provide ownership. | |
| unsafe fn ref_byte_lines(self) -> RefByteLines<'a, B>; | |
| } | |
| /// Blanket implementation for all `BufRead`. | |
| impl<'a, B> ByteLinesReader<'a, B> for B | |
| where | |
| B: BufRead, | |
| { | |
| /// Returns an iterator over the lines of this reader (as `Vec<u8>`). | |
| fn byte_lines(self) -> ByteLines<'a, Self> { | |
| ByteLines { | |
| inner: unsafe { self.ref_byte_lines() }, | |
| } | |
| } | |
| /// Returns an iterator over the lines of this reader (as `&[u8]`). | |
| unsafe fn ref_byte_lines(self) -> RefByteLines<'a, Self> { | |
| RefByteLines { | |
| buffer: Vec::new(), | |
| reader: self, | |
| marker: PhantomData, | |
| } | |
| } | |
| } | |
| /// Provides a safe iterator over lines of input as byte vectors (`Vec<u8>`). | |
| /// | |
| /// Internally, this iterator delegates to `RefByteLines` - the only difference | |
| /// being that this iterator will allocate a vector for each reference returned, | |
| /// thus making ownership clear and avoiding any issues with data races. | |
| pub struct ByteLines<'a, B> | |
| where | |
| B: BufRead, | |
| { | |
| inner: RefByteLines<'a, B>, | |
| } | |
| /// Provides an iterator over lines of input as byte slices (`&[u8]`). | |
| /// | |
| /// This iterator requires opting in to the use of unsafe code, as there is a | |
| /// potential data race if you call `next()` on the iterator twice. This iterator | |
| /// should only be used in a traditional `for $x in $y` syntax, otherwise values | |
| /// cannot be relied upon as being consistent. | |
| /// | |
| /// Here is a demonstration of this issue in action using a very basic clash of | |
| /// the same length. Note that you might (in some cases) get mixed input if you | |
| /// went from a longer length value to a shorter length. | |
| /// | |
| /// ```rust | |
| /// use bytelines::*; | |
| /// use std::fs::File; | |
| /// use std::io::BufReader; | |
| /// | |
| /// unsafe { | |
| /// // construct our iterator from our file input | |
| /// let file = File::open("./res/numbers.txt").unwrap(); | |
| /// let mut iter = BufReader::new(file).ref_byte_lines(); | |
| /// | |
| /// // take the first line from the input | |
| /// let line1 = iter.next(); | |
| /// println!("{:?}", line1); // equivalent to bytes of "0" | |
| /// | |
| /// // take the second line from the input | |
| /// let line2 = iter.next(); | |
| /// println!("{:?}", line2); // equivalent to bytes of "1" | |
| /// println!("{:?}", line1); // also now equivalent to bytes of "1" | |
| /// } | |
| /// ``` | |
| /// | |
| /// This implmentation is much more memory efficient than `ByteLines` (and more | |
| /// performant), and so should be used in performance critical code blocks. As | |
| /// a small aside, `ByteLines` simply delegates to this struct internally and | |
| /// provides an allocation on top to enforce all ownership correctly. | |
| pub struct RefByteLines<'a, B> | |
| where | |
| B: BufRead, | |
| { | |
| buffer: Vec<u8>, | |
| marker: PhantomData<&'a B>, | |
| reader: B, | |
| } | |
| /// Wrapping iterator to enforce ownership. | |
| impl<'a, B> Iterator for ByteLines<'a, B> | |
| where | |
| B: BufRead, | |
| { | |
| type Item = Result<Vec<u8>, std::io::Error>; | |
| /// Retrieves the next line in the iterator (if any). | |
| fn next(&mut self) -> Option<Result<Vec<u8>, std::io::Error>> { | |
| self.inner.next().map(|r| r.map(|s| s.to_vec())) | |
| } | |
| } | |
| /// Base iterator for line retrieval. | |
| impl<'a, B> Iterator for RefByteLines<'a, B> | |
| where | |
| B: BufRead, | |
| { | |
| type Item = Result<&'a [u8], std::io::Error>; | |
| /// Retrieves the next line in the iterator (if any). | |
| fn next(&mut self) -> Option<Result<&'a [u8], std::io::Error>> { | |
| // clear the main buffer | |
| self.buffer.clear(); | |
| // iterate every line coming from the reader (but as bytes) | |
| match self.reader.read_until(b'\n', &mut self.buffer) { | |
| // short circuit on error | |
| Err(e) => Some(Err(e)), | |
| // no input, done | |
| Ok(0) => None, | |
| // bytes! | |
| Ok(mut n) => { | |
| // always "pop" the delim | |
| if self.buffer[n - 1] == b'\n' { | |
| n -= 1; | |
| // also "pop" a leading \r | |
| if self.buffer[n - 1] == b'\r' { | |
| n -= 1; | |
| } | |
| } | |
| // Here's the fun unsafe section; in order to provide a reference and avoid allocation, | |
| // we need to extend the lifetime and so we do so here. This means that you're open to | |
| // data races in the case you call `next` on an iterator twice, and maintain the values | |
| // of each retrieved line (as the former will be invalidated to point to the bytes of | |
| // the second). To avoid this, simply always use `for $x in $y` syntax when using this | |
| // type of iteration directly (as you're never going to hold two lines at once). | |
| unsafe { | |
| Some(Ok(std::mem::transmute::<&[u8], &'a [u8]>( | |
| &self.buffer[..n], | |
| ))) | |
| } | |
| } | |
| } | |
| } | |
| } | |
| #[cfg(test)] | |
| mod tests { | |
| use super::*; | |
| use std::fs::File; | |
| use std::io::BufReader; | |
| #[test] | |
| fn test_basic_iterator() { | |
| let file = File::open("./res/numbers.txt").unwrap(); | |
| let lines: Vec<String> = BufReader::new(file) | |
| .byte_lines() | |
| .map(|line| line.unwrap()) | |
| .map(|line| String::from_utf8(line).unwrap()) | |
| .collect(); | |
| for i in 0..9 { | |
| assert_eq!(lines[i], format!("{}", i)); | |
| } | |
| } | |
| } |