Permalink
Find file Copy path
1244 lines (1092 sloc) 39.3 KB
// Copyright 2016 The xi-editor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! A rope data structure with a line count metric and (soon) other useful
//! info.
use std::borrow::Cow;
use std::cmp::{max, min};
use std::fmt;
use std::ops::Add;
use std::str;
use std::str::FromStr;
use std::string::ParseError;
use crate::delta::{Delta, DeltaElement};
use crate::interval::{Interval, IntervalBounds};
use crate::tree::{Cursor, Leaf, Metric, Node, NodeInfo, TreeBuilder};
use bytecount;
use memchr::{memchr, memrchr};
use serde::de::{Deserialize, Deserializer};
use serde::ser::{Serialize, SerializeStruct, SerializeTupleVariant, Serializer};
use unicode_segmentation::GraphemeCursor;
use unicode_segmentation::GraphemeIncomplete;
const MIN_LEAF: usize = 511;
const MAX_LEAF: usize = 1024;
/// A rope data structure.
///
/// A [rope](https://en.wikipedia.org/wiki/Rope_(data_structure)) is a data structure
/// for strings, specialized for incremental editing operations. Most operations
/// (such as insert, delete, substring) are O(log n). This module provides an immutable
/// (also known as [persistent](https://en.wikipedia.org/wiki/Persistent_data_structure))
/// version of Ropes, and if there are many copies of similar strings, the common parts
/// are shared.
///
/// Internally, the implementation uses reference counting (not thread safe, though
/// it would be easy enough to modify to use `Arc` instead of `Rc` if that were
/// required). Mutations are generally copy-on-write, though in-place edits are
/// supported as an optimization when only one reference exists, making the
/// implementation as efficient as a mutable version.
///
/// Also note: in addition to the `From` traits described below, this module
/// implements `From<Rope> for String` and `From<&Rope> for String`, for easy
/// conversions in both directions.
///
/// # Examples
///
/// Create a `Rope` from a `String`:
///
/// ```rust
/// # use xi_rope::Rope;
/// let a = Rope::from("hello ");
/// let b = Rope::from("world");
/// assert_eq!("hello world", String::from(a.clone() + b.clone()));
/// assert!("hello world" == String::from(a + b));
/// ```
///
/// Get a slice of a `Rope`:
///
/// ```rust
/// # use xi_rope::Rope;
/// let a = Rope::from("hello world");
/// let b = a.slice(1..9);
/// assert_eq!("ello wor", String::from(&b));
/// let c = b.slice(1..7);
/// assert_eq!("llo wo", String::from(c));
/// ```
///
/// Replace part of a `Rope`:
///
/// ```rust
/// # use xi_rope::Rope;
/// let mut a = Rope::from("hello world");
/// a.edit(1..9, "era");
/// assert_eq!("herald", String::from(a));
/// ```
pub type Rope = Node<RopeInfo>;
/// Represents a transform from one rope to another.
pub type RopeDelta = Delta<RopeInfo>;
/// An element in a `RopeDelta`.
pub type RopeDeltaElement = DeltaElement<RopeInfo>;
impl Leaf for String {
fn len(&self) -> usize {
self.len()
}
fn is_ok_child(&self) -> bool {
self.len() >= MIN_LEAF
}
fn push_maybe_split(&mut self, other: &String, iv: Interval) -> Option<String> {
//println!("push_maybe_split [{}] [{}] {:?}", self, other, iv);
let (start, end) = iv.start_end();
self.push_str(&other[start..end]);
if self.len() <= MAX_LEAF {
None
} else {
let splitpoint = find_leaf_split_for_merge(self);
let right_str = self[splitpoint..].to_owned();
self.truncate(splitpoint);
self.shrink_to_fit();
Some(right_str)
}
}
}
#[derive(Clone, Copy)]
pub struct RopeInfo {
lines: usize,
utf16_size: usize,
}
impl NodeInfo for RopeInfo {
type L = String;
fn accumulate(&mut self, other: &Self) {
self.lines += other.lines;
self.utf16_size += other.utf16_size;
}
fn compute_info(s: &String) -> Self {
RopeInfo { lines: count_newlines(s), utf16_size: count_utf16_code_units(s) }
}
fn identity() -> Self {
RopeInfo { lines: 0, utf16_size: 0 }
}
}
//TODO: document metrics, based on https://github.com/google/xi-editor/issues/456
//See ../docs/MetricsAndBoundaries.md for more information.
/// This metric let us walk utf8 text by code point.
///
/// `BaseMetric` implements the trait [Metric]. Both its _measured unit_ and
/// its _base unit_ are utf8 code unit.
///
/// Offsets that do not correspond to codepoint boundaries are _invalid_, and
/// calling functions that assume valid offsets with invalid offets will panic
/// in debug mode.
///
/// Boundary is atomic and determined by codepoint boundary. Atomicity is
/// implicit, because offsets between two utf8 code units that form a code
/// point is considered invalid. For example, if a string starts with a
/// 0xC2 byte, then `offset=1` is invalid.
#[derive(Clone, Copy)]
pub struct BaseMetric(());
impl Metric<RopeInfo> for BaseMetric {
fn measure(_: &RopeInfo, len: usize) -> usize {
len
}
fn to_base_units(s: &String, in_measured_units: usize) -> usize {
debug_assert!(s.is_char_boundary(in_measured_units));
in_measured_units
}
fn from_base_units(s: &String, in_base_units: usize) -> usize {
debug_assert!(s.is_char_boundary(in_base_units));
in_base_units
}
fn is_boundary(s: &String, offset: usize) -> bool {
s.is_char_boundary(offset)
}
fn prev(s: &String, offset: usize) -> Option<usize> {
if offset == 0 {
// I think it's a precondition that this will never be called
// with offset == 0, but be defensive.
None
} else {
let mut len = 1;
while !s.is_char_boundary(offset - len) {
len += 1;
}
Some(offset - len)
}
}
fn next(s: &String, offset: usize) -> Option<usize> {
if offset == s.len() {
// I think it's a precondition that this will never be called
// with offset == s.len(), but be defensive.
None
} else {
let b = s.as_bytes()[offset];
Some(offset + len_utf8_from_first_byte(b))
}
}
fn can_fragment() -> bool {
false
}
}
/// Given the inital byte of a UTF-8 codepoint, returns the number of
/// bytes required to represent the codepoint.
/// RFC reference : https://tools.ietf.org/html/rfc3629#section-4
pub fn len_utf8_from_first_byte(b: u8) -> usize {
match b {
b if b < 0x80 => 1,
b if b < 0xe0 => 2,
b if b < 0xf0 => 3,
_ => 4,
}
}
#[derive(Clone, Copy)]
pub struct LinesMetric(usize); // number of lines
/// Measured unit is newline amount.
/// Base unit is utf8 code unit.
/// Boundary is trailing and determined by a newline char.
impl Metric<RopeInfo> for LinesMetric {
fn measure(info: &RopeInfo, _: usize) -> usize {
info.lines
}
fn is_boundary(s: &String, offset: usize) -> bool {
if offset == 0 {
// shouldn't be called with this, but be defensive
false
} else {
s.as_bytes()[offset - 1] == b'\n'
}
}
fn to_base_units(s: &String, in_measured_units: usize) -> usize {
let mut offset = 0;
for _ in 0..in_measured_units {
match memchr(b'\n', &s.as_bytes()[offset..]) {
Some(pos) => offset += pos + 1,
_ => panic!("to_base_units called with arg too large"),
}
}
offset
}
fn from_base_units(s: &String, in_base_units: usize) -> usize {
count_newlines(&s[..in_base_units])
}
fn prev(s: &String, offset: usize) -> Option<usize> {
memrchr(b'\n', &s.as_bytes()[..offset]).map(|pos| pos + 1)
}
fn next(s: &String, offset: usize) -> Option<usize> {
memchr(b'\n', &s.as_bytes()[offset..]).map(|pos| offset + pos + 1)
}
fn can_fragment() -> bool {
true
}
}
#[derive(Clone, Copy)]
pub struct Utf16CodeUnitsMetric(usize);
impl Metric<RopeInfo> for Utf16CodeUnitsMetric {
fn measure(info: &RopeInfo, _: usize) -> usize {
info.utf16_size
}
fn is_boundary(s: &String, offset: usize) -> bool {
s.is_char_boundary(offset)
}
fn to_base_units(s: &String, in_measured_units: usize) -> usize {
let mut cur_len_utf16 = 0;
let mut cur_len_utf8 = 0;
for u in s.chars() {
if cur_len_utf16 >= in_measured_units {
break;
}
cur_len_utf16 += u.len_utf16();
cur_len_utf8 += u.len_utf8();
}
cur_len_utf8
}
fn from_base_units(s: &String, in_base_units: usize) -> usize {
count_utf16_code_units(&s[..in_base_units])
}
fn prev(s: &String, offset: usize) -> Option<usize> {
if offset == 0 {
// I think it's a precondition that this will never be called
// with offset == 0, but be defensive.
None
} else {
let mut len = 1;
while !s.is_char_boundary(offset - len) {
len += 1;
}
Some(offset - len)
}
}
fn next(s: &String, offset: usize) -> Option<usize> {
if offset == s.len() {
// I think it's a precondition that this will never be called
// with offset == s.len(), but be defensive.
None
} else {
let b = s.as_bytes()[offset];
Some(offset + len_utf8_from_first_byte(b))
}
}
fn can_fragment() -> bool {
false
}
}
// Low level functions
pub fn count_newlines(s: &str) -> usize {
bytecount::count(s.as_bytes(), b'\n')
}
fn count_utf16_code_units(s: &str) -> usize {
let mut utf16_count = 0;
for &b in s.as_bytes() {
if (b as i8) >= -0x40 {
utf16_count += 1;
}
if b >= 0xf0 {
utf16_count += 1;
}
}
utf16_count
}
fn find_leaf_split_for_bulk(s: &str) -> usize {
find_leaf_split(s, MIN_LEAF)
}
fn find_leaf_split_for_merge(s: &str) -> usize {
find_leaf_split(s, max(MIN_LEAF, s.len() - MAX_LEAF))
}
// Try to split at newline boundary (leaning left), if not, then split at codepoint
fn find_leaf_split(s: &str, minsplit: usize) -> usize {
let mut splitpoint = min(MAX_LEAF, s.len() - MIN_LEAF);
match memrchr(b'\n', &s.as_bytes()[minsplit - 1..splitpoint]) {
Some(pos) => minsplit + pos,
None => {
while !s.is_char_boundary(splitpoint) {
splitpoint -= 1;
}
splitpoint
}
}
}
// Additional APIs custom to strings
impl FromStr for Rope {
type Err = ParseError;
fn from_str(s: &str) -> Result<Rope, Self::Err> {
let mut b = TreeBuilder::new();
b.push_str(s);
Ok(b.build())
}
}
impl Serialize for Rope {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
serializer.serialize_str(&String::from(self))
}
}
impl<'de> Deserialize<'de> for Rope {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
Ok(Rope::from(s))
}
}
impl Serialize for DeltaElement<RopeInfo> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
match *self {
DeltaElement::Copy(ref start, ref end) => {
let mut el = serializer.serialize_tuple_variant("DeltaElement", 0, "copy", 2)?;
el.serialize_field(start)?;
el.serialize_field(end)?;
el.end()
}
DeltaElement::Insert(ref node) => {
serializer.serialize_newtype_variant("DeltaElement", 1, "insert", node)
}
}
}
}
impl Serialize for Delta<RopeInfo> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let mut delta = serializer.serialize_struct("Delta", 2)?;
delta.serialize_field("els", &self.els)?;
delta.serialize_field("base_len", &self.base_len)?;
delta.end()
}
}
impl<'de> Deserialize<'de> for Delta<RopeInfo> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
// NOTE: we derive to an interim representation and then convert
// that into our actual target.
#[derive(Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
enum RopeDeltaElement_ {
Copy(usize, usize),
Insert(String),
}
#[derive(Serialize, Deserialize)]
struct RopeDelta_ {
els: Vec<RopeDeltaElement_>,
base_len: usize,
}
impl From<RopeDeltaElement_> for DeltaElement<RopeInfo> {
fn from(elem: RopeDeltaElement_) -> DeltaElement<RopeInfo> {
match elem {
RopeDeltaElement_::Copy(start, end) => DeltaElement::Copy(start, end),
RopeDeltaElement_::Insert(s) => DeltaElement::Insert(Rope::from(s)),
}
}
}
impl From<RopeDelta_> for Delta<RopeInfo> {
fn from(mut delta: RopeDelta_) -> Delta<RopeInfo> {
Delta {
els: delta.els.drain(..).map(DeltaElement::from).collect(),
base_len: delta.base_len,
}
}
}
let d = RopeDelta_::deserialize(deserializer)?;
Ok(Delta::from(d))
}
}
impl Rope {
/// Edit the string, replacing the byte range [`start`..`end`] with `new`.
///
/// Time complexity: O(log n)
#[deprecated(since = "0.3.0", note = "Use Rope::edit instead")]
pub fn edit_str<T: IntervalBounds>(&mut self, iv: T, new: &str) {
self.edit(iv, new)
}
/// Returns a new Rope with the contents of the provided range.
pub fn slice<T: IntervalBounds>(&self, iv: T) -> Rope {
self.subseq(iv)
}
// encourage callers to use Cursor instead?
/// Determine whether `offset` lies on a codepoint boundary.
pub fn is_codepoint_boundary(&self, offset: usize) -> bool {
let mut cursor = Cursor::new(self, offset);
cursor.is_boundary::<BaseMetric>()
}
/// Return the offset of the codepoint before `offset`.
pub fn prev_codepoint_offset(&self, offset: usize) -> Option<usize> {
let mut cursor = Cursor::new(self, offset);
cursor.prev::<BaseMetric>()
}
/// Return the offset of the codepoint after `offset`.
pub fn next_codepoint_offset(&self, offset: usize) -> Option<usize> {
let mut cursor = Cursor::new(self, offset);
cursor.next::<BaseMetric>()
}
pub fn prev_grapheme_offset(&self, offset: usize) -> Option<usize> {
let mut cursor = Cursor::new(self, offset);
cursor.prev_grapheme()
}
pub fn next_grapheme_offset(&self, offset: usize) -> Option<usize> {
let mut cursor = Cursor::new(self, offset);
cursor.next_grapheme()
}
/// Return the line number corresponding to the byte index `offset`.
///
/// The line number is 0-based, thus this is equivalent to the count of newlines
/// in the slice up to `offset`.
///
/// Time complexity: O(log n)
///
/// # Panics
///
/// This function will panic if `offset > self.len()`. Callers are expected to
/// validate their input.
pub fn line_of_offset(&self, offset: usize) -> usize {
self.convert_metrics::<BaseMetric, LinesMetric>(offset)
}
/// Return the byte offset corresponding to the line number `line`.
/// If `line` is equal to one plus the current number of lines,
/// this returns the offset of the end of the rope. Arguments higher
/// than this will panic.
///
/// The line number is 0-based.
///
/// Time complexity: O(log n)
///
/// # Panics
///
/// This function will panic if `line > self.measure::<LinesMetric>() + 1`.
/// Callers are expected to validate their input.
pub fn offset_of_line(&self, line: usize) -> usize {
let max_line = self.measure::<LinesMetric>() + 1;
if line > max_line {
panic!("line number {} beyond last line {}", line, max_line);
} else if line == max_line {
return self.len();
}
self.convert_metrics::<LinesMetric, BaseMetric>(line)
}
/// Returns an iterator over chunks of the rope.
///
/// Each chunk is a `&str` slice borrowed from the rope's storage. The size
/// of the chunks is indeterminate but for large strings will generally be
/// in the range of 511-1024 bytes.
///
/// The empty string will yield a single empty slice. In all other cases, the
/// slices will be nonempty.
///
/// Time complexity: technically O(n log n), but the constant factor is so
/// tiny it is effectively O(n). This iterator does not allocate.
pub fn iter_chunks<T: IntervalBounds>(&self, range: T) -> ChunkIter {
let Interval { start, end } = range.into_interval(self.len());
ChunkIter { cursor: Cursor::new(self, start), end }
}
/// An iterator over the raw lines. The lines, except the last, include the
/// terminating newline.
///
/// The return type is a `Cow<str>`, and in most cases the lines are slices
/// borrowed from the rope.
pub fn lines_raw<T: IntervalBounds>(&self, range: T) -> LinesRaw {
LinesRaw { inner: self.iter_chunks(range), fragment: "" }
}
/// An iterator over the lines of a rope.
///
/// Lines are ended with either Unix (`\n`) or MS-DOS (`\r\n`) style line endings.
/// The line ending is stripped from the resulting string. The final line ending
/// is optional.
///
/// The return type is a `Cow<str>`, and in most cases the lines are slices borrowed
/// from the rope.
///
/// The semantics are intended to match `str::lines()`.
pub fn lines<T: IntervalBounds>(&self, range: T) -> Lines {
Lines { inner: self.lines_raw(range) }
}
// callers should be encouraged to use cursor instead
pub fn byte_at(&self, offset: usize) -> u8 {
let cursor = Cursor::new(self, offset);
let (leaf, pos) = cursor.get_leaf().unwrap();
leaf.as_bytes()[pos]
}
pub fn slice_to_cow<T: IntervalBounds>(&self, range: T) -> Cow<str> {
let mut iter = self.iter_chunks(range);
let first = iter.next();
let second = iter.next();
match (first, second) {
(None, None) => Cow::from(""),
(Some(s), None) => Cow::from(s),
(Some(one), Some(two)) => {
let mut result = [one, two].concat();
for chunk in iter {
result.push_str(chunk);
}
Cow::from(result)
}
(None, Some(_)) => unreachable!(),
}
}
}
// should make this generic, but most leaf types aren't going to be sliceable
pub struct ChunkIter<'a> {
cursor: Cursor<'a, RopeInfo>,
end: usize,
}
impl<'a> Iterator for ChunkIter<'a> {
type Item = &'a str;
fn next(&mut self) -> Option<&'a str> {
if self.cursor.pos() >= self.end {
return None;
}
let (leaf, start_pos) = self.cursor.get_leaf().unwrap();
let len = min(self.end - self.cursor.pos(), leaf.len() - start_pos);
self.cursor.next_leaf();
Some(&leaf[start_pos..start_pos + len])
}
}
impl TreeBuilder<RopeInfo> {
/// Push a string on the accumulating tree in the naive way.
///
/// Splits the provided string in chunks that fit in a leaf
/// and pushes the leaves one by one onto the tree by calling.
pub fn push_str(&mut self, mut s: &str) {
if s.len() <= MAX_LEAF {
if !s.is_empty() {
self.push_leaf(s.to_owned());
}
return;
}
while !s.is_empty() {
let splitpoint = if s.len() > MAX_LEAF { find_leaf_split_for_bulk(s) } else { s.len() };
self.push_leaf(s[..splitpoint].to_owned());
s = &s[splitpoint..];
}
}
/// Push a string on the accumulating tree in an optimized fashion.
///
/// Splits the string into leaves first and
/// then pushes all the leaves onto the accumulating tree in one go.
///
/// Note: this is only used in tests.
#[doc(hidden)]
pub fn push_str_stacked(&mut self, s: &str) {
let leaves = split_as_leaves(s);
self.push_leaves(leaves);
}
}
fn split_as_leaves(mut s: &str) -> Vec<String> {
let mut nodes = Vec::new();
while !s.is_empty() {
let splitpoint = if s.len() > MAX_LEAF { find_leaf_split_for_bulk(s) } else { s.len() };
nodes.push(s[..splitpoint].to_owned());
s = &s[splitpoint..];
}
nodes
}
impl<T: AsRef<str>> From<T> for Rope {
fn from(s: T) -> Rope {
Rope::from_str(s.as_ref()).unwrap()
}
}
impl From<Rope> for String {
// maybe explore grabbing leaf? would require api in tree
fn from(r: Rope) -> String {
String::from(&r)
}
}
impl<'a> From<&'a Rope> for String {
fn from(r: &Rope) -> String {
r.slice_to_cow(..).into_owned()
}
}
impl fmt::Display for Rope {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for s in self.iter_chunks(..) {
write!(f, "{}", s)?;
}
Ok(())
}
}
impl fmt::Debug for Rope {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if f.alternate() {
write!(f, "{}", String::from(self))
} else {
write!(f, "Rope({:?})", String::from(self))
}
}
}
impl Add<Rope> for Rope {
type Output = Rope;
fn add(self, rhs: Rope) -> Rope {
let mut b = TreeBuilder::new();
b.push(self);
b.push(rhs);
b.build()
}
}
//additional cursor features
impl<'a> Cursor<'a, RopeInfo> {
/// Get previous codepoint before cursor position, and advance cursor backwards.
pub fn prev_codepoint(&mut self) -> Option<char> {
self.prev::<BaseMetric>();
if let Some((l, offset)) = self.get_leaf() {
l[offset..].chars().next()
} else {
None
}
}
/// Get next codepoint after cursor position, and advance cursor.
pub fn next_codepoint(&mut self) -> Option<char> {
if let Some((l, offset)) = self.get_leaf() {
self.next::<BaseMetric>();
l[offset..].chars().next()
} else {
None
}
}
pub fn next_grapheme(&mut self) -> Option<usize> {
let (mut l, mut offset) = self.get_leaf()?;
let mut pos = self.pos();
while offset < l.len() && !l.is_char_boundary(offset) {
pos -= 1;
offset -= 1;
}
let mut leaf_offset = pos - offset;
let mut c = GraphemeCursor::new(pos, self.total_len(), true);
let mut next_boundary = c.next_boundary(&l, leaf_offset);
while let Err(incomp) = next_boundary {
if let GraphemeIncomplete::PreContext(_) = incomp {
let (pl, poffset) = self.prev_leaf()?;
c.provide_context(&pl, self.pos() - poffset);
} else if incomp == GraphemeIncomplete::NextChunk {
self.set(pos);
let (nl, noffset) = self.next_leaf()?;
l = nl;
leaf_offset = self.pos() - noffset;
pos = leaf_offset + nl.len();
} else {
return None;
}
next_boundary = c.next_boundary(&l, leaf_offset);
}
next_boundary.unwrap_or(None)
}
pub fn prev_grapheme(&mut self) -> Option<usize> {
let (mut l, mut offset) = self.get_leaf()?;
let mut pos = self.pos();
while offset < l.len() && !l.is_char_boundary(offset) {
pos += 1;
offset += 1;
}
let mut leaf_offset = pos - offset;
let mut c = GraphemeCursor::new(pos, l.len() + leaf_offset, true);
let mut prev_boundary = c.prev_boundary(&l, leaf_offset);
while let Err(incomp) = prev_boundary {
if let GraphemeIncomplete::PreContext(_) = incomp {
let (pl, poffset) = self.prev_leaf()?;
c.provide_context(&pl, self.pos() - poffset);
} else if incomp == GraphemeIncomplete::PrevChunk {
self.set(pos);
let (pl, poffset) = self.prev_leaf()?;
l = pl;
leaf_offset = self.pos() - poffset;
pos = leaf_offset + pl.len();
} else {
return None;
}
prev_boundary = c.prev_boundary(&l, leaf_offset);
}
prev_boundary.unwrap_or(None)
}
}
// line iterators
pub struct LinesRaw<'a> {
inner: ChunkIter<'a>,
fragment: &'a str,
}
fn cow_append<'a>(a: Cow<'a, str>, b: &'a str) -> Cow<'a, str> {
if a.is_empty() {
Cow::from(b)
} else {
Cow::from(a.into_owned() + b)
}
}
impl<'a> Iterator for LinesRaw<'a> {
type Item = Cow<'a, str>;
fn next(&mut self) -> Option<Cow<'a, str>> {
let mut result = Cow::from("");
loop {
if self.fragment.is_empty() {
match self.inner.next() {
Some(chunk) => self.fragment = chunk,
None => return if result.is_empty() { None } else { Some(result) },
}
if self.fragment.is_empty() {
// can only happen on empty input
return None;
}
}
match memchr(b'\n', self.fragment.as_bytes()) {
Some(i) => {
result = cow_append(result, &self.fragment[..=i]);
self.fragment = &self.fragment[i + 1..];
return Some(result);
}
None => {
result = cow_append(result, self.fragment);
self.fragment = "";
}
}
}
}
}
pub struct Lines<'a> {
inner: LinesRaw<'a>,
}
impl<'a> Iterator for Lines<'a> {
type Item = Cow<'a, str>;
fn next(&mut self) -> Option<Cow<'a, str>> {
match self.inner.next() {
Some(Cow::Borrowed(mut s)) => {
if s.ends_with('\n') {
s = &s[..s.len() - 1];
if s.ends_with('\r') {
s = &s[..s.len() - 1];
}
}
Some(Cow::from(s))
}
Some(Cow::Owned(mut s)) => {
if s.ends_with('\n') {
let _ = s.pop();
if s.ends_with('\r') {
let _ = s.pop();
}
}
Some(Cow::from(s))
}
None => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_test::{assert_tokens, Token};
#[test]
fn replace_small() {
let mut a = Rope::from("hello world");
a.edit(1..9, "era");
assert_eq!("herald", String::from(a));
}
#[test]
fn lines_raw_small() {
let a = Rope::from("a\nb\nc");
assert_eq!(vec!["a\n", "b\n", "c"], a.lines_raw(..).collect::<Vec<_>>());
assert_eq!(vec!["a\n", "b\n", "c"], a.lines_raw(..).collect::<Vec<_>>());
let a = Rope::from("a\nb\n");
assert_eq!(vec!["a\n", "b\n"], a.lines_raw(..).collect::<Vec<_>>());
let a = Rope::from("\n");
assert_eq!(vec!["\n"], a.lines_raw(..).collect::<Vec<_>>());
let a = Rope::from("");
assert_eq!(0, a.lines_raw(..).count());
}
#[test]
fn lines_small() {
let a = Rope::from("a\nb\nc");
assert_eq!(vec!["a", "b", "c"], a.lines(..).collect::<Vec<_>>());
assert_eq!(String::from(&a).lines().collect::<Vec<_>>(), a.lines(..).collect::<Vec<_>>());
let a = Rope::from("a\nb\n");
assert_eq!(vec!["a", "b"], a.lines(..).collect::<Vec<_>>());
assert_eq!(String::from(&a).lines().collect::<Vec<_>>(), a.lines(..).collect::<Vec<_>>());
let a = Rope::from("\n");
assert_eq!(vec![""], a.lines(..).collect::<Vec<_>>());
assert_eq!(String::from(&a).lines().collect::<Vec<_>>(), a.lines(..).collect::<Vec<_>>());
let a = Rope::from("");
assert_eq!(0, a.lines(..).count());
assert_eq!(String::from(&a).lines().collect::<Vec<_>>(), a.lines(..).collect::<Vec<_>>());
let a = Rope::from("a\r\nb\r\nc");
assert_eq!(vec!["a", "b", "c"], a.lines(..).collect::<Vec<_>>());
assert_eq!(String::from(&a).lines().collect::<Vec<_>>(), a.lines(..).collect::<Vec<_>>());
let a = Rope::from("a\rb\rc");
assert_eq!(vec!["a\rb\rc"], a.lines(..).collect::<Vec<_>>());
assert_eq!(String::from(&a).lines().collect::<Vec<_>>(), a.lines(..).collect::<Vec<_>>());
}
#[test]
fn lines_med() {
let mut a = String::new();
let mut b = String::new();
let line_len = MAX_LEAF + MIN_LEAF - 1;
for _ in 0..line_len {
a.push('a');
b.push('b');
}
a.push('\n');
b.push('\n');
let r = Rope::from(&a[..MAX_LEAF]);
let r = r + Rope::from(String::from(&a[MAX_LEAF..]) + &b[..MIN_LEAF]);
let r = r + Rope::from(&b[MIN_LEAF..]);
//println!("{:?}", r.iter_chunks().collect::<Vec<_>>());
assert_eq!(vec![a.as_str(), b.as_str()], r.lines_raw(..).collect::<Vec<_>>());
assert_eq!(vec![&a[..line_len], &b[..line_len]], r.lines(..).collect::<Vec<_>>());
assert_eq!(String::from(&r).lines().collect::<Vec<_>>(), r.lines(..).collect::<Vec<_>>());
// additional tests for line indexing
assert_eq!(a.len(), r.offset_of_line(1));
assert_eq!(r.len(), r.offset_of_line(2));
assert_eq!(0, r.line_of_offset(a.len() - 1));
assert_eq!(1, r.line_of_offset(a.len()));
assert_eq!(1, r.line_of_offset(r.len() - 1));
assert_eq!(2, r.line_of_offset(r.len()));
}
#[test]
fn append_large() {
let mut a = Rope::from("");
let mut b = String::new();
for i in 0..5_000 {
let c = i.to_string() + "\n";
b.push_str(&c);
a = a + Rope::from(&c);
}
assert_eq!(b, String::from(a));
}
#[test]
fn prev_codepoint_offset_small() {
let a = Rope::from("a\u{00A1}\u{4E00}\u{1F4A9}");
assert_eq!(Some(6), a.prev_codepoint_offset(10));
assert_eq!(Some(3), a.prev_codepoint_offset(6));
assert_eq!(Some(1), a.prev_codepoint_offset(3));
assert_eq!(Some(0), a.prev_codepoint_offset(1));
assert_eq!(None, a.prev_codepoint_offset(0));
let b = a.slice(1..10);
assert_eq!(Some(5), b.prev_codepoint_offset(9));
assert_eq!(Some(2), b.prev_codepoint_offset(5));
assert_eq!(Some(0), b.prev_codepoint_offset(2));
assert_eq!(None, b.prev_codepoint_offset(0));
}
#[test]
fn next_codepoint_offset_small() {
let a = Rope::from("a\u{00A1}\u{4E00}\u{1F4A9}");
assert_eq!(Some(10), a.next_codepoint_offset(6));
assert_eq!(Some(6), a.next_codepoint_offset(3));
assert_eq!(Some(3), a.next_codepoint_offset(1));
assert_eq!(Some(1), a.next_codepoint_offset(0));
assert_eq!(None, a.next_codepoint_offset(10));
let b = a.slice(1..10);
assert_eq!(Some(9), b.next_codepoint_offset(5));
assert_eq!(Some(5), b.next_codepoint_offset(2));
assert_eq!(Some(2), b.next_codepoint_offset(0));
assert_eq!(None, b.next_codepoint_offset(9));
}
#[test]
fn prev_grapheme_offset() {
// A with ring, hangul, regional indicator "US"
let a = Rope::from("A\u{030a}\u{110b}\u{1161}\u{1f1fa}\u{1f1f8}");
assert_eq!(Some(9), a.prev_grapheme_offset(17));
assert_eq!(Some(3), a.prev_grapheme_offset(9));
assert_eq!(Some(0), a.prev_grapheme_offset(3));
assert_eq!(None, a.prev_grapheme_offset(0));
}
#[test]
fn next_grapheme_offset() {
// A with ring, hangul, regional indicator "US"
let a = Rope::from("A\u{030a}\u{110b}\u{1161}\u{1f1fa}\u{1f1f8}");
assert_eq!(Some(3), a.next_grapheme_offset(0));
assert_eq!(Some(9), a.next_grapheme_offset(3));
assert_eq!(Some(17), a.next_grapheme_offset(9));
assert_eq!(None, a.next_grapheme_offset(17));
}
#[test]
fn next_grapheme_offset_with_ris_of_leaf_boundaries() {
let s1 = "\u{1f1fa}\u{1f1f8}".repeat(100);
let a = Rope::concat(
Rope::from(s1.clone()),
Rope::concat(
Rope::from(String::from(s1.clone()) + "\u{1f1fa}"),
Rope::from(s1.clone()),
),
);
for i in 1..(s1.len() * 3) {
assert_eq!(Some((i - 1) / 8 * 8), a.prev_grapheme_offset(i));
assert_eq!(Some(i / 8 * 8 + 8), a.next_grapheme_offset(i));
}
for i in (s1.len() * 3 + 1)..(s1.len() * 3 + 4) {
assert_eq!(Some(s1.len() * 3), a.prev_grapheme_offset(i));
assert_eq!(Some(s1.len() * 3 + 4), a.next_grapheme_offset(i));
}
assert_eq!(None, a.prev_grapheme_offset(0));
assert_eq!(Some(8), a.next_grapheme_offset(0));
assert_eq!(Some(s1.len() * 3), a.prev_grapheme_offset(s1.len() * 3 + 4));
assert_eq!(None, a.next_grapheme_offset(s1.len() * 3 + 4));
}
#[test]
fn test_ser_de() {
let rope = Rope::from("a\u{00A1}\u{4E00}\u{1F4A9}");
assert_tokens(&rope, &[Token::Str("a\u{00A1}\u{4E00}\u{1F4A9}")]);
assert_tokens(&rope, &[Token::String("a\u{00A1}\u{4E00}\u{1F4A9}")]);
assert_tokens(&rope, &[Token::BorrowedStr("a\u{00A1}\u{4E00}\u{1F4A9}")]);
}
#[test]
fn line_of_offset_small() {
let a = Rope::from("a\nb\nc");
assert_eq!(0, a.line_of_offset(0));
assert_eq!(0, a.line_of_offset(1));
assert_eq!(1, a.line_of_offset(2));
assert_eq!(1, a.line_of_offset(3));
assert_eq!(2, a.line_of_offset(4));
assert_eq!(2, a.line_of_offset(5));
let b = a.slice(2..4);
assert_eq!(0, b.line_of_offset(0));
assert_eq!(0, b.line_of_offset(1));
assert_eq!(1, b.line_of_offset(2));
}
#[test]
fn offset_of_line_small() {
let a = Rope::from("a\nb\nc");
assert_eq!(0, a.offset_of_line(0));
assert_eq!(2, a.offset_of_line(1));
assert_eq!(4, a.offset_of_line(2));
assert_eq!(5, a.offset_of_line(3));
let b = a.slice(2..4);
assert_eq!(0, b.offset_of_line(0));
assert_eq!(2, b.offset_of_line(1));
}
#[test]
fn eq_small() {
let a = Rope::from("a");
let a2 = Rope::from("a");
let b = Rope::from("b");
let empty = Rope::from("");
assert!(a == a2);
assert!(a != b);
assert!(a != empty);
assert!(empty == empty);
assert!(a.slice(0..0) == empty);
}
#[test]
fn eq_med() {
let mut a = String::new();
let mut b = String::new();
let line_len = MAX_LEAF + MIN_LEAF - 1;
for _ in 0..line_len {
a.push('a');
b.push('b');
}
a.push('\n');
b.push('\n');
let r = Rope::from(&a[..MAX_LEAF]);
let r = r + Rope::from(String::from(&a[MAX_LEAF..]) + &b[..MIN_LEAF]);
let r = r + Rope::from(&b[MIN_LEAF..]);
let a_rope = Rope::from(&a);
let b_rope = Rope::from(&b);
assert!(r != a_rope);
assert!(r.clone().slice(..a.len()) == a_rope);
assert!(r.clone().slice(a.len()..) == b_rope);
assert!(r == a_rope.clone() + b_rope.clone());
assert!(r != b_rope + a_rope);
}
#[test]
fn line_offsets() {
let rope = Rope::from("hi\ni'm\nfour\nlines");
assert_eq!(rope.offset_of_line(0), 0);
assert_eq!(rope.offset_of_line(1), 3);
assert_eq!(rope.line_of_offset(0), 0);
assert_eq!(rope.line_of_offset(3), 1);
// interior of first line should be first line
assert_eq!(rope.line_of_offset(1), 0);
// interior of last line should be last line
assert_eq!(rope.line_of_offset(15), 3);
assert_eq!(rope.offset_of_line(4), rope.len());
}
#[test]
#[should_panic]
fn line_of_offset_panic() {
let rope = Rope::from("hi\ni'm\nfour\nlines");
rope.line_of_offset(20);
}
#[test]
#[should_panic]
fn offset_of_line_panic() {
let rope = Rope::from("hi\ni'm\nfour\nlines");
rope.offset_of_line(5);
}
#[test]
fn utf16_code_units_metric() {
let rope = Rope::from("hi\ni'm\nfour\nlines");
let utf16_units = rope.measure::<Utf16CodeUnitsMetric>();
assert_eq!(utf16_units, 17);
// position after 'f' in four
let utf8_offset = 9;
let utf16_units = rope.convert_metrics::<BaseMetric, Utf16CodeUnitsMetric>(utf8_offset);
assert_eq!(utf16_units, 9);
let utf8_offset = rope.convert_metrics::<Utf16CodeUnitsMetric, BaseMetric>(utf16_units);
assert_eq!(utf8_offset, 9);
let rope_with_emoji = Rope::from("hi\ni'm\n😀 four\nlines");
let utf16_units = rope_with_emoji.measure::<Utf16CodeUnitsMetric>();
assert_eq!(utf16_units, 20);
// position after 'f' in four
let utf8_offset = 13;
let utf16_units =
rope_with_emoji.convert_metrics::<BaseMetric, Utf16CodeUnitsMetric>(utf8_offset);
assert_eq!(utf16_units, 11);
let utf8_offset =
rope_with_emoji.convert_metrics::<Utf16CodeUnitsMetric, BaseMetric>(utf16_units);
assert_eq!(utf8_offset, 13);
//for next line
let utf8_offset = 19;
let utf16_units =
rope_with_emoji.convert_metrics::<BaseMetric, Utf16CodeUnitsMetric>(utf8_offset);
assert_eq!(utf16_units, 17);
let utf8_offset =
rope_with_emoji.convert_metrics::<Utf16CodeUnitsMetric, BaseMetric>(utf16_units);
assert_eq!(utf8_offset, 19);
}
#[test]
fn slice_to_cow_small_string() {
let short_text = "hi, i'm a small piece of text.";
let rope = Rope::from(short_text);
let cow = rope.slice_to_cow(..);
assert!(short_text.len() <= 1024);
assert_eq!(cow, Cow::Borrowed(short_text) as Cow<str>);
}
#[test]
fn slice_to_cow_long_string_long_slice() {
// 32 char long string, repeat it 33 times so it is longer than 1024 bytes
let long_text =
"1234567812345678123456781234567812345678123456781234567812345678".repeat(33);
let rope = Rope::from(&long_text);
let cow = rope.slice_to_cow(..);
assert!(long_text.len() > 1024);
assert_eq!(cow, Cow::Owned(long_text) as Cow<str>);
}
#[test]
fn slice_to_cow_long_string_short_slice() {
// 32 char long string, repeat it 33 times so it is longer than 1024 bytes
let long_text =
"1234567812345678123456781234567812345678123456781234567812345678".repeat(33);
let rope = Rope::from(&long_text);
let cow = rope.slice_to_cow(..500);
assert!(long_text.len() > 1024);
assert_eq!(cow, Cow::Borrowed(&long_text[..500]));
}
}