Fix line endings around element breaks in text

Closes GH-3.
syntax-tree · Jan 5, 2023 · 387eff4 · 387eff4
1 parent 4de6078
commit 387eff4
Show file tree

Hide file tree

Showing 2 changed files with 141 additions and 30 deletions.
diff --git a/lib/index.js b/lib/index.js
@@ -14,12 +14,16 @@
  *   Any parent.
  * @typedef {'normal' | 'pre' | 'nowrap' | 'pre-wrap'} Whitespace
  *   Valid and useful whitespace values (from CSS).
- * @typedef {boolean} BreakValue
- *   Whether there was a break.
- * @typedef {1 | 2} BreakNumber
- *   Specific break.
+ * @typedef {0 | 1 | 2} BreakNumber
+ *   Specific break:
+ *
+ *   *   `0` — space
+ *   *   `1` — line ending
+ *   *   `2` — blank line
  * @typedef {'\n'} BreakForce
  *   Forced break.
+ * @typedef {boolean} BreakValue
+ *   Whether there was a break.
  * @typedef {BreakValue | BreakNumber | undefined} BreakBefore
  *   Any value for a break before.
  * @typedef {BreakValue | BreakNumber | BreakForce | undefined} BreakAfter
@@ -156,6 +160,9 @@ export function toText(tree, options = {}) {
     breakAfter: false
   })
 
+  /** @type {Array<string | BreakNumber>} */
+  const results = []
+
   // Treat `text` and `comment` as having normal white-space.
   // This deviates from the spec as in the DOM the node’s `.data` has to be
   // returned.
@@ -165,7 +172,13 @@ export function toText(tree, options = {}) {
   // Nodes without children are treated as a void element, so `doctype` is thus
   // ignored.
   if (tree.type === 'text' || tree.type === 'comment') {
-    return collectText(tree, {whitespace, breakBefore: true, breakAfter: true})
+    results.push(
+      ...collectText(tree, {
+        whitespace,
+        breakBefore: true,
+        breakAfter: true
+      })
+    )
   }
 
   // 1.  If this element is not being rendered, or if the user agent is a
@@ -179,8 +192,6 @@ export function toText(tree, options = {}) {
   //     Important: we’ll have to account for this later though.
 
   // 2.  Let results be a new empty list.
-  /** @type {Array<string | BreakNumber>} */
-  let results = []
   let index = -1
 
   // 3.  For each child node node of this element:
@@ -190,9 +201,9 @@ export function toText(tree, options = {}) {
     //      Each item in results will either be a JavaScript string or a
     //      positive integer (a required line break count).
     // 3.2. For each item item in current, append item to results.
-    results = results.concat(
+    results.push(
       // @ts-expect-error Looks like a parent.
-      innerTextCollection(children[index], tree, {
+      ...innerTextCollection(children[index], tree, {
         whitespace,
         breakBefore: index ? undefined : block,
         breakAfter:
@@ -221,8 +232,11 @@ export function toText(tree, options = {}) {
     if (typeof value === 'number') {
       if (count !== undefined && value > count) count = value
     } else if (value) {
-      if (count) result.push('\n'.repeat(count))
-      count = 0
+      if (count !== undefined && count > -1) {
+        result.push('\n'.repeat(count) || ' ')
+      }
+
+      count = -1
       result.push(value)
     }
   }
@@ -245,11 +259,9 @@ function innerTextCollection(node, parent, info) {
   }
 
   if (node.type === 'text') {
-    return [
-      info.whitespace === 'normal'
-        ? collectText(node, info)
-        : collectPreText(node)
-    ]
+    return info.whitespace === 'normal'
+      ? collectText(node, info)
+      : collectPreText(node)
   }
 
   return []
@@ -259,8 +271,11 @@ function innerTextCollection(node, parent, info) {
  * Collect an element.
  *
  * @param {Element} node
+ *   Element node.
  * @param {Parent} parent
  * @param {CollectionInfo} info
+ *   Info on current collection.
+ * @returns {Array<string | BreakNumber>}
  */
 function collectElement(node, parent, info) {
   // First we infer the `white-space` property.
@@ -376,18 +391,21 @@ function collectElement(node, parent, info) {
  * See: <https://drafts.csswg.org/css-text/#white-space-phase-1>
  *
  * @param {Text | Comment} node
+ *   Text node.
  * @param {CollectionInfo} info
- * @returns {string}
+ *   Info on current collection.
+ * @returns {Array<string | BreakNumber>}
+ *   Result.
  */
 function collectText(node, info) {
   const value = String(node.value)
   /** @type {Array<string>} */
   const lines = []
-  /** @type {Array<string>} */
+  /** @type {Array<string | BreakNumber>} */
   const result = []
   let start = 0
 
-  while (start < value.length) {
+  while (start <= value.length) {
     searchLineFeeds.lastIndex = start
 
     const match = searchLineFeeds.exec(value)
@@ -397,14 +415,14 @@ function collectText(node, info) {
       // Any sequence of collapsible spaces and tabs immediately preceding or
       // following a segment break is removed.
       trimAndCollapseSpacesAndTabs(
-        // [...] ignoring bidi formatting characters (characters with the
+        // […] ignoring bidi formatting characters (characters with the
         // Bidi_Control property [UAX9]: ALM, LTR, RTL, LRE-RLO, LRI-PDI) as if
         // they were not there.
         value
           .slice(start, end)
           .replace(/[\u061C\u200E\u200F\u202A-\u202E\u2066-\u2069]/g, ''),
-        info.breakBefore,
-        info.breakAfter
+        start === 0 ? info.breakBefore : true,
+        end === value.length ? info.breakAfter : true
       )
     )
 
@@ -417,7 +435,8 @@ function collectText(node, info) {
   // Any collapsible segment break immediately following another collapsible
   // segment break is removed
   let index = -1
-  let join = ''
+  /** @type {BreakNumber | undefined} */
+  let join
 
   while (++index < lines.length) {
     // *   If the character immediately before or immediately after the segment
@@ -429,7 +448,7 @@ function collectText(node, info) {
         lines[index + 1].charCodeAt(0) === 0x200b) /* ZWSP */
     ) {
       result.push(lines[index])
-      join = ''
+      join = undefined
     }
 
     // *   Otherwise, if the East Asian Width property [UAX11] of both the
@@ -449,21 +468,30 @@ function collectText(node, info) {
 
     // *   Otherwise, the segment break is converted to a space (U+0020).
     else if (lines[index]) {
-      if (join) result.push(join)
+      if (typeof join === 'number') result.push(join)
       result.push(lines[index])
-      join = ' '
+      join = 0
+    } else if (index === 0 || index === lines.length - 1) {
+      // If this line is empty, and it’s the first or last, add a space.
+      // Note that this function is only called in normal whitespace, so we
+      // don’t worry about `pre`.
+      result.push(0)
     }
   }
 
-  return result.join('')
+  return result
 }
 
 /**
- * @param {Text | Comment} node
- * @returns {string}
+ * Collect a text node as “pre” whitespace.
+ *
+ * @param {Text} node
+ *   Text node.
+ * @returns {Array<string | BreakNumber>}
+ *   Result.
  */
 function collectPreText(node) {
-  return String(node.value)
+  return [String(node.value)]
 }
 
 /**
@@ -475,9 +503,13 @@ function collectPreText(node) {
  *     but retains its soft wrap opportunity, if any.)
  *
  * @param {string} value
+ *   Value to collapse.
  * @param {BreakBefore} breakBefore
+ *   Whether there was a break before.
  * @param {BreakAfter} breakAfter
+ *   Whether there was a break after.
  * @returns {string}
+ *   Result.
  */
 function trimAndCollapseSpacesAndTabs(value, breakBefore, breakAfter) {
   /** @type {Array<string>} */
@@ -515,11 +547,16 @@ function trimAndCollapseSpacesAndTabs(value, breakBefore, breakAfter) {
 }
 
 /**
+ * Figure out the whitespace of a node.
+ *
  * We don’t support void elements here (so `nobr wbr` -> `normal` is ignored).
  *
  * @param {Node} node
+ *   Node (typically `Element`).
  * @param {CollectionInfo} info
+ *   Info on current collection.
  * @returns {Whitespace}
+ *   Applied whitespace.
  */
 function inferWhitespace(node, info) {
   if (node.type === 'element') {

diff --git a/test.js b/test.js
@@ -319,3 +319,77 @@ test('non-normal white-space', () => {
     'should support a `textarea` element'
   )
 })
+
+test('more whitespace', () => {
+  assert.equal(
+    toText(h('p', ['A\n', h('span', 'b')])),
+    'A b',
+    'should support line endings around element breaks (1)'
+  )
+
+  assert.equal(
+    toText(h('p', ['A\nb', h('span', 'c')])),
+    'A bc',
+    'should support line endings around element breaks (2)'
+  )
+
+  assert.equal(
+    toText(h('p', ['A', h('span', '\nb')])),
+    'A b',
+    'should support line endings around element breaks (3)'
+  )
+
+  assert.equal(
+    toText(h('p', ['A\n', h('span', '\nb')])),
+    'A b',
+    'should support line endings around element breaks (4)'
+  )
+
+  assert.equal(
+    toText(h('p', [h('span', 'A\n'), h('span', 'b')])),
+    'A b',
+    'should support line endings around element breaks (5)'
+  )
+
+  assert.equal(
+    toText(h('p', [h('span', 'A'), h('span', '\nb')])),
+    'A b',
+    'should support line endings around element breaks (6)'
+  )
+
+  assert.equal(
+    toText(h('p', [h('span', 'A\n'), h('span', '\nb')])),
+    'A b',
+    'should support line endings around element breaks (7)'
+  )
+
+  assert.equal(
+    toText(h('p', [h('span', 'A\n'), 'b'])),
+    'A b',
+    'should support line endings around element breaks (8)'
+  )
+
+  assert.equal(
+    toText(h('p', [h('span', 'A'), '\nb'])),
+    'A b',
+    'should support line endings around element breaks (9)'
+  )
+
+  assert.equal(
+    toText(h('p', [h('span', 'A\n'), '\nb'])),
+    'A b',
+    'should support line endings around element breaks (10)'
+  )
+
+  assert.equal(
+    toText(h('div', [h('p', [h('span', 'A\n'), '\nb'])])),
+    'A b',
+    'should support line endings around element breaks (11)'
+  )
+
+  assert.equal(
+    toText(h('pre', ['A\n', h('span', 'b')])),
+    'A\nb',
+    'should support line endings around element breaks (12)'
+  )
+})