Merge branch 'main' of github.com:taoqf/node-html-parser; branch 'fix…

…-134' of https://github.com/nonara/node-html-parser into nonara-fix-134
taoqf · Jul 5, 2021 · 4f13096 · 4f13096
2 parents cb11eab + e728280
commit 4f13096
Show file tree

Hide file tree

Showing 4 changed files with 95 additions and 76 deletions.
diff --git a/src/nodes/html.ts b/src/nodes/html.ts
@@ -458,7 +458,7 @@ export default class HTMLElement extends Node {
 				if ((node as TextNode).isWhitespace) {
 					return;
 				}
-				node.rawText = (<TextNode>node).trimmedText;
+				node.rawText = (<TextNode>node).trimmedRawText;
 			} else if (node.nodeType === NodeType.ELEMENT_NODE) {
 				(node as HTMLElement).removeWhitespace();
 			}

diff --git a/src/nodes/text.ts b/src/nodes/text.ts
@@ -1,14 +1,16 @@
-import NodeType from './type';
-import Node from './node';
+import { decode } from 'he';
 import HTMLElement from './html';
+import Node from './node';
+import NodeType from './type';
 
 /**
  * TextNode to contain a text element in DOM tree.
  * @param {string} value [description]
  */
 export default class TextNode extends Node {
-	public constructor(public rawText: string, parentNode: HTMLElement) {
+	public constructor(rawText: string, parentNode: HTMLElement) {
 		super(parentNode);
+		this._rawText = rawText;
 	}
 
 	/**
@@ -17,42 +19,38 @@ export default class TextNode extends Node {
 	 */
 	public nodeType = NodeType.TEXT_NODE;
 
+	private _rawText: string;
+	private _trimmedRawText?: string;
 	private _trimmedText?: string;
 
+	public get rawText() {
+		return this._rawText;
+	}
+
+	/**
+	 * Set rawText and invalidate trimmed caches
+	 */
+	public set rawText(text: string) {
+		this._rawText = text;
+		this._trimmedRawText = void 0;
+		this._trimmedText = void 0;
+	}
+
+	/**
+	 * Returns raw text with all whitespace trimmed except single leading/trailing non-breaking space
+	 */
+	public get trimmedRawText() {
+		if (this._trimmedRawText !== undefined) return this._trimmedRawText;
+		this._trimmedRawText = trimText(this.rawText);
+		return this._trimmedRawText;
+	}
+
 	/**
 	 * Returns text with all whitespace trimmed except single leading/trailing non-breaking space
 	 */
 	public get trimmedText() {
 		if (this._trimmedText !== undefined) return this._trimmedText;
-
-		const text = this.rawText;
-		let i = 0;
-		let startPos;
-		let endPos;
-
-		while (i >= 0 && i < text.length) {
-			if (/\S/.test(text[i])) {
-				if (startPos === undefined) {
-					startPos = i;
-					i = text.length;
-				} else {
-					endPos = i;
-					i = void 0;
-				}
-			}
-
-			if (startPos === undefined) i++;
-			else i--;
-		}
-
-		if (startPos === undefined) startPos = 0;
-		if (endPos === undefined) endPos = text.length - 1;
-
-		const hasLeadingSpace = startPos > 0 && /[^\S\r\n]/.test(text[startPos - 1]);
-		const hasTrailingSpace = endPos < (text.length - 1) && /[^\S\r\n]/.test(text[endPos + 1]);
-
-		this._trimmedText = (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : '');
-
+		this._trimmedText = trimText(this.text);
 		return this._trimmedText;
 	}
 
@@ -61,18 +59,50 @@ export default class TextNode extends Node {
 	 * @return {string} text content
 	 */
 	public get text() {
-		return this.rawText;
+		return decode(this.rawText);
 	}
 
 	/**
 	 * Detect if the node contains only white space.
-	 * @return {bool}
+	 * @return {boolean}
 	 */
 	public get isWhitespace() {
 		return /^(\s|&nbsp;)*$/.test(this.rawText);
 	}
 
 	public toString() {
-		return this.text;
+		return this.rawText;
+	}
+}
+
+/**
+ * Trim whitespace except single leading/trailing non-breaking space
+ */
+function trimText(text: string): string {
+	let i = 0;
+	let startPos;
+	let endPos;
+
+	while (i >= 0 && i < text.length) {
+		if (/\S/.test(text[i])) {
+			if (startPos === undefined) {
+				startPos = i;
+				i = text.length;
+			} else {
+				endPos = i;
+				i = void 0;
+			}
+		}
+
+		if (startPos === undefined) i++;
+		else i--;
 	}
+
+	if (startPos === undefined) startPos = 0;
+	if (endPos === undefined) endPos = text.length - 1;
+
+	const hasLeadingSpace = startPos > 0 && /[^\S\r\n]/.test(text[startPos - 1]);
+	const hasTrailingSpace = endPos < (text.length - 1) && /[^\S\r\n]/.test(text[endPos + 1]);
+
+	return (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : '');
 }
diff --git a/test/135.js b/test/135.js
@@ -38,21 +38,4 @@ describe('pr 135', function () {
 		const textNode = pNode.firstChild;
 		textNode.textContent.should.eql(decodedText);
 	});
-
-	it('should remove whitespaces while preserving nodes with content', function () {
-		const root = parse('<p> \r \n  \t <h5>  123&nbsp;  </h5></p>');
-
-		const textNode = new TextNode('  123&nbsp;  ');
-		textNode.rawText = textNode.trimmedText;
-		textNode.rawText.should.eql(' 123&nbsp; ');
-
-		const p = new HTMLElement('p', {}, '', root);
-		p
-			.appendChild(new HTMLElement('h5', {}, ''))
-			.appendChild(textNode);
-
-		p.toString().should.eql('<p><h5> 123&nbsp; </h5></p>');
-		root.firstChild.removeWhitespace().toString().should.eql('<p><h5> 123&nbsp; </h5></p>');
-		root.firstChild.removeWhitespace().should.eql(p);
-	})
 });
diff --git a/test/html.js b/test/html.js
@@ -197,11 +197,11 @@ describe('HTML Parser', function () {
 
 		describe('#removeWhitespace()', function () {
 			it('should remove whitespaces while preserving nodes with content', function () {
-				const root = parseHTML('<p> \r \n  \t <h5>  123  </h5></p>');
+				const root = parseHTML('<p> \r \n  \t <h5>  123&nbsp;  </h5></p>');
 
-				const textNode = new TextNode('  123  ');
-				textNode.rawText = textNode.trimmedText;
-				textNode.rawText.should.eql(' 123 ');
+				const textNode = new TextNode('  123&nbsp;  ');
+				textNode.rawText = textNode.trimmedRawText;
+				textNode.rawText.should.eql(' 123&nbsp; ');
 
 				const p = new HTMLElement('p', {}, '', root);
 				p
@@ -414,26 +414,32 @@ describe('HTML Parser', function () {
 				root.firstChild.getAttribute('alt').should.eql('«Sogno');
 				root.firstChild.rawAttributes.alt.should.eql('&laquo;Sogno');
 			});
-			it('shoud not decode text', function () {
+
+			it('should not decode text from parseHTML()', function () {
 				// https://github.com/taoqf/node-html-parser/issues/33
-				const root = parseHTML(`<html>
-<body>
-<div id='source'>
-&lt;p&gt;
-This content should be enclosed within an escaped p tag&lt;br /&gt;
-&lt;/p&gt;
-</div>
-</body>
-</html>`)
-				root.toString().should.eql(`<html>
-<body>
-<div id='source'>
-&lt;p&gt;
-This content should be enclosed within an escaped p tag&lt;br /&gt;
-&lt;/p&gt;
-</div>
-</body>
-</html>`);
+				const content = `&lt;p&gt; Not a p tag &lt;br /&gt; at all`;
+				const root = parseHTML(`<div>${content}</div>`);
+				root.childNodes.should.have.length(1);
+
+				const divNode = root.firstChild;
+				divNode.childNodes.should.have.length(1);
+
+				const textNode = divNode.firstChild;
+				textNode.rawText.should.eql(content);
+			});
+
+			it(`should decode for node text property`, function () {
+				const encodedText = `My&gt;text`;
+				const decodedText = `My>text`;
+				const root = parseHTML(`<p>${encodedText}</p>`);
+
+				const pNode = root.firstChild;
+				pNode.text.should.eql(decodedText);
+				pNode.rawText.should.eql(encodedText);
+
+				const textNode = pNode.firstChild;
+				textNode.text.should.eql(decodedText);
+				textNode.rawText.should.eql(encodedText);
 			});
 		});