decode textContent #135

taoqf · Jul 2, 2021 · 840ffda · nonara · Jul 2, 2021 · nonara
1 parent e35e4ff
commit 840ffda
Show file tree

Hide file tree

Showing 6 changed files with 176 additions and 7 deletions.
diff --git a/src/nodes/html.ts b/src/nodes/html.ts
@@ -276,7 +276,7 @@ export default class HTMLElement extends Node {
 		}, '');
 	}
 	public get textContent() {
-		return this.rawText;
+		return decode(this.rawText);
 	}
 	public set textContent(val: string) {
 		const content = [new TextNode(val, this)];

diff --git a/src/nodes/node.ts b/src/nodes/node.ts
@@ -1,3 +1,4 @@
+import { decode } from 'he';
 import NodeType from './type';
 import HTMLElement from './html';
 
@@ -17,9 +18,10 @@ export default abstract class Node {
 		return this.rawText;
 	}
 	public get textContent() {
-		return this.rawText;
+		return decode(this.rawText);
 	}
 	public set textContent(val: string) {
+		console.error('ssssssssssssssss', val);
 		this.rawText = val;
 	}
 }
diff --git a/src/nodes/text.1.ts b/src/nodes/text.1.ts
@@ -0,0 +1,110 @@
+import { decode } from 'he';
+import NodeType from './type';
+import Node from './node';
+import HTMLElement from './html';
+
+/**
+ * TextNode to contain a text element in DOM tree.
+ * @param {string} value [description]
+ */
+export default class TextNode extends Node {
+	public constructor(rawText: string, parentNode: HTMLElement) {
+		super(parentNode);
+		this._rawText = rawText;
+	}
+
+	/**
+	 * Node Type declaration.
+	 * @type {Number}
+	 */
+	public nodeType = NodeType.TEXT_NODE;
+
+	private _rawText: string;
+	private _trimmedRawText?: string;
+	private _trimmedText?: string;
+
+	public get rawText() {
+		return this._rawText;
+	}
+
+	/**
+	 * Set rawText and invalidate trimmed caches
+	 */
+	public set rawText(text: string) {
+		this._rawText = text;
+		this._trimmedRawText = void 0;
+		this._trimmedText = void 0;
+	}
+
+	/**
+	 * Returns raw text with all whitespace trimmed except single leading/trailing non-breaking space
+	 */
+	public get trimmedRawText() {
+		if (this._trimmedRawText !== undefined) return this._trimmedRawText;
+		this._trimmedRawText = trimText(this.rawText);
+		return this._trimmedRawText;
+	}
+
+	/**
+	 * Returns text with all whitespace trimmed except single leading/trailing non-breaking space
+	 */
+	public get trimmedText() {
+		if (this._trimmedText !== undefined) return this._trimmedText;
+
+		this._trimmedText = trimText(this.text);
+
+		return this._trimmedText;
+	}
+
+	/**
+	 * Get unescaped text value of current node and its children.
+	 * @return {string} text content
+	 */
+	public get text() {
+		return decode(this.rawText);
+	}
+
+	/**
+	 * Detect if the node contains only white space.
+	 * @return {boolean}
+	 */
+	public get isWhitespace() {
+		return /^(\s|&nbsp;)*$/.test(this.rawText);
+	}
+
+	public toString() {
+		return this.rawText;
+	}
+}
+
+/**
+ * Trim whitespace except single leading/trailing non-breaking space
+ */
+function trimText(text: string): string {
+	let i = 0;
+	let startPos;
+	let endPos;
+
+	while (i >= 0 && i < text.length) {
+		if (/\S/.test(text[i])) {
+			if (startPos === undefined) {
+				startPos = i;
+				i = text.length;
+			} else {
+				endPos = i;
+				i = void 0;
+			}
+		}
+
+		if (startPos === undefined) i++;
+		else i--;
+	}
+
+	if (startPos === undefined) startPos = 0;
+	if (endPos === undefined) endPos = text.length - 1;
+
+	const hasLeadingSpace = startPos > 0 && /[^\S\r\n]/.test(text[startPos - 1]);
+	const hasTrailingSpace = endPos < (text.length - 1) && /[^\S\r\n]/.test(text[endPos + 1]);
+
+	return (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : '');
+}
diff --git a/src/nodes/text.ts b/src/nodes/text.ts
@@ -48,8 +48,8 @@ export default class TextNode extends Node {
 		if (startPos === undefined) startPos = 0;
 		if (endPos === undefined) endPos = text.length - 1;
 
-		const hasLeadingSpace = startPos > 0 && /[^\S\r\n]/.test(text[startPos-1]);
-		const hasTrailingSpace = endPos < (text.length - 1) && /[^\S\r\n]/.test(text[endPos+1]);
+		const hasLeadingSpace = startPos > 0 && /[^\S\r\n]/.test(text[startPos - 1]);
+		const hasTrailingSpace = endPos < (text.length - 1) && /[^\S\r\n]/.test(text[endPos + 1]);
 
 		this._trimmedText = (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : '');
 

diff --git a/test/135.js b/test/135.js
@@ -0,0 +1,58 @@
+const { parse, TextNode, HTMLElement } = require('../dist');
+
+describe('pr 135', function () {
+	it('shoud not decode text', function () {
+		const content = `&lt;p&gt; Not a p tag &lt;br /&gt; at all`;
+		const root = parse(`<div>${content}</div>`);
+		const div = root.firstChild;
+		div.innerHTML.should.eql(content);
+		div.textContent.should.eql('<p> Not a p tag <br /> at all');
+		// div.innerText.should.eql('<p> Not a p tag <br /> at all');
+
+		// const textNode = div.firstChild;
+		// textNode.rawText.should.eql(content);
+		// textNode.toString().should.eql('aaa')
+	});
+
+	it('should not decode text from parseHTML()', function () {
+		const content = `&lt;p&gt; Not a p tag &lt;br /&gt; at all`;
+		const root = parse(`<div>${content}</div>`);
+		root.childNodes.should.have.length(1);
+
+		const divNode = root.firstChild;
+		divNode.childNodes.should.have.length(1);
+
+		const textNode = divNode.firstChild;
+		textNode.rawText.should.eql(content);
+	});
+
+	it(`should decode for node text property`, function () {
+		const encodedText = `My&gt;text`;
+		const decodedText = `My>text`;
+		const root = parse(`<p>${encodedText}</p>`);
+
+		const pNode = root.firstChild;
+		pNode.innerHTML.should.eql(encodedText);
+		pNode.textContent.should.eql(decodedText);
+
+		const textNode = pNode.firstChild;
+		textNode.textContent.should.eql(decodedText);
+	});
+
+	it('should remove whitespaces while preserving nodes with content', function () {
+		const root = parse('<p> \r \n  \t <h5>  123&nbsp;  </h5></p>');
+
+		const textNode = new TextNode('  123&nbsp;  ');
+		textNode.rawText = textNode.trimmedText;
+		textNode.rawText.should.eql(' 123&nbsp; ');
+
+		const p = new HTMLElement('p', {}, '', root);
+		p
+			.appendChild(new HTMLElement('h5', {}, ''))
+			.appendChild(textNode);
+
+		p.toString().should.eql('<p><h5> 123&nbsp; </h5></p>');
+		root.firstChild.removeWhitespace().toString().should.eql('<p><h5> 123&nbsp; </h5></p>');
+		root.firstChild.removeWhitespace().should.eql(p);
+	})
+});
diff --git a/test/html.js b/test/html.js
@@ -2,7 +2,6 @@ const should = require('should');
 const fs = require('fs');
 
 const HTMLParser = require('../dist');
-const Matcher = require('../dist/matcher').default;
 const HTMLElement = require('../dist/nodes/html').default;
 const TextNode = require('../dist/nodes/text').default;
 const CommentNode = require('../dist/nodes/comment').default;
@@ -126,10 +125,10 @@ describe('HTML Parser', function () {
 			const script = root.firstChild;
 			const style = root.lastChild;
 			script.childNodes.should.not.be.empty;
-			script.childNodes.should.eql([ new TextNode('1', script) ]);
+			script.childNodes.should.eql([new TextNode('1', script)]);
 			script.text.should.eql('1');
 			style.childNodes.should.not.be.empty;
-			style.childNodes.should.eql([ new TextNode('2&amp;', style) ]);
+			style.childNodes.should.eql([new TextNode('2&amp;', style)]);
 			style.text.should.eql('2&');
 			style.rawText.should.eql('2&amp;');
 		});