Skip to content

Commit

Permalink
Merge branch 'main' of github.com:taoqf/node-html-parser; branch 'fix…
Browse files Browse the repository at this point in the history
…-134' of https://github.com/nonara/node-html-parser into nonara-fix-134
  • Loading branch information
taoqf committed Jul 5, 2021
2 parents cb11eab + e728280 commit 4f13096
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 76 deletions.
2 changes: 1 addition & 1 deletion src/nodes/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ export default class HTMLElement extends Node {
if ((node as TextNode).isWhitespace) {
return;
}
node.rawText = (<TextNode>node).trimmedText;
node.rawText = (<TextNode>node).trimmedRawText;
} else if (node.nodeType === NodeType.ELEMENT_NODE) {
(node as HTMLElement).removeWhitespace();
}
Expand Down
100 changes: 65 additions & 35 deletions src/nodes/text.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import NodeType from './type';
import Node from './node';
import { decode } from 'he';
import HTMLElement from './html';
import Node from './node';
import NodeType from './type';

/**
* TextNode to contain a text element in DOM tree.
* @param {string} value [description]
*/
export default class TextNode extends Node {
public constructor(public rawText: string, parentNode: HTMLElement) {
public constructor(rawText: string, parentNode: HTMLElement) {
super(parentNode);
this._rawText = rawText;
}

/**
Expand All @@ -17,42 +19,38 @@ export default class TextNode extends Node {
*/
public nodeType = NodeType.TEXT_NODE;

private _rawText: string;
private _trimmedRawText?: string;
private _trimmedText?: string;

public get rawText() {
return this._rawText;
}

/**
* Set rawText and invalidate trimmed caches
*/
public set rawText(text: string) {
this._rawText = text;
this._trimmedRawText = void 0;
this._trimmedText = void 0;
}

/**
* Returns raw text with all whitespace trimmed except single leading/trailing non-breaking space
*/
public get trimmedRawText() {
if (this._trimmedRawText !== undefined) return this._trimmedRawText;
this._trimmedRawText = trimText(this.rawText);
return this._trimmedRawText;
}

/**
* Returns text with all whitespace trimmed except single leading/trailing non-breaking space
*/
public get trimmedText() {
if (this._trimmedText !== undefined) return this._trimmedText;

const text = this.rawText;
let i = 0;
let startPos;
let endPos;

while (i >= 0 && i < text.length) {
if (/\S/.test(text[i])) {
if (startPos === undefined) {
startPos = i;
i = text.length;
} else {
endPos = i;
i = void 0;
}
}

if (startPos === undefined) i++;
else i--;
}

if (startPos === undefined) startPos = 0;
if (endPos === undefined) endPos = text.length - 1;

const hasLeadingSpace = startPos > 0 && /[^\S\r\n]/.test(text[startPos - 1]);
const hasTrailingSpace = endPos < (text.length - 1) && /[^\S\r\n]/.test(text[endPos + 1]);

this._trimmedText = (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : '');

this._trimmedText = trimText(this.text);
return this._trimmedText;
}

Expand All @@ -61,18 +59,50 @@ export default class TextNode extends Node {
* @return {string} text content
*/
public get text() {
return this.rawText;
return decode(this.rawText);
}

/**
* Detect if the node contains only white space.
* @return {bool}
* @return {boolean}
*/
public get isWhitespace() {
return /^(\s|&nbsp;)*$/.test(this.rawText);
}

public toString() {
return this.text;
return this.rawText;
}
}

/**
* Trim whitespace except single leading/trailing non-breaking space
*/
function trimText(text: string): string {
let i = 0;
let startPos;
let endPos;

while (i >= 0 && i < text.length) {
if (/\S/.test(text[i])) {
if (startPos === undefined) {
startPos = i;
i = text.length;
} else {
endPos = i;
i = void 0;
}
}

if (startPos === undefined) i++;
else i--;
}

if (startPos === undefined) startPos = 0;
if (endPos === undefined) endPos = text.length - 1;

const hasLeadingSpace = startPos > 0 && /[^\S\r\n]/.test(text[startPos - 1]);
const hasTrailingSpace = endPos < (text.length - 1) && /[^\S\r\n]/.test(text[endPos + 1]);

return (hasLeadingSpace ? ' ' : '') + text.slice(startPos, endPos + 1) + (hasTrailingSpace ? ' ' : '');
}
17 changes: 0 additions & 17 deletions test/135.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,21 +38,4 @@ describe('pr 135', function () {
const textNode = pNode.firstChild;
textNode.textContent.should.eql(decodedText);
});

it('should remove whitespaces while preserving nodes with content', function () {
const root = parse('<p> \r \n \t <h5> 123&nbsp; </h5></p>');

const textNode = new TextNode(' 123&nbsp; ');
textNode.rawText = textNode.trimmedText;
textNode.rawText.should.eql(' 123&nbsp; ');

const p = new HTMLElement('p', {}, '', root);
p
.appendChild(new HTMLElement('h5', {}, ''))
.appendChild(textNode);

p.toString().should.eql('<p><h5> 123&nbsp; </h5></p>');
root.firstChild.removeWhitespace().toString().should.eql('<p><h5> 123&nbsp; </h5></p>');
root.firstChild.removeWhitespace().should.eql(p);
})
});
52 changes: 29 additions & 23 deletions test/html.js
Original file line number Diff line number Diff line change
Expand Up @@ -197,11 +197,11 @@ describe('HTML Parser', function () {

describe('#removeWhitespace()', function () {
it('should remove whitespaces while preserving nodes with content', function () {
const root = parseHTML('<p> \r \n \t <h5> 123 </h5></p>');
const root = parseHTML('<p> \r \n \t <h5> 123&nbsp; </h5></p>');

const textNode = new TextNode(' 123 ');
textNode.rawText = textNode.trimmedText;
textNode.rawText.should.eql(' 123 ');
const textNode = new TextNode(' 123&nbsp; ');
textNode.rawText = textNode.trimmedRawText;
textNode.rawText.should.eql(' 123&nbsp; ');

const p = new HTMLElement('p', {}, '', root);
p
Expand Down Expand Up @@ -414,26 +414,32 @@ describe('HTML Parser', function () {
root.firstChild.getAttribute('alt').should.eql('«Sogno');
root.firstChild.rawAttributes.alt.should.eql('&laquo;Sogno');
});
it('shoud not decode text', function () {

it('should not decode text from parseHTML()', function () {
// https://github.com/taoqf/node-html-parser/issues/33
const root = parseHTML(`<html>
<body>
<div id='source'>
&lt;p&gt;
This content should be enclosed within an escaped p tag&lt;br /&gt;
&lt;/p&gt;
</div>
</body>
</html>`)
root.toString().should.eql(`<html>
<body>
<div id='source'>
&lt;p&gt;
This content should be enclosed within an escaped p tag&lt;br /&gt;
&lt;/p&gt;
</div>
</body>
</html>`);
const content = `&lt;p&gt; Not a p tag &lt;br /&gt; at all`;
const root = parseHTML(`<div>${content}</div>`);
root.childNodes.should.have.length(1);

const divNode = root.firstChild;
divNode.childNodes.should.have.length(1);

const textNode = divNode.firstChild;
textNode.rawText.should.eql(content);
});

it(`should decode for node text property`, function () {
const encodedText = `My&gt;text`;
const decodedText = `My>text`;
const root = parseHTML(`<p>${encodedText}</p>`);

const pNode = root.firstChild;
pNode.text.should.eql(decodedText);
pNode.rawText.should.eql(encodedText);

const textNode = pNode.firstChild;
textNode.text.should.eql(decodedText);
textNode.rawText.should.eql(encodedText);
});
});

Expand Down

0 comments on commit 4f13096

Please sign in to comment.