Skip to content

Commit

Permalink
feat: Add range to nodes and fix whitespace issue (fixes #137)
Browse files Browse the repository at this point in the history
  • Loading branch information
nonara committed Jul 12, 2021
1 parent c13ba94 commit a64f336
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 44 deletions.
4 changes: 2 additions & 2 deletions src/nodes/comment.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ import NodeType from './type';
import HTMLElement from './html';

export default class CommentNode extends Node {
public constructor(public rawText: string, parentNode: HTMLElement) {
super(parentNode);
public constructor(public rawText: string, parentNode: HTMLElement, range?: [ number, number ]) {
super(parentNode, range);
}

/**
Expand Down
105 changes: 66 additions & 39 deletions src/nodes/html.ts
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,14 @@ export default class HTMLElement extends Node {
*
* @memberof HTMLElement
*/
public constructor(tagName: string, keyAttrs: KeyAttributes, private rawAttrs = '', parentNode: HTMLElement | null) {
super(parentNode);
public constructor(
tagName: string,
keyAttrs: KeyAttributes,
private rawAttrs = '',
parentNode: HTMLElement | null,
range?: [ number, number ]
) {
super(parentNode, range);
this.rawTagName = tagName;
this.rawAttrs = rawAttrs || '';
this.id = keyAttrs.id || '';
Expand Down Expand Up @@ -1012,88 +1018,109 @@ export function base_parse(data: string, options = { lowerCaseTagName: false, co
return it.test(tag);
});
}
const root = new HTMLElement(null, {}, '', null);
const createRange = (startPos: number, endPos: number): [ number, number ] =>
[ startPos - frameFlagOffset, endPos - frameFlagOffset ];

const root = new HTMLElement(null, {}, '', null, [ 0, data.length ]);
let currentParent = root;
const stack = [root];
let lastTextPos = -1;
let match: RegExpExecArray;
// https://github.com/taoqf/node-html-parser/issues/38
data = `<${frameflag}>${data}</${frameflag}>`;

const dataEndPos = data.length - (frameflag.length + 2);
const frameFlagOffset = frameflag.length + 2;

while ((match = kMarkupPattern.exec(data))) {
const tagStartPos = kMarkupPattern.lastIndex - match[0].length;
const tagEndPos = kMarkupPattern.lastIndex;

// Add TextNode if content
if (lastTextPos > -1) {
if (lastTextPos + match[0].length < kMarkupPattern.lastIndex) {
// if has content
const text = data.substring(lastTextPos, kMarkupPattern.lastIndex - match[0].length);
currentParent.appendChild(new TextNode(text, currentParent));
if (lastTextPos + match[0].length < tagEndPos) {
const text = data.substring(lastTextPos, tagStartPos);
currentParent.appendChild(new TextNode(text, currentParent, createRange(lastTextPos, tagStartPos)));
}
}

lastTextPos = kMarkupPattern.lastIndex;
if (match[2] === frameflag) {
continue;
}

// https://github.com/taoqf/node-html-parser/issues/38
// Skip frameflag node
if (match[2] === frameflag) continue;

// Handle comments
if (match[0][1] === '!') {
// this is a comment
if (options.comment) {
// Only keep what is in between <!-- and -->
const text = data.substring(lastTextPos - 3, lastTextPos - match[0].length + 4);
currentParent.appendChild(new CommentNode(text, currentParent));
const text = data.substring(tagStartPos + 4, tagEndPos - 3);
currentParent.appendChild(new CommentNode(text, currentParent, createRange(tagStartPos, tagEndPos)));
}
continue;
}
if (options.lowerCaseTagName) {
match[2] = match[2].toLowerCase();
}

/* -- Handle tag matching -- */
// Fix tag casing if necessary
if (options.lowerCaseTagName) match[2] = match[2].toLowerCase();

// Handle opening tags (ie. <this> not </that>)
if (!match[1]) {
// not </ tags
/* Populate attributes */
const attrs = {};
for (let attMatch; (attMatch = kAttributePattern.exec(match[3]));) {
attrs[attMatch[2].toLowerCase()] = attMatch[4] || attMatch[5] || attMatch[6];
}

const tagName = currentParent.rawTagName as 'LI' | 'P' | 'B' | 'TD' | 'TH' | 'H1' | 'H2' | 'H3' | 'H4' | 'H5' | 'H6' | 'li' | 'p' | 'b' | 'td' | 'th' | 'h1' | 'h2' | 'h3' | 'h4' | 'h5' | 'h6';

if (!match[4] && kElementsClosedByOpening[tagName]) {
if (kElementsClosedByOpening[tagName][match[2]]) {
stack.pop();
currentParent = arr_back(stack);
}
}
// ignore container tag we add above
// https://github.com/taoqf/node-html-parser/issues/38
currentParent = currentParent.appendChild(new HTMLElement(match[2], attrs, match[3], null));

const tagEndPos = kMarkupPattern.lastIndex;
const tagStartPos = tagEndPos - match[0].length;

currentParent = currentParent.appendChild(
// Initialize range (end position updated later for closed tags)
new HTMLElement(match[2], attrs, match[3], null, createRange(tagStartPos, tagEndPos))
);
stack.push(currentParent);

if (is_block_text_element(match[2])) {
// a little test to find next </script> or </style> ...
// Find closing tag
const closeMarkup = `</${match[2]}>`;
const index = (() => {
if (options.lowerCaseTagName) {
return data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex);
}
return data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
})();
const closeIndex = options.lowerCaseTagName
? data.toLocaleLowerCase().indexOf(closeMarkup, kMarkupPattern.lastIndex)
: data.indexOf(closeMarkup, kMarkupPattern.lastIndex);
const textEndPos = closeIndex === -1 ? dataEndPos : closeIndex;

if (element_should_be_ignore(match[2])) {
let text: string;
if (index === -1) {
// there is no matching ending for the text element.
text = data.substr(kMarkupPattern.lastIndex);
} else {
text = data.substring(kMarkupPattern.lastIndex, index);
}
if (text.length > 0) {
currentParent.appendChild(new TextNode(text, currentParent));
const text = data.substring(tagEndPos, textEndPos);
if (text.length > 0 && /\S/.test(text)) {
currentParent.appendChild(new TextNode(text, currentParent, createRange(tagEndPos, textEndPos)));
}
}
if (index === -1) {

if (closeIndex === -1) {
lastTextPos = kMarkupPattern.lastIndex = data.length + 1;
} else {
lastTextPos = kMarkupPattern.lastIndex = index + closeMarkup.length;
lastTextPos = kMarkupPattern.lastIndex = closeIndex + closeMarkup.length;
// Cause to be treated as self-closing, because no close found
match[1] = 'true';
}
}
}

// Handle closing tags or self-closed elements (ie </tag> or <br>)
if (match[1] || match[4] || kSelfClosingElements[match[2]]) {
// </ or /> or <br> etc.
while (true) {
if (currentParent.rawTagName === match[2]) {
// Update range end for closed tag
(<[ number, number ]>currentParent.range)[1] = createRange(-1, Math.max(lastTextPos, tagEndPos))[1];
stack.pop();
currentParent = arr_back(stack);
break;
Expand Down
12 changes: 11 additions & 1 deletion src/nodes/node.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,21 @@ import HTMLElement from './html';
export default abstract class Node {
abstract nodeType: NodeType;
public childNodes = [] as Node[];
public range: readonly [ number, number ];
abstract text: string;
abstract rawText: string;
// abstract get rawText(): string;
abstract toString(): string;
public constructor(public parentNode = null as HTMLElement | null) {
public constructor(
public parentNode = null as HTMLElement | null,
range?: [ number, number ]
) {
Object.defineProperty(this, 'range', {
enumerable: false,
writable: true,
configurable: true,
value: range ?? [ -1, -1 ]
});
}
public get innerText() {
return this.rawText;
Expand Down
4 changes: 2 additions & 2 deletions src/nodes/text.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ import NodeType from './type';
* @param {string} value [description]
*/
export default class TextNode extends Node {
public constructor(rawText: string, parentNode: HTMLElement) {
super(parentNode);
public constructor(rawText: string, parentNode: HTMLElement, range?: [ number, number ]) {
super(parentNode, range);
this._rawText = rawText;
}

Expand Down
85 changes: 85 additions & 0 deletions test/node-ranges.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
const { parse, HTMLElement, TextNode, CommentNode } = require('../dist');
const hp2 = require('htmlparser2')
const mochaEach = require('mocha-each');

// Use https://astexplorer.net/ to compare
const html = `
Leading text
<div>
<p>Text Content</p>
Goes Here
</div>
<input name="hello">
<!-- comment -->
<style>
.abc {
display: none
}
</style>
<pre>
block Text
</pre>
<span>The space between us</span> <span>is vast</span>
Closing text
`;

function prepare() {
const nodeMeta = [];
const abbreviate = (s, maxLen = 8) =>
(s.length > maxLen ? s.slice(0, maxLen) + '...' : s).replace(/(\r?\n)/g, '\\n');

// Parse AST
const hp2ast = hp2.parseDocument(html, { withEndIndices: true, withStartIndices: true });
const ast = parse(html, { comment: true });

// Prepare flatNodes
ast.childNodes.forEach((n, idx, arr) => walk(arr, idx, hp2ast.childNodes));

return { nodeMeta, ast, hp2ast };

function walk(nodeArr, idx, mirrorArr) {
const node = nodeArr[idx];
const mirrorNode = mirrorArr[idx];

const label = mirrorNode.type !== 'tag' ? `<${mirrorNode.type}: '${abbreviate(node.text)}'>` : node.tagName;
nodeMeta.push([ label, node, mirrorNode ]);

node.childNodes.forEach((n, idx, arr) => walk(arr, idx, mirrorNode.childNodes));
}
}

// See: https://github.com/taoqf/node-html-parser/issues/137
describe(`Elements ranges`, function () {
const { nodeMeta, ast } = prepare();

before(() => {
// Pre-check to make sure configured html is not altered
ast.childNodes.length.should.be.greaterThan(2);
});

describe(`parsed elements created with proper ranges`, () => {
mochaEach(nodeMeta).it(`%s`, (label, node, hp2Node) => {
/* Ensure we have the right node mapping */
const expectedProto = hp2Node.type === 'comment' ? CommentNode :
hp2Node.type === 'text' ? TextNode :
HTMLElement;
Object.getPrototypeOf(node).constructor.should.eql(expectedProto);
if (expectedProto === HTMLElement) node.tagName.toLocaleLowerCase().should.eql(hp2Node.name.toLocaleLowerCase());

// Check range
node.range.should.eql([ hp2Node.startIndex, hp2Node.endIndex + 1 ]);
});
});

it(`new nodes are created with [ -1, -1 ] range by default`, () => {
const nodes = [
new HTMLElement('B', {}, '', null),
new TextNode('text', null),
new CommentNode('text', null)
];

for (const node of nodes) node.range.should.eql([ -1, -1 ]);
});
});
5 changes: 5 additions & 0 deletions test/parse.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,9 @@ describe('HTML Parser', function () {
const root = parse('<!DOCTYPE html><html lang="en"><head></head><body><script id="storeFinder" type="application/json">{"key":true}</script></body></html>');
root.toString().should.eql('<!DOCTYPE html><html lang="en"><head></head><body><script id="storeFinder" type="application/json">{"key":true}</script></body></html>');
});
// See: https://github.com/taoqf/node-html-parser/issues/137
it(`parses all whitespace`, () => {
const root = parse(`<span>test1</span> <span>test2</span>\n<span>test3</span>\r\n<span>test4</span>`);
root.text.should.eql('test1 test2\ntest3\r\ntest4');
});
});

0 comments on commit a64f336

Please sign in to comment.