From ee028ab152db7baf0261977052fbb37b7cd511fb Mon Sep 17 00:00:00 2001 From: Chris Winberry Date: Thu, 22 Apr 2010 11:17:56 -0400 Subject: [PATCH] Added options parameter to DefaultHandler --- node-htmlparser.js | 23 ++++++++++-- runtests.html | 12 ++++--- runtests.js | 10 +++--- tests/15-non-verbose.js | 43 ++++++++++++++++++++++ tests/16-ignore_whitespace.js | 68 +++++++++++++++++++++++++++++++++++ 5 files changed, 143 insertions(+), 13 deletions(-) create mode 100644 tests/15-non-verbose.js create mode 100644 tests/16-ignore_whitespace.js diff --git a/node-htmlparser.js b/node-htmlparser.js index a6eaa3e..43fe13b 100644 --- a/node-htmlparser.js +++ b/node-htmlparser.js @@ -418,8 +418,14 @@ function Parser (handler) { } Parser.prototype.HandleError = Parser.prototype.handleError; //TODO: remove next version -function DefaultHandler (callback) { +//TODO: add support for options: ignoreWhitespace, verbose (keep data for tags and raw for all) +function DefaultHandler (callback, options) { this.reset(); + this._options = options ? options : { }; + if (this._options.ignoreWhitespace == undefined) + this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes + if (this._options.verbose == undefined) + this._options.verbose = true; //Keep data property for tags and raw property for all if ((typeof callback) == "function") this._callback = callback; } @@ -442,6 +448,8 @@ function DefaultHandler (callback) { , param: 1 , embed: 1 } + //Regex to detect whitespace only text nodes + DefaultHandler.reWhitespace = /^\s*$/; //**Public**// //Properties// @@ -465,6 +473,9 @@ function DefaultHandler (callback) { this.handleElement(element); } DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) { + if (this._options.ignoreWhitespace) + if (DefaultHandler.reWhitespace.test(element.data)) + return; this.handleElement(element); } DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) { @@ -479,6 +490,7 @@ function DefaultHandler (callback) { //**Private**// //Properties// + DefaultHandler.prototype._options = null; //Handler options for how to behave DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed @@ -494,8 +506,13 @@ function DefaultHandler (callback) { DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) { if (this._done) this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()")); -// delete element.raw; //FIXME: Serious performance problem here -// element.raw = null; //FIXME: Not clean + if (!this._options.verbose) { +// element.raw = null; //FIXME: Not clean + //FIXME: Serious performance problem using delete + delete element.raw; + if (element.type == "tag" || element.type == "script" || element.type == "comment") + delete element.data; + } if (!this._tagStack.last()) { //There are no parent elements //If the element can be a container, add it to the tag stack and the top level list if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) { diff --git a/runtests.html b/runtests.html index 37e4707..580ae47 100644 --- a/runtests.html +++ b/runtests.html @@ -27,23 +27,25 @@ + +