Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Updated lib filename to exclude 'node-' prefix and included backwards…

… compatible shim for those still expecting 'node-htmlparser.*'
  • Loading branch information...
commit 76e98562af8100349807ec48d3872b723c536aa0 1 parent 60d64db
@tautologistics authored
View
822 lib/htmlparser.js
@@ -0,0 +1,822 @@
+/***********************************************
+Copyright 2010, Chris Winberry <chris@winberry.net>. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
+***********************************************/
+/* v1.7.2 */
+
+(function () {
+
+function runningInNode () {
+ return(
+ (typeof require) == "function"
+ &&
+ (typeof exports) == "object"
+ &&
+ (typeof module) == "object"
+ &&
+ (typeof __filename) == "string"
+ &&
+ (typeof __dirname) == "string"
+ );
+}
+
+if (!runningInNode()) {
+ if (!this.Tautologistics)
+ this.Tautologistics = {};
+ else if (this.Tautologistics.NodeHtmlParser)
+ return; //NodeHtmlParser already defined!
+ this.Tautologistics.NodeHtmlParser = {};
+ exports = this.Tautologistics.NodeHtmlParser;
+}
+
+//Types of elements found in the DOM
+var ElementType = {
+ Text: "text" //Plain text
+ , Directive: "directive" //Special tag <!...>
+ , Comment: "comment" //Special tag <!--...-->
+ , Script: "script" //Special tag <script>...</script>
+ , Style: "style" //Special tag <style>...</style>
+ , Tag: "tag" //Any tag that isn't special
+}
+
+function Parser (handler, options) {
+ this._options = options ? options : { };
+ if (this._options.includeLocation == undefined) {
+ this._options.includeLocation = false; //Do not track element position in document by default
+ }
+
+ this.validateHandler(handler);
+ this._handler = handler;
+ this.reset();
+}
+
+ //**"Static"**//
+ //Regular expressions used for cleaning up and parsing (stateless)
+ Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace
+ Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents
+ Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on
+ Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element
+
+ //Regular expressions used for parsing (stateful)
+ Parser._reAttrib = //Find attributes in a tag
+ /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;
+ Parser._reTags = /[\<\>]/g; //Find tag markers
+
+ //**Public**//
+ //Methods//
+ //Parses a complete HTML and pushes it to the handler
+ Parser.prototype.parseComplete = function Parser$parseComplete (data) {
+ this.reset();
+ this.parseChunk(data);
+ this.done();
+ }
+
+ //Parses a piece of an HTML document
+ Parser.prototype.parseChunk = function Parser$parseChunk (data) {
+ if (this._done)
+ this.handleError(new Error("Attempted to parse chunk after parsing already done"));
+ this._buffer += data; //FIXME: this can be a bottleneck
+ this.parseTags();
+ }
+
+ //Tells the parser that the HTML being parsed is complete
+ Parser.prototype.done = function Parser$done () {
+ if (this._done)
+ return;
+ this._done = true;
+
+ //Push any unparsed text into a final element in the element list
+ if (this._buffer.length) {
+ var rawData = this._buffer;
+ this._buffer = "";
+ var element = {
+ raw: rawData
+ , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
+ , type: this._parseState
+ };
+ if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style)
+ element.name = this.parseTagName(element.data);
+ this.parseAttribs(element);
+ this._elements.push(element);
+ }
+
+ this.writeHandler();
+ this._handler.done();
+ }
+
+ //Resets the parser to a blank state, ready to parse a new HTML document
+ Parser.prototype.reset = function Parser$reset () {
+ this._buffer = "";
+ this._done = false;
+ this._elements = [];
+ this._elementsCurrent = 0;
+ this._current = 0;
+ this._next = 0;
+ this._location = {
+ row: 0
+ , col: 0
+ , charOffset: 0
+ , inBuffer: 0
+ };
+ this._parseState = ElementType.Text;
+ this._prevTagSep = '';
+ this._tagStack = [];
+ this._handler.reset();
+ }
+
+ //**Private**//
+ //Properties//
+ Parser.prototype._options = null; //Parser options for how to behave
+ Parser.prototype._handler = null; //Handler for parsed elements
+ Parser.prototype._buffer = null; //Buffer of unparsed data
+ Parser.prototype._done = false; //Flag indicating whether parsing is done
+ Parser.prototype._elements = null; //Array of parsed elements
+ Parser.prototype._elementsCurrent = 0; //Pointer to last element in _elements that has been processed
+ Parser.prototype._current = 0; //Position in data that has already been parsed
+ Parser.prototype._next = 0; //Position in data of the next tag marker (<>)
+ Parser.prototype._location = null; //Position tracking for elements in a stream
+ Parser.prototype._parseState = ElementType.Text; //Current type of element being parsed
+ Parser.prototype._prevTagSep = ''; //Previous tag marker found
+ //Stack of element types previously encountered; keeps track of when
+ //parsing occurs inside a script/comment/style tag
+ Parser.prototype._tagStack = null;
+
+ //Methods//
+ //Takes an array of elements and parses any found attributes
+ Parser.prototype.parseTagAttribs = function Parser$parseTagAttribs (elements) {
+ var idxEnd = elements.length;
+ var idx = 0;
+
+ while (idx < idxEnd) {
+ var element = elements[idx++];
+ if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style)
+ this.parseAttribs(element);
+ }
+
+ return(elements);
+ }
+
+ //Takes an element and adds an "attribs" property for any element attributes found
+ Parser.prototype.parseAttribs = function Parser$parseAttribs (element) {
+ //Only parse attributes for tags
+ if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag)
+ return;
+
+ var tagName = element.data.split(Parser._reWhitespace, 1)[0];
+ var attribRaw = element.data.substring(tagName.length);
+ if (attribRaw.length < 1)
+ return;
+
+ var match;
+ Parser._reAttrib.lastIndex = 0;
+ while (match = Parser._reAttrib.exec(attribRaw)) {
+ if (element.attribs == undefined)
+ element.attribs = {};
+
+ if (typeof match[1] == "string" && match[1].length) {
+ element.attribs[match[1]] = match[2];
+ } else if (typeof match[3] == "string" && match[3].length) {
+ element.attribs[match[3].toString()] = match[4].toString();
+ } else if (typeof match[5] == "string" && match[5].length) {
+ element.attribs[match[5]] = match[6];
+ } else if (typeof match[7] == "string" && match[7].length) {
+ element.attribs[match[7]] = match[7];
+ }
+ }
+ }
+
+ //Extracts the base tag name from the data value of an element
+ Parser.prototype.parseTagName = function Parser$parseTagName (data) {
+ if (data == null || data == "")
+ return("");
+ var match = Parser._reTagName.exec(data);
+ if (!match)
+ return("");
+ return((match[1] ? "/" : "") + match[2]);
+ }
+
+ //Parses through HTML text and returns an array of found elements
+ //I admit, this function is rather large but splitting up had an noticeable impact on speed
+ Parser.prototype.parseTags = function Parser$parseTags () {
+ var bufferEnd = this._buffer.length - 1;
+ while (Parser._reTags.test(this._buffer)) {
+ this._next = Parser._reTags.lastIndex - 1;
+ var tagSep = this._buffer.charAt(this._next); //The currently found tag marker
+ var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse
+
+ //A new element to eventually be appended to the element list
+ var element = {
+ raw: rawData
+ , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
+ , type: this._parseState
+ };
+
+ var elementName = this.parseTagName(element.data);
+
+ //This section inspects the current tag stack and modifies the current
+ //element if we're actually parsing a special area (script/comment/style tag)
+ if (this._tagStack.length) { //We're parsing inside a script/comment/style tag
+ if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag
+ if (elementName == "/script") //Actually, we're no longer in a script tag, so pop it off the stack
+ this._tagStack.pop();
+ else { //Not a closing script tag
+ if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
+ //All data from here to script close is now a text element
+ element.type = ElementType.Text;
+ //If the previous element is text, append the current text to it
+ if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
+ var prevElement = this._elements[this._elements.length - 1];
+ prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
+ element.raw = element.data = ""; //This causes the current element to not be added to the element list
+ }
+ }
+ }
+ }
+ else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag
+ if (elementName == "/style") //Actually, we're no longer in a style tag, so pop it off the stack
+ this._tagStack.pop();
+ else {
+ if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
+ //All data from here to style close is now a text element
+ element.type = ElementType.Text;
+ //If the previous element is text, append the current text to it
+ if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
+ var prevElement = this._elements[this._elements.length - 1];
+ if (element.raw != "") {
+ prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
+ element.raw = element.data = ""; //This causes the current element to not be added to the element list
+ } else { //Element is empty, so just append the last tag marker found
+ prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep;
+ }
+ } else { //The previous element was not text
+ if (element.raw != "") {
+ element.raw = element.data = element.raw;
+ }
+ }
+ }
+ }
+ }
+ else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag
+ var rawLen = element.raw.length;
+ if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") {
+ //Actually, we're no longer in a style tag, so pop it off the stack
+ this._tagStack.pop();
+ //If the previous element is a comment, append the current text to it
+ if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
+ var prevElement = this._elements[this._elements.length - 1];
+ prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, "");
+ element.raw = element.data = ""; //This causes the current element to not be added to the element list
+ element.type = ElementType.Text;
+ }
+ else //Previous element not a comment
+ element.type = ElementType.Comment; //Change the current element's type to a comment
+ }
+ else { //Still in a comment tag
+ element.type = ElementType.Comment;
+ //If the previous element is a comment, append the current text to it
+ if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
+ var prevElement = this._elements[this._elements.length - 1];
+ prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep;
+ element.raw = element.data = ""; //This causes the current element to not be added to the element list
+ element.type = ElementType.Text;
+ }
+ else
+ element.raw = element.data = element.raw + tagSep;
+ }
+ }
+ }
+
+ //Processing of non-special tags
+ if (element.type == ElementType.Tag) {
+ element.name = elementName;
+
+ if (element.raw.indexOf("!--") == 0) { //This tag is really comment
+ element.type = ElementType.Comment;
+ delete element["name"];
+ var rawLen = element.raw.length;
+ //Check if the comment is terminated in the current element
+ if (element.raw.charAt(rawLen - 1) == "-" && element.raw.charAt(rawLen - 2) == "-" && tagSep == ">")
+ element.raw = element.data = element.raw.replace(Parser._reTrimComment, "");
+ else { //It's not so push the comment onto the tag stack
+ element.raw += tagSep;
+ this._tagStack.push(ElementType.Comment);
+ }
+ }
+ else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) {
+ element.type = ElementType.Directive;
+ //TODO: what about CDATA?
+ }
+ else if (element.name == "script") {
+ element.type = ElementType.Script;
+ //Special tag, push onto the tag stack if not terminated
+ if (element.data.charAt(element.data.length - 1) != "/")
+ this._tagStack.push(ElementType.Script);
+ }
+ else if (element.name == "/script")
+ element.type = ElementType.Script;
+ else if (element.name == "style") {
+ element.type = ElementType.Style;
+ //Special tag, push onto the tag stack if not terminated
+ if (element.data.charAt(element.data.length - 1) != "/")
+ this._tagStack.push(ElementType.Style);
+ }
+ else if (element.name == "/style")
+ element.type = ElementType.Style;
+ if (element.name && element.name.charAt(0) == "/")
+ element.data = element.name;
+ }
+
+ //Add all tags and non-empty text elements to the element list
+ if (element.raw != "" || element.type != ElementType.Text) {
+ if (this._options.includeLocation && !element.location) {
+ element.location = this.getLocation(element.type == ElementType.Tag);
+ }
+ this.parseAttribs(element);
+ this._elements.push(element);
+ //If tag self-terminates, add an explicit, separate closing tag
+ if (
+ element.type != ElementType.Text
+ &&
+ element.type != ElementType.Comment
+ &&
+ element.type != ElementType.Directive
+ &&
+ element.data.charAt(element.data.length - 1) == "/"
+ )
+ this._elements.push({
+ raw: "/" + element.name
+ , data: "/" + element.name
+ , name: "/" + element.name
+ , type: element.type
+ });
+ }
+ this._parseState = (tagSep == "<") ? ElementType.Tag : ElementType.Text;
+ this._current = this._next + 1;
+ this._prevTagSep = tagSep;
+ }
+
+ if (this._options.includeLocation) {
+ this.getLocation();
+ this._location.row += this._location.inBuffer;
+ this._location.inBuffer = 0;
+ this._location.charOffset = 0;
+ }
+ this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : "";
+ this._current = 0;
+
+ this.writeHandler();
+ }
+
+ Parser.prototype.getLocation = function Parser$getLocation (startTag) {
+ var c,
+ l = this._location,
+ end = this._current - (startTag ? 1 : 0),
+ chunk = startTag && l.charOffset == 0 && this._current == 0;
+
+ for (; l.charOffset < end; l.charOffset++) {
+ c = this._buffer.charAt(l.charOffset);
+ if (c == '\n') {
+ l.inBuffer++;
+ l.col = 0;
+ } else if (c != '\r') {
+ l.col++;
+ }
+ }
+ return {
+ line: l.row + l.inBuffer + 1
+ , col: l.col + (chunk ? 0: 1)
+ };
+ }
+
+ //Checks the handler to make it is an object with the right "interface"
+ Parser.prototype.validateHandler = function Parser$validateHandler (handler) {
+ if ((typeof handler) != "object")
+ throw new Error("Handler is not an object");
+ if ((typeof handler.reset) != "function")
+ throw new Error("Handler method 'reset' is invalid");
+ if ((typeof handler.done) != "function")
+ throw new Error("Handler method 'done' is invalid");
+ if ((typeof handler.writeTag) != "function")
+ throw new Error("Handler method 'writeTag' is invalid");
+ if ((typeof handler.writeText) != "function")
+ throw new Error("Handler method 'writeText' is invalid");
+ if ((typeof handler.writeComment) != "function")
+ throw new Error("Handler method 'writeComment' is invalid");
+ if ((typeof handler.writeDirective) != "function")
+ throw new Error("Handler method 'writeDirective' is invalid");
+ }
+
+ //Writes parsed elements out to the handler
+ Parser.prototype.writeHandler = function Parser$writeHandler (forceFlush) {
+ forceFlush = !!forceFlush;
+ if (this._tagStack.length && !forceFlush)
+ return;
+ while (this._elements.length) {
+ var element = this._elements.shift();
+ switch (element.type) {
+ case ElementType.Comment:
+ this._handler.writeComment(element);
+ break;
+ case ElementType.Directive:
+ this._handler.writeDirective(element);
+ break;
+ case ElementType.Text:
+ this._handler.writeText(element);
+ break;
+ default:
+ this._handler.writeTag(element);
+ break;
+ }
+ }
+ }
+
+ Parser.prototype.handleError = function Parser$handleError (error) {
+ if ((typeof this._handler.error) == "function")
+ this._handler.error(error);
+ else
+ throw error;
+ }
+
+//TODO: make this a trully streamable handler
+function RssHandler (callback) {
+ RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
+}
+inherits(RssHandler, DefaultHandler);
+
+ RssHandler.prototype.done = function RssHandler$done () {
+ var feed = { };
+ var feedRoot;
+
+ var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false);
+ if (found.length) {
+ feedRoot = found[0];
+ }
+ if (feedRoot) {
+ if (feedRoot.name == "rss") {
+ feed.type = "rss";
+ feedRoot = feedRoot.children[0]; //<channel/>
+ feed.id = "";
+ try {
+ feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data);
+ } catch (ex) { }
+ try {
+ feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ feed.items = [];
+ DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) {
+ var entry = {};
+ try {
+ entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data);
+ } catch (ex) { }
+ feed.items.push(entry);
+ });
+ } else {
+ feed.type = "atom";
+ try {
+ feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href;
+ } catch (ex) { }
+ try {
+ feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data);
+ } catch (ex) { }
+ try {
+ feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data;
+ } catch (ex) { }
+ feed.items = [];
+ DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) {
+ var entry = {};
+ try {
+ entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href;
+ } catch (ex) { }
+ try {
+ entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data);
+ } catch (ex) { }
+ feed.items.push(entry);
+ });
+ }
+
+ this.dom = feed;
+ }
+ RssHandler.super_.prototype.done.call(this);
+ }
+
+///////////////////////////////////////////////////
+
+function DefaultHandler (callback, options) {
+ this.reset();
+ this._options = options ? options : { };
+ if (this._options.ignoreWhitespace == undefined)
+ this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
+ if (this._options.verbose == undefined)
+ this._options.verbose = true; //Keep data property for tags and raw property for all
+ if (this._options.enforceEmptyTags == undefined)
+ this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
+ if ((typeof callback) == "function")
+ this._callback = callback;
+}
+
+ //**"Static"**//
+ //HTML Tags that shouldn't contain child nodes
+ DefaultHandler._emptyTags = {
+ area: 1
+ , base: 1
+ , basefont: 1
+ , br: 1
+ , col: 1
+ , frame: 1
+ , hr: 1
+ , img: 1
+ , input: 1
+ , isindex: 1
+ , link: 1
+ , meta: 1
+ , param: 1
+ , embed: 1
+ }
+ //Regex to detect whitespace only text nodes
+ DefaultHandler.reWhitespace = /^\s*$/;
+
+ //**Public**//
+ //Properties//
+ DefaultHandler.prototype.dom = null; //The hierarchical object containing the parsed HTML
+ //Methods//
+ //Resets the handler back to starting state
+ DefaultHandler.prototype.reset = function DefaultHandler$reset() {
+ this.dom = [];
+ this._done = false;
+ this._tagStack = [];
+ this._tagStack.last = function DefaultHandler$_tagStack$last () {
+ return(this.length ? this[this.length - 1] : null);
+ }
+ }
+ //Signals the handler that parsing is done
+ DefaultHandler.prototype.done = function DefaultHandler$done () {
+ this._done = true;
+ this.handleCallback(null);
+ }
+ DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) {
+ this.handleElement(element);
+ }
+ DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) {
+ if (this._options.ignoreWhitespace)
+ if (DefaultHandler.reWhitespace.test(element.data))
+ return;
+ this.handleElement(element);
+ }
+ DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) {
+ this.handleElement(element);
+ }
+ DefaultHandler.prototype.writeDirective = function DefaultHandler$writeDirective (element) {
+ this.handleElement(element);
+ }
+ DefaultHandler.prototype.error = function DefaultHandler$error (error) {
+ this.handleCallback(error);
+ }
+
+ //**Private**//
+ //Properties//
+ DefaultHandler.prototype._options = null; //Handler options for how to behave
+ DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done
+ DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed
+ DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed
+ //Methods//
+ DefaultHandler.prototype.handleCallback = function DefaultHandler$handleCallback (error) {
+ if ((typeof this._callback) != "function")
+ if (error)
+ throw error;
+ else
+ return;
+ this._callback(error, this.dom);
+ }
+
+ DefaultHandler.prototype.isEmptyTag = function(element) {
+ var name = element.name.toLowerCase();
+ if (name.charAt(0) == '/') {
+ name = name.substring(1);
+ }
+ return this._options.enforceEmptyTags && !!DefaultHandler._emptyTags[name];
+ };
+
+ DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) {
+ if (this._done)
+ this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()"));
+ if (!this._options.verbose) {
+// element.raw = null; //FIXME: Not clean
+ //FIXME: Serious performance problem using delete
+ delete element.raw;
+ if (element.type == "tag" || element.type == "script" || element.type == "style")
+ delete element.data;
+ }
+ if (!this._tagStack.last()) { //There are no parent elements
+ //If the element can be a container, add it to the tag stack and the top level list
+ if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
+ if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag
+ this.dom.push(element);
+ if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children
+ this._tagStack.push(element);
+ }
+ }
+ }
+ else //Otherwise just add to the top level list
+ this.dom.push(element);
+ }
+ else { //There are parent elements
+ //If the element can be a container, add it as a child of the element
+ //on top of the tag stack and then add it to the tag stack
+ if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
+ if (element.name.charAt(0) == "/") {
+ //This is a closing tag, scan the tagStack to find the matching opening tag
+ //and pop the stack up to the opening tag's parent
+ var baseName = element.name.substring(1);
+ if (!this.isEmptyTag(element)) {
+ var pos = this._tagStack.length - 1;
+ while (pos > -1 && this._tagStack[pos--].name != baseName) { }
+ if (pos > -1 || this._tagStack[0].name == baseName)
+ while (pos < this._tagStack.length - 1)
+ this._tagStack.pop();
+ }
+ }
+ else { //This is not a closing tag
+ if (!this._tagStack.last().children)
+ this._tagStack.last().children = [];
+ this._tagStack.last().children.push(element);
+ if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children
+ this._tagStack.push(element);
+ }
+ }
+ else { //This is not a container element
+ if (!this._tagStack.last().children)
+ this._tagStack.last().children = [];
+ this._tagStack.last().children.push(element);
+ }
+ }
+ }
+
+ var DomUtils = {
+ testElement: function DomUtils$testElement (options, element) {
+ if (!element) {
+ return false;
+ }
+
+ for (var key in options) {
+ if (key == "tag_name") {
+ if (element.type != "tag" && element.type != "script" && element.type != "style") {
+ return false;
+ }
+ if (!options["tag_name"](element.name)) {
+ return false;
+ }
+ } else if (key == "tag_type") {
+ if (!options["tag_type"](element.type)) {
+ return false;
+ }
+ } else if (key == "tag_contains") {
+ if (element.type != "text" && element.type != "comment" && element.type != "directive") {
+ return false;
+ }
+ if (!options["tag_contains"](element.data)) {
+ return false;
+ }
+ } else {
+ if (!element.attribs || !options[key](element.attribs[key])) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ , getElements: function DomUtils$getElements (options, currentElement, recurse, limit) {
+ recurse = (recurse === undefined || recurse === null) || !!recurse;
+ limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit);
+
+ if (!currentElement) {
+ return([]);
+ }
+
+ var found = [];
+ var elementList;
+
+ function getTest (checkVal) {
+ return(function (value) { return(value == checkVal); });
+ }
+ for (var key in options) {
+ if ((typeof options[key]) != "function") {
+ options[key] = getTest(options[key]);
+ }
+ }
+
+ if (DomUtils.testElement(options, currentElement)) {
+ found.push(currentElement);
+ }
+
+ if (limit >= 0 && found.length >= limit) {
+ return(found);
+ }
+
+ if (recurse && currentElement.children) {
+ elementList = currentElement.children;
+ } else if (currentElement instanceof Array) {
+ elementList = currentElement;
+ } else {
+ return(found);
+ }
+
+ for (var i = 0; i < elementList.length; i++) {
+ found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit));
+ if (limit >= 0 && found.length >= limit) {
+ break;
+ }
+ }
+
+ return(found);
+ }
+
+ , getElementById: function DomUtils$getElementById (id, currentElement, recurse) {
+ var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1);
+ return(result.length ? result[0] : null);
+ }
+
+ , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse, limit) {
+ return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit));
+ }
+
+ , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse, limit) {
+ return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit));
+ }
+ }
+
+ function inherits (ctor, superCtor) {
+ var tempCtor = function(){};
+ tempCtor.prototype = superCtor.prototype;
+ ctor.super_ = superCtor;
+ ctor.prototype = new tempCtor();
+ ctor.prototype.constructor = ctor;
+ }
+
+exports.Parser = Parser;
+
+exports.DefaultHandler = DefaultHandler;
+
+exports.RssHandler = RssHandler;
+
+exports.ElementType = ElementType;
+
+exports.DomUtils = DomUtils;
+
+})();
View
22 lib/htmlparser.min.js
@@ -0,0 +1,22 @@
+/***********************************************
+Copyright 2010, Chris Winberry <chris@winberry.net>. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to
+deal in the Software without restriction, including without limitation the
+rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+sell copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+IN THE SOFTWARE.
+***********************************************/
+/* v1.7.2 */
+(function(){function e(a,c){this._options=c?c:{};if(this._options.includeLocation==undefined)this._options.includeLocation=false;this.validateHandler(a);this._handler=a;this.reset()}function n(a){n.super_.call(this,a,{ignoreWhitespace:true,verbose:false,enforceEmptyTags:false})}function i(a,c){this.reset();this._options=c?c:{};if(this._options.ignoreWhitespace==undefined)this._options.ignoreWhitespace=false;if(this._options.verbose==undefined)this._options.verbose=true;if(this._options.enforceEmptyTags== undefined)this._options.enforceEmptyTags=true;if(typeof a=="function")this._callback=a}if(!(typeof require=="function"&&typeof exports=="object"&&typeof module=="object"&&typeof __filename=="string"&&typeof __dirname=="string")){if(this.Tautologistics){if(this.Tautologistics.NodeHtmlParser)return}else this.Tautologistics={};this.Tautologistics.NodeHtmlParser={};exports=this.Tautologistics.NodeHtmlParser}var d={Text:"text",Directive:"directive",Comment:"comment",Script:"script",Style:"style",Tag:"tag"}; e._reTrim=/(^\s+|\s+$)/g;e._reTrimComment=/(^\!--|--$)/g;e._reWhitespace=/\s/g;e._reTagName=/^\s*(\/?)\s*([^\s\/]+)/;e._reAttrib=/([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;e._reTags=/[\<\>]/g;e.prototype.parseComplete=function(a){this.reset();this.parseChunk(a);this.done()};e.prototype.parseChunk=function(a){this._done&&this.handleError(Error("Attempted to parse chunk after parsing already done"));this._buffer+=a;this.parseTags()}; e.prototype.done=function(){if(!this._done){this._done=true;if(this._buffer.length){var a=this._buffer;this._buffer="";a={raw:a,data:this._parseState==d.Text?a:a.replace(e._reTrim,""),type:this._parseState};if(this._parseState==d.Tag||this._parseState==d.Script||this._parseState==d.Style)a.name=this.parseTagName(a.data);this.parseAttribs(a);this._elements.push(a)}this.writeHandler();this._handler.done()}};e.prototype.reset=function(){this._buffer="";this._done=false;this._elements=[];this._next=this._current= this._elementsCurrent=0;this._location={row:0,col:0,charOffset:0,inBuffer:0};this._parseState=d.Text;this._prevTagSep="";this._tagStack=[];this._handler.reset()};e.prototype._options=null;e.prototype._handler=null;e.prototype._buffer=null;e.prototype._done=false;e.prototype._elements=null;e.prototype._elementsCurrent=0;e.prototype._current=0;e.prototype._next=0;e.prototype._location=null;e.prototype._parseState=d.Text;e.prototype._prevTagSep="";e.prototype._tagStack=null;e.prototype.parseTagAttribs= function(a){for(var c=a.length,b=0;b<c;){var h=a[b++];if(h.type==d.Tag||h.type==d.Script||h.type==d.style)this.parseAttribs(h)}return a};e.prototype.parseAttribs=function(a){if(!(a.type!=d.Script&&a.type!=d.Style&&a.type!=d.Tag)){var c=a.data.split(e._reWhitespace,1)[0];c=a.data.substring(c.length);if(!(c.length<1)){var b;for(e._reAttrib.lastIndex=0;b=e._reAttrib.exec(c);){if(a.attribs==undefined)a.attribs={};if(typeof b[1]=="string"&&b[1].length)a.attribs[b[1]]=b[2];else if(typeof b[3]=="string"&& b[3].length)a.attribs[b[3].toString()]=b[4].toString();else if(typeof b[5]=="string"&&b[5].length)a.attribs[b[5]]=b[6];else if(typeof b[7]=="string"&&b[7].length)a.attribs[b[7]]=b[7]}}}};e.prototype.parseTagName=function(a){if(a==null||a=="")return"";a=e._reTagName.exec(a);if(!a)return"";return(a[1]?"/":"")+a[2]};e.prototype.parseTags=function(){for(var a=this._buffer.length-1;e._reTags.test(this._buffer);){this._next=e._reTags.lastIndex-1;var c=this._buffer.charAt(this._next),b=this._buffer.substring(this._current, this._next);b={raw:b,data:this._parseState==d.Text?b:b.replace(e._reTrim,""),type:this._parseState};var h=this.parseTagName(b.data);if(this._tagStack.length)if(this._tagStack[this._tagStack.length-1]==d.Script)if(h=="/script")this._tagStack.pop();else{if(b.raw.indexOf("!--")!=0){b.type=d.Text;if(this._elements.length&&this._elements[this._elements.length-1].type==d.Text){var g=this._elements[this._elements.length-1];g.raw=g.data=g.raw+this._prevTagSep+b.raw;b.raw=b.data=""}}}else if(this._tagStack[this._tagStack.length- 1]==d.Style)if(h=="/style")this._tagStack.pop();else{if(b.raw.indexOf("!--")!=0){b.type=d.Text;if(this._elements.length&&this._elements[this._elements.length-1].type==d.Text){g=this._elements[this._elements.length-1];if(b.raw!=""){g.raw=g.data=g.raw+this._prevTagSep+b.raw;b.raw=b.data=""}else g.raw=g.data=g.raw+this._prevTagSep}else if(b.raw!="")b.raw=b.data=b.raw}}else if(this._tagStack[this._tagStack.length-1]==d.Comment){g=b.raw.length;if(b.raw.charAt(g-2)=="-"&&b.raw.charAt(g-1)=="-"&&c==">"){this._tagStack.pop(); if(this._elements.length&&this._elements[this._elements.length-1].type==d.Comment){g=this._elements[this._elements.length-1];g.raw=g.data=(g.raw+b.raw).replace(e._reTrimComment,"");b.raw=b.data="";b.type=d.Text}else b.type=d.Comment}else{b.type=d.Comment;if(this._elements.length&&this._elements[this._elements.length-1].type==d.Comment){g=this._elements[this._elements.length-1];g.raw=g.data=g.raw+b.raw+c;b.raw=b.data="";b.type=d.Text}else b.raw=b.data=b.raw+c}}if(b.type==d.Tag){b.name=h;if(b.raw.indexOf("!--")== 0){b.type=d.Comment;delete b.name;g=b.raw.length;if(b.raw.charAt(g-1)=="-"&&b.raw.charAt(g-2)=="-"&&c==">")b.raw=b.data=b.raw.replace(e._reTrimComment,"");else{b.raw+=c;this._tagStack.push(d.Comment)}}else if(b.raw.indexOf("!")==0||b.raw.indexOf("?")==0)b.type=d.Directive;else if(b.name=="script"){b.type=d.Script;b.data.charAt(b.data.length-1)!="/"&&this._tagStack.push(d.Script)}else if(b.name=="/script")b.type=d.Script;else if(b.name=="style"){b.type=d.Style;b.data.charAt(b.data.length-1)!="/"&& this._tagStack.push(d.Style)}else if(b.name=="/style")b.type=d.Style;if(b.name&&b.name.charAt(0)=="/")b.data=b.name}if(b.raw!=""||b.type!=d.Text){if(this._options.includeLocation&&!b.location)b.location=this.getLocation(b.type==d.Tag);this.parseAttribs(b);this._elements.push(b);b.type!=d.Text&&b.type!=d.Comment&&b.type!=d.Directive&&b.data.charAt(b.data.length-1)=="/"&&this._elements.push({raw:"/"+b.name,data:"/"+b.name,name:"/"+b.name,type:b.type})}this._parseState=c=="<"?d.Tag:d.Text;this._current= this._next+1;this._prevTagSep=c}if(this._options.includeLocation){this.getLocation();this._location.row+=this._location.inBuffer;this._location.inBuffer=0;this._location.charOffset=0}this._buffer=this._current<=a?this._buffer.substring(this._current):"";this._current=0;this.writeHandler()};e.prototype.getLocation=function(a){for(var c=this._location,b=this._current-(a?1:0),h=a&&c.charOffset==0&&this._current==0;c.charOffset<b;c.charOffset++){a=this._buffer.charAt(c.charOffset);if(a=="\n"){c.inBuffer++; c.col=0}else a!="\r"&&c.col++}return{line:c.row+c.inBuffer+1,col:c.col+(h?0:1)}};e.prototype.validateHandler=function(a){if(typeof a!="object")throw Error("Handler is not an object");if(typeof a.reset!="function")throw Error("Handler method 'reset' is invalid");if(typeof a.done!="function")throw Error("Handler method 'done' is invalid");if(typeof a.writeTag!="function")throw Error("Handler method 'writeTag' is invalid");if(typeof a.writeText!="function")throw Error("Handler method 'writeText' is invalid"); if(typeof a.writeComment!="function")throw Error("Handler method 'writeComment' is invalid");if(typeof a.writeDirective!="function")throw Error("Handler method 'writeDirective' is invalid");};e.prototype.writeHandler=function(a){a=!!a;if(!(this._tagStack.length&&!a))for(;this._elements.length;){a=this._elements.shift();switch(a.type){case d.Comment:this._handler.writeComment(a);break;case d.Directive:this._handler.writeDirective(a);break;case d.Text:this._handler.writeText(a);break;default:this._handler.writeTag(a)}}}; e.prototype.handleError=function(a){if(typeof this._handler.error=="function")this._handler.error(a);else throw a;};(function(a,c){var b=function(){};b.prototype=c.prototype;a.super_=c;a.prototype=new b;a.prototype.constructor=a})(n,i);n.prototype.done=function(){var a={},c,b=f.getElementsByTagName(function(j){return j=="rss"||j=="feed"},this.dom,false);if(b.length)c=b[0];if(c){if(c.name=="rss"){a.type="rss";c=c.children[0];a.id="";try{a.title=f.getElementsByTagName("title",c.children,false)[0].children[0].data}catch(h){}try{a.link= f.getElementsByTagName("link",c.children,false)[0].children[0].data}catch(g){}try{a.description=f.getElementsByTagName("description",c.children,false)[0].children[0].data}catch(l){}try{a.updated=new Date(f.getElementsByTagName("lastBuildDate",c.children,false)[0].children[0].data)}catch(m){}try{a.author=f.getElementsByTagName("managingEditor",c.children,false)[0].children[0].data}catch(o){}a.items=[];f.getElementsByTagName("item",c.children).forEach(function(j){var k={};try{k.id=f.getElementsByTagName("guid", j.children,false)[0].children[0].data}catch(q){}try{k.title=f.getElementsByTagName("title",j.children,false)[0].children[0].data}catch(r){}try{k.link=f.getElementsByTagName("link",j.children,false)[0].children[0].data}catch(s){}try{k.description=f.getElementsByTagName("description",j.children,false)[0].children[0].data}catch(t){}try{k.pubDate=new Date(f.getElementsByTagName("pubDate",j.children,false)[0].children[0].data)}catch(u){}a.items.push(k)})}else{a.type="atom";try{a.id=f.getElementsByTagName("id", c.children,false)[0].children[0].data}catch(p){}try{a.title=f.getElementsByTagName("title",c.children,false)[0].children[0].data}catch(v){}try{a.link=f.getElementsByTagName("link",c.children,false)[0].attribs.href}catch(w){}try{a.description=f.getElementsByTagName("subtitle",c.children,false)[0].children[0].data}catch(x){}try{a.updated=new Date(f.getElementsByTagName("updated",c.children,false)[0].children[0].data)}catch(y){}try{a.author=f.getElementsByTagName("email",c.children,true)[0].children[0].data}catch(z){}a.items= [];f.getElementsByTagName("entry",c.children).forEach(function(j){var k={};try{k.id=f.getElementsByTagName("id",j.children,false)[0].children[0].data}catch(q){}try{k.title=f.getElementsByTagName("title",j.children,false)[0].children[0].data}catch(r){}try{k.link=f.getElementsByTagName("link",j.children,false)[0].attribs.href}catch(s){}try{k.description=f.getElementsByTagName("summary",j.children,false)[0].children[0].data}catch(t){}try{k.pubDate=new Date(f.getElementsByTagName("updated",j.children, false)[0].children[0].data)}catch(u){}a.items.push(k)})}this.dom=a}n.super_.prototype.done.call(this)};i._emptyTags={area:1,base:1,basefont:1,br:1,col:1,frame:1,hr:1,img:1,input:1,isindex:1,link:1,meta:1,param:1,embed:1};i.reWhitespace=/^\s*$/;i.prototype.dom=null;i.prototype.reset=function(){this.dom=[];this._done=false;this._tagStack=[];this._tagStack.last=function(){return this.length?this[this.length-1]:null}};i.prototype.done=function(){this._done=true;this.handleCallback(null)};i.prototype.writeTag= function(a){this.handleElement(a)};i.prototype.writeText=function(a){if(this._options.ignoreWhitespace)if(i.reWhitespace.test(a.data))return;this.handleElement(a)};i.prototype.writeComment=function(a){this.handleElement(a)};i.prototype.writeDirective=function(a){this.handleElement(a)};i.prototype.error=function(a){this.handleCallback(a)};i.prototype._options=null;i.prototype._callback=null;i.prototype._done=false;i.prototype._tagStack=null;i.prototype.handleCallback=function(a){if(typeof this._callback!= "function")if(a)throw a;else return;this._callback(a,this.dom)};i.prototype.isEmptyTag=function(a){a=a.name.toLowerCase();if(a.charAt(0)=="/")a=a.substring(1);return this._options.enforceEmptyTags&&!!i._emptyTags[a]};i.prototype.handleElement=function(a){this._done&&this.handleCallback(Error("Writing to the handler after done() called is not allowed without a reset()"));if(!this._options.verbose){delete a.raw;if(a.type=="tag"||a.type=="script"||a.type=="style")delete a.data}if(this._tagStack.last())if(a.type!= d.Text&&a.type!=d.Comment&&a.type!=d.Directive)if(a.name.charAt(0)=="/"){var c=a.name.substring(1);if(!this.isEmptyTag(a)){for(a=this._tagStack.length-1;a>-1&&this._tagStack[a--].name!=c;);if(a>-1||this._tagStack[0].name==c)for(;a<this._tagStack.length-1;)this._tagStack.pop()}}else{if(!this._tagStack.last().children)this._tagStack.last().children=[];this._tagStack.last().children.push(a);this.isEmptyTag(a)||this._tagStack.push(a)}else{if(!this._tagStack.last().children)this._tagStack.last().children= [];this._tagStack.last().children.push(a)}else if(a.type!=d.Text&&a.type!=d.Comment&&a.type!=d.Directive){if(a.name.charAt(0)!="/"){this.dom.push(a);this.isEmptyTag(a)||this._tagStack.push(a)}}else this.dom.push(a)};var f={testElement:function(a,c){if(!c)return false;for(var b in a)if(b=="tag_name"){if(c.type!="tag"&&c.type!="script"&&c.type!="style")return false;if(!a.tag_name(c.name))return false}else if(b=="tag_type"){if(!a.tag_type(c.type))return false}else if(b=="tag_contains"){if(c.type!="text"&& c.type!="comment"&&c.type!="directive")return false;if(!a.tag_contains(c.data))return false}else if(!c.attribs||!a[b](c.attribs[b]))return false;return true},getElements:function(a,c,b,h){function g(o){return function(p){return p==o}}b=b===undefined||b===null||!!b;h=isNaN(parseInt(h))?-1:parseInt(h);if(!c)return[];var l=[],m;for(m in a)if(typeof a[m]!="function")a[m]=g(a[m]);f.testElement(a,c)&&l.push(c);if(h>=0&&l.length>=h)return l;if(b&&c.children)c=c.children;else if(c instanceof Array)c=c;else return l; for(m=0;m<c.length;m++){l=l.concat(f.getElements(a,c[m],b,h));if(h>=0&&l.length>=h)break}return l},getElementById:function(a,c,b){a=f.getElements({id:a},c,b,1);return a.length?a[0]:null},getElementsByTagName:function(a,c,b,h){return f.getElements({tag_name:a},c,b,h)},getElementsByTagType:function(a,c,b,h){return f.getElements({tag_type:a},c,b,h)}};exports.Parser=e;exports.DefaultHandler=i;exports.RssHandler=n;exports.ElementType=d;exports.DomUtils=f})();
View
828 lib/node-htmlparser.js
@@ -1,822 +1,6 @@
-/***********************************************
-Copyright 2010, Chris Winberry <chris@winberry.net>. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to
-deal in the Software without restriction, including without limitation the
-rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-sell copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-IN THE SOFTWARE.
-***********************************************/
-/* v1.7.2 */
-
-(function () {
-
-function runningInNode () {
- return(
- (typeof require) == "function"
- &&
- (typeof exports) == "object"
- &&
- (typeof module) == "object"
- &&
- (typeof __filename) == "string"
- &&
- (typeof __dirname) == "string"
- );
-}
-
-if (!runningInNode()) {
- if (!this.Tautologistics)
- this.Tautologistics = {};
- else if (this.Tautologistics.NodeHtmlParser)
- return; //NodeHtmlParser already defined!
- this.Tautologistics.NodeHtmlParser = {};
- exports = this.Tautologistics.NodeHtmlParser;
-}
-
-//Types of elements found in the DOM
-var ElementType = {
- Text: "text" //Plain text
- , Directive: "directive" //Special tag <!...>
- , Comment: "comment" //Special tag <!--...-->
- , Script: "script" //Special tag <script>...</script>
- , Style: "style" //Special tag <style>...</style>
- , Tag: "tag" //Any tag that isn't special
-}
-
-function Parser (handler, options) {
- this._options = options ? options : { };
- if (this._options.includeLocation == undefined) {
- this._options.includeLocation = false; //Do not track element position in document by default
- }
-
- this.validateHandler(handler);
- this._handler = handler;
- this.reset();
-}
-
- //**"Static"**//
- //Regular expressions used for cleaning up and parsing (stateless)
- Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace
- Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents
- Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on
- Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element
-
- //Regular expressions used for parsing (stateful)
- Parser._reAttrib = //Find attributes in a tag
- /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;
- Parser._reTags = /[\<\>]/g; //Find tag markers
-
- //**Public**//
- //Methods//
- //Parses a complete HTML and pushes it to the handler
- Parser.prototype.parseComplete = function Parser$parseComplete (data) {
- this.reset();
- this.parseChunk(data);
- this.done();
- }
-
- //Parses a piece of an HTML document
- Parser.prototype.parseChunk = function Parser$parseChunk (data) {
- if (this._done)
- this.handleError(new Error("Attempted to parse chunk after parsing already done"));
- this._buffer += data; //FIXME: this can be a bottleneck
- this.parseTags();
- }
-
- //Tells the parser that the HTML being parsed is complete
- Parser.prototype.done = function Parser$done () {
- if (this._done)
- return;
- this._done = true;
-
- //Push any unparsed text into a final element in the element list
- if (this._buffer.length) {
- var rawData = this._buffer;
- this._buffer = "";
- var element = {
- raw: rawData
- , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
- , type: this._parseState
- };
- if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style)
- element.name = this.parseTagName(element.data);
- this.parseAttribs(element);
- this._elements.push(element);
- }
-
- this.writeHandler();
- this._handler.done();
- }
-
- //Resets the parser to a blank state, ready to parse a new HTML document
- Parser.prototype.reset = function Parser$reset () {
- this._buffer = "";
- this._done = false;
- this._elements = [];
- this._elementsCurrent = 0;
- this._current = 0;
- this._next = 0;
- this._location = {
- row: 0
- , col: 0
- , charOffset: 0
- , inBuffer: 0
- };
- this._parseState = ElementType.Text;
- this._prevTagSep = '';
- this._tagStack = [];
- this._handler.reset();
- }
-
- //**Private**//
- //Properties//
- Parser.prototype._options = null; //Parser options for how to behave
- Parser.prototype._handler = null; //Handler for parsed elements
- Parser.prototype._buffer = null; //Buffer of unparsed data
- Parser.prototype._done = false; //Flag indicating whether parsing is done
- Parser.prototype._elements = null; //Array of parsed elements
- Parser.prototype._elementsCurrent = 0; //Pointer to last element in _elements that has been processed
- Parser.prototype._current = 0; //Position in data that has already been parsed
- Parser.prototype._next = 0; //Position in data of the next tag marker (<>)
- Parser.prototype._location = null; //Position tracking for elements in a stream
- Parser.prototype._parseState = ElementType.Text; //Current type of element being parsed
- Parser.prototype._prevTagSep = ''; //Previous tag marker found
- //Stack of element types previously encountered; keeps track of when
- //parsing occurs inside a script/comment/style tag
- Parser.prototype._tagStack = null;
-
- //Methods//
- //Takes an array of elements and parses any found attributes
- Parser.prototype.parseTagAttribs = function Parser$parseTagAttribs (elements) {
- var idxEnd = elements.length;
- var idx = 0;
-
- while (idx < idxEnd) {
- var element = elements[idx++];
- if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style)
- this.parseAttribs(element);
- }
-
- return(elements);
- }
-
- //Takes an element and adds an "attribs" property for any element attributes found
- Parser.prototype.parseAttribs = function Parser$parseAttribs (element) {
- //Only parse attributes for tags
- if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag)
- return;
-
- var tagName = element.data.split(Parser._reWhitespace, 1)[0];
- var attribRaw = element.data.substring(tagName.length);
- if (attribRaw.length < 1)
- return;
-
- var match;
- Parser._reAttrib.lastIndex = 0;
- while (match = Parser._reAttrib.exec(attribRaw)) {
- if (element.attribs == undefined)
- element.attribs = {};
-
- if (typeof match[1] == "string" && match[1].length) {
- element.attribs[match[1]] = match[2];
- } else if (typeof match[3] == "string" && match[3].length) {
- element.attribs[match[3].toString()] = match[4].toString();
- } else if (typeof match[5] == "string" && match[5].length) {
- element.attribs[match[5]] = match[6];
- } else if (typeof match[7] == "string" && match[7].length) {
- element.attribs[match[7]] = match[7];
- }
- }
- }
-
- //Extracts the base tag name from the data value of an element
- Parser.prototype.parseTagName = function Parser$parseTagName (data) {
- if (data == null || data == "")
- return("");
- var match = Parser._reTagName.exec(data);
- if (!match)
- return("");
- return((match[1] ? "/" : "") + match[2]);
- }
-
- //Parses through HTML text and returns an array of found elements
- //I admit, this function is rather large but splitting up had an noticeable impact on speed
- Parser.prototype.parseTags = function Parser$parseTags () {
- var bufferEnd = this._buffer.length - 1;
- while (Parser._reTags.test(this._buffer)) {
- this._next = Parser._reTags.lastIndex - 1;
- var tagSep = this._buffer.charAt(this._next); //The currently found tag marker
- var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse
-
- //A new element to eventually be appended to the element list
- var element = {
- raw: rawData
- , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
- , type: this._parseState
- };
-
- var elementName = this.parseTagName(element.data);
-
- //This section inspects the current tag stack and modifies the current
- //element if we're actually parsing a special area (script/comment/style tag)
- if (this._tagStack.length) { //We're parsing inside a script/comment/style tag
- if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag
- if (elementName == "/script") //Actually, we're no longer in a script tag, so pop it off the stack
- this._tagStack.pop();
- else { //Not a closing script tag
- if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
- //All data from here to script close is now a text element
- element.type = ElementType.Text;
- //If the previous element is text, append the current text to it
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
- var prevElement = this._elements[this._elements.length - 1];
- prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
- }
- }
- }
- }
- else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag
- if (elementName == "/style") //Actually, we're no longer in a style tag, so pop it off the stack
- this._tagStack.pop();
- else {
- if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
- //All data from here to style close is now a text element
- element.type = ElementType.Text;
- //If the previous element is text, append the current text to it
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
- var prevElement = this._elements[this._elements.length - 1];
- if (element.raw != "") {
- prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
- } else { //Element is empty, so just append the last tag marker found
- prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep;
- }
- } else { //The previous element was not text
- if (element.raw != "") {
- element.raw = element.data = element.raw;
- }
- }
- }
- }
- }
- else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag
- var rawLen = element.raw.length;
- if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") {
- //Actually, we're no longer in a style tag, so pop it off the stack
- this._tagStack.pop();
- //If the previous element is a comment, append the current text to it
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
- var prevElement = this._elements[this._elements.length - 1];
- prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, "");
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
- element.type = ElementType.Text;
- }
- else //Previous element not a comment
- element.type = ElementType.Comment; //Change the current element's type to a comment
- }
- else { //Still in a comment tag
- element.type = ElementType.Comment;
- //If the previous element is a comment, append the current text to it
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
- var prevElement = this._elements[this._elements.length - 1];
- prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep;
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
- element.type = ElementType.Text;
- }
- else
- element.raw = element.data = element.raw + tagSep;
- }
- }
- }
-
- //Processing of non-special tags
- if (element.type == ElementType.Tag) {
- element.name = elementName;
-
- if (element.raw.indexOf("!--") == 0) { //This tag is really comment
- element.type = ElementType.Comment;
- delete element["name"];
- var rawLen = element.raw.length;
- //Check if the comment is terminated in the current element
- if (element.raw.charAt(rawLen - 1) == "-" && element.raw.charAt(rawLen - 2) == "-" && tagSep == ">")
- element.raw = element.data = element.raw.replace(Parser._reTrimComment, "");
- else { //It's not so push the comment onto the tag stack
- element.raw += tagSep;
- this._tagStack.push(ElementType.Comment);
- }
- }
- else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) {
- element.type = ElementType.Directive;
- //TODO: what about CDATA?
- }
- else if (element.name == "script") {
- element.type = ElementType.Script;
- //Special tag, push onto the tag stack if not terminated
- if (element.data.charAt(element.data.length - 1) != "/")
- this._tagStack.push(ElementType.Script);
- }
- else if (element.name == "/script")
- element.type = ElementType.Script;
- else if (element.name == "style") {
- element.type = ElementType.Style;
- //Special tag, push onto the tag stack if not terminated
- if (element.data.charAt(element.data.length - 1) != "/")
- this._tagStack.push(ElementType.Style);
- }
- else if (element.name == "/style")
- element.type = ElementType.Style;
- if (element.name && element.name.charAt(0) == "/")
- element.data = element.name;
- }
-
- //Add all tags and non-empty text elements to the element list
- if (element.raw != "" || element.type != ElementType.Text) {
- if (this._options.includeLocation && !element.location) {
- element.location = this.getLocation(element.type == ElementType.Tag);
- }
- this.parseAttribs(element);
- this._elements.push(element);
- //If tag self-terminates, add an explicit, separate closing tag
- if (
- element.type != ElementType.Text
- &&
- element.type != ElementType.Comment
- &&
- element.type != ElementType.Directive
- &&
- element.data.charAt(element.data.length - 1) == "/"
- )
- this._elements.push({
- raw: "/" + element.name
- , data: "/" + element.name
- , name: "/" + element.name
- , type: element.type
- });
- }
- this._parseState = (tagSep == "<") ? ElementType.Tag : ElementType.Text;
- this._current = this._next + 1;
- this._prevTagSep = tagSep;
- }
-
- if (this._options.includeLocation) {
- this.getLocation();
- this._location.row += this._location.inBuffer;
- this._location.inBuffer = 0;
- this._location.charOffset = 0;
- }
- this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : "";
- this._current = 0;
-
- this.writeHandler();
- }
-
- Parser.prototype.getLocation = function Parser$getLocation (startTag) {
- var c,
- l = this._location,
- end = this._current - (startTag ? 1 : 0),
- chunk = startTag && l.charOffset == 0 && this._current == 0;
-
- for (; l.charOffset < end; l.charOffset++) {
- c = this._buffer.charAt(l.charOffset);
- if (c == '\n') {
- l.inBuffer++;
- l.col = 0;
- } else if (c != '\r') {
- l.col++;
- }
- }
- return {
- line: l.row + l.inBuffer + 1
- , col: l.col + (chunk ? 0: 1)
- };
- }
-
- //Checks the handler to make it is an object with the right "interface"
- Parser.prototype.validateHandler = function Parser$validateHandler (handler) {
- if ((typeof handler) != "object")
- throw new Error("Handler is not an object");
- if ((typeof handler.reset) != "function")
- throw new Error("Handler method 'reset' is invalid");
- if ((typeof handler.done) != "function")
- throw new Error("Handler method 'done' is invalid");
- if ((typeof handler.writeTag) != "function")
- throw new Error("Handler method 'writeTag' is invalid");
- if ((typeof handler.writeText) != "function")
- throw new Error("Handler method 'writeText' is invalid");
- if ((typeof handler.writeComment) != "function")
- throw new Error("Handler method 'writeComment' is invalid");
- if ((typeof handler.writeDirective) != "function")
- throw new Error("Handler method 'writeDirective' is invalid");
- }
-
- //Writes parsed elements out to the handler
- Parser.prototype.writeHandler = function Parser$writeHandler (forceFlush) {
- forceFlush = !!forceFlush;
- if (this._tagStack.length && !forceFlush)
- return;
- while (this._elements.length) {
- var element = this._elements.shift();
- switch (element.type) {
- case ElementType.Comment:
- this._handler.writeComment(element);
- break;
- case ElementType.Directive:
- this._handler.writeDirective(element);
- break;
- case ElementType.Text:
- this._handler.writeText(element);
- break;
- default:
- this._handler.writeTag(element);
- break;
- }
- }
- }
-
- Parser.prototype.handleError = function Parser$handleError (error) {
- if ((typeof this._handler.error) == "function")
- this._handler.error(error);
- else
- throw error;
- }
-
-//TODO: make this a trully streamable handler
-function RssHandler (callback) {
- RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
-}
-inherits(RssHandler, DefaultHandler);
-
- RssHandler.prototype.done = function RssHandler$done () {
- var feed = { };
- var feedRoot;
-
- var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false);
- if (found.length) {
- feedRoot = found[0];
- }
- if (feedRoot) {
- if (feedRoot.name == "rss") {
- feed.type = "rss";
- feedRoot = feedRoot.children[0]; //<channel/>
- feed.id = "";
- try {
- feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data);
- } catch (ex) { }
- try {
- feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- feed.items = [];
- DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) {
- var entry = {};
- try {
- entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data);
- } catch (ex) { }
- feed.items.push(entry);
- });
- } else {
- feed.type = "atom";
- try {
- feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href;
- } catch (ex) { }
- try {
- feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data);
- } catch (ex) { }
- try {
- feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data;
- } catch (ex) { }
- feed.items = [];
- DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) {
- var entry = {};
- try {
- entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href;
- } catch (ex) { }
- try {
- entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data);
- } catch (ex) { }
- feed.items.push(entry);
- });
- }
-
- this.dom = feed;
- }
- RssHandler.super_.prototype.done.call(this);
- }
-
-///////////////////////////////////////////////////
-
-function DefaultHandler (callback, options) {
- this.reset();
- this._options = options ? options : { };
- if (this._options.ignoreWhitespace == undefined)
- this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
- if (this._options.verbose == undefined)
- this._options.verbose = true; //Keep data property for tags and raw property for all
- if (this._options.enforceEmptyTags == undefined)
- this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
- if ((typeof callback) == "function")
- this._callback = callback;
-}
-
- //**"Static"**//
- //HTML Tags that shouldn't contain child nodes
- DefaultHandler._emptyTags = {
- area: 1
- , base: 1
- , basefont: 1
- , br: 1
- , col: 1
- , frame: 1
- , hr: 1
- , img: 1
- , input: 1
- , isindex: 1
- , link: 1
- , meta: 1
- , param: 1
- , embed: 1
- }
- //Regex to detect whitespace only text nodes
- DefaultHandler.reWhitespace = /^\s*$/;
-
- //**Public**//
- //Properties//
- DefaultHandler.prototype.dom = null; //The hierarchical object containing the parsed HTML
- //Methods//
- //Resets the handler back to starting state
- DefaultHandler.prototype.reset = function DefaultHandler$reset() {
- this.dom = [];
- this._done = false;
- this._tagStack = [];
- this._tagStack.last = function DefaultHandler$_tagStack$last () {
- return(this.length ? this[this.length - 1] : null);
- }
- }
- //Signals the handler that parsing is done
- DefaultHandler.prototype.done = function DefaultHandler$done () {
- this._done = true;
- this.handleCallback(null);
- }
- DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) {
- this.handleElement(element);
- }
- DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) {
- if (this._options.ignoreWhitespace)
- if (DefaultHandler.reWhitespace.test(element.data))
- return;
- this.handleElement(element);
- }
- DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) {
- this.handleElement(element);
- }
- DefaultHandler.prototype.writeDirective = function DefaultHandler$writeDirective (element) {
- this.handleElement(element);
- }
- DefaultHandler.prototype.error = function DefaultHandler$error (error) {
- this.handleCallback(error);
- }
-
- //**Private**//
- //Properties//
- DefaultHandler.prototype._options = null; //Handler options for how to behave
- DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done
- DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed
- DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed
- //Methods//
- DefaultHandler.prototype.handleCallback = function DefaultHandler$handleCallback (error) {
- if ((typeof this._callback) != "function")
- if (error)
- throw error;
- else
- return;
- this._callback(error, this.dom);
- }
-
- DefaultHandler.prototype.isEmptyTag = function(element) {
- var name = element.name.toLowerCase();
- if (name.charAt(0) == '/') {
- name = name.substring(1);
- }
- return this._options.enforceEmptyTags && !!DefaultHandler._emptyTags[name];
- };
-
- DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) {
- if (this._done)
- this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()"));
- if (!this._options.verbose) {
-// element.raw = null; //FIXME: Not clean
- //FIXME: Serious performance problem using delete
- delete element.raw;
- if (element.type == "tag" || element.type == "script" || element.type == "style")
- delete element.data;
- }
- if (!this._tagStack.last()) { //There are no parent elements
- //If the element can be a container, add it to the tag stack and the top level list
- if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
- if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag
- this.dom.push(element);
- if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children
- this._tagStack.push(element);
- }
- }
- }
- else //Otherwise just add to the top level list
- this.dom.push(element);
- }
- else { //There are parent elements
- //If the element can be a container, add it as a child of the element
- //on top of the tag stack and then add it to the tag stack
- if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
- if (element.name.charAt(0) == "/") {
- //This is a closing tag, scan the tagStack to find the matching opening tag
- //and pop the stack up to the opening tag's parent
- var baseName = element.name.substring(1);
- if (!this.isEmptyTag(element)) {
- var pos = this._tagStack.length - 1;
- while (pos > -1 && this._tagStack[pos--].name != baseName) { }
- if (pos > -1 || this._tagStack[0].name == baseName)
- while (pos < this._tagStack.length - 1)
- this._tagStack.pop();
- }
- }
- else { //This is not a closing tag
- if (!this._tagStack.last().children)
- this._tagStack.last().children = [];
- this._tagStack.last().children.push(element);
- if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children
- this._tagStack.push(element);
- }
- }
- else { //This is not a container element
- if (!this._tagStack.last().children)
- this._tagStack.last().children = [];
- this._tagStack.last().children.push(element);
- }
- }
- }
-
- var DomUtils = {
- testElement: function DomUtils$testElement (options, element) {
- if (!element) {
- return false;
- }
-
- for (var key in options) {
- if (key == "tag_name") {
- if (element.type != "tag" && element.type != "script" && element.type != "style") {
- return false;
- }
- if (!options["tag_name"](element.name)) {
- return false;
- }
- } else if (key == "tag_type") {
- if (!options["tag_type"](element.type)) {
- return false;
- }
- } else if (key == "tag_contains") {
- if (element.type != "text" && element.type != "comment" && element.type != "directive") {
- return false;
- }
- if (!options["tag_contains"](element.data)) {
- return false;
- }
- } else {
- if (!element.attribs || !options[key](element.attribs[key])) {
- return false;
- }
- }
- }
-
- return true;
- }
-
- , getElements: function DomUtils$getElements (options, currentElement, recurse, limit) {
- recurse = (recurse === undefined || recurse === null) || !!recurse;
- limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit);
-
- if (!currentElement) {
- return([]);
- }
-
- var found = [];
- var elementList;
-
- function getTest (checkVal) {
- return(function (value) { return(value == checkVal); });
- }
- for (var key in options) {
- if ((typeof options[key]) != "function") {
- options[key] = getTest(options[key]);
- }
- }
-
- if (DomUtils.testElement(options, currentElement)) {
- found.push(currentElement);
- }
-
- if (limit >= 0 && found.length >= limit) {
- return(found);
- }
-
- if (recurse && currentElement.children) {
- elementList = currentElement.children;
- } else if (currentElement instanceof Array) {
- elementList = currentElement;
- } else {
- return(found);
- }
-
- for (var i = 0; i < elementList.length; i++) {
- found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit));
- if (limit >= 0 && found.length >= limit) {
- break;
- }
- }
-
- return(found);
- }
-
- , getElementById: function DomUtils$getElementById (id, currentElement, recurse) {
- var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1);
- return(result.length ? result[0] : null);
- }
-
- , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse, limit) {
- return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit));
- }
-
- , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse, limit) {
- return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit));
- }
- }
-
- function inherits (ctor, superCtor) {
- var tempCtor = function(){};
- tempCtor.prototype = superCtor.prototype;
- ctor.super_ = superCtor;
- ctor.prototype = new tempCtor();
- ctor.prototype.constructor = ctor;
- }
-
-exports.Parser = Parser;
-
-exports.DefaultHandler = DefaultHandler;
-
-exports.RssHandler = RssHandler;
-
-exports.ElementType = ElementType;
-
-exports.DomUtils = DomUtils;
-
-})();
+var htmlparser = require("./htmlparser");
+exports.Parser = htmlparser.Parser;
+exports.DefaultHandler = htmlparser.DefaultHandler;
+exports.RssHandler = htmlparser.RssHandler;
+exports.ElementType = htmlparser.ElementType;
+exports.DomUtils = htmlparser.DomUtils;
View
28 lib/node-htmlparser.min.js
@@ -1,22 +1,6 @@
-/***********************************************
-Copyright 2010, Chris Winberry <chris@winberry.net>. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to
-deal in the Software without restriction, including without limitation the
-rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-sell copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-IN THE SOFTWARE.
-***********************************************/
-/* v1.7.2 */
-(function(){function e(a,c){this._options=c?c:{};if(this._options.includeLocation==undefined)this._options.includeLocation=false;this.validateHandler(a);this._handler=a;this.reset()}function n(a){n.super_.call(this,a,{ignoreWhitespace:true,verbose:false,enforceEmptyTags:false})}function i(a,c){this.reset();this._options=c?c:{};if(this._options.ignoreWhitespace==undefined)this._options.ignoreWhitespace=false;if(this._options.verbose==undefined)this._options.verbose=true;if(this._options.enforceEmptyTags== undefined)this._options.enforceEmptyTags=true;if(typeof a=="function")this._callback=a}if(!(typeof require=="function"&&typeof exports=="object"&&typeof module=="object"&&typeof __filename=="string"&&typeof __dirname=="string")){if(this.Tautologistics){if(this.Tautologistics.NodeHtmlParser)return}else this.Tautologistics={};this.Tautologistics.NodeHtmlParser={};exports=this.Tautologistics.NodeHtmlParser}var d={Text:"text",Directive:"directive",Comment:"comment",Script:"script",Style:"style",Tag:"tag"}; e._reTrim=/(^\s+|\s+$)/g;e._reTrimComment=/(^\!--|--$)/g;e._reWhitespace=/\s/g;e._reTagName=/^\s*(\/?)\s*([^\s\/]+)/;e._reAttrib=/([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;e._reTags=/[\<\>]/g;e.prototype.parseComplete=function(a){this.reset();this.parseChunk(a);this.done()};e.prototype.parseChunk=function(a){this._done&&this.handleError(Error("Attempted to parse chunk after parsing already done"));this._buffer+=a;this.parseTags()}; e.prototype.done=function(){if(!this._done){this._done=true;if(this._buffer.length){var a=this._buffer;this._buffer="";a={raw:a,data:this._parseState==d.Text?a:a.replace(e._reTrim,""),type:this._parseState};if(this._parseState==d.Tag||this._parseState==d.Script||this._parseState==d.Style)a.name=this.parseTagName(a.data);this.parseAttribs(a);this._elements.push(a)}this.writeHandler();this._handler.done()}};e.prototype.reset=function(){this._buffer="";this._done=false;this._elements=[];this._next=this._current= this._elementsCurrent=0;this._location={row:0,col:0,charOffset:0,inBuffer:0};this._parseState=d.Text;this._prevTagSep="";this._tagStack=[];this._handler.reset()};e.prototype._options=null;e.prototype._handler=null;e.prototype._buffer=null;e.prototype._done=false;e.prototype._elements=null;e.prototype._elementsCurrent=0;e.prototype._current=0;e.prototype._next=0;e.prototype._location=null;e.prototype._parseState=d.Text;e.prototype._prevTagSep="";e.prototype._tagStack=null;e.prototype.parseTagAttribs= function(a){for(var c=a.length,b=0;b<c;){var h=a[b++];if(h.type==d.Tag||h.type==d.Script||h.type==d.style)this.parseAttribs(h)}return a};e.prototype.parseAttribs=function(a){if(!(a.type!=d.Script&&a.type!=d.Style&&a.type!=d.Tag)){var c=a.data.split(e._reWhitespace,1)[0];c=a.data.substring(c.length);if(!(c.length<1)){var b;for(e._reAttrib.lastIndex=0;b=e._reAttrib.exec(c);){if(a.attribs==undefined)a.attribs={};if(typeof b[1]=="string"&&b[1].length)a.attribs[b[1]]=b[2];else if(typeof b[3]=="string"&& b[3].length)a.attribs[b[3].toString()]=b[4].toString();else if(typeof b[5]=="string"&&b[5].length)a.attribs[b[5]]=b[6];else if(typeof b[7]=="string"&&b[7].length)a.attribs[b[7]]=b[7]}}}};e.prototype.parseTagName=function(a){if(a==null||a=="")return"";a=e._reTagName.exec(a);if(!a)return"";return(a[1]?"/":"")+a[2]};e.prototype.parseTags=function(){for(var a=this._buffer.length-1;e._reTags.test(this._buffer);){this._next=e._reTags.lastIndex-1;var c=this._buffer.charAt(this._next),b=this._buffer.substring(this._current, this._next);b={raw:b,data:this._parseState==d.Text?b:b.replace(e._reTrim,""),type:this._parseState};var h=this.parseTagName(b.data);if(this._tagStack.length)if(this._tagStack[this._tagStack.length-1]==d.Script)if(h=="/script")this._tagStack.pop();else{if(b.raw.indexOf("!--")!=0){b.type=d.Text;if(this._elements.length&&this._elements[this._elements.length-1].type==d.Text){var g=this._elements[this._elements.length-1];g.raw=g.data=g.raw+this._prevTagSep+b.raw;b.raw=b.data=""}}}else if(this._tagStack[this._tagStack.length- 1]==d.Style)if(h=="/style")this._tagStack.pop();else{if(b.raw.indexOf("!--")!=0){b.type=d.Text;if(this._elements.length&&this._elements[this._elements.length-1].type==d.Text){g=this._elements[this._elements.length-1];if(b.raw!=""){g.raw=g.data=g.raw+this._prevTagSep+b.raw;b.raw=b.data=""}else g.raw=g.data=g.raw+this._prevTagSep}else if(b.raw!="")b.raw=b.data=b.raw}}else if(this._tagStack[this._tagStack.length-1]==d.Comment){g=b.raw.length;if(b.raw.charAt(g-2)=="-"&&b.raw.charAt(g-1)=="-"&&c==">"){this._tagStack.pop(); if(this._elements.length&&this._elements[this._elements.length-1].type==d.Comment){g=this._elements[this._elements.length-1];g.raw=g.data=(g.raw+b.raw).replace(e._reTrimComment,"");b.raw=b.data="";b.type=d.Text}else b.type=d.Comment}else{b.type=d.Comment;if(this._elements.length&&this._elements[this._elements.length-1].type==d.Comment){g=this._elements[this._elements.length-1];g.raw=g.data=g.raw+b.raw+c;b.raw=b.data="";b.type=d.Text}else b.raw=b.data=b.raw+c}}if(b.type==d.Tag){b.name=h;if(b.raw.indexOf("!--")== 0){b.type=d.Comment;delete b.name;g=b.raw.length;if(b.raw.charAt(g-1)=="-"&&b.raw.charAt(g-2)=="-"&&c==">")b.raw=b.data=b.raw.replace(e._reTrimComment,"");else{b.raw+=c;this._tagStack.push(d.Comment)}}else if(b.raw.indexOf("!")==0||b.raw.indexOf("?")==0)b.type=d.Directive;else if(b.name=="script"){b.type=d.Script;b.data.charAt(b.data.length-1)!="/"&&this._tagStack.push(d.Script)}else if(b.name=="/script")b.type=d.Script;else if(b.name=="style"){b.type=d.Style;b.data.charAt(b.data.length-1)!="/"&& this._tagStack.push(d.Style)}else if(b.name=="/style")b.type=d.Style;if(b.name&&b.name.charAt(0)=="/")b.data=b.name}if(b.raw!=""||b.type!=d.Text){if(this._options.includeLocation&&!b.location)b.location=this.getLocation(b.type==d.Tag);this.parseAttribs(b);this._elements.push(b);b.type!=d.Text&&b.type!=d.Comment&&b.type!=d.Directive&&b.data.charAt(b.data.length-1)=="/"&&this._elements.push({raw:"/"+b.name,data:"/"+b.name,name:"/"+b.name,type:b.type})}this._parseState=c=="<"?d.Tag:d.Text;this._current= this._next+1;this._prevTagSep=c}if(this._options.includeLocation){this.getLocation();this._location.row+=this._location.inBuffer;this._location.inBuffer=0;this._location.charOffset=0}this._buffer=this._current<=a?this._buffer.substring(this._current):"";this._current=0;this.writeHandler()};e.prototype.getLocation=function(a){for(var c=this._location,b=this._current-(a?1:0),h=a&&c.charOffset==0&&this._current==0;c.charOffset<b;c.charOffset++){a=this._buffer.charAt(c.charOffset);if(a=="\n"){c.inBuffer++; c.col=0}else a!="\r"&&c.col++}return{line:c.row+c.inBuffer+1,col:c.col+(h?0:1)}};e.prototype.validateHandler=function(a){if(typeof a!="object")throw Error("Handler is not an object");if(typeof a.reset!="function")throw Error("Handler method 'reset' is invalid");if(typeof a.done!="function")throw Error("Handler method 'done' is invalid");if(typeof a.writeTag!="function")throw Error("Handler method 'writeTag' is invalid");if(typeof a.writeText!="function")throw Error("Handler method 'writeText' is invalid"); if(typeof a.writeComment!="function")throw Error("Handler method 'writeComment' is invalid");if(typeof a.writeDirective!="function")throw Error("Handler method 'writeDirective' is invalid");};e.prototype.writeHandler=function(a){a=!!a;if(!(this._tagStack.length&&!a))for(;this._elements.length;){a=this._elements.shift();switch(a.type){case d.Comment:this._handler.writeComment(a);break;case d.Directive:this._handler.writeDirective(a);break;case d.Text:this._handler.writeText(a);break;default:this._handler.writeTag(a)}}}; e.prototype.handleError=function(a){if(typeof this._handler.error=="function")this._handler.error(a);else throw a;};(function(a,c){var b=function(){};b.prototype=c.prototype;a.super_=c;a.prototype=new b;a.prototype.constructor=a})(n,i);n.prototype.done=function(){var a={},c,b=f.getElementsByTagName(function(j){return j=="rss"||j=="feed"},this.dom,false);if(b.length)c=b[0];if(c){if(c.name=="rss"){a.type="rss";c=c.children[0];a.id="";try{a.title=f.getElementsByTagName("title",c.children,false)[0].children[0].data}catch(h){}try{a.link= f.getElementsByTagName("link",c.children,false)[0].children[0].data}catch(g){}try{a.description=f.getElementsByTagName("description",c.children,false)[0].children[0].data}catch(l){}try{a.updated=new Date(f.getElementsByTagName("lastBuildDate",c.children,false)[0].children[0].data)}catch(m){}try{a.author=f.getElementsByTagName("managingEditor",c.children,false)[0].children[0].data}catch(o){}a.items=[];f.getElementsByTagName("item",c.children).forEach(function(j){var k={};try{k.id=f.getElementsByTagName("guid", j.children,false)[0].children[0].data}catch(q){}try{k.title=f.getElementsByTagName("title",j.children,false)[0].children[0].data}catch(r){}try{k.link=f.getElementsByTagName("link",j.children,false)[0].children[0].data}catch(s){}try{k.description=f.getElementsByTagName("description",j.children,false)[0].children[0].data}catch(t){}try{k.pubDate=new Date(f.getElementsByTagName("pubDate",j.children,false)[0].children[0].data)}catch(u){}a.items.push(k)})}else{a.type="atom";try{a.id=f.getElementsByTagName("id", c.children,false)[0].children[0].data}catch(p){}try{a.title=f.getElementsByTagName("title",c.children,false)[0].children[0].data}catch(v){}try{a.link=f.getElementsByTagName("link",c.children,false)[0].attribs.href}catch(w){}try{a.description=f.getElementsByTagName("subtitle",c.children,false)[0].children[0].data}catch(x){}try{a.updated=new Date(f.getElementsByTagName("updated",c.children,false)[0].children[0].data)}catch(y){}try{a.author=f.getElementsByTagName("email",c.children,true)[0].children[0].data}catch(z){}a.items= [];f.getElementsByTagName("entry",c.children).forEach(function(j){var k={};try{k.id=f.getElementsByTagName("id",j.children,false)[0].children[0].data}catch(q){}try{k.title=f.getElementsByTagName("title",j.children,false)[0].children[0].data}catch(r){}try{k.link=f.getElementsByTagName("link",j.children,false)[0].attribs.href}catch(s){}try{k.description=f.getElementsByTagName("summary",j.children,false)[0].children[0].data}catch(t){}try{k.pubDate=new Date(f.getElementsByTagName("updated",j.children, false)[0].children[0].data)}catch(u){}a.items.push(k)})}this.dom=a}n.super_.prototype.done.call(this)};i._emptyTags={area:1,base:1,basefont:1,br:1,col:1,frame:1,hr:1,img:1,input:1,isindex:1,link:1,meta:1,param:1,embed:1};i.reWhitespace=/^\s*$/;i.prototype.dom=null;i.prototype.reset=function(){this.dom=[];this._done=false;this._tagStack=[];this._tagStack.last=function(){return this.length?this[this.length-1]:null}};i.prototype.done=function(){this._done=true;this.handleCallback(null)};i.prototype.writeTag= function(a){this.handleElement(a)};i.prototype.writeText=function(a){if(this._options.ignoreWhitespace)if(i.reWhitespace.test(a.data))return;this.handleElement(a)};i.prototype.writeComment=function(a){this.handleElement(a)};i.prototype.writeDirective=function(a){this.handleElement(a)};i.prototype.error=function(a){this.handleCallback(a)};i.prototype._options=null;i.prototype._callback=null;i.prototype._done=false;i.prototype._tagStack=null;i.prototype.handleCallback=function(a){if(typeof this._callback!= "function")if(a)throw a;else return;this._callback(a,this.dom)};i.prototype.isEmptyTag=function(a){a=a.name.toLowerCase();if(a.charAt(0)=="/")a=a.substring(1);return this._options.enforceEmptyTags&&!!i._emptyTags[a]};i.prototype.handleElement=function(a){this._done&&this.handleCallback(Error("Writing to the handler after done() called is not allowed without a reset()"));if(!this._options.verbose){delete a.raw;if(a.type=="tag"||a.type=="script"||a.type=="style")delete a.data}if(this._tagStack.last())if(a.type!= d.Text&&a.type!=d.Comment&&a.type!=d.Directive)if(a.name.charAt(0)=="/"){var c=a.name.substring(1);if(!this.isEmptyTag(a)){for(a=this._tagStack.length-1;a>-1&&this._tagStack[a--].name!=c;);if(a>-1||this._tagStack[0].name==c)for(;a<this._tagStack.length-1;)this._tagStack.pop()}}else{if(!this._tagStack.last().children)this._tagStack.last().children=[];this._tagStack.last().children.push(a);this.isEmptyTag(a)||this._tagStack.push(a)}else{if(!this._tagStack.last().children)this._tagStack.last().children= [];this._tagStack.last().children.push(a)}else if(a.type!=d.Text&&a.type!=d.Comment&&a.type!=d.Directive){if(a.name.charAt(0)!="/"){this.dom.push(a);this.isEmptyTag(a)||this._tagStack.push(a)}}else this.dom.push(a)};var f={testElement:function(a,c){if(!c)return false;for(var b in a)if(b=="tag_name"){if(c.type!="tag"&&c.type!="script"&&c.type!="style")return false;if(!a.tag_name(c.name))return false}else if(b=="tag_type"){if(!a.tag_type(c.type))return false}else if(b=="tag_contains"){if(c.type!="text"&& c.type!="comment"&&c.type!="directive")return false;if(!a.tag_contains(c.data))return false}else if(!c.attribs||!a[b](c.attribs[b]))return false;return true},getElements:function(a,c,b,h){function g(o){return function(p){return p==o}}b=b===undefined||b===null||!!b;h=isNaN(parseInt(h))?-1:parseInt(h);if(!c)return[];var l=[],m;for(m in a)if(typeof a[m]!="function")a[m]=g(a[m]);f.testElement(a,c)&&l.push(c);if(h>=0&&l.length>=h)return l;if(b&&c.children)c=c.children;else if(c instanceof Array)c=c;else return l; for(m=0;m<c.length;m++){l=l.concat(f.getElements(a,c[m],b,h));if(h>=0&&l.length>=h)break}return l},getElementById:function(a,c,b){a=f.getElements({id:a},c,b,1);return a.length?a[0]:null},getElementsByTagName:function(a,c,b,h){return f.getElements({tag_name:a},c,b,h)},getElementsByTagType:function(a,c,b,h){return f.getElements({tag_type:a},c,b,h)}};exports.Parser=e;exports.DefaultHandler=i;exports.RssHandler=n;exports.ElementType=d;exports.DomUtils=f})();
+var htmlparser = require("./htmlparser.min");
+exports.Parser = htmlparser.Parser;
+exports.DefaultHandler = htmlparser.DefaultHandler;
+exports.RssHandler = htmlparser.RssHandler;
+exports.ElementType = htmlparser.ElementType;
+exports.DomUtils = htmlparser.DomUtils;
View
6 package.json
@@ -1,7 +1,7 @@
{
"name": "htmlparser"
, "description": "Forgiving HTML/XML/RSS Parser in JS for *both* Node and Browsers"
- , "version": "1.7.2"
+ , "version": "1.7.3"
, "author": "Chris Winberry <chris@winberry.net>"
, "contributors": []
, "repository": {
@@ -12,9 +12,9 @@
"mail": "chris@winberry.net"
, "web": "http://github.com/tautologistics/node-htmlparser/issues"
}
- , "os": [ "linux", "darwin", "freebsd" ]
+ , "os": [ "linux", "darwin", "freebsd", "win32" ]
, "directories": { "lib": "./lib/" }
- , "main": "./lib/node-htmlparser"
+ , "main": "./lib/htmlparser"
, "engines": { "node": ">=0.1.33" }
, "licenses": [{
"type": "MIT"
View
12 profile.js
@@ -3,16 +3,16 @@
var sys = require("sys");
var fs = require("fs");
var http = require("http");
-var htmlparser = require("./node-htmlparser");
-var libxml = require('./libxmljs');
+var htmlparser = require("./lib/htmlparser");
+//var libxml = require('./libxmljs');
var testNHP = true; //Should node-htmlparser be exercised?
-var testLXJS = true; //Should libxmljs be exercised?
+var testLXJS = false; //Should libxmljs be exercised?
var testIterations = 100; //Number of test loops to run
-var testHost = "nodejs.org"; //Host to fetch test HTML from
+var testHost = "localhost"; //Host to fetch test HTML from
var testPort = 80; //Port on host to fetch test HTML from
-var testPath = "/api.html"; //Path on host to fetch HTML from
+var testPath = "/~chris/feed.xml"; //Path on host to fetch HTML from
function getMillisecs () {
return((new Date()).getTime());
@@ -41,7 +41,7 @@ http.createClient(testPort, testHost)
if (err)
sys.debug("Error: " + err);
});
- var parser = new htmlparser.Parser(handler);
+ var parser = new htmlparser.Parser(handler, { includeLocation: true });
parser.parseComplete(html);
})
View
2  runtests.html
@@ -21,7 +21,7 @@
head.insertBefore(script, head.firstChild)
}
</script>
- <script language="JavaScript" src="lib/node-htmlparser.js"></script>
+ <script language="JavaScript" src="lib/htmlparser.js"></script>
<script language="JavaScript" src="tests/01-basic.js"></script>
<script language="JavaScript" src="tests/02-single_tag_1.js"></script>
<script language="JavaScript" src="tests/03-single_tag_2.js"></script>
View
2  runtests.js
@@ -21,7 +21,7 @@ IN THE SOFTWARE.
var sys = require("sys");
var fs = require("fs");
-var htmlparser = require("./lib/node-htmlparser");
+var htmlparser = require("./lib/htmlparser");
var testFolder = "./tests";
var chunkSize = 5;
View
2  runtests.min.html
@@ -21,7 +21,7 @@
head.insertBefore(script, head.firstChild)
}
</script>
- <script language="JavaScript" src="lib/node-htmlparser.min.js"></script>
+ <script language="JavaScript" src="lib/htmlparser.min.js"></script>
<script language="JavaScript" src="tests/01-basic.js"></script>
<script language="JavaScript" src="tests/02-single_tag_1.js"></script>
<script language="JavaScript" src="tests/03-single_tag_2.js"></script>
View
2  runtests.min.js
@@ -21,7 +21,7 @@ IN THE SOFTWARE.
var sys = require("sys");
var fs = require("fs");
-var htmlparser = require("./lib/node-htmlparser.min");
+var htmlparser = require("./lib/htmlparser.min");
var testFolder = "./tests";
var chunkSize = 5;
View
2  snippet.js
@@ -1,7 +1,7 @@
//node --prof --prof_auto profile.js
//deps/v8/tools/mac-tick-processor v8.log
var sys = require("sys");
-var htmlparser = require("./node-htmlparser");
+var htmlparser = require("./htmlparser");
var html = "<link>text</link>";
View
2  utils_example.js
@@ -1,7 +1,7 @@
//node --prof --prof_auto profile.js
//deps/v8/tools/mac-tick-processor v8.log
var sys = require("sys");
-var htmlparser = require("./lib/node-htmlparser");
+var htmlparser = require("./lib/htmlparser");
var html = "<a>text a</a><b id='x'>text b</b><c class='y'>text c</c><d id='z' class='w'><e>text e</e></d><g class='g h i'>hhh</g><yy>hellow</yy><yy id='secondyy'>world</yy>";
Please sign in to comment.
Something went wrong with that request. Please try again.