diff --git a/lib/htmlparser.js b/lib/htmlparser.js new file mode 100644 index 0000000..ac3625d --- /dev/null +++ b/lib/htmlparser.js @@ -0,0 +1,822 @@ +/*********************************************** +Copyright 2010, Chris Winberry . All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. +***********************************************/ +/* v1.7.2 */ + +(function () { + +function runningInNode () { + return( + (typeof require) == "function" + && + (typeof exports) == "object" + && + (typeof module) == "object" + && + (typeof __filename) == "string" + && + (typeof __dirname) == "string" + ); +} + +if (!runningInNode()) { + if (!this.Tautologistics) + this.Tautologistics = {}; + else if (this.Tautologistics.NodeHtmlParser) + return; //NodeHtmlParser already defined! + this.Tautologistics.NodeHtmlParser = {}; + exports = this.Tautologistics.NodeHtmlParser; +} + +//Types of elements found in the DOM +var ElementType = { + Text: "text" //Plain text + , Directive: "directive" //Special tag + , Comment: "comment" //Special tag + , Script: "script" //Special tag + , Style: "style" //Special tag + , Tag: "tag" //Any tag that isn't special +} + +function Parser (handler, options) { + this._options = options ? options : { }; + if (this._options.includeLocation == undefined) { + this._options.includeLocation = false; //Do not track element position in document by default + } + + this.validateHandler(handler); + this._handler = handler; + this.reset(); +} + + //**"Static"**// + //Regular expressions used for cleaning up and parsing (stateless) + Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace + Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents + Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on + Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element + + //Regular expressions used for parsing (stateful) + Parser._reAttrib = //Find attributes in a tag + /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; + Parser._reTags = /[\<\>]/g; //Find tag markers + + //**Public**// + //Methods// + //Parses a complete HTML and pushes it to the handler + Parser.prototype.parseComplete = function Parser$parseComplete (data) { + this.reset(); + this.parseChunk(data); + this.done(); + } + + //Parses a piece of an HTML document + Parser.prototype.parseChunk = function Parser$parseChunk (data) { + if (this._done) + this.handleError(new Error("Attempted to parse chunk after parsing already done")); + this._buffer += data; //FIXME: this can be a bottleneck + this.parseTags(); + } + + //Tells the parser that the HTML being parsed is complete + Parser.prototype.done = function Parser$done () { + if (this._done) + return; + this._done = true; + + //Push any unparsed text into a final element in the element list + if (this._buffer.length) { + var rawData = this._buffer; + this._buffer = ""; + var element = { + raw: rawData + , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") + , type: this._parseState + }; + if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style) + element.name = this.parseTagName(element.data); + this.parseAttribs(element); + this._elements.push(element); + } + + this.writeHandler(); + this._handler.done(); + } + + //Resets the parser to a blank state, ready to parse a new HTML document + Parser.prototype.reset = function Parser$reset () { + this._buffer = ""; + this._done = false; + this._elements = []; + this._elementsCurrent = 0; + this._current = 0; + this._next = 0; + this._location = { + row: 0 + , col: 0 + , charOffset: 0 + , inBuffer: 0 + }; + this._parseState = ElementType.Text; + this._prevTagSep = ''; + this._tagStack = []; + this._handler.reset(); + } + + //**Private**// + //Properties// + Parser.prototype._options = null; //Parser options for how to behave + Parser.prototype._handler = null; //Handler for parsed elements + Parser.prototype._buffer = null; //Buffer of unparsed data + Parser.prototype._done = false; //Flag indicating whether parsing is done + Parser.prototype._elements = null; //Array of parsed elements + Parser.prototype._elementsCurrent = 0; //Pointer to last element in _elements that has been processed + Parser.prototype._current = 0; //Position in data that has already been parsed + Parser.prototype._next = 0; //Position in data of the next tag marker (<>) + Parser.prototype._location = null; //Position tracking for elements in a stream + Parser.prototype._parseState = ElementType.Text; //Current type of element being parsed + Parser.prototype._prevTagSep = ''; //Previous tag marker found + //Stack of element types previously encountered; keeps track of when + //parsing occurs inside a script/comment/style tag + Parser.prototype._tagStack = null; + + //Methods// + //Takes an array of elements and parses any found attributes + Parser.prototype.parseTagAttribs = function Parser$parseTagAttribs (elements) { + var idxEnd = elements.length; + var idx = 0; + + while (idx < idxEnd) { + var element = elements[idx++]; + if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style) + this.parseAttribs(element); + } + + return(elements); + } + + //Takes an element and adds an "attribs" property for any element attributes found + Parser.prototype.parseAttribs = function Parser$parseAttribs (element) { + //Only parse attributes for tags + if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag) + return; + + var tagName = element.data.split(Parser._reWhitespace, 1)[0]; + var attribRaw = element.data.substring(tagName.length); + if (attribRaw.length < 1) + return; + + var match; + Parser._reAttrib.lastIndex = 0; + while (match = Parser._reAttrib.exec(attribRaw)) { + if (element.attribs == undefined) + element.attribs = {}; + + if (typeof match[1] == "string" && match[1].length) { + element.attribs[match[1]] = match[2]; + } else if (typeof match[3] == "string" && match[3].length) { + element.attribs[match[3].toString()] = match[4].toString(); + } else if (typeof match[5] == "string" && match[5].length) { + element.attribs[match[5]] = match[6]; + } else if (typeof match[7] == "string" && match[7].length) { + element.attribs[match[7]] = match[7]; + } + } + } + + //Extracts the base tag name from the data value of an element + Parser.prototype.parseTagName = function Parser$parseTagName (data) { + if (data == null || data == "") + return(""); + var match = Parser._reTagName.exec(data); + if (!match) + return(""); + return((match[1] ? "/" : "") + match[2]); + } + + //Parses through HTML text and returns an array of found elements + //I admit, this function is rather large but splitting up had an noticeable impact on speed + Parser.prototype.parseTags = function Parser$parseTags () { + var bufferEnd = this._buffer.length - 1; + while (Parser._reTags.test(this._buffer)) { + this._next = Parser._reTags.lastIndex - 1; + var tagSep = this._buffer.charAt(this._next); //The currently found tag marker + var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse + + //A new element to eventually be appended to the element list + var element = { + raw: rawData + , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") + , type: this._parseState + }; + + var elementName = this.parseTagName(element.data); + + //This section inspects the current tag stack and modifies the current + //element if we're actually parsing a special area (script/comment/style tag) + if (this._tagStack.length) { //We're parsing inside a script/comment/style tag + if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag + if (elementName == "/script") //Actually, we're no longer in a script tag, so pop it off the stack + this._tagStack.pop(); + else { //Not a closing script tag + if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment + //All data from here to script close is now a text element + element.type = ElementType.Text; + //If the previous element is text, append the current text to it + if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) { + var prevElement = this._elements[this._elements.length - 1]; + prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; + element.raw = element.data = ""; //This causes the current element to not be added to the element list + } + } + } + } + else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag + if (elementName == "/style") //Actually, we're no longer in a style tag, so pop it off the stack + this._tagStack.pop(); + else { + if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment + //All data from here to style close is now a text element + element.type = ElementType.Text; + //If the previous element is text, append the current text to it + if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) { + var prevElement = this._elements[this._elements.length - 1]; + if (element.raw != "") { + prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; + element.raw = element.data = ""; //This causes the current element to not be added to the element list + } else { //Element is empty, so just append the last tag marker found + prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep; + } + } else { //The previous element was not text + if (element.raw != "") { + element.raw = element.data = element.raw; + } + } + } + } + } + else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag + var rawLen = element.raw.length; + if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") { + //Actually, we're no longer in a style tag, so pop it off the stack + this._tagStack.pop(); + //If the previous element is a comment, append the current text to it + if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) { + var prevElement = this._elements[this._elements.length - 1]; + prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, ""); + element.raw = element.data = ""; //This causes the current element to not be added to the element list + element.type = ElementType.Text; + } + else //Previous element not a comment + element.type = ElementType.Comment; //Change the current element's type to a comment + } + else { //Still in a comment tag + element.type = ElementType.Comment; + //If the previous element is a comment, append the current text to it + if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) { + var prevElement = this._elements[this._elements.length - 1]; + prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep; + element.raw = element.data = ""; //This causes the current element to not be added to the element list + element.type = ElementType.Text; + } + else + element.raw = element.data = element.raw + tagSep; + } + } + } + + //Processing of non-special tags + if (element.type == ElementType.Tag) { + element.name = elementName; + + if (element.raw.indexOf("!--") == 0) { //This tag is really comment + element.type = ElementType.Comment; + delete element["name"]; + var rawLen = element.raw.length; + //Check if the comment is terminated in the current element + if (element.raw.charAt(rawLen - 1) == "-" && element.raw.charAt(rawLen - 2) == "-" && tagSep == ">") + element.raw = element.data = element.raw.replace(Parser._reTrimComment, ""); + else { //It's not so push the comment onto the tag stack + element.raw += tagSep; + this._tagStack.push(ElementType.Comment); + } + } + else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) { + element.type = ElementType.Directive; + //TODO: what about CDATA? + } + else if (element.name == "script") { + element.type = ElementType.Script; + //Special tag, push onto the tag stack if not terminated + if (element.data.charAt(element.data.length - 1) != "/") + this._tagStack.push(ElementType.Script); + } + else if (element.name == "/script") + element.type = ElementType.Script; + else if (element.name == "style") { + element.type = ElementType.Style; + //Special tag, push onto the tag stack if not terminated + if (element.data.charAt(element.data.length - 1) != "/") + this._tagStack.push(ElementType.Style); + } + else if (element.name == "/style") + element.type = ElementType.Style; + if (element.name && element.name.charAt(0) == "/") + element.data = element.name; + } + + //Add all tags and non-empty text elements to the element list + if (element.raw != "" || element.type != ElementType.Text) { + if (this._options.includeLocation && !element.location) { + element.location = this.getLocation(element.type == ElementType.Tag); + } + this.parseAttribs(element); + this._elements.push(element); + //If tag self-terminates, add an explicit, separate closing tag + if ( + element.type != ElementType.Text + && + element.type != ElementType.Comment + && + element.type != ElementType.Directive + && + element.data.charAt(element.data.length - 1) == "/" + ) + this._elements.push({ + raw: "/" + element.name + , data: "/" + element.name + , name: "/" + element.name + , type: element.type + }); + } + this._parseState = (tagSep == "<") ? ElementType.Tag : ElementType.Text; + this._current = this._next + 1; + this._prevTagSep = tagSep; + } + + if (this._options.includeLocation) { + this.getLocation(); + this._location.row += this._location.inBuffer; + this._location.inBuffer = 0; + this._location.charOffset = 0; + } + this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : ""; + this._current = 0; + + this.writeHandler(); + } + + Parser.prototype.getLocation = function Parser$getLocation (startTag) { + var c, + l = this._location, + end = this._current - (startTag ? 1 : 0), + chunk = startTag && l.charOffset == 0 && this._current == 0; + + for (; l.charOffset < end; l.charOffset++) { + c = this._buffer.charAt(l.charOffset); + if (c == '\n') { + l.inBuffer++; + l.col = 0; + } else if (c != '\r') { + l.col++; + } + } + return { + line: l.row + l.inBuffer + 1 + , col: l.col + (chunk ? 0: 1) + }; + } + + //Checks the handler to make it is an object with the right "interface" + Parser.prototype.validateHandler = function Parser$validateHandler (handler) { + if ((typeof handler) != "object") + throw new Error("Handler is not an object"); + if ((typeof handler.reset) != "function") + throw new Error("Handler method 'reset' is invalid"); + if ((typeof handler.done) != "function") + throw new Error("Handler method 'done' is invalid"); + if ((typeof handler.writeTag) != "function") + throw new Error("Handler method 'writeTag' is invalid"); + if ((typeof handler.writeText) != "function") + throw new Error("Handler method 'writeText' is invalid"); + if ((typeof handler.writeComment) != "function") + throw new Error("Handler method 'writeComment' is invalid"); + if ((typeof handler.writeDirective) != "function") + throw new Error("Handler method 'writeDirective' is invalid"); + } + + //Writes parsed elements out to the handler + Parser.prototype.writeHandler = function Parser$writeHandler (forceFlush) { + forceFlush = !!forceFlush; + if (this._tagStack.length && !forceFlush) + return; + while (this._elements.length) { + var element = this._elements.shift(); + switch (element.type) { + case ElementType.Comment: + this._handler.writeComment(element); + break; + case ElementType.Directive: + this._handler.writeDirective(element); + break; + case ElementType.Text: + this._handler.writeText(element); + break; + default: + this._handler.writeTag(element); + break; + } + } + } + + Parser.prototype.handleError = function Parser$handleError (error) { + if ((typeof this._handler.error) == "function") + this._handler.error(error); + else + throw error; + } + +//TODO: make this a trully streamable handler +function RssHandler (callback) { + RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false }); +} +inherits(RssHandler, DefaultHandler); + + RssHandler.prototype.done = function RssHandler$done () { + var feed = { }; + var feedRoot; + + var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false); + if (found.length) { + feedRoot = found[0]; + } + if (feedRoot) { + if (feedRoot.name == "rss") { + feed.type = "rss"; + feedRoot = feedRoot.children[0]; // + feed.id = ""; + try { + feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + try { + feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + try { + feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + try { + feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data); + } catch (ex) { } + try { + feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + feed.items = []; + DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) { + var entry = {}; + try { + entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data); + } catch (ex) { } + feed.items.push(entry); + }); + } else { + feed.type = "atom"; + try { + feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + try { + feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + try { + feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href; + } catch (ex) { } + try { + feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data; + } catch (ex) { } + try { + feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data); + } catch (ex) { } + try { + feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data; + } catch (ex) { } + feed.items = []; + DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) { + var entry = {}; + try { + entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href; + } catch (ex) { } + try { + entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data; + } catch (ex) { } + try { + entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data); + } catch (ex) { } + feed.items.push(entry); + }); + } + + this.dom = feed; + } + RssHandler.super_.prototype.done.call(this); + } + +/////////////////////////////////////////////////// + +function DefaultHandler (callback, options) { + this.reset(); + this._options = options ? options : { }; + if (this._options.ignoreWhitespace == undefined) + this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes + if (this._options.verbose == undefined) + this._options.verbose = true; //Keep data property for tags and raw property for all + if (this._options.enforceEmptyTags == undefined) + this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec + if ((typeof callback) == "function") + this._callback = callback; +} + + //**"Static"**// + //HTML Tags that shouldn't contain child nodes + DefaultHandler._emptyTags = { + area: 1 + , base: 1 + , basefont: 1 + , br: 1 + , col: 1 + , frame: 1 + , hr: 1 + , img: 1 + , input: 1 + , isindex: 1 + , link: 1 + , meta: 1 + , param: 1 + , embed: 1 + } + //Regex to detect whitespace only text nodes + DefaultHandler.reWhitespace = /^\s*$/; + + //**Public**// + //Properties// + DefaultHandler.prototype.dom = null; //The hierarchical object containing the parsed HTML + //Methods// + //Resets the handler back to starting state + DefaultHandler.prototype.reset = function DefaultHandler$reset() { + this.dom = []; + this._done = false; + this._tagStack = []; + this._tagStack.last = function DefaultHandler$_tagStack$last () { + return(this.length ? this[this.length - 1] : null); + } + } + //Signals the handler that parsing is done + DefaultHandler.prototype.done = function DefaultHandler$done () { + this._done = true; + this.handleCallback(null); + } + DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) { + this.handleElement(element); + } + DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) { + if (this._options.ignoreWhitespace) + if (DefaultHandler.reWhitespace.test(element.data)) + return; + this.handleElement(element); + } + DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) { + this.handleElement(element); + } + DefaultHandler.prototype.writeDirective = function DefaultHandler$writeDirective (element) { + this.handleElement(element); + } + DefaultHandler.prototype.error = function DefaultHandler$error (error) { + this.handleCallback(error); + } + + //**Private**// + //Properties// + DefaultHandler.prototype._options = null; //Handler options for how to behave + DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done + DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed + DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed + //Methods// + DefaultHandler.prototype.handleCallback = function DefaultHandler$handleCallback (error) { + if ((typeof this._callback) != "function") + if (error) + throw error; + else + return; + this._callback(error, this.dom); + } + + DefaultHandler.prototype.isEmptyTag = function(element) { + var name = element.name.toLowerCase(); + if (name.charAt(0) == '/') { + name = name.substring(1); + } + return this._options.enforceEmptyTags && !!DefaultHandler._emptyTags[name]; + }; + + DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) { + if (this._done) + this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()")); + if (!this._options.verbose) { +// element.raw = null; //FIXME: Not clean + //FIXME: Serious performance problem using delete + delete element.raw; + if (element.type == "tag" || element.type == "script" || element.type == "style") + delete element.data; + } + if (!this._tagStack.last()) { //There are no parent elements + //If the element can be a container, add it to the tag stack and the top level list + if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) { + if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag + this.dom.push(element); + if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children + this._tagStack.push(element); + } + } + } + else //Otherwise just add to the top level list + this.dom.push(element); + } + else { //There are parent elements + //If the element can be a container, add it as a child of the element + //on top of the tag stack and then add it to the tag stack + if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) { + if (element.name.charAt(0) == "/") { + //This is a closing tag, scan the tagStack to find the matching opening tag + //and pop the stack up to the opening tag's parent + var baseName = element.name.substring(1); + if (!this.isEmptyTag(element)) { + var pos = this._tagStack.length - 1; + while (pos > -1 && this._tagStack[pos--].name != baseName) { } + if (pos > -1 || this._tagStack[0].name == baseName) + while (pos < this._tagStack.length - 1) + this._tagStack.pop(); + } + } + else { //This is not a closing tag + if (!this._tagStack.last().children) + this._tagStack.last().children = []; + this._tagStack.last().children.push(element); + if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children + this._tagStack.push(element); + } + } + else { //This is not a container element + if (!this._tagStack.last().children) + this._tagStack.last().children = []; + this._tagStack.last().children.push(element); + } + } + } + + var DomUtils = { + testElement: function DomUtils$testElement (options, element) { + if (!element) { + return false; + } + + for (var key in options) { + if (key == "tag_name") { + if (element.type != "tag" && element.type != "script" && element.type != "style") { + return false; + } + if (!options["tag_name"](element.name)) { + return false; + } + } else if (key == "tag_type") { + if (!options["tag_type"](element.type)) { + return false; + } + } else if (key == "tag_contains") { + if (element.type != "text" && element.type != "comment" && element.type != "directive") { + return false; + } + if (!options["tag_contains"](element.data)) { + return false; + } + } else { + if (!element.attribs || !options[key](element.attribs[key])) { + return false; + } + } + } + + return true; + } + + , getElements: function DomUtils$getElements (options, currentElement, recurse, limit) { + recurse = (recurse === undefined || recurse === null) || !!recurse; + limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit); + + if (!currentElement) { + return([]); + } + + var found = []; + var elementList; + + function getTest (checkVal) { + return(function (value) { return(value == checkVal); }); + } + for (var key in options) { + if ((typeof options[key]) != "function") { + options[key] = getTest(options[key]); + } + } + + if (DomUtils.testElement(options, currentElement)) { + found.push(currentElement); + } + + if (limit >= 0 && found.length >= limit) { + return(found); + } + + if (recurse && currentElement.children) { + elementList = currentElement.children; + } else if (currentElement instanceof Array) { + elementList = currentElement; + } else { + return(found); + } + + for (var i = 0; i < elementList.length; i++) { + found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit)); + if (limit >= 0 && found.length >= limit) { + break; + } + } + + return(found); + } + + , getElementById: function DomUtils$getElementById (id, currentElement, recurse) { + var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1); + return(result.length ? result[0] : null); + } + + , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse, limit) { + return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit)); + } + + , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse, limit) { + return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit)); + } + } + + function inherits (ctor, superCtor) { + var tempCtor = function(){}; + tempCtor.prototype = superCtor.prototype; + ctor.super_ = superCtor; + ctor.prototype = new tempCtor(); + ctor.prototype.constructor = ctor; + } + +exports.Parser = Parser; + +exports.DefaultHandler = DefaultHandler; + +exports.RssHandler = RssHandler; + +exports.ElementType = ElementType; + +exports.DomUtils = DomUtils; + +})(); diff --git a/lib/htmlparser.min.js b/lib/htmlparser.min.js new file mode 100644 index 0000000..2f029f7 --- /dev/null +++ b/lib/htmlparser.min.js @@ -0,0 +1,22 @@ +/*********************************************** +Copyright 2010, Chris Winberry . All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to +deal in the Software without restriction, including without limitation the +rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. +***********************************************/ +/* v1.7.2 */ +(function(){function e(a,c){this._options=c?c:{};if(this._options.includeLocation==undefined)this._options.includeLocation=false;this.validateHandler(a);this._handler=a;this.reset()}function n(a){n.super_.call(this,a,{ignoreWhitespace:true,verbose:false,enforceEmptyTags:false})}function i(a,c){this.reset();this._options=c?c:{};if(this._options.ignoreWhitespace==undefined)this._options.ignoreWhitespace=false;if(this._options.verbose==undefined)this._options.verbose=true;if(this._options.enforceEmptyTags== undefined)this._options.enforceEmptyTags=true;if(typeof a=="function")this._callback=a}if(!(typeof require=="function"&&typeof exports=="object"&&typeof module=="object"&&typeof __filename=="string"&&typeof __dirname=="string")){if(this.Tautologistics){if(this.Tautologistics.NodeHtmlParser)return}else this.Tautologistics={};this.Tautologistics.NodeHtmlParser={};exports=this.Tautologistics.NodeHtmlParser}var d={Text:"text",Directive:"directive",Comment:"comment",Script:"script",Style:"style",Tag:"tag"}; e._reTrim=/(^\s+|\s+$)/g;e._reTrimComment=/(^\!--|--$)/g;e._reWhitespace=/\s/g;e._reTagName=/^\s*(\/?)\s*([^\s\/]+)/;e._reAttrib=/([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;e._reTags=/[\<\>]/g;e.prototype.parseComplete=function(a){this.reset();this.parseChunk(a);this.done()};e.prototype.parseChunk=function(a){this._done&&this.handleError(Error("Attempted to parse chunk after parsing already done"));this._buffer+=a;this.parseTags()}; e.prototype.done=function(){if(!this._done){this._done=true;if(this._buffer.length){var a=this._buffer;this._buffer="";a={raw:a,data:this._parseState==d.Text?a:a.replace(e._reTrim,""),type:this._parseState};if(this._parseState==d.Tag||this._parseState==d.Script||this._parseState==d.Style)a.name=this.parseTagName(a.data);this.parseAttribs(a);this._elements.push(a)}this.writeHandler();this._handler.done()}};e.prototype.reset=function(){this._buffer="";this._done=false;this._elements=[];this._next=this._current= this._elementsCurrent=0;this._location={row:0,col:0,charOffset:0,inBuffer:0};this._parseState=d.Text;this._prevTagSep="";this._tagStack=[];this._handler.reset()};e.prototype._options=null;e.prototype._handler=null;e.prototype._buffer=null;e.prototype._done=false;e.prototype._elements=null;e.prototype._elementsCurrent=0;e.prototype._current=0;e.prototype._next=0;e.prototype._location=null;e.prototype._parseState=d.Text;e.prototype._prevTagSep="";e.prototype._tagStack=null;e.prototype.parseTagAttribs= function(a){for(var c=a.length,b=0;b"){this._tagStack.pop(); if(this._elements.length&&this._elements[this._elements.length-1].type==d.Comment){g=this._elements[this._elements.length-1];g.raw=g.data=(g.raw+b.raw).replace(e._reTrimComment,"");b.raw=b.data="";b.type=d.Text}else b.type=d.Comment}else{b.type=d.Comment;if(this._elements.length&&this._elements[this._elements.length-1].type==d.Comment){g=this._elements[this._elements.length-1];g.raw=g.data=g.raw+b.raw+c;b.raw=b.data="";b.type=d.Text}else b.raw=b.data=b.raw+c}}if(b.type==d.Tag){b.name=h;if(b.raw.indexOf("!--")== 0){b.type=d.Comment;delete b.name;g=b.raw.length;if(b.raw.charAt(g-1)=="-"&&b.raw.charAt(g-2)=="-"&&c==">")b.raw=b.data=b.raw.replace(e._reTrimComment,"");else{b.raw+=c;this._tagStack.push(d.Comment)}}else if(b.raw.indexOf("!")==0||b.raw.indexOf("?")==0)b.type=d.Directive;else if(b.name=="script"){b.type=d.Script;b.data.charAt(b.data.length-1)!="/"&&this._tagStack.push(d.Script)}else if(b.name=="/script")b.type=d.Script;else if(b.name=="style"){b.type=d.Style;b.data.charAt(b.data.length-1)!="/"&& this._tagStack.push(d.Style)}else if(b.name=="/style")b.type=d.Style;if(b.name&&b.name.charAt(0)=="/")b.data=b.name}if(b.raw!=""||b.type!=d.Text){if(this._options.includeLocation&&!b.location)b.location=this.getLocation(b.type==d.Tag);this.parseAttribs(b);this._elements.push(b);b.type!=d.Text&&b.type!=d.Comment&&b.type!=d.Directive&&b.data.charAt(b.data.length-1)=="/"&&this._elements.push({raw:"/"+b.name,data:"/"+b.name,name:"/"+b.name,type:b.type})}this._parseState=c=="<"?d.Tag:d.Text;this._current= this._next+1;this._prevTagSep=c}if(this._options.includeLocation){this.getLocation();this._location.row+=this._location.inBuffer;this._location.inBuffer=0;this._location.charOffset=0}this._buffer=this._current<=a?this._buffer.substring(this._current):"";this._current=0;this.writeHandler()};e.prototype.getLocation=function(a){for(var c=this._location,b=this._current-(a?1:0),h=a&&c.charOffset==0&&this._current==0;c.charOffset-1&&this._tagStack[a--].name!=c;);if(a>-1||this._tagStack[0].name==c)for(;a=0&&l.length>=h)return l;if(b&&c.children)c=c.children;else if(c instanceof Array)c=c;else return l; for(m=0;m=0&&l.length>=h)break}return l},getElementById:function(a,c,b){a=f.getElements({id:a},c,b,1);return a.length?a[0]:null},getElementsByTagName:function(a,c,b,h){return f.getElements({tag_name:a},c,b,h)},getElementsByTagType:function(a,c,b,h){return f.getElements({tag_type:a},c,b,h)}};exports.Parser=e;exports.DefaultHandler=i;exports.RssHandler=n;exports.ElementType=d;exports.DomUtils=f})(); \ No newline at end of file diff --git a/lib/node-htmlparser.js b/lib/node-htmlparser.js index ac3625d..1fc03ea 100644 --- a/lib/node-htmlparser.js +++ b/lib/node-htmlparser.js @@ -1,822 +1,6 @@ -/*********************************************** -Copyright 2010, Chris Winberry . All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to -deal in the Software without restriction, including without limitation the -rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -sell copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -IN THE SOFTWARE. -***********************************************/ -/* v1.7.2 */ - -(function () { - -function runningInNode () { - return( - (typeof require) == "function" - && - (typeof exports) == "object" - && - (typeof module) == "object" - && - (typeof __filename) == "string" - && - (typeof __dirname) == "string" - ); -} - -if (!runningInNode()) { - if (!this.Tautologistics) - this.Tautologistics = {}; - else if (this.Tautologistics.NodeHtmlParser) - return; //NodeHtmlParser already defined! - this.Tautologistics.NodeHtmlParser = {}; - exports = this.Tautologistics.NodeHtmlParser; -} - -//Types of elements found in the DOM -var ElementType = { - Text: "text" //Plain text - , Directive: "directive" //Special tag - , Comment: "comment" //Special tag - , Script: "script" //Special tag - , Style: "style" //Special tag - , Tag: "tag" //Any tag that isn't special -} - -function Parser (handler, options) { - this._options = options ? options : { }; - if (this._options.includeLocation == undefined) { - this._options.includeLocation = false; //Do not track element position in document by default - } - - this.validateHandler(handler); - this._handler = handler; - this.reset(); -} - - //**"Static"**// - //Regular expressions used for cleaning up and parsing (stateless) - Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace - Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents - Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on - Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element - - //Regular expressions used for parsing (stateful) - Parser._reAttrib = //Find attributes in a tag - /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; - Parser._reTags = /[\<\>]/g; //Find tag markers - - //**Public**// - //Methods// - //Parses a complete HTML and pushes it to the handler - Parser.prototype.parseComplete = function Parser$parseComplete (data) { - this.reset(); - this.parseChunk(data); - this.done(); - } - - //Parses a piece of an HTML document - Parser.prototype.parseChunk = function Parser$parseChunk (data) { - if (this._done) - this.handleError(new Error("Attempted to parse chunk after parsing already done")); - this._buffer += data; //FIXME: this can be a bottleneck - this.parseTags(); - } - - //Tells the parser that the HTML being parsed is complete - Parser.prototype.done = function Parser$done () { - if (this._done) - return; - this._done = true; - - //Push any unparsed text into a final element in the element list - if (this._buffer.length) { - var rawData = this._buffer; - this._buffer = ""; - var element = { - raw: rawData - , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") - , type: this._parseState - }; - if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style) - element.name = this.parseTagName(element.data); - this.parseAttribs(element); - this._elements.push(element); - } - - this.writeHandler(); - this._handler.done(); - } - - //Resets the parser to a blank state, ready to parse a new HTML document - Parser.prototype.reset = function Parser$reset () { - this._buffer = ""; - this._done = false; - this._elements = []; - this._elementsCurrent = 0; - this._current = 0; - this._next = 0; - this._location = { - row: 0 - , col: 0 - , charOffset: 0 - , inBuffer: 0 - }; - this._parseState = ElementType.Text; - this._prevTagSep = ''; - this._tagStack = []; - this._handler.reset(); - } - - //**Private**// - //Properties// - Parser.prototype._options = null; //Parser options for how to behave - Parser.prototype._handler = null; //Handler for parsed elements - Parser.prototype._buffer = null; //Buffer of unparsed data - Parser.prototype._done = false; //Flag indicating whether parsing is done - Parser.prototype._elements = null; //Array of parsed elements - Parser.prototype._elementsCurrent = 0; //Pointer to last element in _elements that has been processed - Parser.prototype._current = 0; //Position in data that has already been parsed - Parser.prototype._next = 0; //Position in data of the next tag marker (<>) - Parser.prototype._location = null; //Position tracking for elements in a stream - Parser.prototype._parseState = ElementType.Text; //Current type of element being parsed - Parser.prototype._prevTagSep = ''; //Previous tag marker found - //Stack of element types previously encountered; keeps track of when - //parsing occurs inside a script/comment/style tag - Parser.prototype._tagStack = null; - - //Methods// - //Takes an array of elements and parses any found attributes - Parser.prototype.parseTagAttribs = function Parser$parseTagAttribs (elements) { - var idxEnd = elements.length; - var idx = 0; - - while (idx < idxEnd) { - var element = elements[idx++]; - if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style) - this.parseAttribs(element); - } - - return(elements); - } - - //Takes an element and adds an "attribs" property for any element attributes found - Parser.prototype.parseAttribs = function Parser$parseAttribs (element) { - //Only parse attributes for tags - if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag) - return; - - var tagName = element.data.split(Parser._reWhitespace, 1)[0]; - var attribRaw = element.data.substring(tagName.length); - if (attribRaw.length < 1) - return; - - var match; - Parser._reAttrib.lastIndex = 0; - while (match = Parser._reAttrib.exec(attribRaw)) { - if (element.attribs == undefined) - element.attribs = {}; - - if (typeof match[1] == "string" && match[1].length) { - element.attribs[match[1]] = match[2]; - } else if (typeof match[3] == "string" && match[3].length) { - element.attribs[match[3].toString()] = match[4].toString(); - } else if (typeof match[5] == "string" && match[5].length) { - element.attribs[match[5]] = match[6]; - } else if (typeof match[7] == "string" && match[7].length) { - element.attribs[match[7]] = match[7]; - } - } - } - - //Extracts the base tag name from the data value of an element - Parser.prototype.parseTagName = function Parser$parseTagName (data) { - if (data == null || data == "") - return(""); - var match = Parser._reTagName.exec(data); - if (!match) - return(""); - return((match[1] ? "/" : "") + match[2]); - } - - //Parses through HTML text and returns an array of found elements - //I admit, this function is rather large but splitting up had an noticeable impact on speed - Parser.prototype.parseTags = function Parser$parseTags () { - var bufferEnd = this._buffer.length - 1; - while (Parser._reTags.test(this._buffer)) { - this._next = Parser._reTags.lastIndex - 1; - var tagSep = this._buffer.charAt(this._next); //The currently found tag marker - var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse - - //A new element to eventually be appended to the element list - var element = { - raw: rawData - , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "") - , type: this._parseState - }; - - var elementName = this.parseTagName(element.data); - - //This section inspects the current tag stack and modifies the current - //element if we're actually parsing a special area (script/comment/style tag) - if (this._tagStack.length) { //We're parsing inside a script/comment/style tag - if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag - if (elementName == "/script") //Actually, we're no longer in a script tag, so pop it off the stack - this._tagStack.pop(); - else { //Not a closing script tag - if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment - //All data from here to script close is now a text element - element.type = ElementType.Text; - //If the previous element is text, append the current text to it - if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) { - var prevElement = this._elements[this._elements.length - 1]; - prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; - element.raw = element.data = ""; //This causes the current element to not be added to the element list - } - } - } - } - else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag - if (elementName == "/style") //Actually, we're no longer in a style tag, so pop it off the stack - this._tagStack.pop(); - else { - if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment - //All data from here to style close is now a text element - element.type = ElementType.Text; - //If the previous element is text, append the current text to it - if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) { - var prevElement = this._elements[this._elements.length - 1]; - if (element.raw != "") { - prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw; - element.raw = element.data = ""; //This causes the current element to not be added to the element list - } else { //Element is empty, so just append the last tag marker found - prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep; - } - } else { //The previous element was not text - if (element.raw != "") { - element.raw = element.data = element.raw; - } - } - } - } - } - else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag - var rawLen = element.raw.length; - if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") { - //Actually, we're no longer in a style tag, so pop it off the stack - this._tagStack.pop(); - //If the previous element is a comment, append the current text to it - if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) { - var prevElement = this._elements[this._elements.length - 1]; - prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, ""); - element.raw = element.data = ""; //This causes the current element to not be added to the element list - element.type = ElementType.Text; - } - else //Previous element not a comment - element.type = ElementType.Comment; //Change the current element's type to a comment - } - else { //Still in a comment tag - element.type = ElementType.Comment; - //If the previous element is a comment, append the current text to it - if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) { - var prevElement = this._elements[this._elements.length - 1]; - prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep; - element.raw = element.data = ""; //This causes the current element to not be added to the element list - element.type = ElementType.Text; - } - else - element.raw = element.data = element.raw + tagSep; - } - } - } - - //Processing of non-special tags - if (element.type == ElementType.Tag) { - element.name = elementName; - - if (element.raw.indexOf("!--") == 0) { //This tag is really comment - element.type = ElementType.Comment; - delete element["name"]; - var rawLen = element.raw.length; - //Check if the comment is terminated in the current element - if (element.raw.charAt(rawLen - 1) == "-" && element.raw.charAt(rawLen - 2) == "-" && tagSep == ">") - element.raw = element.data = element.raw.replace(Parser._reTrimComment, ""); - else { //It's not so push the comment onto the tag stack - element.raw += tagSep; - this._tagStack.push(ElementType.Comment); - } - } - else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) { - element.type = ElementType.Directive; - //TODO: what about CDATA? - } - else if (element.name == "script") { - element.type = ElementType.Script; - //Special tag, push onto the tag stack if not terminated - if (element.data.charAt(element.data.length - 1) != "/") - this._tagStack.push(ElementType.Script); - } - else if (element.name == "/script") - element.type = ElementType.Script; - else if (element.name == "style") { - element.type = ElementType.Style; - //Special tag, push onto the tag stack if not terminated - if (element.data.charAt(element.data.length - 1) != "/") - this._tagStack.push(ElementType.Style); - } - else if (element.name == "/style") - element.type = ElementType.Style; - if (element.name && element.name.charAt(0) == "/") - element.data = element.name; - } - - //Add all tags and non-empty text elements to the element list - if (element.raw != "" || element.type != ElementType.Text) { - if (this._options.includeLocation && !element.location) { - element.location = this.getLocation(element.type == ElementType.Tag); - } - this.parseAttribs(element); - this._elements.push(element); - //If tag self-terminates, add an explicit, separate closing tag - if ( - element.type != ElementType.Text - && - element.type != ElementType.Comment - && - element.type != ElementType.Directive - && - element.data.charAt(element.data.length - 1) == "/" - ) - this._elements.push({ - raw: "/" + element.name - , data: "/" + element.name - , name: "/" + element.name - , type: element.type - }); - } - this._parseState = (tagSep == "<") ? ElementType.Tag : ElementType.Text; - this._current = this._next + 1; - this._prevTagSep = tagSep; - } - - if (this._options.includeLocation) { - this.getLocation(); - this._location.row += this._location.inBuffer; - this._location.inBuffer = 0; - this._location.charOffset = 0; - } - this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : ""; - this._current = 0; - - this.writeHandler(); - } - - Parser.prototype.getLocation = function Parser$getLocation (startTag) { - var c, - l = this._location, - end = this._current - (startTag ? 1 : 0), - chunk = startTag && l.charOffset == 0 && this._current == 0; - - for (; l.charOffset < end; l.charOffset++) { - c = this._buffer.charAt(l.charOffset); - if (c == '\n') { - l.inBuffer++; - l.col = 0; - } else if (c != '\r') { - l.col++; - } - } - return { - line: l.row + l.inBuffer + 1 - , col: l.col + (chunk ? 0: 1) - }; - } - - //Checks the handler to make it is an object with the right "interface" - Parser.prototype.validateHandler = function Parser$validateHandler (handler) { - if ((typeof handler) != "object") - throw new Error("Handler is not an object"); - if ((typeof handler.reset) != "function") - throw new Error("Handler method 'reset' is invalid"); - if ((typeof handler.done) != "function") - throw new Error("Handler method 'done' is invalid"); - if ((typeof handler.writeTag) != "function") - throw new Error("Handler method 'writeTag' is invalid"); - if ((typeof handler.writeText) != "function") - throw new Error("Handler method 'writeText' is invalid"); - if ((typeof handler.writeComment) != "function") - throw new Error("Handler method 'writeComment' is invalid"); - if ((typeof handler.writeDirective) != "function") - throw new Error("Handler method 'writeDirective' is invalid"); - } - - //Writes parsed elements out to the handler - Parser.prototype.writeHandler = function Parser$writeHandler (forceFlush) { - forceFlush = !!forceFlush; - if (this._tagStack.length && !forceFlush) - return; - while (this._elements.length) { - var element = this._elements.shift(); - switch (element.type) { - case ElementType.Comment: - this._handler.writeComment(element); - break; - case ElementType.Directive: - this._handler.writeDirective(element); - break; - case ElementType.Text: - this._handler.writeText(element); - break; - default: - this._handler.writeTag(element); - break; - } - } - } - - Parser.prototype.handleError = function Parser$handleError (error) { - if ((typeof this._handler.error) == "function") - this._handler.error(error); - else - throw error; - } - -//TODO: make this a trully streamable handler -function RssHandler (callback) { - RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false }); -} -inherits(RssHandler, DefaultHandler); - - RssHandler.prototype.done = function RssHandler$done () { - var feed = { }; - var feedRoot; - - var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false); - if (found.length) { - feedRoot = found[0]; - } - if (feedRoot) { - if (feedRoot.name == "rss") { - feed.type = "rss"; - feedRoot = feedRoot.children[0]; // - feed.id = ""; - try { - feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data); - } catch (ex) { } - try { - feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - feed.items = []; - DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) { - var entry = {}; - try { - entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data); - } catch (ex) { } - feed.items.push(entry); - }); - } else { - feed.type = "atom"; - try { - feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href; - } catch (ex) { } - try { - feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data; - } catch (ex) { } - try { - feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data); - } catch (ex) { } - try { - feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data; - } catch (ex) { } - feed.items = []; - DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) { - var entry = {}; - try { - entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href; - } catch (ex) { } - try { - entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data; - } catch (ex) { } - try { - entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data); - } catch (ex) { } - feed.items.push(entry); - }); - } - - this.dom = feed; - } - RssHandler.super_.prototype.done.call(this); - } - -/////////////////////////////////////////////////// - -function DefaultHandler (callback, options) { - this.reset(); - this._options = options ? options : { }; - if (this._options.ignoreWhitespace == undefined) - this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes - if (this._options.verbose == undefined) - this._options.verbose = true; //Keep data property for tags and raw property for all - if (this._options.enforceEmptyTags == undefined) - this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec - if ((typeof callback) == "function") - this._callback = callback; -} - - //**"Static"**// - //HTML Tags that shouldn't contain child nodes - DefaultHandler._emptyTags = { - area: 1 - , base: 1 - , basefont: 1 - , br: 1 - , col: 1 - , frame: 1 - , hr: 1 - , img: 1 - , input: 1 - , isindex: 1 - , link: 1 - , meta: 1 - , param: 1 - , embed: 1 - } - //Regex to detect whitespace only text nodes - DefaultHandler.reWhitespace = /^\s*$/; - - //**Public**// - //Properties// - DefaultHandler.prototype.dom = null; //The hierarchical object containing the parsed HTML - //Methods// - //Resets the handler back to starting state - DefaultHandler.prototype.reset = function DefaultHandler$reset() { - this.dom = []; - this._done = false; - this._tagStack = []; - this._tagStack.last = function DefaultHandler$_tagStack$last () { - return(this.length ? this[this.length - 1] : null); - } - } - //Signals the handler that parsing is done - DefaultHandler.prototype.done = function DefaultHandler$done () { - this._done = true; - this.handleCallback(null); - } - DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) { - this.handleElement(element); - } - DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) { - if (this._options.ignoreWhitespace) - if (DefaultHandler.reWhitespace.test(element.data)) - return; - this.handleElement(element); - } - DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) { - this.handleElement(element); - } - DefaultHandler.prototype.writeDirective = function DefaultHandler$writeDirective (element) { - this.handleElement(element); - } - DefaultHandler.prototype.error = function DefaultHandler$error (error) { - this.handleCallback(error); - } - - //**Private**// - //Properties// - DefaultHandler.prototype._options = null; //Handler options for how to behave - DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done - DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed - DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed - //Methods// - DefaultHandler.prototype.handleCallback = function DefaultHandler$handleCallback (error) { - if ((typeof this._callback) != "function") - if (error) - throw error; - else - return; - this._callback(error, this.dom); - } - - DefaultHandler.prototype.isEmptyTag = function(element) { - var name = element.name.toLowerCase(); - if (name.charAt(0) == '/') { - name = name.substring(1); - } - return this._options.enforceEmptyTags && !!DefaultHandler._emptyTags[name]; - }; - - DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) { - if (this._done) - this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()")); - if (!this._options.verbose) { -// element.raw = null; //FIXME: Not clean - //FIXME: Serious performance problem using delete - delete element.raw; - if (element.type == "tag" || element.type == "script" || element.type == "style") - delete element.data; - } - if (!this._tagStack.last()) { //There are no parent elements - //If the element can be a container, add it to the tag stack and the top level list - if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) { - if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag - this.dom.push(element); - if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children - this._tagStack.push(element); - } - } - } - else //Otherwise just add to the top level list - this.dom.push(element); - } - else { //There are parent elements - //If the element can be a container, add it as a child of the element - //on top of the tag stack and then add it to the tag stack - if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) { - if (element.name.charAt(0) == "/") { - //This is a closing tag, scan the tagStack to find the matching opening tag - //and pop the stack up to the opening tag's parent - var baseName = element.name.substring(1); - if (!this.isEmptyTag(element)) { - var pos = this._tagStack.length - 1; - while (pos > -1 && this._tagStack[pos--].name != baseName) { } - if (pos > -1 || this._tagStack[0].name == baseName) - while (pos < this._tagStack.length - 1) - this._tagStack.pop(); - } - } - else { //This is not a closing tag - if (!this._tagStack.last().children) - this._tagStack.last().children = []; - this._tagStack.last().children.push(element); - if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children - this._tagStack.push(element); - } - } - else { //This is not a container element - if (!this._tagStack.last().children) - this._tagStack.last().children = []; - this._tagStack.last().children.push(element); - } - } - } - - var DomUtils = { - testElement: function DomUtils$testElement (options, element) { - if (!element) { - return false; - } - - for (var key in options) { - if (key == "tag_name") { - if (element.type != "tag" && element.type != "script" && element.type != "style") { - return false; - } - if (!options["tag_name"](element.name)) { - return false; - } - } else if (key == "tag_type") { - if (!options["tag_type"](element.type)) { - return false; - } - } else if (key == "tag_contains") { - if (element.type != "text" && element.type != "comment" && element.type != "directive") { - return false; - } - if (!options["tag_contains"](element.data)) { - return false; - } - } else { - if (!element.attribs || !options[key](element.attribs[key])) { - return false; - } - } - } - - return true; - } - - , getElements: function DomUtils$getElements (options, currentElement, recurse, limit) { - recurse = (recurse === undefined || recurse === null) || !!recurse; - limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit); - - if (!currentElement) { - return([]); - } - - var found = []; - var elementList; - - function getTest (checkVal) { - return(function (value) { return(value == checkVal); }); - } - for (var key in options) { - if ((typeof options[key]) != "function") { - options[key] = getTest(options[key]); - } - } - - if (DomUtils.testElement(options, currentElement)) { - found.push(currentElement); - } - - if (limit >= 0 && found.length >= limit) { - return(found); - } - - if (recurse && currentElement.children) { - elementList = currentElement.children; - } else if (currentElement instanceof Array) { - elementList = currentElement; - } else { - return(found); - } - - for (var i = 0; i < elementList.length; i++) { - found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit)); - if (limit >= 0 && found.length >= limit) { - break; - } - } - - return(found); - } - - , getElementById: function DomUtils$getElementById (id, currentElement, recurse) { - var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1); - return(result.length ? result[0] : null); - } - - , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse, limit) { - return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit)); - } - - , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse, limit) { - return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit)); - } - } - - function inherits (ctor, superCtor) { - var tempCtor = function(){}; - tempCtor.prototype = superCtor.prototype; - ctor.super_ = superCtor; - ctor.prototype = new tempCtor(); - ctor.prototype.constructor = ctor; - } - -exports.Parser = Parser; - -exports.DefaultHandler = DefaultHandler; - -exports.RssHandler = RssHandler; - -exports.ElementType = ElementType; - -exports.DomUtils = DomUtils; - -})(); +var htmlparser = require("./htmlparser"); +exports.Parser = htmlparser.Parser; +exports.DefaultHandler = htmlparser.DefaultHandler; +exports.RssHandler = htmlparser.RssHandler; +exports.ElementType = htmlparser.ElementType; +exports.DomUtils = htmlparser.DomUtils; diff --git a/lib/node-htmlparser.min.js b/lib/node-htmlparser.min.js index 2f029f7..27d5eea 100644 --- a/lib/node-htmlparser.min.js +++ b/lib/node-htmlparser.min.js @@ -1,22 +1,6 @@ -/*********************************************** -Copyright 2010, Chris Winberry . All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to -deal in the Software without restriction, including without limitation the -rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -sell copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -IN THE SOFTWARE. -***********************************************/ -/* v1.7.2 */ -(function(){function e(a,c){this._options=c?c:{};if(this._options.includeLocation==undefined)this._options.includeLocation=false;this.validateHandler(a);this._handler=a;this.reset()}function n(a){n.super_.call(this,a,{ignoreWhitespace:true,verbose:false,enforceEmptyTags:false})}function i(a,c){this.reset();this._options=c?c:{};if(this._options.ignoreWhitespace==undefined)this._options.ignoreWhitespace=false;if(this._options.verbose==undefined)this._options.verbose=true;if(this._options.enforceEmptyTags== undefined)this._options.enforceEmptyTags=true;if(typeof a=="function")this._callback=a}if(!(typeof require=="function"&&typeof exports=="object"&&typeof module=="object"&&typeof __filename=="string"&&typeof __dirname=="string")){if(this.Tautologistics){if(this.Tautologistics.NodeHtmlParser)return}else this.Tautologistics={};this.Tautologistics.NodeHtmlParser={};exports=this.Tautologistics.NodeHtmlParser}var d={Text:"text",Directive:"directive",Comment:"comment",Script:"script",Style:"style",Tag:"tag"}; e._reTrim=/(^\s+|\s+$)/g;e._reTrimComment=/(^\!--|--$)/g;e._reWhitespace=/\s/g;e._reTagName=/^\s*(\/?)\s*([^\s\/]+)/;e._reAttrib=/([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;e._reTags=/[\<\>]/g;e.prototype.parseComplete=function(a){this.reset();this.parseChunk(a);this.done()};e.prototype.parseChunk=function(a){this._done&&this.handleError(Error("Attempted to parse chunk after parsing already done"));this._buffer+=a;this.parseTags()}; e.prototype.done=function(){if(!this._done){this._done=true;if(this._buffer.length){var a=this._buffer;this._buffer="";a={raw:a,data:this._parseState==d.Text?a:a.replace(e._reTrim,""),type:this._parseState};if(this._parseState==d.Tag||this._parseState==d.Script||this._parseState==d.Style)a.name=this.parseTagName(a.data);this.parseAttribs(a);this._elements.push(a)}this.writeHandler();this._handler.done()}};e.prototype.reset=function(){this._buffer="";this._done=false;this._elements=[];this._next=this._current= this._elementsCurrent=0;this._location={row:0,col:0,charOffset:0,inBuffer:0};this._parseState=d.Text;this._prevTagSep="";this._tagStack=[];this._handler.reset()};e.prototype._options=null;e.prototype._handler=null;e.prototype._buffer=null;e.prototype._done=false;e.prototype._elements=null;e.prototype._elementsCurrent=0;e.prototype._current=0;e.prototype._next=0;e.prototype._location=null;e.prototype._parseState=d.Text;e.prototype._prevTagSep="";e.prototype._tagStack=null;e.prototype.parseTagAttribs= function(a){for(var c=a.length,b=0;b"){this._tagStack.pop(); if(this._elements.length&&this._elements[this._elements.length-1].type==d.Comment){g=this._elements[this._elements.length-1];g.raw=g.data=(g.raw+b.raw).replace(e._reTrimComment,"");b.raw=b.data="";b.type=d.Text}else b.type=d.Comment}else{b.type=d.Comment;if(this._elements.length&&this._elements[this._elements.length-1].type==d.Comment){g=this._elements[this._elements.length-1];g.raw=g.data=g.raw+b.raw+c;b.raw=b.data="";b.type=d.Text}else b.raw=b.data=b.raw+c}}if(b.type==d.Tag){b.name=h;if(b.raw.indexOf("!--")== 0){b.type=d.Comment;delete b.name;g=b.raw.length;if(b.raw.charAt(g-1)=="-"&&b.raw.charAt(g-2)=="-"&&c==">")b.raw=b.data=b.raw.replace(e._reTrimComment,"");else{b.raw+=c;this._tagStack.push(d.Comment)}}else if(b.raw.indexOf("!")==0||b.raw.indexOf("?")==0)b.type=d.Directive;else if(b.name=="script"){b.type=d.Script;b.data.charAt(b.data.length-1)!="/"&&this._tagStack.push(d.Script)}else if(b.name=="/script")b.type=d.Script;else if(b.name=="style"){b.type=d.Style;b.data.charAt(b.data.length-1)!="/"&& this._tagStack.push(d.Style)}else if(b.name=="/style")b.type=d.Style;if(b.name&&b.name.charAt(0)=="/")b.data=b.name}if(b.raw!=""||b.type!=d.Text){if(this._options.includeLocation&&!b.location)b.location=this.getLocation(b.type==d.Tag);this.parseAttribs(b);this._elements.push(b);b.type!=d.Text&&b.type!=d.Comment&&b.type!=d.Directive&&b.data.charAt(b.data.length-1)=="/"&&this._elements.push({raw:"/"+b.name,data:"/"+b.name,name:"/"+b.name,type:b.type})}this._parseState=c=="<"?d.Tag:d.Text;this._current= this._next+1;this._prevTagSep=c}if(this._options.includeLocation){this.getLocation();this._location.row+=this._location.inBuffer;this._location.inBuffer=0;this._location.charOffset=0}this._buffer=this._current<=a?this._buffer.substring(this._current):"";this._current=0;this.writeHandler()};e.prototype.getLocation=function(a){for(var c=this._location,b=this._current-(a?1:0),h=a&&c.charOffset==0&&this._current==0;c.charOffset-1&&this._tagStack[a--].name!=c;);if(a>-1||this._tagStack[0].name==c)for(;a=0&&l.length>=h)return l;if(b&&c.children)c=c.children;else if(c instanceof Array)c=c;else return l; for(m=0;m=0&&l.length>=h)break}return l},getElementById:function(a,c,b){a=f.getElements({id:a},c,b,1);return a.length?a[0]:null},getElementsByTagName:function(a,c,b,h){return f.getElements({tag_name:a},c,b,h)},getElementsByTagType:function(a,c,b,h){return f.getElements({tag_type:a},c,b,h)}};exports.Parser=e;exports.DefaultHandler=i;exports.RssHandler=n;exports.ElementType=d;exports.DomUtils=f})(); \ No newline at end of file +var htmlparser = require("./htmlparser.min"); +exports.Parser = htmlparser.Parser; +exports.DefaultHandler = htmlparser.DefaultHandler; +exports.RssHandler = htmlparser.RssHandler; +exports.ElementType = htmlparser.ElementType; +exports.DomUtils = htmlparser.DomUtils; diff --git a/package.json b/package.json index 198d1dd..b395c90 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "htmlparser" , "description": "Forgiving HTML/XML/RSS Parser in JS for *both* Node and Browsers" - , "version": "1.7.2" + , "version": "1.7.3" , "author": "Chris Winberry " , "contributors": [] , "repository": { @@ -12,9 +12,9 @@ "mail": "chris@winberry.net" , "web": "http://github.com/tautologistics/node-htmlparser/issues" } - , "os": [ "linux", "darwin", "freebsd" ] + , "os": [ "linux", "darwin", "freebsd", "win32" ] , "directories": { "lib": "./lib/" } - , "main": "./lib/node-htmlparser" + , "main": "./lib/htmlparser" , "engines": { "node": ">=0.1.33" } , "licenses": [{ "type": "MIT" diff --git a/profile.js b/profile.js index c5a474e..f9d0ef2 100644 --- a/profile.js +++ b/profile.js @@ -3,16 +3,16 @@ var sys = require("sys"); var fs = require("fs"); var http = require("http"); -var htmlparser = require("./node-htmlparser"); -var libxml = require('./libxmljs'); +var htmlparser = require("./lib/htmlparser"); +//var libxml = require('./libxmljs'); var testNHP = true; //Should node-htmlparser be exercised? -var testLXJS = true; //Should libxmljs be exercised? +var testLXJS = false; //Should libxmljs be exercised? var testIterations = 100; //Number of test loops to run -var testHost = "nodejs.org"; //Host to fetch test HTML from +var testHost = "localhost"; //Host to fetch test HTML from var testPort = 80; //Port on host to fetch test HTML from -var testPath = "/api.html"; //Path on host to fetch HTML from +var testPath = "/~chris/feed.xml"; //Path on host to fetch HTML from function getMillisecs () { return((new Date()).getTime()); @@ -41,7 +41,7 @@ http.createClient(testPort, testHost) if (err) sys.debug("Error: " + err); }); - var parser = new htmlparser.Parser(handler); + var parser = new htmlparser.Parser(handler, { includeLocation: true }); parser.parseComplete(html); }) diff --git a/runtests.html b/runtests.html index 13a0ebc..e89702d 100644 --- a/runtests.html +++ b/runtests.html @@ -21,7 +21,7 @@ head.insertBefore(script, head.firstChild) } - + diff --git a/runtests.js b/runtests.js index d1c87b8..e906fe4 100644 --- a/runtests.js +++ b/runtests.js @@ -21,7 +21,7 @@ IN THE SOFTWARE. var sys = require("sys"); var fs = require("fs"); -var htmlparser = require("./lib/node-htmlparser"); +var htmlparser = require("./lib/htmlparser"); var testFolder = "./tests"; var chunkSize = 5; diff --git a/runtests.min.html b/runtests.min.html index 8be48ac..73ea4c7 100644 --- a/runtests.min.html +++ b/runtests.min.html @@ -21,7 +21,7 @@ head.insertBefore(script, head.firstChild) } - + diff --git a/runtests.min.js b/runtests.min.js index fc8ff3d..df33736 100644 --- a/runtests.min.js +++ b/runtests.min.js @@ -21,7 +21,7 @@ IN THE SOFTWARE. var sys = require("sys"); var fs = require("fs"); -var htmlparser = require("./lib/node-htmlparser.min"); +var htmlparser = require("./lib/htmlparser.min"); var testFolder = "./tests"; var chunkSize = 5; diff --git a/snippet.js b/snippet.js index 2f54b36..9448ea3 100644 --- a/snippet.js +++ b/snippet.js @@ -1,7 +1,7 @@ //node --prof --prof_auto profile.js //deps/v8/tools/mac-tick-processor v8.log var sys = require("sys"); -var htmlparser = require("./node-htmlparser"); +var htmlparser = require("./htmlparser"); var html = "text"; diff --git a/utils_example.js b/utils_example.js index 9f6deca..d219de5 100644 --- a/utils_example.js +++ b/utils_example.js @@ -1,7 +1,7 @@ //node --prof --prof_auto profile.js //deps/v8/tools/mac-tick-processor v8.log var sys = require("sys"); -var htmlparser = require("./lib/node-htmlparser"); +var htmlparser = require("./lib/htmlparser"); var html = "text atext btext ctext ehhhhellowworld";