Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Fully working 2.0 but still need browser testing page

  • Loading branch information...
commit 3d236116a4aeb680aaa05865e8b4db369c3f836f 1 parent 479f26a
@tautologistics authored
Showing with 2,766 additions and 2,254 deletions.
  1. +13 −2 CHANGELOG
  2. +896 −780 lib/htmlparser.js
  3. +0 −22 lib/htmlparser.min.js
  4. +244 −52 runtests.js
  5. +0 −108 runtests.min.html
  6. +0 −75 runtests.min.js
  7. +0 −61 tests/01-basic.js
  8. +0 −39 tests/02-single_tag_1.js
  9. +0 −40 tests/03-single_tag_2.js
  10. +0 −56 tests/04-unescaped_in_script.js
  11. +0 −48 tests/05-tags_in_comment.js
  12. +0 −48 tests/06-comment_in_script.js
  13. +0 −49 tests/07-unescaped_in_style.js
  14. +0 −49 tests/08-extra_spaces_in_tag.js
  15. +0 −49 tests/09-unquoted_attrib.js
  16. +0 −43 tests/10-singular_attribute.js
  17. +0 −50 tests/11-text_outside_tags.js
  18. +0 −41 tests/12-text_only.js
  19. +0 −49 tests/13-comment_in_text.js
  20. +0 −57 tests/14-comment_in_text_in_script.js
  21. +0 −46 tests/15-non-verbose.js
  22. +0 −71 tests/16-ignore_whitespace.js
  23. +0 −38 tests/17-xml_namespace.js
  24. +0 −40 tests/18-enforce_empty_tags.js
  25. +0 −41 tests/19-ignore_empty_tags.js
  26. +0 −120 tests/20-rss.js
  27. +0 −80 tests/21-atom.js
  28. +0 −100 tests/22-position_data.js
  29. +518 −0 tests/html.js
  30. +922 −0 tests/parser.js
  31. +173 −0 tests/rss.js
View
15 CHANGELOG
@@ -1,5 +1,16 @@
-v1.8.0
- *
+v2.0.0
+ * Brand new parser, handles edge cases old parser did not
+ * Parser handlers renamed to builders
+ * Builder method signature simplified
+ * Moved element position calculation to builders for efficiency
+ * Added case-sensitivity options for tag and attribute names
+ * Parser output minimized (unecessary values removed)
+ * Element attribute list renamed from attribs to attributes
+ * Node types consolidated; "script" and "style" moved to "tag"
+ * An order of magnitude more tests, with many targeting the parser rather than just the builders
+ * Tests consolidated into single files per test type (e.g. parser tests, html tests, rss tests)
+ * Testing code rewritten (e.g. direct object comparator instead of comparison of and object's JSON)
+ * Brand new bugs! (not sure what they are yet but I am sure there are at least a few)
v1.7.6
* Removed "os" entry from package.json
View
1,676 lib/htmlparser.js
@@ -1,5 +1,5 @@
/***********************************************
-Copyright 2010, 2011, Chris Winberry <chris@winberry.net>. All rights reserved.
+Copyright 2010 - 2012 Chris Winberry <chris@winberry.net>. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
deal in the Software without restriction, including without limitation the
@@ -18,805 +18,921 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.
***********************************************/
-/* v1.7.6 */
+/* v2.0.0 */
(function () {
function runningInNode () {
- return(
- (typeof require) == "function"
- &&
- (typeof exports) == "object"
- &&
- (typeof module) == "object"
- &&
- (typeof __filename) == "string"
- &&
- (typeof __dirname) == "string"
- );
+ return(
+ (typeof require) == "function"
+ &&
+ (typeof exports) == "object"
+ &&
+ (typeof module) == "object"
+ &&
+ (typeof __filename) == "string"
+ &&
+ (typeof __dirname) == "string"
+ );
}
if (!runningInNode()) {
- if (!this.Tautologistics)
- this.Tautologistics = {};
- else if (this.Tautologistics.NodeHtmlParser)
- return; //NodeHtmlParser already defined!
- this.Tautologistics.NodeHtmlParser = {};
- exports = this.Tautologistics.NodeHtmlParser;
+ if (!this.Tautologistics) {
+ this.Tautologistics = {};
+ } else if (this.Tautologistics.NodeHtmlParser) {
+ return; //NodeHtmlParser already defined!
+ }
+ this.Tautologistics.NodeHtmlParser = {};
+ exports = this.Tautologistics.NodeHtmlParser;
}
-//Types of elements found in the DOM
-var ElementType = {
- Text: "text" //Plain text
- , Directive: "directive" //Special tag <!...>
- , Comment: "comment" //Special tag <!--...-->
- , Script: "script" //Special tag <script>...</script>
- , Style: "style" //Special tag <style>...</style>
- , Tag: "tag" //Any tag that isn't special
+function inherits (ctor, superCtor) {
+ var tempCtor = function(){};
+ tempCtor.prototype = superCtor.prototype;
+ ctor.super_ = superCtor;
+ ctor.prototype = new tempCtor();
+ ctor.prototype.constructor = ctor;
}
-function Parser (handler, options) {
- this._options = options ? options : { };
- if (this._options.includeLocation == undefined) {
- this._options.includeLocation = false; //Do not track element position in document by default
- }
-
- this.validateHandler(handler);
- this._handler = handler;
- this.reset();
+var Mode = {
+ Text: 'text',
+ Tag: 'tag',
+ Attr: 'attr',
+ CData: 'cdata',
+ Comment: 'comment'
+};
+
+function Parser (builder, options) {
+ this._options = options ? options : { };
+ // if (this._options.includeLocation === undefined) {
+ // this._options.includeLocation = false; //Include position of element (row, col) on nodes
+ // }
+ this._validateBuilder(builder);
+ var self = this;
+ this._builder = builder;
+ this.reset();
}
- //**"Static"**//
- //Regular expressions used for cleaning up and parsing (stateless)
- Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace
- Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents
- Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on
- Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element
-
- //Regular expressions used for parsing (stateful)
- Parser._reAttrib = //Find attributes in a tag
- /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;
- Parser._reTags = /[\<\>]/g; //Find tag markers
-
- //**Public**//
- //Methods//
- //Parses a complete HTML and pushes it to the handler
- Parser.prototype.parseComplete = function Parser$parseComplete (data) {
- this.reset();
- this.parseChunk(data);
- this.done();
- }
-
- //Parses a piece of an HTML document
- Parser.prototype.parseChunk = function Parser$parseChunk (data) {
- if (this._done)
- this.handleError(new Error("Attempted to parse chunk after parsing already done"));
- this._buffer += data; //FIXME: this can be a bottleneck
- this.parseTags();
- }
-
- //Tells the parser that the HTML being parsed is complete
- Parser.prototype.done = function Parser$done () {
- if (this._done)
- return;
- this._done = true;
-
- //Push any unparsed text into a final element in the element list
- if (this._buffer.length) {
- var rawData = this._buffer;
- this._buffer = "";
- var element = {
- raw: rawData
- , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
- , type: this._parseState
- };
- if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style)
- element.name = this.parseTagName(element.data);
- this.parseAttribs(element);
- this._elements.push(element);
- }
-
- this.writeHandler();
- this._handler.done();
- }
-
- //Resets the parser to a blank state, ready to parse a new HTML document
- Parser.prototype.reset = function Parser$reset () {
- this._buffer = "";
- this._done = false;
- this._elements = [];
- this._elementsCurrent = 0;
- this._current = 0;
- this._next = 0;
- this._location = {
- row: 0
- , col: 0
- , charOffset: 0
- , inBuffer: 0
- };
- this._parseState = ElementType.Text;
- this._prevTagSep = '';
- this._tagStack = [];
- this._handler.reset();
- }
-
- //**Private**//
- //Properties//
- Parser.prototype._options = null; //Parser options for how to behave
- Parser.prototype._handler = null; //Handler for parsed elements
- Parser.prototype._buffer = null; //Buffer of unparsed data
- Parser.prototype._done = false; //Flag indicating whether parsing is done
- Parser.prototype._elements = null; //Array of parsed elements
- Parser.prototype._elementsCurrent = 0; //Pointer to last element in _elements that has been processed
- Parser.prototype._current = 0; //Position in data that has already been parsed
- Parser.prototype._next = 0; //Position in data of the next tag marker (<>)
- Parser.prototype._location = null; //Position tracking for elements in a stream
- Parser.prototype._parseState = ElementType.Text; //Current type of element being parsed
- Parser.prototype._prevTagSep = ''; //Previous tag marker found
- //Stack of element types previously encountered; keeps track of when
- //parsing occurs inside a script/comment/style tag
- Parser.prototype._tagStack = null;
-
- //Methods//
- //Takes an array of elements and parses any found attributes
- Parser.prototype.parseTagAttribs = function Parser$parseTagAttribs (elements) {
- var idxEnd = elements.length;
- var idx = 0;
-
- while (idx < idxEnd) {
- var element = elements[idx++];
- if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style)
- this.parseAttribs(element);
- }
-
- return(elements);
- }
-
- //Takes an element and adds an "attribs" property for any element attributes found
- Parser.prototype.parseAttribs = function Parser$parseAttribs (element) {
- //Only parse attributes for tags
- if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag)
- return;
-
- var tagName = element.data.split(Parser._reWhitespace, 1)[0];
- var attribRaw = element.data.substring(tagName.length);
- if (attribRaw.length < 1)
- return;
-
- var match;
- Parser._reAttrib.lastIndex = 0;
- while (match = Parser._reAttrib.exec(attribRaw)) {
- if (element.attribs == undefined)
- element.attribs = {};
-
- if (typeof match[1] == "string" && match[1].length) {
- element.attribs[match[1]] = match[2];
- } else if (typeof match[3] == "string" && match[3].length) {
- element.attribs[match[3].toString()] = match[4].toString();
- } else if (typeof match[5] == "string" && match[5].length) {
- element.attribs[match[5]] = match[6];
- } else if (typeof match[7] == "string" && match[7].length) {
- element.attribs[match[7]] = match[7];
- }
- }
- }
-
- //Extracts the base tag name from the data value of an element
- Parser.prototype.parseTagName = function Parser$parseTagName (data) {
- if (data == null || data == "")
- return("");
- var match = Parser._reTagName.exec(data);
- if (!match)
- return("");
- return((match[1] ? "/" : "") + match[2]);
- }
-
- //Parses through HTML text and returns an array of found elements
- //I admit, this function is rather large but splitting up had an noticeable impact on speed
- Parser.prototype.parseTags = function Parser$parseTags () {
- var bufferEnd = this._buffer.length - 1;
- while (Parser._reTags.test(this._buffer)) {
- this._next = Parser._reTags.lastIndex - 1;
- var tagSep = this._buffer.charAt(this._next); //The currently found tag marker
- var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse
-
- //A new element to eventually be appended to the element list
- var element = {
- raw: rawData
- , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
- , type: this._parseState
- };
-
- var elementName = this.parseTagName(element.data);
-
- //This section inspects the current tag stack and modifies the current
- //element if we're actually parsing a special area (script/comment/style tag)
- if (this._tagStack.length) { //We're parsing inside a script/comment/style tag
- if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag
- if (elementName.toLowerCase() == "/script") //Actually, we're no longer in a script tag, so pop it off the stack
- this._tagStack.pop();
- else { //Not a closing script tag
- if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
- //All data from here to script close is now a text element
- element.type = ElementType.Text;
- //If the previous element is text, append the current text to it
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
- var prevElement = this._elements[this._elements.length - 1];
- prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
- }
- }
- }
- }
- else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag
- if (elementName.toLowerCase() == "/style") //Actually, we're no longer in a style tag, so pop it off the stack
- this._tagStack.pop();
- else {
- if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
- //All data from here to style close is now a text element
- element.type = ElementType.Text;
- //If the previous element is text, append the current text to it
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
- var prevElement = this._elements[this._elements.length - 1];
- if (element.raw != "") {
- prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
- } else { //Element is empty, so just append the last tag marker found
- prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep;
- }
- } else { //The previous element was not text
- if (element.raw != "") {
- element.raw = element.data = element.raw;
- }
- }
- }
- }
- }
- else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag
- var rawLen = element.raw.length;
- if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") {
- //Actually, we're no longer in a style tag, so pop it off the stack
- this._tagStack.pop();
- //If the previous element is a comment, append the current text to it
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
- var prevElement = this._elements[this._elements.length - 1];
- prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, "");
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
- element.type = ElementType.Text;
- }
- else //Previous element not a comment
- element.type = ElementType.Comment; //Change the current element's type to a comment
- }
- else { //Still in a comment tag
- element.type = ElementType.Comment;
- //If the previous element is a comment, append the current text to it
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
- var prevElement = this._elements[this._elements.length - 1];
- prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep;
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
- element.type = ElementType.Text;
- }
- else
- element.raw = element.data = element.raw + tagSep;
- }
- }
- }
-
- //Processing of non-special tags
- if (element.type == ElementType.Tag) {
- element.name = elementName;
- var elementNameCI = elementName.toLowerCase();
-
- if (element.raw.indexOf("!--") == 0) { //This tag is really comment
- element.type = ElementType.Comment;
- delete element["name"];
- var rawLen = element.raw.length;
- //Check if the comment is terminated in the current element
- if (element.raw.charAt(rawLen - 1) == "-" && element.raw.charAt(rawLen - 2) == "-" && tagSep == ">")
- element.raw = element.data = element.raw.replace(Parser._reTrimComment, "");
- else { //It's not so push the comment onto the tag stack
- element.raw += tagSep;
- this._tagStack.push(ElementType.Comment);
- }
- }
- else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) {
- element.type = ElementType.Directive;
- //TODO: what about CDATA?
- }
- else if (elementNameCI == "script") {
- element.type = ElementType.Script;
- //Special tag, push onto the tag stack if not terminated
- if (element.data.charAt(element.data.length - 1) != "/")
- this._tagStack.push(ElementType.Script);
- }
- else if (elementNameCI == "/script")
- element.type = ElementType.Script;
- else if (elementNameCI == "style") {
- element.type = ElementType.Style;
- //Special tag, push onto the tag stack if not terminated
- if (element.data.charAt(element.data.length - 1) != "/")
- this._tagStack.push(ElementType.Style);
- }
- else if (elementNameCI == "/style")
- element.type = ElementType.Style;
- if (element.name && element.name.charAt(0) == "/")
- element.data = element.name;
- }
-
- //Add all tags and non-empty text elements to the element list
- if (element.raw != "" || element.type != ElementType.Text) {
- if (this._options.includeLocation && !element.location) {
- element.location = this.getLocation(element.type == ElementType.Tag);
- }
- this.parseAttribs(element);
- this._elements.push(element);
- //If tag self-terminates, add an explicit, separate closing tag
- if (
- element.type != ElementType.Text
- &&
- element.type != ElementType.Comment
- &&
- element.type != ElementType.Directive
- &&
- element.data.charAt(element.data.length - 1) == "/"
- )
- this._elements.push({
- raw: "/" + element.name
- , data: "/" + element.name
- , name: "/" + element.name
- , type: element.type
- });
- }
- this._parseState = (tagSep == "<") ? ElementType.Tag : ElementType.Text;
- this._current = this._next + 1;
- this._prevTagSep = tagSep;
- }
-
- if (this._options.includeLocation) {
- this.getLocation();
- this._location.row += this._location.inBuffer;
- this._location.inBuffer = 0;
- this._location.charOffset = 0;
- }
- this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : "";
- this._current = 0;
-
- this.writeHandler();
- }
-
- Parser.prototype.getLocation = function Parser$getLocation (startTag) {
- var c,
- l = this._location,
- end = this._current - (startTag ? 1 : 0),
- chunk = startTag && l.charOffset == 0 && this._current == 0;
-
- for (; l.charOffset < end; l.charOffset++) {
- c = this._buffer.charAt(l.charOffset);
- if (c == '\n') {
- l.inBuffer++;
- l.col = 0;
- } else if (c != '\r') {
- l.col++;
- }
- }
- return {
- line: l.row + l.inBuffer + 1
- , col: l.col + (chunk ? 0: 1)
- };
- }
-
- //Checks the handler to make it is an object with the right "interface"
- Parser.prototype.validateHandler = function Parser$validateHandler (handler) {
- if ((typeof handler) != "object")
- throw new Error("Handler is not an object");
- if ((typeof handler.reset) != "function")
- throw new Error("Handler method 'reset' is invalid");
- if ((typeof handler.done) != "function")
- throw new Error("Handler method 'done' is invalid");
- if ((typeof handler.writeTag) != "function")
- throw new Error("Handler method 'writeTag' is invalid");
- if ((typeof handler.writeText) != "function")
- throw new Error("Handler method 'writeText' is invalid");
- if ((typeof handler.writeComment) != "function")
- throw new Error("Handler method 'writeComment' is invalid");
- if ((typeof handler.writeDirective) != "function")
- throw new Error("Handler method 'writeDirective' is invalid");
- }
-
- //Writes parsed elements out to the handler
- Parser.prototype.writeHandler = function Parser$writeHandler (forceFlush) {
- forceFlush = !!forceFlush;
- if (this._tagStack.length && !forceFlush)
- return;
- while (this._elements.length) {
- var element = this._elements.shift();
- switch (element.type) {
- case ElementType.Comment:
- this._handler.writeComment(element);
- break;
- case ElementType.Directive:
- this._handler.writeDirective(element);
- break;
- case ElementType.Text:
- this._handler.writeText(element);
- break;
- default:
- this._handler.writeTag(element);
- break;
- }
- }
- }
-
- Parser.prototype.handleError = function Parser$handleError (error) {
- if ((typeof this._handler.error) == "function")
- this._handler.error(error);
- else
- throw error;
- }
-
-//TODO: make this a trully streamable handler
-function RssHandler (callback) {
- RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
-}
-inherits(RssHandler, DefaultHandler);
-
- RssHandler.prototype.done = function RssHandler$done () {
- var feed = { };
- var feedRoot;
-
- var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false);
- if (found.length) {
- feedRoot = found[0];
- }
- if (feedRoot) {
- if (feedRoot.name == "rss") {
- feed.type = "rss";
- feedRoot = feedRoot.children[0]; //<channel/>
- feed.id = "";
- try {
- feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data);
- } catch (ex) { }
- try {
- feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- feed.items = [];
- DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) {
- var entry = {};
- try {
- entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data);
- } catch (ex) { }
- feed.items.push(entry);
- });
- } else {
- feed.type = "atom";
- try {
- feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href;
- } catch (ex) { }
- try {
- feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data);
- } catch (ex) { }
- try {
- feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data;
- } catch (ex) { }
- feed.items = [];
- DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) {
- var entry = {};
- try {
- entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href;
- } catch (ex) { }
- try {
- entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data);
- } catch (ex) { }
- feed.items.push(entry);
- });
- }
-
- this.dom = feed;
- }
- RssHandler.super_.prototype.done.call(this);
- }
-
-///////////////////////////////////////////////////
-
-function DefaultHandler (callback, options) {
- this.reset();
- this._options = options ? options : { };
- if (this._options.ignoreWhitespace == undefined)
- this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
- if (this._options.verbose == undefined)
- this._options.verbose = true; //Keep data property for tags and raw property for all
- if (this._options.enforceEmptyTags == undefined)
- this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
- if ((typeof callback) == "function")
- this._callback = callback;
+ //**Public**//
+ Parser.prototype.reset = function Parser$reset () {
+ this._state = {
+ mode: Mode.Text,
+ pos: 0,
+ data: null,
+ pendingText: null,
+ pendingWrite: null,
+ lastTag: null,
+ isScript: false,
+ needData: false,
+ output: [],
+ done: false//,
+ // line: 1,
+ // col: 1
+ };
+ this._builder.reset();
+ };
+
+ Parser.prototype.parseChunk = function Parser$parseChunk (chunk) {
+ this._state.needData = false;
+ this._state.data = (this._state.data !== null) ?
+ this._state.data.substr(this.pos) + chunk
+ :
+ chunk
+ ;
+ while (this._state.pos < this._state.data.length && !this._state.needData) {
+ this._parse(this._state);
+ }
+ };
+
+ Parser.prototype.parseComplete = function Parser$parseComplete (data) {
+ this.reset();
+ this.parseChunk(data);
+ this.done();
+ };
+
+ Parser.prototype.done = function Parser$done () {
+ this._state.done = true;
+ this._parse(this._state);
+ this._flushWrite();
+ this._builder.done();
+ };
+
+ //**Private**//
+ Parser.prototype._validateBuilder = function Parser$_validateBuilder (builder) {
+ if ((typeof builder) != "object") {
+ throw new Error("Builder is not an object");
+ }
+ if ((typeof builder.reset) != "function") {
+ throw new Error("Builder method 'reset' is invalid");
+ }
+ if ((typeof builder.done) != "function") {
+ throw new Error("Builder method 'done' is invalid");
+ }
+ if ((typeof builder.write) != "function") {
+ throw new Error("Builder method 'write' is invalid");
+ }
+ if ((typeof builder.error) != "function") {
+ throw new Error("Builder method 'error' is invalid");
+ }
+ };
+
+ Parser.prototype._parse = function Parser$_parse () {
+ switch (this._state.mode) {
+ case Mode.Text:
+ return this._parseText(this._state);
+ case Mode.Tag:
+ return this._parseTag(this._state);
+ case Mode.Attr:
+ return this._parseAttr(this._state);
+ case Mode.CData:
+ return this._parseCData(this._state);
+ case Mode.Comment:
+ return this._parseComment(this._state);
+ }
+ };
+
+ Parser.prototype._writePending = function Parser$_writePending (node) {
+ if (!this._state.pendingWrite) {
+ this._state.pendingWrite = [];
+ }
+ this._state.pendingWrite.push(node);
+ };
+
+ Parser.prototype._flushWrite = function Parser$_flushWrite () {
+ if (this._state.pendingWrite) {
+ for (var i = 0, len = this._state.pendingWrite.length; i < len; i++) {
+ var node = this._state.pendingWrite[i];
+ this._builder.write(node);
+ }
+ this._state.pendingWrite = null;
+ }
+ };
+
+ Parser.prototype._write = function Parser$_write (node) {
+ this._flushWrite();
+ this._builder.write(node);
+ };
+
+ Parser._re_parseText_scriptClose = /<\s*\/\s*script/ig;
+ Parser.prototype._parseText = function Parser$_parseText () {
+ var state = this._state;
+ var foundPos;
+ if (state.isScript) {
+ Parser._re_parseText_scriptClose.lastIndex = state.pos;
+ foundPos = Parser._re_parseText_scriptClose.exec(state.data);
+ foundPos = (foundPos) ?
+ foundPos.index
+ :
+ -1
+ ;
+ } else {
+ foundPos = state.data.indexOf('<', state.pos);
+ }
+ var text = (foundPos === -1) ? state.data.substring(state.pos, state.data.length) : state.data.substring(state.pos, foundPos);
+ if (foundPos < 0 && state.done) {
+ foundPos = state.data.length;
+ }
+ if (foundPos < 0) {
+ if (state.isScript) {
+ state.needData = true;
+ return;
+ }
+ if (!state.pendingText) {
+ state.pendingText = [];
+ }
+ state.pendingText.push(state.data.substring(state.pos, state.data.length));
+ state.pos = state.data.length;
+ } else {
+ if (state.pendingText) {
+ state.pendingText.push(state.data.substring(state.pos, foundPos));
+ text = state.pendingText.join('');
+ state.pendingText = null;
+ } else {
+ text = state.data.substring(state.pos, foundPos);
+ }
+ if (text !== '') {
+ this._write({ type: Mode.Text, data: text });
+ }
+ state.pos = foundPos + 1;
+ state.mode = Mode.Tag;
+ }
+ };
+
+ Parser.re_parseTag = /\s*(\/?)\s*([^\s>\/]+)(\s*)\??(>?)/g;
+ Parser.prototype._parseTag = function Parser$_parseTag () {
+ var state = this._state;
+ Parser.re_parseTag.lastIndex = state.pos;
+ var match = Parser.re_parseTag.exec(state.data);
+ if (match) {
+ if (!match[1] && match[2].substr(0, 3) === '!--') {
+ state.mode = Mode.Comment;
+ state.pos += 3;
+ return;
+ }
+ if (!match[1] && match[2].substr(0, 8) === '![CDATA[') {
+ state.mode = Mode.CData;
+ state.pos += 8;
+ return;
+ }
+ if (!state.done && (state.pos + match[0].length) === state.data.length) {
+ //We're at the and of the data, might be incomplete
+ state.needData = true;
+ return;
+ }
+ var raw;
+ if (match[4] === '>') {
+ state.mode = Mode.Text;
+ raw = match[0].substr(0, match[0].length - 1);
+ } else {
+ state.mode = Mode.Attr;
+ raw = match[0];
+ }
+ state.pos += match[0].length;
+ var tag = { type: Mode.Tag, name: match[1] + match[2], raw: raw };
+ if (state.mode === Mode.Attr) {
+ state.lastTag = tag;
+ }
+ if (tag.name.toLowerCase() === 'script') {
+ state.isScript = true;
+ } else if (tag.name.toLowerCase() === '/script') {
+ state.isScript = false;
+ }
+ if (state.mode === Mode.Attr) {
+ this._writePending(tag);
+ } else {
+ this._write(tag);
+ }
+ } else {
+ //TODO: end of tag?
+ //TODO: push to pending?
+ state.needData = true;
+ }
+ };
+
+ Parser.re_parseAttr_findName = /\s*([^=<>\s'"\/]+)\s*/g;
+ Parser.prototype._parseAttr_findName = function Parser$_parseAttr_findName () {
+ Parser.re_parseAttr_findName.lastIndex = this._state.pos;
+ var match = Parser.re_parseAttr_findName.exec(this._state.data);
+ if (!match) {
+ return null;
+ }
+ if (this._state.pos + match[0].length !== Parser.re_parseAttr_findName.lastIndex) {
+ return null;
+ }
+ return {
+ match: match[0]
+ , name: match[1]
+ };
+ };
+ Parser.re_parseAttr_findValue = /\s*=\s*(?:'([^']*)'|"([^"]*)"|([^'"\s\/>]+))\s*/g;
+ Parser.re_parseAttr_findValue_last = /\s*=\s*['"]?(.*)$/g;
+ Parser.prototype._parseAttr_findValue = function Parser$_parseAttr_findValue () {
+ var state = this._state;
+ Parser.re_parseAttr_findValue.lastIndex = state.pos;
+ var match = Parser.re_parseAttr_findValue.exec(state.data);
+ if (!match) {
+ if (!state.done) {
+ return null;
+ }
+ Parser.re_parseAttr_findValue_last.lastIndex = state.pos;
+ match = Parser.re_parseAttr_findValue_last.exec(state.data);
+ if (!match) {
+ return null;
+ }
+ return {
+ match: match[0]
+ , value: (match[1] !== '') ? match[1] : null
+ };
+ }
+ if (state.pos + match[0].length !== Parser.re_parseAttr_findValue.lastIndex) {
+ return null;
+ }
+ return {
+ match: match[0]
+ , value: match[1] || match[2] || match[3]
+ };
+ };
+ Parser.re_parseAttr_splitValue = /\s*=\s*['"]?/g;
+ Parser.re_parseAttr_selfClose = /(\s*\/\s*)(>?)/g;
+ Parser.prototype._parseAttr = function Parser$_parseAttr () {
+ var state = this._state;
+ var name_data = this._parseAttr_findName(state);
+ if (!name_data || name_data.name === '?') {
+ Parser.re_parseAttr_selfClose.lastIndex = state.pos;
+ var matchTrailingSlash = Parser.re_parseAttr_selfClose.exec(state.data);
+ if (matchTrailingSlash && matchTrailingSlash.index === state.pos) {
+ if (!state.done && !matchTrailingSlash[2] && state.pos + matchTrailingSlash[0].length === state.data.length) {
+ state.needData = true;
+ return;
+ }
+ state.lastTag.raw += matchTrailingSlash[1];
+ // state.output.push({ type: Mode.Tag, name: '/' + state.lastTag.name, raw: null });
+ this._write({ type: Mode.Tag, name: '/' + state.lastTag.name, raw: null });
+ state.pos += matchTrailingSlash[1].length;
+ }
+ var foundPos = state.data.indexOf('>', state.pos);
+ if (foundPos < 0) {
+ if (state.done) { //TODO: is this needed?
+ state.lastTag.raw += state.data.substr(state.pos);
+ state.pos = state.data.length;
+ return;
+ }
+ state.needData = true;
+ } else {
+ // state.lastTag = null;
+ state.pos = foundPos + 1;
+ state.mode = Mode.Text;
+ }
+ return;
+ }
+ if (!state.done && state.pos + name_data.match.length === state.data.length) {
+ state.needData = true;
+ return null;
+ }
+ state.pos += name_data.match.length;
+ var value_data = this._parseAttr_findValue(state);
+ if (value_data) {
+ if (!state.done && state.pos + value_data.match.length === state.data.length) {
+ state.needData = true;
+ state.pos -= name_data.match.length;
+ return;
+ }
+ state.pos += value_data.match.length;
+ } else {
+ Parser.re_parseAttr_splitValue.lastIndex = state.pos;
+ if (Parser.re_parseAttr_splitValue.exec(state.data)) {
+ state.needData = true;
+ state.pos -= name_data.match.length;
+ return;
+ }
+ value_data = {
+ match: ''
+ , value: null
+ };
+ }
+ state.lastTag.raw += name_data.match + value_data.match;
+
+ this._writePending({ type: Mode.Attr, name: name_data.name, data: value_data.value });
+ };
+
+ Parser.re_parseCData_findEnding = /\]{1,2}$/;
+ Parser.prototype._parseCData = function Parser$_parseCData () {
+ var state = this._state;
+ var foundPos = state.data.indexOf(']]>', state.pos);
+ if (foundPos < 0 && state.done) {
+ foundPos = state.data.length;
+ }
+ if (foundPos < 0) {
+ Parser.re_parseCData_findEnding.lastIndex = state.pos;
+ var matchPartialCDataEnd = Parser.re_parseCData_findEnding.exec(state.data);
+ if (matchPartialCDataEnd) {
+ state.needData = true;
+ return;
+ }
+ if (!state.pendingText) {
+ state.pendingText = [];
+ }
+ state.pendingText.push(state.data.substr(state.pos, state.data.length));
+ state.pos = state.data.length;
+ state.needData = true;
+ } else {
+ var text;
+ if (state.pendingText) {
+ state.pendingText.push(state.data.substring(state.pos, foundPos));
+ text = state.pendingText.join('');
+ state.pendingText = null;
+ } else {
+ text = state.data.substring(state.pos, foundPos);
+ }
+ this._write({ type: Mode.CData, data: text });
+ state.mode = Mode.Text;
+ state.pos = foundPos + 3;
+ }
+ };
+
+ Parser.re_parseComment_findEnding = /\-{1,2}$/;
+ Parser.prototype._parseComment = function Parser$_parseComment () {
+ var state = this._state;
+ var foundPos = state.data.indexOf('-->', state.pos);
+ if (foundPos < 0 && state.done) {
+ foundPos = state.data.length;
+ }
+ if (foundPos < 0) {
+ Parser.re_parseComment_findEnding.lastIndex = state.pos;
+ var matchPartialCommentEnd = Parser.re_parseComment_findEnding.exec(state.data);
+ if (matchPartialCommentEnd) {
+ state.needData = true;
+ return;
+ }
+ if (!state.pendingText) {
+ state.pendingText = [];
+ }
+ state.pendingText.push(state.data.substr(state.pos, state.data.length));
+ state.pos = state.data.length;
+ state.needData = true;
+ } else {
+ var text;
+ if (state.pendingText) {
+ state.pendingText.push(state.data.substring(state.pos, foundPos));
+ text = state.pendingText.join('');
+ state.pendingText = null;
+ } else {
+ text = state.data.substring(state.pos, foundPos);
+ }
+ // state.output.push({ type: Mode.Comment, data: text });
+ this._write({ type: Mode.Comment, data: text });
+ state.mode = Mode.Text;
+ state.pos = foundPos + 3;
+ }
+ };
+
+
+function HtmlBuilder (callback, options) {
+ this.reset();
+ this._options = options ? options : { };
+ if (this._options.ignoreWhitespace === undefined) {
+ this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
+ }
+ if (this._options.includeLocation === undefined) {
+ this._options.includeLocation = false; //Include position of element (row, col) on nodes
+ }
+ if (this._options.verbose === undefined) {
+ this._options.verbose = true; //Keep data property for tags and raw property for all
+ }
+ if (this._options.enforceEmptyTags === undefined) {
+ this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
+ }
+ if (this._options.caseSensitiveTags === undefined) {
+ this._options.caseSensitiveTags = false; //Lowercase all tag names
+ }
+ if (this._options.caseSensitiveAttr === undefined) {
+ this._options.caseSensitiveAttr = false; //Lowercase all attribute names
+ }
+ if ((typeof callback) == "function") {
+ this._callback = callback;
+ }
}
- //**"Static"**//
- //HTML Tags that shouldn't contain child nodes
- DefaultHandler._emptyTags = {
- area: 1
- , base: 1
- , basefont: 1
- , br: 1
- , col: 1
- , frame: 1
- , hr: 1
- , img: 1
- , input: 1
- , isindex: 1
- , link: 1
- , meta: 1
- , param: 1
- , embed: 1
- }
- //Regex to detect whitespace only text nodes
- DefaultHandler.reWhitespace = /^\s*$/;
-
- //**Public**//
- //Properties//
- DefaultHandler.prototype.dom = null; //The hierarchical object containing the parsed HTML
- //Methods//
- //Resets the handler back to starting state
- DefaultHandler.prototype.reset = function DefaultHandler$reset() {
- this.dom = [];
- this._done = false;
- this._tagStack = [];
- this._tagStack.last = function DefaultHandler$_tagStack$last () {
- return(this.length ? this[this.length - 1] : null);
- }
- }
- //Signals the handler that parsing is done
- DefaultHandler.prototype.done = function DefaultHandler$done () {
- this._done = true;
- this.handleCallback(null);
- }
- DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) {
- this.handleElement(element);
- }
- DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) {
- if (this._options.ignoreWhitespace)
- if (DefaultHandler.reWhitespace.test(element.data))
- return;
- this.handleElement(element);
- }
- DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) {
- this.handleElement(element);
- }
- DefaultHandler.prototype.writeDirective = function DefaultHandler$writeDirective (element) {
- this.handleElement(element);
- }
- DefaultHandler.prototype.error = function DefaultHandler$error (error) {
- this.handleCallback(error);
- }
-
- //**Private**//
- //Properties//
- DefaultHandler.prototype._options = null; //Handler options for how to behave
- DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done
- DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed
- DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed
- //Methods//
- DefaultHandler.prototype.handleCallback = function DefaultHandler$handleCallback (error) {
- if ((typeof this._callback) != "function")
- if (error)
- throw error;
- else
- return;
- this._callback(error, this.dom);
- }
-
- DefaultHandler.prototype.isEmptyTag = function(element) {
- var name = element.name.toLowerCase();
- if (name.charAt(0) == '/') {
- name = name.substring(1);
- }
- return this._options.enforceEmptyTags && !!DefaultHandler._emptyTags[name];
- };
-
- DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) {
- if (this._done)
- this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()"));
- if (!this._options.verbose) {
-// element.raw = null; //FIXME: Not clean
- //FIXME: Serious performance problem using delete
- delete element.raw;
- if (element.type == "tag" || element.type == "script" || element.type == "style")
- delete element.data;
- }
- if (!this._tagStack.last()) { //There are no parent elements
- //If the element can be a container, add it to the tag stack and the top level list
- if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
- if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag
- this.dom.push(element);
- if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children
- this._tagStack.push(element);
- }
- }
- }
- else //Otherwise just add to the top level list
- this.dom.push(element);
- }
- else { //There are parent elements
- //If the element can be a container, add it as a child of the element
- //on top of the tag stack and then add it to the tag stack
- if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
- if (element.name.charAt(0) == "/") {
- //This is a closing tag, scan the tagStack to find the matching opening tag
- //and pop the stack up to the opening tag's parent
- var baseName = element.name.substring(1);
- if (!this.isEmptyTag(element)) {
- var pos = this._tagStack.length - 1;
- while (pos > -1 && this._tagStack[pos--].name != baseName) { }
- if (pos > -1 || this._tagStack[0].name == baseName)
- while (pos < this._tagStack.length - 1)
- this._tagStack.pop();
- }
- }
- else { //This is not a closing tag
- if (!this._tagStack.last().children)
- this._tagStack.last().children = [];
- this._tagStack.last().children.push(element);
- if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children
- this._tagStack.push(element);
- }
- }
- else { //This is not a container element
- if (!this._tagStack.last().children)
- this._tagStack.last().children = [];
- this._tagStack.last().children.push(element);
- }
- }
- }
-
- var DomUtils = {
- testElement: function DomUtils$testElement (options, element) {
- if (!element) {
- return false;
- }
-
- for (var key in options) {
- if (key == "tag_name") {
- if (element.type != "tag" && element.type != "script" && element.type != "style") {
- return false;
- }
- if (!options["tag_name"](element.name)) {
- return false;
- }
- } else if (key == "tag_type") {
- if (!options["tag_type"](element.type)) {
- return false;
- }
- } else if (key == "tag_contains") {
- if (element.type != "text" && element.type != "comment" && element.type != "directive") {
- return false;
- }
- if (!options["tag_contains"](element.data)) {
- return false;
- }
- } else {
- if (!element.attribs || !options[key](element.attribs[key])) {
- return false;
- }
- }
- }
-
- return true;
- }
-
- , getElements: function DomUtils$getElements (options, currentElement, recurse, limit) {
- recurse = (recurse === undefined || recurse === null) || !!recurse;
- limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit);
-
- if (!currentElement) {
- return([]);
- }
-
- var found = [];
- var elementList;
-
- function getTest (checkVal) {
- return(function (value) { return(value == checkVal); });
- }
- for (var key in options) {
- if ((typeof options[key]) != "function") {
- options[key] = getTest(options[key]);
- }
- }
-
- if (DomUtils.testElement(options, currentElement)) {
- found.push(currentElement);
- }
-
- if (limit >= 0 && found.length >= limit) {
- return(found);
- }
-
- if (recurse && currentElement.children) {
- elementList = currentElement.children;
- } else if (currentElement instanceof Array) {
- elementList = currentElement;
- } else {
- return(found);
- }
-
- for (var i = 0; i < elementList.length; i++) {
- found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit));
- if (limit >= 0 && found.length >= limit) {
- break;
- }
- }
-
- return(found);
- }
-
- , getElementById: function DomUtils$getElementById (id, currentElement, recurse) {
- var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1);
- return(result.length ? result[0] : null);
- }
-
- , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse, limit) {
- return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit));
- }
-
- , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse, limit) {
- return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit));
- }
- }
-
- function inherits (ctor, superCtor) {
- var tempCtor = function(){};
- tempCtor.prototype = superCtor.prototype;
- ctor.super_ = superCtor;
- ctor.prototype = new tempCtor();
- ctor.prototype.constructor = ctor;
- }
+ //**"Static"**//
+ //HTML Tags that shouldn't contain child nodes
+ HtmlBuilder._emptyTags = {
+ area: 1
+ , base: 1
+ , basefont: 1
+ , br: 1
+ , col: 1
+ , frame: 1
+ , hr: 1
+ , img: 1
+ , input: 1
+ , isindex: 1
+ , link: 1
+ , meta: 1
+ , param: 1
+ , embed: 1
+ , '?xml': 1
+ };
+ //Regex to detect whitespace only text nodes
+ HtmlBuilder.reWhitespace = /^\s*$/;
+
+ //**Public**//
+ //Properties//
+ HtmlBuilder.prototype.dom = null; //The hierarchical object containing the parsed HTML
+ //Methods//
+ //Resets the builder back to starting state
+ HtmlBuilder.prototype.reset = function HtmlBuilder$reset() {
+ this.dom = [];
+ // this._raw = [];
+ this._done = false;
+ this._tagStack = [];
+ this._lastTag = null;
+ this._tagStack.last = function HtmlBuilder$_tagStack$last () {
+ return(this.length ? this[this.length - 1] : null);
+ };
+ this._line = 1;
+ this._col = 1;
+ };
+ //Signals the builder that parsing is done
+ HtmlBuilder.prototype.done = function HtmlBuilder$done () {
+ this._done = true;
+ this.handleCallback(null);
+ };
+
+ HtmlBuilder.prototype.error = function HtmlBuilder$error (error) {
+ this.handleCallback(error);
+ };
+
+ HtmlBuilder.prototype.handleCallback = function HtmlBuilder$handleCallback (error) {
+ if ((typeof this._callback) != "function") {
+ if (error) {
+ throw error;
+ } else {
+ return;
+ }
+ }
+ this._callback(error, this.dom);
+ };
+
+ HtmlBuilder.prototype.isEmptyTag = function HtmlBuilder$isEmptyTag (element) {
+ var name = element.name.toLowerCase();
+ if (name.charAt(0) == '?') {
+ return true;
+ }
+ if (name.charAt(0) == '/') {
+ name = name.substring(1);
+ }
+ return this._options.enforceEmptyTags && !!HtmlBuilder._emptyTags[name];
+ };
+
+ HtmlBuilder.prototype._getLocation = function HtmlBuilder$_getLocation () {
+ return { line: this._line, col: this._col };
+ };
+
+ // HtmlBuilder.reLineSplit = /(\r\n|\r|\n)/g;
+ HtmlBuilder.prototype._updateLocation = function HtmlBuilder$_updateLocation (node) {
+ var positionData = (node.type === Mode.Tag) ? node.raw : node.data;
+ if (positionData === null) {
+ return;
+ }
+ // var lines = positionData.split(HtmlBuilder.reLineSplit);
+ var lines = positionData.split("\n");
+ this._line += lines.length - 1;
+ if (lines.length > 1) {
+ this._col = 1;
+ }
+ this._col += lines[lines.length - 1].length;
+ if (node.type === Mode.Tag) {
+ this._col += 2;
+ } else if (node.type === Mode.Comment) {
+ this._col += 7;
+ } else if (node.type === Mode.CData) {
+ this._col += 12;
+ }
+ };
+
+ HtmlBuilder.prototype._copyElement = function HtmlBuilder$_copyElement (element) {
+ var newElement = { type: element.type };
+
+ if (this._options.verbose && element['raw'] !== undefined) {
+ newElement.raw = element.raw;
+ }
+ if (element['name'] !== undefined) {
+ switch (element.type) {
+
+ case Mode.Tag:
+ newElement.name = this._options.caseSensitiveTags ?
+ element.name
+ :
+ element.name.toLowerCase()
+ ;
+ break;
+
+ case Mode.Attr:
+ newElement.name = this._options.caseSensitiveAttr ?
+ element.name
+ :
+ element.name.toLowerCase()
+ ;
+ break;
+
+ default:
+ newElement.name = this._options.caseSensitiveTags ?
+ element.name
+ :
+ element.name.toLowerCase()
+ ;
+ break;
+
+ }
+ }
+ if (element['data'] !== undefined) {
+ newElement.data = element.data;
+ }
+ if (element.location) {
+ newElement.location = { line: element.location.line, col: element.location.col };
+ }
+
+ return newElement;
+ };
+
+ HtmlBuilder.prototype.write = function HtmlBuilder$write (element) {
+ // this._raw.push(element);
+ if (this._done) {
+ this.handleCallback(new Error("Writing to the builder after done() called is not allowed without a reset()"));
+ }
+ if (this._options.includeLocation) {
+ if (element.type !== Mode.Attr) {
+ element.location = this._getLocation();
+ this._updateLocation(element);
+ }
+ }
+ if (element.type === Mode.Text && this._options.ignoreWhitespace) {
+ if (HtmlBuilder.reWhitespace.test(element.data)) {
+ return;
+ }
+ }
+ var parent;
+ var node;
+ if (!this._tagStack.last()) { //There are no parent elements
+ //If the element can be a container, add it to the tag stack and the top level list
+ if (element.type === Mode.Tag) {
+ if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag
+ node = this._copyElement(element);
+ this.dom.push(node);
+ if (!this.isEmptyTag(node)) { //Don't add tags to the tag stack that can't have children
+ this._tagStack.push(node);
+ }
+ this._lastTag = node;
+ }
+ } else if (element.type === Mode.Attr && this._lastTag) {
+ if (!this._lastTag.attributes) {
+ this._lastTag.attributes = {};
+ }
+ this._lastTag.attributes[this._options.caseSensitiveAttr ? element.name : element.name.toLowerCase()] =
+ element.data;
+ } else { //Otherwise just add to the top level list
+ this.dom.push(this._copyElement(element));
+ }
+ } else { //There are parent elements
+ //If the element can be a container, add it as a child of the element
+ //on top of the tag stack and then add it to the tag stack
+ if (element.type === Mode.Tag) {
+ if (element.name.charAt(0) == "/") {
+ //This is a closing tag, scan the tagStack to find the matching opening tag
+ //and pop the stack up to the opening tag's parent
+ var baseName = this._options.caseSensitiveTags ?
+ element.name.substring(1)
+ :
+ element.name.substring(1).toLowerCase()
+ ;
+ if (!this.isEmptyTag(element)) {
+ var pos = this._tagStack.length - 1;
+ while (pos > -1 && this._tagStack[pos--].name != baseName) { }
+ if (pos > -1 || this._tagStack[0].name == baseName) {
+ while (pos < this._tagStack.length - 1) {
+ this._tagStack.pop();
+ }
+ }
+ }
+ }
+ else { //This is not a closing tag
+ parent = this._tagStack.last();
+ if (element.type === Mode.Attr) {
+ if (!parent.attributes) {
+ parent.attributes = {};
+ }
+ parent.attributes[this._options.caseSensitiveAttr ? element.name : element.name.toLowerCase()] =
+ element.data;
+ } else {
+ node = this._copyElement(element);
+ if (!parent.children) {
+ parent.children = [];
+ }
+ parent.children.push(node);
+ if (!this.isEmptyTag(node)) { //Don't add tags to the tag stack that can't have children
+ this._tagStack.push(node);
+ }
+ if (element.type === Mode.Tag) {
+ this._lastTag = node;
+ }
+ }
+ }
+ }
+ else { //This is not a container element
+ parent = this._tagStack.last();
+ if (element.type === Mode.Attr) {
+ if (!parent.attributes) {
+ parent.attributes = {};
+ }
+ parent.attributes[this._options.caseSensitiveAttr ? element.name : element.name.toLowerCase()] =
+ element.data;
+ } else {
+ if (!parent.children) {
+ parent.children = [];
+ }
+ parent.children.push(this._copyElement(element));
+ }
+ }
+ }
+ };
+
+
+ //**Private**//
+ //Properties//
+ HtmlBuilder.prototype._options = null; //Builder options for how to behave
+ HtmlBuilder.prototype._callback = null; //Callback to respond to when parsing done
+ HtmlBuilder.prototype._done = false; //Flag indicating whether builder has been notified of parsing completed
+ HtmlBuilder.prototype._tagStack = null; //List of parents to the currently element being processed
+ //Methods//
+
+
+function RssBuilder (callback) {
+ RssBuilder.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false, caseSensitiveTags: true });
+}
+inherits(RssBuilder, HtmlBuilder);
+
+ RssBuilder.prototype.done = function RssBuilder$done () {
+ var feed = {};
+ var feedRoot;
+
+ var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false);
+ if (found.length) {
+ feedRoot = found[0];
+ }
+ if (feedRoot) {
+ if (feedRoot.name == "rss") {
+ feed.type = "rss";
+ feedRoot = feedRoot.children[0]; //<channel/>
+ feed.id = "";
+ try {
+ feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data);
+ } catch (ex) { }
+ try {
+ feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ feed.items = [];
+ DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) {
+ var entry = {};
+ try {
+ entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data);
+ } catch (ex) { }
+ feed.items.push(entry);
+ });
+ } else {
+ feed.type = "atom";
+ try {
+ feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attributes.href;
+ } catch (ex) { }
+ try {
+ feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data);
+ } catch (ex) { }
+ try {
+ feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data;
+ } catch (ex) { }
+ feed.items = [];
+ DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) {
+ var entry = {};
+ try {
+ entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attributes.href;
+ } catch (ex) { }
+ try {
+ entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data);
+ } catch (ex) { }
+ feed.items.push(entry);
+ });
+ }
+
+ this.dom = feed;
+ }
+ RssBuilder.super_.prototype.done.call(this);
+ };
+
+ var DomUtils = {
+ testElement: function DomUtils$testElement (options, element) {
+ if (!element) {
+ return false;
+ }
+
+ for (var key in options) {
+ if (!options.hasOwnProperty(key)) {
+ continue;
+ }
+ if (key == "tag_name") {
+ if (element.type !== Mode.Tag) {
+ return false;
+ }
+ if (!options["tag_name"](element.name)) {
+ return false;
+ }
+ } else if (key == "tag_type") {
+ if (!options["tag_type"](element.type)) {
+ return false;
+ }
+ } else if (key == "tag_contains") {
+ if (element.type !== Mode.Text && element.type !== Mode.Comment && element.type !== Mode.CData) {
+ return false;
+ }
+ if (!options["tag_contains"](element.data)) {
+ return false;
+ }
+ } else {
+ if (!element.attributes || !options[key](element.attributes[key])) {
+ return false;
+ }
+ }
+ }
+
+ return true;
+ }
+
+ , getElements: function DomUtils$getElements (options, currentElement, recurse, limit) {
+ recurse = (recurse === undefined || recurse === null) || !!recurse;
+ limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit);
+
+ if (!currentElement) {
+ return([]);
+ }
+
+ var found = [];
+ var elementList;
+
+ function getTest (checkVal) {
+ return function (value) {
+ return(value == checkVal);
+ };
+ }
+ for (var key in options) {
+ if ((typeof options[key]) != "function") {
+ options[key] = getTest(options[key]);
+ }
+ }
+
+ if (DomUtils.testElement(options, currentElement)) {
+ found.push(currentElement);
+ }
+
+ if (limit >= 0 && found.length >= limit) {
+ return(found);
+ }
+
+ if (recurse && currentElement.children) {
+ elementList = currentElement.children;
+ } else if (currentElement instanceof Array) {
+ elementList = currentElement;
+ } else {
+ return(found);
+ }
+
+ for (var i = 0; i < elementList.length; i++) {
+ found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit));
+ if (limit >= 0 && found.length >= limit) {
+ break;
+ }
+ }
+
+ return(found);
+ }
+
+ , getElementById: function DomUtils$getElementById (id, currentElement, recurse) {
+ var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1);
+ return(result.length ? result[0] : null);
+ }
+
+ , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse, limit) {
+ return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit));
+ }
+
+ , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse, limit) {
+ return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit));
+ }
+ };
exports.Parser = Parser;
-exports.DefaultHandler = DefaultHandler;
+exports.HtmlBuilder = HtmlBuilder;
-exports.RssHandler = RssHandler;
+exports.RssBuilder = RssBuilder;
-exports.ElementType = ElementType;
+exports.ElementType = Mode;
exports.DomUtils = DomUtils;
View
22 lib/htmlparser.min.js
@@ -1,22 +0,0 @@
-/***********************************************
-Copyright 2010, 2011, Chris Winberry <chris@winberry.net>. All rights reserved.
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to
-deal in the Software without restriction, including without limitation the
-rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-sell copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-IN THE SOFTWARE.
-***********************************************/
-/* v1.7.6 */
-(function(){function e(a,c){this._options=c?c:{};void 0==this._options.includeLocation&&(this._options.includeLocation=!1);this.validateHandler(a);this._handler=a;this.reset()}function j(a){j.super_.call(this,a,{ignoreWhitespace:!0,verbose:!1,enforceEmptyTags:!1})}function g(a,c){this.reset();this._options=c?c:{};void 0==this._options.ignoreWhitespace&&(this._options.ignoreWhitespace=!1);void 0==this._options.verbose&&(this._options.verbose=!0);void 0==this._options.enforceEmptyTags&&(this._options.enforceEmptyTags= !0);"function"==typeof a&&(this._callback=a)}if(!("function"==typeof require&&"object"==typeof exports&&"object"==typeof module&&"string"==typeof __filename&&"string"==typeof __dirname)){if(this.Tautologistics){if(this.Tautologistics.NodeHtmlParser)return}else this.Tautologistics={};this.Tautologistics.NodeHtmlParser={};exports=this.Tautologistics.NodeHtmlParser}var d={Text:"text",Directive:"directive",Comment:"comment",Script:"script",Style:"style",Tag:"tag"};e._reTrim=/(^\s+|\s+$)/g;e._reTrimComment= /(^\!--|--$)/g;e._reWhitespace=/\s/g;e._reTagName=/^\s*(\/?)\s*([^\s\/]+)/;e._reAttrib=/([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;e._reTags=/[\<\>]/g;e.prototype.parseComplete=function(a){this.reset();this.parseChunk(a);this.done()};e.prototype.parseChunk=function(a){this._done&&this.handleError(Error("Attempted to parse chunk after parsing already done"));this._buffer+=a;this.parseTags()};e.prototype.done=function(){if(!this._done){this._done= !0;if(this._buffer.length){var a=this._buffer;this._buffer="";a={raw:a,data:this._parseState==d.Text?a:a.replace(e._reTrim,""),type:this._parseState};if(this._parseState==d.Tag||this._parseState==d.Script||this._parseState==d.Style)a.name=this.parseTagName(a.data);this.parseAttribs(a);this._elements.push(a)}this.writeHandler();this._handler.done()}};e.prototype.reset=function(){this._buffer="";this._done=!1;this._elements=[];this._next=this._current=this._elementsCurrent=0;this._location={row:0,col:0, charOffset:0,inBuffer:0};this._parseState=d.Text;this._prevTagSep="";this._tagStack=[];this._handler.reset()};e.prototype._options=null;e.prototype._handler=null;e.prototype._buffer=null;e.prototype._done=!1;e.prototype._elements=null;e.prototype._elementsCurrent=0;e.prototype._current=0;e.prototype._next=0;e.prototype._location=null;e.prototype._parseState=d.Text;e.prototype._prevTagSep="";e.prototype._tagStack=null;e.prototype.parseTagAttribs=function(a){for(var c=a.length,b=0;b<c;){var e=a[b++]; (e.type==d.Tag||e.type==d.Script||e.type==d.style)&&this.parseAttribs(e)}return a};e.prototype.parseAttribs=function(a){if(!(a.type!=d.Script&&a.type!=d.Style&&a.type!=d.Tag)){var c=a.data.split(e._reWhitespace,1)[0],c=a.data.substring(c.length);if(!(1>c.length)){var b;for(e._reAttrib.lastIndex=0;b=e._reAttrib.exec(c);)void 0==a.attribs&&(a.attribs={}),"string"==typeof b[1]&&b[1].length?a.attribs[b[1]]=b[2]:"string"==typeof b[3]&&b[3].length?a.attribs[b[3].toString()]=b[4].toString():"string"==typeof b[5]&& b[5].length?a.attribs[b[5]]=b[6]:"string"==typeof b[7]&&b[7].length&&(a.attribs[b[7]]=b[7])}}};e.prototype.parseTagName=function(a){if(null==a||""==a)return"";a=e._reTagName.exec(a);return!a?"":(a[1]?"/":"")+a[2]};e.prototype.parseTags=function(){for(var a=this._buffer.length-1;e._reTags.test(this._buffer);){this._next=e._reTags.lastIndex-1;var c=this._buffer.charAt(this._next),b=this._buffer.substring(this._current,this._next),b={raw:b,data:this._parseState==d.Text?b:b.replace(e._reTrim,""),type:this._parseState}, f=this.parseTagName(b.data);if(this._tagStack.length)if(this._tagStack[this._tagStack.length-1]==d.Script)if("/script"==f.toLowerCase())this._tagStack.pop();else{if(0!=b.raw.indexOf("!--")&&(b.type=d.Text,this._elements.length&&this._elements[this._elements.length-1].type==d.Text)){var h=this._elements[this._elements.length-1];h.raw=h.data=h.raw+this._prevTagSep+b.raw;b.raw=b.data=""}}else this._tagStack[this._tagStack.length-1]==d.Style?"/style"==f.toLowerCase()?this._tagStack.pop():0!=b.raw.indexOf("!--")&& (b.type=d.Text,this._elements.length&&this._elements[this._elements.length-1].type==d.Text?(h=this._elements[this._elements.length-1],""!=b.raw?(h.raw=h.data=h.raw+this._prevTagSep+b.raw,b.raw=b.data=""):h.raw=h.data=h.raw+this._prevTagSep):""!=b.raw&&(b.raw=b.data=b.raw)):this._tagStack[this._tagStack.length-1]==d.Comment&&(h=b.raw.length,"-"==b.raw.charAt(h-2)&&"-"==b.raw.charAt(h-1)&&">"==c?(this._tagStack.pop(),this._elements.length&&this._elements[this._elements.length-1].type==d.Comment?(h= this._elements[this._elements.length-1],h.raw=h.data=(h.raw+b.raw).replace(e._reTrimComment,""),b.raw=b.data="",b.type=d.Text):b.type=d.Comment):(b.type=d.Comment,this._elements.length&&this._elements[this._elements.length-1].type==d.Comment?(h=this._elements[this._elements.length-1],h.raw=h.data=h.raw+b.raw+c,b.raw=b.data="",b.type=d.Text):b.raw=b.data=b.raw+c));if(b.type==d.Tag&&(b.name=f,f=f.toLowerCase(),0==b.raw.indexOf("!--")?(b.type=d.Comment,delete b.name,h=b.raw.length,"-"==b.raw.charAt(h- 1)&&"-"==b.raw.charAt(h-2)&&">"==c?b.raw=b.data=b.raw.replace(e._reTrimComment,""):(b.raw+=c,this._tagStack.push(d.Comment))):0==b.raw.indexOf("!")||0==b.raw.indexOf("?")?b.type=d.Directive:"script"==f?(b.type=d.Script,"/"!=b.data.charAt(b.data.length-1)&&this._tagStack.push(d.Script)):"/script"==f?b.type=d.Script:"style"==f?(b.type=d.Style,"/"!=b.data.charAt(b.data.length-1)&&this._tagStack.push(d.Style)):"/style"==f&&(b.type=d.Style),b.name&&"/"==b.name.charAt(0)))b.data=b.name;if(""!=b.raw||b.type!= d.Text)this._options.includeLocation&&!b.location&&(b.location=this.getLocation(b.type==d.Tag)),this.parseAttribs(b),this._elements.push(b),b.type!=d.Text&&b.type!=d.Comment&&b.type!=d.Directive&&"/"==b.data.charAt(b.data.length-1)&&this._elements.push({raw:"/"+b.name,data:"/"+b.name,name:"/"+b.name,type:b.type});this._parseState="<"==c?d.Tag:d.Text;this._current=this._next+1;this._prevTagSep=c}this._options.includeLocation&&(this.getLocation(),this._location.row+=this._location.inBuffer,this._location.inBuffer= 0,this._location.charOffset=0);this._buffer=this._current<=a?this._buffer.substring(this._current):"";this._current=0;this.writeHandler()};e.prototype.getLocation=function(a){for(var c=this._location,b=this._current-(a?1:0),d=a&&0==c.charOffset&&0==this._current;c.charOffset<b;c.charOffset++)a=this._buffer.charAt(c.charOffset),"\n"==a?(c.inBuffer++,c.col=0):"\r"!=a&&c.col++;return{line:c.row+c.inBuffer+1,col:c.col+(d?0:1)}};e.prototype.validateHandler=function(a){if("object"!=typeof a)throw Error("Handler is not an object"); if("function"!=typeof a.reset)throw Error("Handler method 'reset' is invalid");if("function"!=typeof a.done)throw Error("Handler method 'done' is invalid");if("function"!=typeof a.writeTag)throw Error("Handler method 'writeTag' is invalid");if("function"!=typeof a.writeText)throw Error("Handler method 'writeText' is invalid");if("function"!=typeof a.writeComment)throw Error("Handler method 'writeComment' is invalid");if("function"!=typeof a.writeDirective)throw Error("Handler method 'writeDirective' is invalid"); };e.prototype.writeHandler=function(a){if(!this._tagStack.length||a)for(;this._elements.length;)switch(a=this._elements.shift(),a.type){case d.Comment:this._handler.writeComment(a);break;case d.Directive:this._handler.writeDirective(a);break;case d.Text:this._handler.writeText(a);break;default:this._handler.writeTag(a)}};e.prototype.handleError=function(a){if("function"==typeof this._handler.error)this._handler.error(a);else throw a;};(function(a,c){var b=function(){};b.prototype=c.prototype;a.super_= c;a.prototype=new b;a.prototype.constructor=a})(j,g);j.prototype.done=function(){var a={},c,b=f.getElementsByTagName(function(a){return"rss"==a||"feed"==a},this.dom,!1);b.length&&(c=b[0]);if(c){if("rss"==c.name){a.type="rss";c=c.children[0];a.id="";try{a.title=f.getElementsByTagName("title",c.children,!1)[0].children[0].data}catch(d){}try{a.link=f.getElementsByTagName("link",c.children,!1)[0].children[0].data}catch(e){}try{a.description=f.getElementsByTagName("description",c.children,!1)[0].children[0].data}catch(g){}try{a.updated= new Date(f.getElementsByTagName("lastBuildDate",c.children,!1)[0].children[0].data)}catch(i){}try{a.author=f.getElementsByTagName("managingEditor",c.children,!1)[0].children[0].data}catch(l){}a.items=[];f.getElementsByTagName("item",c.children).forEach(function(b){var c={};try{c.id=f.getElementsByTagName("guid",b.children,!1)[0].children[0].data}catch(d){}try{c.title=f.getElementsByTagName("title",b.children,!1)[0].children[0].data}catch(e){}try{c.link=f.getElementsByTagName("link",b.children,!1)[0].children[0].data}catch(h){}try{c.description= f.getElementsByTagName("description",b.children,!1)[0].children[0].data}catch(g){}try{c.pubDate=new Date(f.getElementsByTagName("pubDate",b.children,!1)[0].children[0].data)}catch(k){}a.items.push(c)})}else{a.type="atom";try{a.id=f.getElementsByTagName("id",c.children,!1)[0].children[0].data}catch(m){}try{a.title=f.getElementsByTagName("title",c.children,!1)[0].children[0].data}catch(n){}try{a.link=f.getElementsByTagName("link",c.children,!1)[0].attribs.href}catch(o){}try{a.description=f.getElementsByTagName("subtitle", c.children,!1)[0].children[0].data}catch(p){}try{a.updated=new Date(f.getElementsByTagName("updated",c.children,!1)[0].children[0].data)}catch(q){}try{a.author=f.getElementsByTagName("email",c.children,!0)[0].children[0].data}catch(r){}a.items=[];f.getElementsByTagName("entry",c.children).forEach(function(b){var c={};try{c.id=f.getElementsByTagName("id",b.children,!1)[0].children[0].data}catch(d){}try{c.title=f.getElementsByTagName("title",b.children,!1)[0].children[0].data}catch(e){}try{c.link=f.getElementsByTagName("link", b.children,!1)[0].attribs.href}catch(h){}try{c.description=f.getElementsByTagName("summary",b.children,!1)[0].children[0].data}catch(g){}try{c.pubDate=new Date(f.getElementsByTagName("updated",b.children,!1)[0].children[0].data)}catch(k){}a.items.push(c)})}this.dom=a}j.super_.prototype.done.call(this)};g._emptyTags={area:1,base:1,basefont:1,br:1,col:1,frame:1,hr:1,img:1,input:1,isindex:1,link:1,meta:1,param:1,embed:1};g.reWhitespace=/^\s*$/;g.prototype.dom=null;g.prototype.reset=function(){this.dom= [];this._done=!1;this._tagStack=[];this._tagStack.last=function(){return this.length?this[this.length-1]:null}};g.prototype.done=function(){this._done=!0;this.handleCallback(null)};g.prototype.writeTag=function(a){this.handleElement(a)};g.prototype.writeText=function(a){(!this._options.ignoreWhitespace||!g.reWhitespace.test(a.data))&&this.handleElement(a)};g.prototype.writeComment=function(a){this.handleElement(a)};g.prototype.writeDirective=function(a){this.handleElement(a)};g.prototype.error=function(a){this.handleCallback(a)}; g.prototype._options=null;g.prototype._callback=null;g.prototype._done=!1;g.prototype._tagStack=null;g.prototype.handleCallback=function(a){if("function"!=typeof this._callback){if(a)throw a;}else this._callback(a,this.dom)};g.prototype.isEmptyTag=function(a){a=a.name.toLowerCase();"/"==a.charAt(0)&&(a=a.substring(1));return this._options.enforceEmptyTags&&!!g._emptyTags[a]};g.prototype.handleElement=function(a){this._done&&this.handleCallback(Error("Writing to the handler after done() called is not allowed without a reset()")); this._options.verbose||(delete a.raw,("tag"==a.type||"script"==a.type||"style"==a.type)&&delete a.data);if(this._tagStack.last())if(a.type!=d.Text&&a.type!=d.Comment&&a.type!=d.Directive)if("/"==a.name.charAt(0)){var c=a.name.substring(1);if(!this.isEmptyTag(a)){for(a=this._tagStack.length-1;-1<a&&this._tagStack[a--].name!=c;);if(-1<a||this._tagStack[0].name==c)for(;a<this._tagStack.length-1;)this._tagStack.pop()}}else this._tagStack.last().children||(this._tagStack.last().children=[]),this._tagStack.last().children.push(a), this.isEmptyTag(a)||this._tagStack.push(a);else this._tagStack.last().children||(this._tagStack.last().children=[]),this._tagStack.last().children.push(a);else a.type!=d.Text&&a.type!=d.Comment&&a.type!=d.Directive?"/"!=a.name.charAt(0)&&(this.dom.push(a),this.isEmptyTag(a)||this._tagStack.push(a)):this.dom.push(a)};var f={testElement:function(a,c){if(!c)return!1;for(var b in a)if("tag_name"==b){if("tag"!=c.type&&"script"!=c.type&&"style"!=c.type||!a.tag_name(c.name))return!1}else if("tag_type"== b){if(!a.tag_type(c.type))return!1}else if("tag_contains"==b){if("text"!=c.type&&"comment"!=c.type&&"directive"!=c.type||!a.tag_contains(c.data))return!1}else if(!c.attribs||!a[b](c.attribs[b]))return!1;return!0},getElements:function(a,c,b,d){function e(a){return function(b){return b==a}}b=void 0===b||null===b||!!b;d=isNaN(parseInt(d))?-1:parseInt(d);if(!c)return[];var g=[],i;for(i in a)"function"!=typeof a[i]&&(a[i]=e(a[i]));f.testElement(a,c)&&g.push(c);if(0<=d&&g.length>=d)return g;if(b&&c.children)c= c.children;else if(!(c instanceof Array))return g;for(i=0;i<c.length&&!(g=g.concat(f.getElements(a,c[i],b,d)),0<=d&&g.length>=d);i++);return g},getElementById:function(a,c,b){a=f.getElements({id:a},c,b,1);return a.length?a[0]:null},getElementsByTagName:function(a,c,b,d){return f.getElements({tag_name:a},c,b,d)},getElementsByTagType:function(a,c,b,d){return f.getElements({tag_type:a},c,b,d)}};exports.Parser=e;exports.DefaultHandler=g;exports.RssHandler=j;exports.ElementType=d;exports.DomUtils=f})();
View
296 runtests.js
@@ -6,10 +6,10 @@ deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
-
+
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
-
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -19,57 +19,249 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.
***********************************************/
-var sys = require("sys");
+Object.prototype.equals = function (x) {
+ //http://stackoverflow.com/questions/1068834/object-comparison-in-javascript
+ var p;
+
+ for (p in this) {
+ if (typeof(x[p]) == 'undefined') {
+ // console.log('Missing property: ', p);
+ return false;
+ }
+ }
+
+ for (p in x) {
+ if (typeof(this[p]) == 'undefined') {
+ // console.log('Extra property: ', p);
+ return false;
+ }
+ }
+
+ for (p in this) {
+ if (this[p]) {
+ switch(typeof(this[p])) {
+ case 'object':
+ if (!this[p].equals(x[p])) {
+ // console.log('Mismatched property: ', p);
+ return false;
+ }
+ break;
+ case 'function':
+ if (typeof(x[p])=='undefined' || (p != 'equals' && this[p].toString() != x[p].toString())) {
+ // console.log('Mismatched property: ', p);
+ return false;
+ }
+ break;
+ default:
+ if (this[p] != x[p]) {
+ // console.log('Mismatched property: ', p);
+ return false;
+ }
+ }
+ } else {
+ if (x[p]) {
+ // console.log('Poop: ', p);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+var util = require("util");
var fs = require("fs");
var htmlparser = require("./lib/htmlparser");
-var testFolder = "./tests";
-var chunkSize = 5;
-
-var testFiles = fs.readdirSync(testFolder);
-var testCount = 0;
-var failedCount = 0;
-for (var i in testFiles) {
- testCount++;
- var fileParts = testFiles[i].split(".");
- fileParts.pop();
- var moduleName = fileParts.join(".");
- var test = require(testFolder + "/" + moduleName);
- var handlerCallback = function handlerCallback (error) {
- if (error)
- sys.puts("Handler error: " + error);
- }
- var handler = (test.type == "rss") ?
- new htmlparser.RssHandler(handlerCallback, test.options.handler)
- :
- new htmlparser.DefaultHandler(handlerCallback, test.options.handler)
- ;
- var parser = new htmlparser.Parser(handler, test.options.parser);
- parser.parseComplete(test.html);
- var resultComplete = handler.dom;
- var chunkPos = 0;
- parser.reset();
- while (chunkPos < test.html.length) {
- parser.parseChunk(test.html.substring(chunkPos, chunkPos + chunkSize));
- chunkPos += chunkSize;
- }
- parser.done();
- var resultChunk = handler.dom;
- var testResult =
- sys.inspect(resultComplete, false, null) === sys.inspect(test.expected, false, null)
- &&
- sys.inspect(resultChunk, false, null) === sys.inspect(test.expected, false, null)
- ;
- sys.puts("[" + test.name + "\]: " + (testResult ? "passed" : "FAILED"));
- if (!testResult) {
- failedCount++;
- sys.puts("== Complete ==");
- sys.puts(sys.inspect(resultComplete, false, null));
- sys.puts("== Chunked ==");
- sys.puts(sys.inspect(resultChunk, false, null));
- sys.puts("== Expected ==");
- sys.puts(sys.inspect(test.expected, false, null));
- }
+var htmlTests = require('./tests/html');
+var rssTests = require('./tests/rss');
+var parserTests = require('./tests/parser');
+var testResults = {};
+
+function runBuilderTests (tests, builderCtor, permutator) {
+ var passed = 0;
+ var failed = 0;
+
+ var builderCallback = function builderCallback (error) {
+ if (error) {
+ util.puts("Builder error: " + error);
+ }
+ };
+
+ var startTime = Date.now();
+ for (var testName in tests) {
+ if (!tests.hasOwnProperty(testName)) {
+ continue;
+ }
+ var test = permutator ? permutator(tests[testName]) : tests[testName];
+
+ var builder = new builderCtor(builderCallback, test.options.builder);
+ var parser = new htmlparser.Parser(builder, test.options.parser);
+
+ parser.reset();
+ if (test.data.length === 1) {
+ parser.parseComplete(test.data[0]);
+ } else {
+ for (var i = 0, len = test.data.length; i < len; i++) {
+ parser.parseChunk(test.data[i]);
+ }
+ parser.done();
+ }
+ var testResult = builder.dom.equals(test.expected);
+
+ util.puts("[" + testName + "]: " + (testResult ? "passed" : "FAILED"));
+ if (!testResult) {
+ failed++;
+ // util.puts(util.inspect(builder._raw, false, null));
+ util.puts("== Result ==");
+ util.puts(util.inspect(builder.dom, false, null));
+ util.puts("== Expected ==");
+ util.puts(util.inspect(test.expected, false, null));
+ } else {
+ passed++;
+ }
+ }
+ var endTime = Date.now();
+
+ return {
+ elapsed: endTime - startTime
+ , passed: passed
+ , failed: failed
+ };
+}
+
+function TestBuilder (callback) {
+ this.cb = callback;
+ this.reset();
+}
+TestBuilder.prototype.reset = function () {
+ this.output = [];
+}
+TestBuilder.prototype.write = function (element) {
+ this.output.push(element);
+}
+TestBuilder.prototype.done = function () {
+ this.cb(null, this.output);
+}
+TestBuilder.prototype.error = function (error) {
+ this.cb(error);
+}
+
+function runParserTests (tests, permutator) {
+ var callback = function builderCallback (err) {
+ if (err) {
+ console.log('Builder error', err);
+ }
+ };
+ var builder = new TestBuilder(callback);
+ var parser = new htmlparser.Parser(builder);
+
+ var passed = 0;
+ var failed = 0;
+
+ var startTime = Date.now();
+ for (var testName in tests) {
+ if (!tests.hasOwnProperty(testName)) {
+ continue;
+ }
+ var test = permutator ? permutator(tests[testName]) : tests[testName];
+ process.stdout.write('[TEST] ' + testName + ' : ');
+ parser.reset();
+ if (test.data.length === 1) {
+ parser.parseComplete(test.data[0]);
+ } else {
+ for (var i = 0, len = test.data.length; i < len; i++) {
+ parser.parseChunk(test.data[i]);
+ }
+ parser.done();
+ }
+
<