Skip to content
Browse files

Added support for RSS/Atom feeds

  • Loading branch information...
1 parent b954e7f commit 3b30e87ee4d8e7af6c0b27e53eacf1b7a4773b4d @tautologistics committed
Showing with 185 additions and 23 deletions.
  1. +19 −2 README.md
  2. +130 −11 node-htmlparser.js
  3. +2 −2 node-htmlparser.min.js
  4. +11 −2 runtests.html
  5. +7 −2 runtests.js
  6. +9 −2 runtests.min.html
  7. +7 −2 runtests.min.js
View
21 README.md
@@ -1,5 +1,5 @@
#NodeHtmlParser
-A forgiving HTML/XML parser written in JS for both the browser and NodeJS (yes, despite the name it works just fine in any modern browser). The parser can handle streams (chunked data) and supports custom handlers for writing custom DOMs/output.
+A forgiving HTML/XML/RSS parser written in JS for both the browser and NodeJS (yes, despite the name it works just fine in any modern browser). The parser can handle streams (chunked data) and supports custom handlers for writing custom DOMs/output.
##Running Tests
@@ -60,7 +60,24 @@ View runtests.html in any browser
}
parser.done();
-##Handler Options
+##Parsing RSS/Atom Feeds
+
+ new htmlparser.RssHandler(function (error, dom) {
+ ...
+ });
+
+###Usage In Browser
+ var handler = new Tautologistics.NodeHtmlParser.DefaultHandler(function (error, dom) {
+ if (error)
+ [...do something for errors...]
+ else
+ [...parsing done, do something...]
+ });
+ var parser = new Tautologistics.NodeHtmlParser.Parser(handler);
+ parser.parseComplete(document.body.innerHTML);
+ alert(JSON.stringify(handler.dom, null, 2));
+
+##DefaultHandler Options
###Usage
var handler = new htmlparser.DefaultHandler(
View
141 node-htmlparser.js
@@ -304,7 +304,7 @@ function Parser (handler) {
this._tagStack.push(ElementType.Comment);
}
}
- else if (element.raw.indexOf("!") == 0) {
+ else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) {
element.type = ElementType.Directive;
//TODO: what about CDATA?
}
@@ -409,7 +409,111 @@ function Parser (handler) {
throw error;
}
-//TODO: add support for options: ignoreWhitespace, verbose (keep data for tags and raw for all)
+//TODO: make this a trully streamable handler
+function RssHandler (callback) {
+ RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
+}
+inherits(RssHandler, DefaultHandler);
+
+ RssHandler.prototype.done = function RssHandler$done () {
+ var feed = { };
+ var feedRoot;
+
+ var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false);
+ if (found.length) {
+ feedRoot = found[0];
+ }
+ if (feedRoot) {
+ if (feedRoot.name == "rss") {
+ feed.type = "rss";
+ feedRoot = feedRoot.children[0]; //<channel/>
+ feed.id = "";
+// require("sys").debug(require("sys").inspect(feedRoot, false, null));
+// require("sys").debug(require("sys").inspect(DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data, false, null));
+ try {
+ feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data);
+ } catch (ex) { }
+ try {
+ feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ feed.items = [];
+ DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) {
+ var entry = {};
+ try {
+ entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data);
+ } catch (ex) { }
+ feed.items.push(entry);
+ });
+ } else {
+ feed.type = "atom";
+ try {
+ feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href;
+ } catch (ex) { }
+ try {
+ feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data);
+ } catch (ex) { }
+ try {
+ feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data;
+ } catch (ex) { }
+ feed.items = [];
+ DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) {
+ var entry = {};
+ try {
+ entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href;
+ } catch (ex) { }
+ try {
+ entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data);
+ } catch (ex) { }
+ feed.items.push(entry);
+ });
+ }
+
+ this.dom = feed;
+ }
+ RssHandler.super_.prototype.done.call(this);
+ }
+
+///////////////////////////////////////////////////
+
function DefaultHandler (callback, options) {
this.reset();
this._options = options ? options : { };
@@ -578,7 +682,9 @@ function DefaultHandler (callback, options) {
return(true);
}
- , getElements: function DomUtils$getElements (options, currentElement) {
+ , getElements: function DomUtils$getElements (options, currentElement, recurse) {
+ recurse = !!recurse;
+
if (!currentElement) {
return([]);
}
@@ -597,7 +703,7 @@ function DefaultHandler (callback, options) {
found.push(currentElement);
}
- if (currentElement.children)
+ if (recurse && currentElement.children)
elementList = currentElement.children;
else if (currentElement instanceof Array)
elementList = currentElement;
@@ -605,29 +711,42 @@ function DefaultHandler (callback, options) {
return(found);
for (var i = 0; i < elementList.length; i++)
- found = found.concat(DomUtils.getElements(options, elementList[i]));
+ found = found.concat(DomUtils.getElements(options, elementList[i], recurse));
return(found);
}
- , getElementById: function DomUtils$getElementById (id, currentElement) {
- var result = DomUtils.getElements({ id: id }, currentElement);
+ , getElementById: function DomUtils$getElementById (id, currentElement, recurse) {
+ recurse = !!recurse;
+ var result = DomUtils.getElements({ id: id }, currentElement, recurse);
return(result.length ? result[0] : null);
}
- , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement) {
- return(DomUtils.getElements({ tag_name: name }, currentElement));
+ , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse) {
+ recurse = !!recurse;
+ return(DomUtils.getElements({ tag_name: name }, currentElement, recurse));
}
- , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement) {
- return(DomUtils.getElements({ tag_type: type }, currentElement));
+ , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse) {
+ recurse = !!recurse;
+ return(DomUtils.getElements({ tag_type: type }, currentElement, recurse));
}
}
+ function inherits (ctor, superCtor) {
+ var tempCtor = function(){};
+ tempCtor.prototype = superCtor.prototype;
+ ctor.super_ = superCtor;
+ ctor.prototype = new tempCtor();
+ ctor.prototype.constructor = ctor;
+ }
+
exports.Parser = Parser;
exports.DefaultHandler = DefaultHandler;
+exports.RssHandler = RssHandler;
+
exports.ElementType = ElementType;
exports.DomUtils = DomUtils;
View
4 node-htmlparser.min.js
@@ -18,5 +18,5 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.
***********************************************/
-/* v1.5.0 */
-(function(){function e(a){this.validateHandler(a);this._handler=a;this.reset()}function g(a,d){this.reset();this._options=d?d:{};if(this._options.ignoreWhitespace==undefined)this._options.ignoreWhitespace=false;if(this._options.verbose==undefined)this._options.verbose=true;if(this._options.enforceEmptyTags==undefined)this._options.enforceEmptyTags=true;if(typeof a=="function")this._callback=a}if(!(typeof require=="function"&&typeof exports=="object"&&typeof module=="object"&&typeof __filename=="string"&& typeof __dirname=="string")){if(this.Tautologistics){if(this.Tautologistics.NodeHtmlParser)return}else this.Tautologistics={};this.Tautologistics.NodeHtmlParser={};exports=this.Tautologistics.NodeHtmlParser}var c={Text:"text",Directive:"directive",Comment:"comment",Script:"script",Style:"style",Tag:"tag"};e._reTrim=/(^\s+|\s+$)/g;e._reTrimComment=/(^\!--|--$)/g;e._reWhitespace=/\s/g;e._reTagName=/^\s*(\/?)\s*([^\s\/]+)/;e._reAttrib=/([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g; e._reTags=/[\<\>]/g;e.prototype.parseComplete=function(a){this.reset();this.parseChunk(a);this.done()};e.prototype.parseChunk=function(a){this._done&&this.handleError(new Error("Attempted to parse chunk after parsing already done"));this._buffer+=a;this.parseTags()};e.prototype.done=function(){if(!this._done){this._done=true;if(this._buffer.length){var a=this._buffer;this._buffer="";a={raw:a,data:this._parseState==c.Text?a:a.replace(e._reTrim,""),type:this._parseState};if(this._parseState==c.Tag|| this._parseState==c.Script||this._parseState==c.Style)a.name=this.parseTagName(a.data);this.parseAttribs(a);this._elements.push(a)}this.writeHandler();this._handler.done()}};e.prototype.reset=function(){this._buffer="";this._done=false;this._elements=[];this._next=this._current=this._elementsCurrent=0;this._parseState=c.Text;this._prevTagSep="";this._tagStack=[];this._handler.reset()};e.prototype._handler=null;e.prototype._buffer=null;e.prototype._done=false;e.prototype._elements=null;e.prototype._elementsCurrent= 0;e.prototype._current=0;e.prototype._next=0;e.prototype._parseState=c.Text;e.prototype._prevTagSep="";e.prototype._tagStack=null;e.prototype.parseTagAttribs=function(a){for(var d=a.length,b=0;b<d;){var h=a[b++];if(h.type==c.Tag||h.type==c.Script||h.type==c.style)this.parseAttribs(h)}return a};e.prototype.parseAttribs=function(a){if(!(a.type!=c.Script&&a.type!=c.Style&&a.type!=c.Tag)){var d=a.data.split(e._reWhitespace,1)[0];d=a.data.substring(d.length);if(!(d.length<1)){var b;for(e._reAttrib.lastIndex= 0;b=e._reAttrib.exec(d);){if(a.attribs==undefined)a.attribs={};if(typeof b[1]=="string"&&b[1].length)a.attribs[b[1]]=b[2];else if(typeof b[3]=="string"&&b[3].length)a.attribs[b[3].toString()]=b[4].toString();else if(typeof b[5]=="string"&&b[5].length)a.attribs[b[5]]=b[6];else if(typeof b[7]=="string"&&b[7].length)a.attribs[b[7]]=b[7]}}}};e.prototype.parseTagName=function(a){if(a==null||a=="")return"";a=e._reTagName.exec(a);if(!a)return"";return(a[1]?"/":"")+a[2]};e.prototype.parseTags=function(){for(var a= this._buffer.length-1;e._reTags.test(this._buffer);){this._next=e._reTags.lastIndex-1;var d=this._buffer.charAt(this._next),b=this._buffer.substring(this._current,this._next);b={raw:b,data:this._parseState==c.Text?b:b.replace(e._reTrim,""),type:this._parseState};var h=this.parseTagName(b.data);if(this._tagStack.length)if(this._tagStack[this._tagStack.length-1]==c.Script)if(h=="/script")this._tagStack.pop();else{if(b.raw.indexOf("!--")!=0){b.type=c.Text;if(this._elements.length&&this._elements[this._elements.length- 1].type==c.Text){var f=this._elements[this._elements.length-1];f.raw=f.data=f.raw+this._prevTagSep+b.raw;b.raw=b.data=""}}}else if(this._tagStack[this._tagStack.length-1]==c.Style)if(h=="/style")this._tagStack.pop();else{if(b.raw.indexOf("!--")!=0){b.type=c.Text;if(this._elements.length&&this._elements[this._elements.length-1].type==c.Text)if(b.raw!=""){f=this._elements[this._elements.length-1];f.raw=f.data=f.raw+this._prevTagSep+b.raw;b.raw=b.data=""}else f.raw=f.data=f.raw+this._prevTagSep;else if(b.raw!= "")b.raw=b.data=b.raw}}else if(this._tagStack[this._tagStack.length-1]==c.Comment){var i=b.raw.length;if(b.raw.charAt(i-2)=="-"&&b.raw.charAt(i-1)=="-"&&d==">"){this._tagStack.pop();if(this._elements.length&&this._elements[this._elements.length-1].type==c.Comment){f=this._elements[this._elements.length-1];f.raw=f.data=(f.raw+b.raw).replace(e._reTrimComment,"");b.raw=b.data="";b.type=c.Text}else b.type=c.Comment}else{b.type=c.Comment;if(this._elements.length&&this._elements[this._elements.length-1].type== c.Comment){f=this._elements[this._elements.length-1];f.raw=f.data=f.raw+b.raw+d;b.raw=b.data="";b.type=c.Text}else b.raw=b.data=b.raw+d}}if(b.type==c.Tag){b.name=h;if(b.raw.indexOf("!--")==0){b.type=c.Comment;delete b.name;i=b.raw.length;if(b.raw.charAt(i-1)=="-"&&b.raw.charAt(i-2)=="-"&&d==">")b.raw=b.data=b.raw.replace(e._reTrimComment,"");else{b.raw+=d;this._tagStack.push(c.Comment)}}else if(b.raw.indexOf("!")==0)b.type=c.Directive;else if(b.name=="script"){b.type=c.Script;b.data.charAt(b.data.length- 1)!="/"&&this._tagStack.push(c.Script)}else if(b.name=="/script")b.type=c.Script;else if(b.name=="style"){b.type=c.Style;b.data.charAt(b.data.length-1)!="/"&&this._tagStack.push(c.Style)}else if(b.name=="/style")b.type=c.Style;if(b.name&&b.name.charAt(0)=="/")b.data=b.name}if(b.raw!=""||b.type!=c.Text){this.parseAttribs(b);this._elements.push(b);b.type!=c.Text&&b.type!=c.Comment&&b.type!=c.Directive&&b.data.charAt(b.data.length-1)=="/"&&this._elements.push({raw:"/"+b.name,data:"/"+b.name,name:"/"+ b.name,type:b.type})}this._parseState=d=="<"?c.Tag:c.Text;this._current=this._next+1;this._prevTagSep=d}this._buffer=this._current<=a?this._buffer.substring(this._current):"";this._current=0;this.writeHandler()};e.prototype.validateHandler=function(a){if(typeof a!="object")throw new Error("Handler is not an object");if(typeof a.reset!="function")throw new Error("Handler method 'reset' is invalid");if(typeof a.done!="function")throw new Error("Handler method 'done' is invalid");if(typeof a.writeTag!= "function")throw new Error("Handler method 'writeTag' is invalid");if(typeof a.writeText!="function")throw new Error("Handler method 'writeText' is invalid");if(typeof a.writeComment!="function")throw new Error("Handler method 'writeComment' is invalid");if(typeof a.writeDirective!="function")throw new Error("Handler method 'writeDirective' is invalid");};e.prototype.writeHandler=function(a){if(!(this._tagStack.length&&!!!a))for(;this._elements.length;){a=this._elements.shift();switch(a.type){case c.Comment:this._handler.writeComment(a); break;case c.Directive:this._handler.writeDirective(a);break;case c.Text:this._handler.writeText(a);break;default:this._handler.writeTag(a);break}}};e.prototype.handleError=function(a){if(typeof this._handler.error=="function")this._handler.error(a);else throw a;};g._emptyTags={area:1,base:1,basefont:1,br:1,col:1,frame:1,hr:1,img:1,input:1,isindex:1,link:1,meta:1,param:1,embed:1};g.reWhitespace=/^\s*$/;g.prototype.dom=null;g.prototype.reset=function(){this.dom=[];this._done=false;this._tagStack=[]; this._tagStack.last=function(){return this.length?this[this.length-1]:null}};g.prototype.done=function(){this._done=true;this.handleCallback(null)};g.prototype.writeTag=function(a){this.handleElement(a)};g.prototype.writeText=function(a){if(this._options.ignoreWhitespace)if(g.reWhitespace.test(a.data))return;this.handleElement(a)};g.prototype.writeComment=function(a){this.handleElement(a)};g.prototype.writeDirective=function(a){this.handleElement(a)};g.prototype.error=function(a){this.handleCallback(a)}; g.prototype._options=null;g.prototype._callback=null;g.prototype._done=false;g.prototype._tagStack=null;g.prototype.handleCallback=function(a){if(typeof this._callback!="function")if(a)throw a;else return;this._callback(a,this.dom)};g.prototype.handleElement=function(a){this._done&&this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()"));if(!this._options.verbose){delete a.raw;if(a.type=="tag"||a.type=="script"||a.type=="style")delete a.data}if(this._tagStack.last())if(a.type!= c.Text&&a.type!=c.Comment&&a.type!=c.Directive)if(a.name.charAt(0)=="/"){a=a.name.substring(1);if(!this._options.enforceEmptyTags||!g._emptyTags[a]){for(var d=this._tagStack.length-1;d>-1&&this._tagStack[d--].name!=a;);if(d>-1||this._tagStack[0].name==a)for(;d<this._tagStack.length-1;)this._tagStack.pop()}}else{if(!this._tagStack.last().children)this._tagStack.last().children=[];this._tagStack.last().children.push(a);if(!this._options.enforceEmptyTags||!g._emptyTags[a.name])this._tagStack.push(a)}else{if(!this._tagStack.last().children)this._tagStack.last().children= [];this._tagStack.last().children.push(a)}else if(a.type!=c.Text&&a.type!=c.Comment&&a.type!=c.Directive){if(a.name.charAt(0)!="/"){this.dom.push(a);if(!this._options.enforceEmptyTags||!g._emptyTags[a.name])this._tagStack.push(a)}}else this.dom.push(a)};var j={testElement:function(a,d){if(!d)return false;for(var b in a)if(b=="tag_name"){if(d.type!="tag"&&d.type!="script"&&d.type!="style")return false;return a.tag_name(d.name)}else if(b=="tag_type")return a.tag_type(d.type);else if(b=="tag_contains"){if(d.type!= "text"&&d.type!="comment"&&d.type!="directive")return false;return a.tag_contains(d.data)}else return d.attribs&&a[b](d.attribs[b]);return true},getElements:function(a,d){function b(l){return typeof a[i]=="function"?l:function(m){return m==l}}if(!d)return[];var h=[],f;for(var i in a)a[i]=b(a[i]);j.testElement(a,d)&&h.push(d);if(d.children)f=d.children;else if(d instanceof Array)f=d;else return h;for(var k=0;k<f.length;k++)h=h.concat(j.getElements(a,f[k]));return h},getElementById:function(a,d){var b= j.getElements({id:a},d);return b.length?b[0]:null},getElementsByTagName:function(a,d){return j.getElements({tag_name:a},d)},getElementsByTagType:function(a,d){return j.getElements({tag_type:a},d)}};exports.Parser=e;exports.DefaultHandler=g;exports.ElementType=c;exports.DomUtils=j})();
+/* v1.6.0 */
+(function(){function e(a){this.validateHandler(a);this._handler=a;this.reset()}function m(a){m.super_.call(this,a,{ignoreWhitespace:true,verbose:false,enforceEmptyTags:false})}function h(a,c){this.reset();this._options=c?c:{};if(this._options.ignoreWhitespace==undefined)this._options.ignoreWhitespace=false;if(this._options.verbose==undefined)this._options.verbose=true;if(this._options.enforceEmptyTags==undefined)this._options.enforceEmptyTags=true;if(typeof a=="function")this._callback=a}if(!(typeof require== "function"&&typeof exports=="object"&&typeof module=="object"&&typeof __filename=="string"&&typeof __dirname=="string")){if(this.Tautologistics){if(this.Tautologistics.NodeHtmlParser)return}else this.Tautologistics={};this.Tautologistics.NodeHtmlParser={};exports=this.Tautologistics.NodeHtmlParser}var d={Text:"text",Directive:"directive",Comment:"comment",Script:"script",Style:"style",Tag:"tag"};e._reTrim=/(^\s+|\s+$)/g;e._reTrimComment=/(^\!--|--$)/g;e._reWhitespace=/\s/g;e._reTagName=/^\s*(\/?)\s*([^\s\/]+)/; e._reAttrib=/([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;e._reTags=/[\<\>]/g;e.prototype.parseComplete=function(a){this.reset();this.parseChunk(a);this.done()};e.prototype.parseChunk=function(a){this._done&&this.handleError(new Error("Attempted to parse chunk after parsing already done"));this._buffer+=a;this.parseTags()};e.prototype.done=function(){if(!this._done){this._done=true;if(this._buffer.length){var a=this._buffer;this._buffer= "";a={raw:a,data:this._parseState==d.Text?a:a.replace(e._reTrim,""),type:this._parseState};if(this._parseState==d.Tag||this._parseState==d.Script||this._parseState==d.Style)a.name=this.parseTagName(a.data);this.parseAttribs(a);this._elements.push(a)}this.writeHandler();this._handler.done()}};e.prototype.reset=function(){this._buffer="";this._done=false;this._elements=[];this._next=this._current=this._elementsCurrent=0;this._parseState=d.Text;this._prevTagSep="";this._tagStack=[];this._handler.reset()}; e.prototype._handler=null;e.prototype._buffer=null;e.prototype._done=false;e.prototype._elements=null;e.prototype._elementsCurrent=0;e.prototype._current=0;e.prototype._next=0;e.prototype._parseState=d.Text;e.prototype._prevTagSep="";e.prototype._tagStack=null;e.prototype.parseTagAttribs=function(a){for(var c=a.length,b=0;b<c;){var k=a[b++];if(k.type==d.Tag||k.type==d.Script||k.type==d.style)this.parseAttribs(k)}return a};e.prototype.parseAttribs=function(a){if(!(a.type!=d.Script&&a.type!=d.Style&& a.type!=d.Tag)){var c=a.data.split(e._reWhitespace,1)[0];c=a.data.substring(c.length);if(!(c.length<1)){var b;for(e._reAttrib.lastIndex=0;b=e._reAttrib.exec(c);){if(a.attribs==undefined)a.attribs={};if(typeof b[1]=="string"&&b[1].length)a.attribs[b[1]]=b[2];else if(typeof b[3]=="string"&&b[3].length)a.attribs[b[3].toString()]=b[4].toString();else if(typeof b[5]=="string"&&b[5].length)a.attribs[b[5]]=b[6];else if(typeof b[7]=="string"&&b[7].length)a.attribs[b[7]]=b[7]}}}};e.prototype.parseTagName= function(a){if(a==null||a=="")return"";a=e._reTagName.exec(a);if(!a)return"";return(a[1]?"/":"")+a[2]};e.prototype.parseTags=function(){for(var a=this._buffer.length-1;e._reTags.test(this._buffer);){this._next=e._reTags.lastIndex-1;var c=this._buffer.charAt(this._next),b=this._buffer.substring(this._current,this._next);b={raw:b,data:this._parseState==d.Text?b:b.replace(e._reTrim,""),type:this._parseState};var k=this.parseTagName(b.data);if(this._tagStack.length)if(this._tagStack[this._tagStack.length- 1]==d.Script)if(k=="/script")this._tagStack.pop();else{if(b.raw.indexOf("!--")!=0){b.type=d.Text;if(this._elements.length&&this._elements[this._elements.length-1].type==d.Text){var g=this._elements[this._elements.length-1];g.raw=g.data=g.raw+this._prevTagSep+b.raw;b.raw=b.data=""}}}else if(this._tagStack[this._tagStack.length-1]==d.Style)if(k=="/style")this._tagStack.pop();else{if(b.raw.indexOf("!--")!=0){b.type=d.Text;if(this._elements.length&&this._elements[this._elements.length-1].type==d.Text)if(b.raw!= ""){g=this._elements[this._elements.length-1];g.raw=g.data=g.raw+this._prevTagSep+b.raw;b.raw=b.data=""}else g.raw=g.data=g.raw+this._prevTagSep;else if(b.raw!="")b.raw=b.data=b.raw}}else if(this._tagStack[this._tagStack.length-1]==d.Comment){var l=b.raw.length;if(b.raw.charAt(l-2)=="-"&&b.raw.charAt(l-1)=="-"&&c==">"){this._tagStack.pop();if(this._elements.length&&this._elements[this._elements.length-1].type==d.Comment){g=this._elements[this._elements.length-1];g.raw=g.data=(g.raw+b.raw).replace(e._reTrimComment, "");b.raw=b.data="";b.type=d.Text}else b.type=d.Comment}else{b.type=d.Comment;if(this._elements.length&&this._elements[this._elements.length-1].type==d.Comment){g=this._elements[this._elements.length-1];g.raw=g.data=g.raw+b.raw+c;b.raw=b.data="";b.type=d.Text}else b.raw=b.data=b.raw+c}}if(b.type==d.Tag){b.name=k;if(b.raw.indexOf("!--")==0){b.type=d.Comment;delete b.name;l=b.raw.length;if(b.raw.charAt(l-1)=="-"&&b.raw.charAt(l-2)=="-"&&c==">")b.raw=b.data=b.raw.replace(e._reTrimComment,"");else{b.raw+= c;this._tagStack.push(d.Comment)}}else if(b.raw.indexOf("!")==0||b.raw.indexOf("?")==0)b.type=d.Directive;else if(b.name=="script"){b.type=d.Script;b.data.charAt(b.data.length-1)!="/"&&this._tagStack.push(d.Script)}else if(b.name=="/script")b.type=d.Script;else if(b.name=="style"){b.type=d.Style;b.data.charAt(b.data.length-1)!="/"&&this._tagStack.push(d.Style)}else if(b.name=="/style")b.type=d.Style;if(b.name&&b.name.charAt(0)=="/")b.data=b.name}if(b.raw!=""||b.type!=d.Text){this.parseAttribs(b); this._elements.push(b);b.type!=d.Text&&b.type!=d.Comment&&b.type!=d.Directive&&b.data.charAt(b.data.length-1)=="/"&&this._elements.push({raw:"/"+b.name,data:"/"+b.name,name:"/"+b.name,type:b.type})}this._parseState=c=="<"?d.Tag:d.Text;this._current=this._next+1;this._prevTagSep=c}this._buffer=this._current<=a?this._buffer.substring(this._current):"";this._current=0;this.writeHandler()};e.prototype.validateHandler=function(a){if(typeof a!="object")throw new Error("Handler is not an object");if(typeof a.reset!= "function")throw new Error("Handler method 'reset' is invalid");if(typeof a.done!="function")throw new Error("Handler method 'done' is invalid");if(typeof a.writeTag!="function")throw new Error("Handler method 'writeTag' is invalid");if(typeof a.writeText!="function")throw new Error("Handler method 'writeText' is invalid");if(typeof a.writeComment!="function")throw new Error("Handler method 'writeComment' is invalid");if(typeof a.writeDirective!="function")throw new Error("Handler method 'writeDirective' is invalid"); };e.prototype.writeHandler=function(a){if(!(this._tagStack.length&&!!!a))for(;this._elements.length;){a=this._elements.shift();switch(a.type){case d.Comment:this._handler.writeComment(a);break;case d.Directive:this._handler.writeDirective(a);break;case d.Text:this._handler.writeText(a);break;default:this._handler.writeTag(a);break}}};e.prototype.handleError=function(a){if(typeof this._handler.error=="function")this._handler.error(a);else throw a;};(function(a,c){var b=function(){};b.prototype=c.prototype; a.super_=c;a.prototype=new b;a.prototype.constructor=a})(m,h);m.prototype.done=function(){var a={},c,b=f.getElementsByTagName(function(i){return i=="rss"||i=="feed"},this.dom,false);if(b.length)c=b[0];if(c){if(c.name=="rss"){a.type="rss";c=c.children[0];a.id="";try{a.title=f.getElementsByTagName("title",c.children,false)[0].children[0].data}catch(k){}try{a.link=f.getElementsByTagName("link",c.children,false)[0].children[0].data}catch(g){}try{a.description=f.getElementsByTagName("description",c.children, false)[0].children[0].data}catch(l){}try{a.updated=new Date(f.getElementsByTagName("lastBuildDate",c.children,false)[0].children[0].data)}catch(n){}try{a.author=f.getElementsByTagName("managingEditor",c.children,false)[0].children[0].data}catch(o){}a.items=[];f.getElementsByTagName("item",c.children).forEach(function(i){var j={};try{j.id=f.getElementsByTagName("guid",i.children,false)[0].children[0].data}catch(q){}try{j.title=f.getElementsByTagName("title",i.children,false)[0].children[0].data}catch(r){}try{j.link= f.getElementsByTagName("link",i.children,false)[0].children[0].data}catch(s){}try{j.description=f.getElementsByTagName("description",i.children,false)[0].children[0].data}catch(t){}try{j.pubDate=new Date(f.getElementsByTagName("pubDate",i.children,false)[0].children[0].data)}catch(u){}a.items.push(j)})}else{a.type="atom";try{a.id=f.getElementsByTagName("id",c.children,false)[0].children[0].data}catch(p){}try{a.title=f.getElementsByTagName("title",c.children,false)[0].children[0].data}catch(v){}try{a.link= f.getElementsByTagName("link",c.children,false)[0].attribs.href}catch(w){}try{a.description=f.getElementsByTagName("subtitle",c.children,false)[0].children[0].data}catch(x){}try{a.updated=new Date(f.getElementsByTagName("updated",c.children,false)[0].children[0].data)}catch(y){}try{a.author=f.getElementsByTagName("email",c.children,true)[0].children[0].data}catch(z){}a.items=[];f.getElementsByTagName("entry",c.children).forEach(function(i){var j={};try{j.id=f.getElementsByTagName("id",i.children, false)[0].children[0].data}catch(q){}try{j.title=f.getElementsByTagName("title",i.children,false)[0].children[0].data}catch(r){}try{j.link=f.getElementsByTagName("link",i.children,false)[0].attribs.href}catch(s){}try{j.description=f.getElementsByTagName("summary",i.children,false)[0].children[0].data}catch(t){}try{j.pubDate=new Date(f.getElementsByTagName("updated",i.children,false)[0].children[0].data)}catch(u){}a.items.push(j)})}this.dom=a}m.super_.prototype.done.call(this)};h._emptyTags={area:1, base:1,basefont:1,br:1,col:1,frame:1,hr:1,img:1,input:1,isindex:1,link:1,meta:1,param:1,embed:1};h.reWhitespace=/^\s*$/;h.prototype.dom=null;h.prototype.reset=function(){this.dom=[];this._done=false;this._tagStack=[];this._tagStack.last=function(){return this.length?this[this.length-1]:null}};h.prototype.done=function(){this._done=true;this.handleCallback(null)};h.prototype.writeTag=function(a){this.handleElement(a)};h.prototype.writeText=function(a){if(this._options.ignoreWhitespace)if(h.reWhitespace.test(a.data))return; this.handleElement(a)};h.prototype.writeComment=function(a){this.handleElement(a)};h.prototype.writeDirective=function(a){this.handleElement(a)};h.prototype.error=function(a){this.handleCallback(a)};h.prototype._options=null;h.prototype._callback=null;h.prototype._done=false;h.prototype._tagStack=null;h.prototype.handleCallback=function(a){if(typeof this._callback!="function")if(a)throw a;else return;this._callback(a,this.dom)};h.prototype.handleElement=function(a){this._done&&this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()")); if(!this._options.verbose){delete a.raw;if(a.type=="tag"||a.type=="script"||a.type=="style")delete a.data}if(this._tagStack.last())if(a.type!=d.Text&&a.type!=d.Comment&&a.type!=d.Directive)if(a.name.charAt(0)=="/"){a=a.name.substring(1);if(!this._options.enforceEmptyTags||!h._emptyTags[a]){for(var c=this._tagStack.length-1;c>-1&&this._tagStack[c--].name!=a;);if(c>-1||this._tagStack[0].name==a)for(;c<this._tagStack.length-1;)this._tagStack.pop()}}else{if(!this._tagStack.last().children)this._tagStack.last().children= [];this._tagStack.last().children.push(a);if(!this._options.enforceEmptyTags||!h._emptyTags[a.name])this._tagStack.push(a)}else{if(!this._tagStack.last().children)this._tagStack.last().children=[];this._tagStack.last().children.push(a)}else if(a.type!=d.Text&&a.type!=d.Comment&&a.type!=d.Directive){if(a.name.charAt(0)!="/"){this.dom.push(a);if(!this._options.enforceEmptyTags||!h._emptyTags[a.name])this._tagStack.push(a)}}else this.dom.push(a)};var f={testElement:function(a,c){if(!c)return false;for(var b in a)if(b== "tag_name"){if(c.type!="tag"&&c.type!="script"&&c.type!="style")return false;return a.tag_name(c.name)}else if(b=="tag_type")return a.tag_type(c.type);else if(b=="tag_contains"){if(c.type!="text"&&c.type!="comment"&&c.type!="directive")return false;return a.tag_contains(c.data)}else return c.attribs&&a[b](c.attribs[b]);return true},getElements:function(a,c,b){function k(o){return typeof a[l]=="function"?o:function(p){return p==o}}b=!!b;if(!c)return[];var g=[];for(var l in a)a[l]=k(a[l]);f.testElement(a, c)&&g.push(c);if(b&&c.children)c=c.children;else if(c instanceof Array)c=c;else return g;for(var n=0;n<c.length;n++)g=g.concat(f.getElements(a,c[n],b));return g},getElementById:function(a,c,b){a=f.getElements({id:a},c,!!b);return a.length?a[0]:null},getElementsByTagName:function(a,c,b){return f.getElements({tag_name:a},c,!!b)},getElementsByTagType:function(a,c,b){return f.getElements({tag_type:a},c,!!b)}};exports.Parser=e;exports.DefaultHandler=h;exports.RssHandler=m;exports.ElementType=d;exports.DomUtils= f})();
View
13 runtests.html
@@ -39,6 +39,10 @@
<script language="JavaScript" src="tests/15-non-verbose.js"></script>
<script language="JavaScript" src="tests/16-ignore_whitespace.js"></script>
<script language="JavaScript" src="tests/17-xml_namespace.js"></script>
+ <script language="JavaScript" src="tests/18-enforce_empty_tags.js"></script>
+ <script language="JavaScript" src="tests/19-ignore_empty_tags.js"></script>
+ <script language="JavaScript" src="tests/20-rss.js"></script>
+ <script language="JavaScript" src="tests/21-atom.js"></script>
<!-- //TODO: dynamic loading of test files -->
</head>
<body style="font-size: small; font-family:Arial, Helvetica, sans-serif;">
@@ -51,10 +55,15 @@
testCount++;
var test = Tautologistics.NodeHtmlParser.Tests.shift();
try {
- var handler = new Tautologistics.NodeHtmlParser.DefaultHandler(function (error) {
+ var handlerCallback = function handlerCallback (error) {
if (error)
document.write("<hr>Handler error: " + error + "<hr>");
- }, test.options);
+ }
+ var handler = (test.type == "rss") ?
+ new Tautologistics.NodeHtmlParser.RssHandler(handlerCallback, test.options)
+ :
+ new Tautologistics.NodeHtmlParser.DefaultHandler(handlerCallback, test.options)
+ ;
var parser = new Tautologistics.NodeHtmlParser.Parser(handler);
document.write("<b>" + test.name + "</b>: ");
parser.parseComplete(test.html);
View
9 runtests.js
@@ -35,10 +35,15 @@ for (var i in testFiles) {
fileParts.pop();
var moduleName = fileParts.join(".");
var test = require(testFolder + "/" + moduleName);
- var handler = new htmlparser.DefaultHandler(function (error) {
+ var handlerCallback = function handlerCallback (error) {
if (error)
sys.puts("Handler error: " + error);
- }, test.options);
+ }
+ var handler = (test.type == "rss") ?
+ new htmlparser.RssHandler(handlerCallback, test.options)
+ :
+ new htmlparser.DefaultHandler(handlerCallback, test.options)
+ ;
var parser = new htmlparser.Parser(handler);
parser.parseComplete(test.html);
var resultComplete = handler.dom;
View
11 runtests.min.html
@@ -41,6 +41,8 @@
<script language="JavaScript" src="tests/17-xml_namespace.js"></script>
<script language="JavaScript" src="tests/18-enforce_empty_tags.js"></script>
<script language="JavaScript" src="tests/19-ignore_empty_tags.js"></script>
+ <script language="JavaScript" src="tests/20-rss.js"></script>
+ <script language="JavaScript" src="tests/21-atom.js"></script>
<!-- //TODO: dynamic loading of test files -->
</head>
<body style="font-size: small; font-family:Arial, Helvetica, sans-serif;">
@@ -53,10 +55,15 @@
testCount++;
var test = Tautologistics.NodeHtmlParser.Tests.shift();
try {
- var handler = new Tautologistics.NodeHtmlParser.DefaultHandler(function (error) {
+ var handlerCallback = function handlerCallback (error) {
if (error)
document.write("<hr>Handler error: " + error + "<hr>");
- }, test.options);
+ }
+ var handler = (test.type == "rss") ?
+ new Tautologistics.NodeHtmlParser.RssHandler(handlerCallback, test.options)
+ :
+ new Tautologistics.NodeHtmlParser.DefaultHandler(handlerCallback, test.options)
+ ;
var parser = new Tautologistics.NodeHtmlParser.Parser(handler);
document.write("<b>" + test.name + "</b>: ");
parser.parseComplete(test.html);
View
9 runtests.min.js
@@ -35,10 +35,15 @@ for (var i in testFiles) {
fileParts.pop();
var moduleName = fileParts.join(".");
var test = require(testFolder + "/" + moduleName);
- var handler = new htmlparser.DefaultHandler(function (error) {
+ var handlerCallback = function handlerCallback (error) {
if (error)
sys.puts("Handler error: " + error);
- }, test.options);
+ }
+ var handler = (test.type == "rss") ?
+ new htmlparser.RssHandler(handlerCallback, test.options)
+ :
+ new htmlparser.DefaultHandler(handlerCallback, test.options)
+ ;
var parser = new htmlparser.Parser(handler);
parser.parseComplete(test.html);
var resultComplete = handler.dom;

0 comments on commit 3b30e87

Please sign in to comment.
Something went wrong with that request. Please try again.