Permalink
Browse files

Added support for RSS/Atom feeds

  • Loading branch information...
tautologistics committed May 31, 2010
1 parent b954e7f commit 3b30e87ee4d8e7af6c0b27e53eacf1b7a4773b4d
Showing with 185 additions and 23 deletions.
  1. +19 −2 README.md
  2. +130 −11 node-htmlparser.js
  3. +2 −2 node-htmlparser.min.js
  4. +11 −2 runtests.html
  5. +7 −2 runtests.js
  6. +9 −2 runtests.min.html
  7. +7 −2 runtests.min.js
View
@@ -1,5 +1,5 @@
#NodeHtmlParser
-A forgiving HTML/XML parser written in JS for both the browser and NodeJS (yes, despite the name it works just fine in any modern browser). The parser can handle streams (chunked data) and supports custom handlers for writing custom DOMs/output.
+A forgiving HTML/XML/RSS parser written in JS for both the browser and NodeJS (yes, despite the name it works just fine in any modern browser). The parser can handle streams (chunked data) and supports custom handlers for writing custom DOMs/output.
##Running Tests
@@ -60,7 +60,24 @@ View runtests.html in any browser
}
parser.done();
-##Handler Options
+##Parsing RSS/Atom Feeds
+
+ new htmlparser.RssHandler(function (error, dom) {
+ ...
+ });
+
+###Usage In Browser
+ var handler = new Tautologistics.NodeHtmlParser.DefaultHandler(function (error, dom) {
+ if (error)
+ [...do something for errors...]
+ else
+ [...parsing done, do something...]
+ });
+ var parser = new Tautologistics.NodeHtmlParser.Parser(handler);
+ parser.parseComplete(document.body.innerHTML);
+ alert(JSON.stringify(handler.dom, null, 2));
+
+##DefaultHandler Options
###Usage
var handler = new htmlparser.DefaultHandler(
View
@@ -304,7 +304,7 @@ function Parser (handler) {
this._tagStack.push(ElementType.Comment);
}
}
- else if (element.raw.indexOf("!") == 0) {
+ else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) {
element.type = ElementType.Directive;
//TODO: what about CDATA?
}
@@ -409,7 +409,111 @@ function Parser (handler) {
throw error;
}
-//TODO: add support for options: ignoreWhitespace, verbose (keep data for tags and raw for all)
+//TODO: make this a trully streamable handler
+function RssHandler (callback) {
+ RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
+}
+inherits(RssHandler, DefaultHandler);
+
+ RssHandler.prototype.done = function RssHandler$done () {
+ var feed = { };
+ var feedRoot;
+
+ var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false);
+ if (found.length) {
+ feedRoot = found[0];
+ }
+ if (feedRoot) {
+ if (feedRoot.name == "rss") {
+ feed.type = "rss";
+ feedRoot = feedRoot.children[0]; //<channel/>
+ feed.id = "";
+// require("sys").debug(require("sys").inspect(feedRoot, false, null));
+// require("sys").debug(require("sys").inspect(DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data, false, null));
+ try {
+ feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data);
+ } catch (ex) { }
+ try {
+ feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ feed.items = [];
+ DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) {
+ var entry = {};
+ try {
+ entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data);
+ } catch (ex) { }
+ feed.items.push(entry);
+ });
+ } else {
+ feed.type = "atom";
+ try {
+ feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href;
+ } catch (ex) { }
+ try {
+ feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data);
+ } catch (ex) { }
+ try {
+ feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data;
+ } catch (ex) { }
+ feed.items = [];
+ DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) {
+ var entry = {};
+ try {
+ entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href;
+ } catch (ex) { }
+ try {
+ entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data;
+ } catch (ex) { }
+ try {
+ entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data);
+ } catch (ex) { }
+ feed.items.push(entry);
+ });
+ }
+
+ this.dom = feed;
+ }
+ RssHandler.super_.prototype.done.call(this);
+ }
+
+///////////////////////////////////////////////////
+
function DefaultHandler (callback, options) {
this.reset();
this._options = options ? options : { };
@@ -578,7 +682,9 @@ function DefaultHandler (callback, options) {
return(true);
}
- , getElements: function DomUtils$getElements (options, currentElement) {
+ , getElements: function DomUtils$getElements (options, currentElement, recurse) {
+ recurse = !!recurse;
+
if (!currentElement) {
return([]);
}
@@ -597,37 +703,50 @@ function DefaultHandler (callback, options) {
found.push(currentElement);
}
- if (currentElement.children)
+ if (recurse && currentElement.children)
elementList = currentElement.children;
else if (currentElement instanceof Array)
elementList = currentElement;
else
return(found);
for (var i = 0; i < elementList.length; i++)
- found = found.concat(DomUtils.getElements(options, elementList[i]));
+ found = found.concat(DomUtils.getElements(options, elementList[i], recurse));
return(found);
}
- , getElementById: function DomUtils$getElementById (id, currentElement) {
- var result = DomUtils.getElements({ id: id }, currentElement);
+ , getElementById: function DomUtils$getElementById (id, currentElement, recurse) {
+ recurse = !!recurse;
+ var result = DomUtils.getElements({ id: id }, currentElement, recurse);
return(result.length ? result[0] : null);
}
- , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement) {
- return(DomUtils.getElements({ tag_name: name }, currentElement));
+ , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse) {
+ recurse = !!recurse;
+ return(DomUtils.getElements({ tag_name: name }, currentElement, recurse));
}
- , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement) {
- return(DomUtils.getElements({ tag_type: type }, currentElement));
+ , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse) {
+ recurse = !!recurse;
+ return(DomUtils.getElements({ tag_type: type }, currentElement, recurse));
}
}
+ function inherits (ctor, superCtor) {
+ var tempCtor = function(){};
+ tempCtor.prototype = superCtor.prototype;
+ ctor.super_ = superCtor;
+ ctor.prototype = new tempCtor();
+ ctor.prototype.constructor = ctor;
+ }
+
exports.Parser = Parser;
exports.DefaultHandler = DefaultHandler;
+exports.RssHandler = RssHandler;
+
exports.ElementType = ElementType;
exports.DomUtils = DomUtils;
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -39,6 +39,10 @@
<script language="JavaScript" src="tests/15-non-verbose.js"></script>
<script language="JavaScript" src="tests/16-ignore_whitespace.js"></script>
<script language="JavaScript" src="tests/17-xml_namespace.js"></script>
+ <script language="JavaScript" src="tests/18-enforce_empty_tags.js"></script>
+ <script language="JavaScript" src="tests/19-ignore_empty_tags.js"></script>
+ <script language="JavaScript" src="tests/20-rss.js"></script>
+ <script language="JavaScript" src="tests/21-atom.js"></script>
<!-- //TODO: dynamic loading of test files -->
</head>
<body style="font-size: small; font-family:Arial, Helvetica, sans-serif;">
@@ -51,10 +55,15 @@
testCount++;
var test = Tautologistics.NodeHtmlParser.Tests.shift();
try {
- var handler = new Tautologistics.NodeHtmlParser.DefaultHandler(function (error) {
+ var handlerCallback = function handlerCallback (error) {
if (error)
document.write("<hr>Handler error: " + error + "<hr>");
- }, test.options);
+ }
+ var handler = (test.type == "rss") ?
+ new Tautologistics.NodeHtmlParser.RssHandler(handlerCallback, test.options)
+ :
+ new Tautologistics.NodeHtmlParser.DefaultHandler(handlerCallback, test.options)
+ ;
var parser = new Tautologistics.NodeHtmlParser.Parser(handler);
document.write("<b>" + test.name + "</b>: ");
parser.parseComplete(test.html);
View
@@ -35,10 +35,15 @@ for (var i in testFiles) {
fileParts.pop();
var moduleName = fileParts.join(".");
var test = require(testFolder + "/" + moduleName);
- var handler = new htmlparser.DefaultHandler(function (error) {
+ var handlerCallback = function handlerCallback (error) {
if (error)
sys.puts("Handler error: " + error);
- }, test.options);
+ }
+ var handler = (test.type == "rss") ?
+ new htmlparser.RssHandler(handlerCallback, test.options)
+ :
+ new htmlparser.DefaultHandler(handlerCallback, test.options)
+ ;
var parser = new htmlparser.Parser(handler);
parser.parseComplete(test.html);
var resultComplete = handler.dom;
View
@@ -41,6 +41,8 @@
<script language="JavaScript" src="tests/17-xml_namespace.js"></script>
<script language="JavaScript" src="tests/18-enforce_empty_tags.js"></script>
<script language="JavaScript" src="tests/19-ignore_empty_tags.js"></script>
+ <script language="JavaScript" src="tests/20-rss.js"></script>
+ <script language="JavaScript" src="tests/21-atom.js"></script>
<!-- //TODO: dynamic loading of test files -->
</head>
<body style="font-size: small; font-family:Arial, Helvetica, sans-serif;">
@@ -53,10 +55,15 @@
testCount++;
var test = Tautologistics.NodeHtmlParser.Tests.shift();
try {
- var handler = new Tautologistics.NodeHtmlParser.DefaultHandler(function (error) {
+ var handlerCallback = function handlerCallback (error) {
if (error)
document.write("<hr>Handler error: " + error + "<hr>");
- }, test.options);
+ }
+ var handler = (test.type == "rss") ?
+ new Tautologistics.NodeHtmlParser.RssHandler(handlerCallback, test.options)
+ :
+ new Tautologistics.NodeHtmlParser.DefaultHandler(handlerCallback, test.options)
+ ;
var parser = new Tautologistics.NodeHtmlParser.Parser(handler);
document.write("<b>" + test.name + "</b>: ");
parser.parseComplete(test.html);
View
@@ -35,10 +35,15 @@ for (var i in testFiles) {
fileParts.pop();
var moduleName = fileParts.join(".");
var test = require(testFolder + "/" + moduleName);
- var handler = new htmlparser.DefaultHandler(function (error) {
+ var handlerCallback = function handlerCallback (error) {
if (error)
sys.puts("Handler error: " + error);
- }, test.options);
+ }
+ var handler = (test.type == "rss") ?
+ new htmlparser.RssHandler(handlerCallback, test.options)
+ :
+ new htmlparser.DefaultHandler(handlerCallback, test.options)
+ ;
var parser = new htmlparser.Parser(handler);
parser.parseComplete(test.html);
var resultComplete = handler.dom;

0 comments on commit 3b30e87

Please sign in to comment.