Permalink
Browse files

Fixed DomUtils.testElement() and added new, related projects to the R…

…EADME
  • Loading branch information...
tautologistics committed Oct 4, 2010
1 parent 8e538e0 commit 00fbea9eff870ff3b7453c31b9df83d56243e5b9
Showing with 300 additions and 13 deletions.
  1. +7 −0 README.md
  2. +17 −9 lib/node-htmlparser.js
  3. +2 −2 lib/node-htmlparser.min.js
  4. +36 −0 tests/18-enforce_empty_tags.js
  5. +38 −0 tests/19-ignore_empty_tags.js
  6. +117 −0 tests/20-rss.js
  7. +77 −0 tests/21-atom.js
  8. +6 −2 utils_example.js
View
@@ -177,3 +177,10 @@ becomes:
##DomUtils
###TBD (see utils_example.js for now)
+
+##Related Projects
+
+Looking for CSS selectors to search the DOM? Try Node-SoupSelect, a port of SoupSelect to NodeJS: http://github.com/harryf/node-soupselect
+
+There's also a port of hpricot to NodeJS that uses node-HtmlParser for HTML parsing: http://github.com/silentrob/Apricot
+
View
@@ -18,7 +18,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.
***********************************************/
-/* v1.5.0 */
+/* v1.6.3 */
(function () {
@@ -656,28 +656,36 @@ function DefaultHandler (callback, options) {
var DomUtils = {
testElement: function DomUtils$testElement (options, element) {
if (!element) {
- return(false);
+ return false;
}
for (var key in options) {
if (key == "tag_name") {
if (element.type != "tag" && element.type != "script" && element.type != "style") {
- return(false);
+ return false;
+ }
+ if (!options["tag_name"](element.name)) {
+ return false;
}
- return(options["tag_name"](element.name));
} else if (key == "tag_type") {
- return(options["tag_type"](element.type));
+ if (!options["tag_type"](element.type)) {
+ return false;
+ }
} else if (key == "tag_contains") {
if (element.type != "text" && element.type != "comment" && element.type != "directive") {
- return(false);
+ return false;
+ }
+ if (!options["tag_contains"](element.data)) {
+ return false;
}
- return(options["tag_contains"](element.data));
} else {
- return(element.attribs && options[key](element.attribs[key]));
+ if (!element.attribs || !options[key](element.attribs[key])) {
+ return false;
+ }
}
}
- return(true);
+ return true;
}
, getElements: function DomUtils$getElements (options, currentElement, recurse, limit) {

Large diffs are not rendered by default.

Oops, something went wrong.
@@ -0,0 +1,36 @@
+(function () {
+
+function RunningInNode () {
+ return(
+ (typeof require) == "function"
+ &&
+ (typeof exports) == "object"
+ &&
+ (typeof module) == "object"
+ &&
+ (typeof __filename) == "string"
+ &&
+ (typeof __dirname) == "string"
+ );
+}
+
+if (!RunningInNode()) {
+ if (!this.Tautologistics)
+ this.Tautologistics = {};
+ if (!this.Tautologistics.NodeHtmlParser)
+ this.Tautologistics.NodeHtmlParser = {};
+ if (!this.Tautologistics.NodeHtmlParser.Tests)
+ this.Tautologistics.NodeHtmlParser.Tests = [];
+ exports = {};
+ this.Tautologistics.NodeHtmlParser.Tests.push(exports);
+}
+
+exports.name = "Enforce empty tags";
+exports.html = "<link>text</link>";
+exports.expected =
+ [
+ { raw: 'link', data: 'link', type: 'tag', name: 'link' }
+ , { raw: 'text', data: 'text', type: 'text' }
+ ];
+
+})();
@@ -0,0 +1,38 @@
+(function () {
+
+function RunningInNode () {
+ return(
+ (typeof require) == "function"
+ &&
+ (typeof exports) == "object"
+ &&
+ (typeof module) == "object"
+ &&
+ (typeof __filename) == "string"
+ &&
+ (typeof __dirname) == "string"
+ );
+}
+
+if (!RunningInNode()) {
+ if (!this.Tautologistics)
+ this.Tautologistics = {};
+ if (!this.Tautologistics.NodeHtmlParser)
+ this.Tautologistics.NodeHtmlParser = {};
+ if (!this.Tautologistics.NodeHtmlParser.Tests)
+ this.Tautologistics.NodeHtmlParser.Tests = [];
+ exports = {};
+ this.Tautologistics.NodeHtmlParser.Tests.push(exports);
+}
+
+exports.name = "Ignore empty tags";
+exports.html = "<link>text</link>";
+exports.options = { enforceEmptyTags: false };
+exports.expected =
+ [
+ { raw: 'link', data: 'link', type: 'tag', name: 'link', children: [
+ { raw: 'text', data: 'text', type: 'text' }
+ ] }
+ ];
+
+})();
View
@@ -0,0 +1,117 @@
+(function () {
+
+function RunningInNode () {
+ return(
+ (typeof require) == "function"
+ &&
+ (typeof exports) == "object"
+ &&
+ (typeof module) == "object"
+ &&
+ (typeof __filename) == "string"
+ &&
+ (typeof __dirname) == "string"
+ );
+}
+
+if (!RunningInNode()) {
+ if (!this.Tautologistics)
+ this.Tautologistics = {};
+ if (!this.Tautologistics.NodeHtmlParser)
+ this.Tautologistics.NodeHtmlParser = {};
+ if (!this.Tautologistics.NodeHtmlParser.Tests)
+ this.Tautologistics.NodeHtmlParser.Tests = [];
+ exports = {};
+ this.Tautologistics.NodeHtmlParser.Tests.push(exports);
+}
+
+exports.name = "RSS (2.0)";
+//http://cyber.law.harvard.edu/rss/examples/rss2sample.xml
+exports.html = '<?xml version="1.0"?>\
+<rss version="2.0">\
+ <channel>\
+ <title>Liftoff News</title>\
+ <link>http://liftoff.msfc.nasa.gov/</link>\
+ <description>Liftoff to Space Exploration.</description>\
+ <language>en-us</language>\
+ <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate>\
+\
+ <lastBuildDate>Tue, 10 Jun 2003 09:41:01 GMT</lastBuildDate>\
+ <docs>http://blogs.law.harvard.edu/tech/rss</docs>\
+ <generator>Weblog Editor 2.0</generator>\
+ <managingEditor>editor@example.com</managingEditor>\
+ <webMaster>webmaster@example.com</webMaster>\
+ <item>\
+\
+ <title>Star City</title>\
+ <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>\
+ <description>How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia\'s &lt;a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm"&gt;Star City&lt;/a&gt;.</description>\
+ <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>\
+ <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>\
+\
+ </item>\
+ <item>\
+ <description>Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a &lt;a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm"&gt;partial eclipse of the Sun&lt;/a&gt; on Saturday, May 31st.</description>\
+ <pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate>\
+ <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid>\
+\
+ </item>\
+ <item>\
+ <title>The Engine That Does More</title>\
+ <link>http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp</link>\
+ <description>Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that.</description>\
+ <pubDate>Tue, 27 May 2003 08:37:32 GMT</pubDate>\
+ <guid>http://liftoff.msfc.nasa.gov/2003/05/27.html#item571</guid>\
+\
+ </item>\
+ <item>\
+ <title>Astronauts\' Dirty Laundry</title>\
+ <link>http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp</link>\
+ <description>Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options.</description>\
+ <pubDate>Tue, 20 May 2003 08:56:02 GMT</pubDate>\
+ <guid>http://liftoff.msfc.nasa.gov/2003/05/20.html#item570</guid>\
+\
+ </item>\
+ </channel>\
+</rss>';
+exports.options = { };
+exports.type = "rss";
+exports.expected = {
+ type: "rss"
+ , id: ""
+ , title: "Liftoff News"
+ , link: "http://liftoff.msfc.nasa.gov/"
+ , description: "Liftoff to Space Exploration."
+ , updated: new Date("Tue, 10 Jun 2003 09:41:01 GMT")
+ , author: "editor@example.com"
+ , items: [
+ {
+ id: "http://liftoff.msfc.nasa.gov/2003/06/03.html#item573"
+ , title: "Star City"
+ , link: "http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp"
+ , description: "How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's &lt;a href=\"http://howe.iki.rssi.ru/GCTC/gctc_e.htm\"&gt;Star City&lt;/a&gt;."
+ , pubDate: new Date("Tue, 03 Jun 2003 09:39:21 GMT")
+ }
+ , {
+ id: "http://liftoff.msfc.nasa.gov/2003/05/30.html#item572"
+ , description: "Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a &lt;a href=\"http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm\"&gt;partial eclipse of the Sun&lt;/a&gt; on Saturday, May 31st."
+ , pubDate: new Date("Fri, 30 May 2003 11:06:42 GMT")
+ }
+ , {
+ id: "http://liftoff.msfc.nasa.gov/2003/05/27.html#item571"
+ , title: "The Engine That Does More"
+ , link: "http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp"
+ , description: "Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that."
+ , pubDate: new Date("Tue, 27 May 2003 08:37:32 GMT")
+ }
+ , {
+ id: "http://liftoff.msfc.nasa.gov/2003/05/20.html#item570"
+ , title: "Astronauts' Dirty Laundry"
+ , link: "http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp"
+ , description: "Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options."
+ , pubDate: new Date("Tue, 20 May 2003 08:56:02 GMT")
+ }
+ ]
+ };
+
+})();
View
@@ -0,0 +1,77 @@
+(function () {
+
+function RunningInNode () {
+ return(
+ (typeof require) == "function"
+ &&
+ (typeof exports) == "object"
+ &&
+ (typeof module) == "object"
+ &&
+ (typeof __filename) == "string"
+ &&
+ (typeof __dirname) == "string"
+ );
+}
+
+if (!RunningInNode()) {
+ if (!this.Tautologistics)
+ this.Tautologistics = {};
+ if (!this.Tautologistics.NodeHtmlParser)
+ this.Tautologistics.NodeHtmlParser = {};
+ if (!this.Tautologistics.NodeHtmlParser.Tests)
+ this.Tautologistics.NodeHtmlParser.Tests = [];
+ exports = {};
+ this.Tautologistics.NodeHtmlParser.Tests.push(exports);
+}
+
+exports.name = "Atom (1.0)";
+//http://en.wikipedia.org/wiki/Atom_%28standard%29
+exports.html = '<?xml version="1.0" encoding="utf-8"?>\
+\
+<feed xmlns="http://www.w3.org/2005/Atom">\
+\
+ <title>Example Feed</title>\
+ <subtitle>A subtitle.</subtitle>\
+ <link href="http://example.org/feed/" rel="self" />\
+ <link href="http://example.org/" />\
+ <id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id>\
+ <updated>2003-12-13T18:30:02Z</updated>\
+ <author>\
+ <name>John Doe</name>\
+ <email>johndoe@example.com</email>\
+ </author>\
+\
+ <entry>\
+ <title>Atom-Powered Robots Run Amok</title>\
+ <link href="http://example.org/2003/12/13/atom03" />\
+ <link rel="alternate" type="text/html" href="http://example.org/2003/12/13/atom03.html"/>\
+ <link rel="edit" href="http://example.org/2003/12/13/atom03/edit"/>\
+ <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>\
+ <updated>2003-12-13T18:30:02Z</updated>\
+ <summary>Some text.</summary>\
+ </entry>\
+\
+</feed>';
+exports.options = { };
+exports.type = "rss";
+exports.expected = {
+ type: "atom"
+ , id: "urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6"
+ , title: "Example Feed"
+ , link: "http://example.org/feed/"
+ , description: "A subtitle."
+ , updated: new Date("2003-12-13T18:30:02Z")
+ , author: "johndoe@example.com"
+ , items: [
+ {
+ id: "urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a"
+ , title: "Atom-Powered Robots Run Amok"
+ , link: "http://example.org/2003/12/13/atom03"
+ , description: "Some text."
+ , pubDate: new Date("2003-12-13T18:30:02Z")
+ }
+ ]
+ };
+
+})();
View
@@ -1,9 +1,9 @@
//node --prof --prof_auto profile.js
//deps/v8/tools/mac-tick-processor v8.log
var sys = require("sys");
-var htmlparser = require("./node-htmlparser");
+var htmlparser = require("./lib/node-htmlparser");
-var html = "<a>text a</a><b id='x'>text b</b><c class='y'>text c</c><d id='z' class='w'><e>text e</e></d><g class='g h i'>hhh</g>";
+var html = "<a>text a</a><b id='x'>text b</b><c class='y'>text c</c><d id='z' class='w'><e>text e</e></d><g class='g h i'>hhh</g><yy>hellow</yy><yy id='secondyy'>world</yy>";
var handler = new htmlparser.DefaultHandler(function(err, dom) {
if (err) {
@@ -25,6 +25,10 @@ var handler = new htmlparser.DefaultHandler(function(err, dom) {
nested = htmlparser.DomUtils.getElementsByTagName("e", nested);
nested = htmlparser.DomUtils.getElementsByTagType("text", nested);
sys.debug("nested: " + sys.inspect(nested, false, null));
+ var double = htmlparser.DomUtils.getElementsByTagName("yy", dom);
+ sys.debug("double: " + sys.inspect(double, false, null));
+ var single = htmlparser.DomUtils.getElements( { tag_name: "yy", id: "secondyy" }, dom);
+ sys.debug("single: " + sys.inspect(single, false, null));
}
}, { verbose: false });
var parser = new htmlparser.Parser(handler);

0 comments on commit 00fbea9

Please sign in to comment.