Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

jQuery fix, Cache support, Retry support, updated dependencies

  • Loading branch information...
commit e93a312fa495fb23708769c0428407869d469244 1 parent 1565a50
@sylvinus authored
View
8 README.md
@@ -24,7 +24,6 @@ Rough todolist :
* More crawling tests
* Document the API
* Get feedback on featureset for a 1.0 release (option for autofollowing links?)
- * Make sure jQuery is cached / Include latest release in tree or add dependency
* Check how we can support other mimetypes than HTML
* Add+test timeout parameter
* Option to wait for callback to finish before freeing the pool resource (via another callback like next())
@@ -62,4 +61,11 @@ API
}]);
+ChangeLog
+---------
+0.0.3 (dev)
+ - Fixes jQuery being redownloaded at each page + include it in the tree
+ - Cache support
+ - Retries
+ - Updated priority support with new generic-pool>=1.0.4
View
128 lib/crawler.js
@@ -5,13 +5,17 @@ var http = require('http'),
sys = require('sys'),
request = require('request');
-try {
- //https://github.com/joshfire/node-pool
- var Pool = require('../../node-pool/lib/generic-pool.js').Pool;
-} catch (e) {
- var Pool = require('generic-pool').Pool;
-}
-
+
+var Pool = require('generic-pool').Pool;
+
+/* will be needed for jsdom>0.1.23
+require('jsdom').defaultDocumentFeatures = {
+ FetchExternalResources : [],
+ ProcessExternalResources : false,
+ MutationEvents : false,
+ QuerySelector : false
+};
+*/
var cloneAndExtend = function(obj,ext) {
@@ -26,24 +30,33 @@ exports.Crawler = function(options) {
//Default options
this.options = cloneAndExtend({
- "timeout":60,
- "jQueryify":true,
- "maxConnections":10,
- "jQuery":'http://code.jquery.com/jquery-1.4.2.min.js',
- "method":"GET",
- "priority":0
+ timeout: 60,
+ jQuery: true,
+ jQueryUrl: path.normalize(__dirname+'/jquery-1.4.2.js'), //http://code.jquery.com/jquery-1.4.2.min.js",
+ maxConnections: 10,
+ priorityRange: 10,
+ priority: 5,
+ retries: 3,
+ retryTimeout: 10,
+ method: "GET",
+ cache: false, //false,true, [ttl?]
+ skipDuplicates: false,
+ priority: 0
},options);
//Do talks one by one
this.pool = Pool({
name : 'crawler',
- max : this.options["maxConnections"],
+ //log : this.options.debug,
+ max : this.options.maxConnections,
+ priorityRange:this.options.priorityRange,
create : function(callback) {
callback(1);
},
destroy : function(client) { }
});
+ this.cache = {};
this.queue = function(item) {
@@ -67,32 +80,52 @@ exports.Crawler = function(options) {
toQueue = cloneAndExtend(this.options,item);
}
+ var useCache = function() {
+ return ((toQueue.cache || toQueue.skipDuplicates) && (toQueue.method=="GET" || toQueue.method=="HEAD"));
+ };
+
+
var self = this;
- this.pool.borrow(function(poolRef) {
+ this.pool.acquire(function(poolRef) {
+ var makeRequest;
- var onContent = function (error, response, body) {
+ var onContent = function (error, response, body, fromCache) {
- if (toQueue["debug"]) {
+ if (toQueue.debug) {
if (error) {
- console.log("Error "+error+" when fetching "+toQueue["uri"].href);
+ console.log("Error "+error+" when fetching "+toQueue.uri+(toQueue.retries?" ("+toQueue.retries+" retries left)":""));
} else {
- console.log("Got "+toQueue["uri"].href+" ("+body.length+" bytes)...");
+ console.log("Got "+toQueue.uri+" ("+body.length+" bytes)...");
}
}
+ if (error && toQueue.retries) {
+ setTimeout(function() {
+ toQueue.retries--;
+ makeRequest(toQueue);
+ },toQueue.retryTimeout*1000);
+
+ //Don't return the poolRef yet.
+ return;
+ }
+
+ if (useCache() && !fromCache) {
+ self.cache[toQueue.uri] = [error,response,body];
+ }
- if (typeof toQueue["callback"]=="function") {
+ if (typeof toQueue.callback=="function") {
if (error) {
- toQueue["callback"](error);
-
+ //No retries left here
+ toQueue.callback(error);
+
} else {
response.content = body;
response.request = toQueue;
- if (toQueue["jQueryify"] && toQueue["method"]!="HEAD") {
+ if (toQueue.jQuery && toQueue.method!="HEAD") {
var document = require("jsdom").jsdom(),
window = document.createWindow();
@@ -102,34 +135,55 @@ exports.Crawler = function(options) {
response.window = window;
response.document = document;
- require("jsdom").jQueryify(window, toQueue["jQuery"],function() {
- toQueue["callback"](null,response,window.jQuery);
+ require("jsdom").jQueryify(window, toQueue.jQueryUrl,function() {
+ toQueue.callback(null,response,window.jQuery);
});
} else {
- toQueue["callback"](null,response);
+ toQueue.callback(null,response);
}
}
}
- self.pool.returnToPool(poolRef);
+ self.pool.release(poolRef);
};
//Static HTML was given
- if (toQueue["html"]) {
- onContent(null,{},toQueue["html"]);
+ if (toQueue.html) {
+ onContent(null,{},toQueue.html,false);
//Make a HTTP request
} else {
- var makeRequest = function(q) {
- if (q["debug"])
- console.log("Fetching "+q["uri"]+" ...");
- request(q, onContent);
+ makeRequest = function(q) {
+
+ if (useCache()) {
+ if (self.cache[q.uri]) {
+
+ //If a query has already been made to this URL, don't callback again
+ if (!q.skipDuplicates) {
+ onContent.apply(this,self.cache[q.uri].concat(true));
+ }
+ return;
+ }
+ }
+
+ //Clean the object in case of a retry
+ delete q.client;
+ delete q.request;
+
+ if (q.debug) {
+ console.log(q.method+" "+q.uri+" ...");
+ }
+
+ request(q, function(error,response,body) {
+ q.uri=q.uri.href;
+ onContent(error,response,body,false);
+ });
};
- if (typeof toQueue["uri"]=="function") {
- toQueue["uri"](function(uri) {
- toQueue["uri"]=uri;
+ if (typeof toQueue.uri=="function") {
+ toQueue.uri(function(uri) {
+ toQueue.uri=uri;
makeRequest(toQueue);
});
} else {
@@ -140,7 +194,7 @@ exports.Crawler = function(options) {
}
- },toQueue["priority"]);
+ },toQueue.priority);
}
}
View
6,240 lib/jquery-1.4.2.js
6,240 additions, 0 deletions not shown
View
6 package.json
@@ -1,6 +1,6 @@
{
"name": "crawler",
- "version": "0.0.2",
+ "version": "0.0.3",
"description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously.",
"keywords": [
"dom",
@@ -33,8 +33,8 @@
,
"dependencies": {
"request": ">= 0.9.5",
- "jsdom": ">= 0.1.20",
- "generic-pool": ">= 1.0.2",
+ "jsdom": "= 0.1.20",
+ "generic-pool": ">= 1.0.4",
"qunit": ">= 0.0.7",
"htmlparser": ">= 1.6.2"
},
View
26 test/cache.js
@@ -0,0 +1,26 @@
+var Crawler = require("../lib/crawler").Crawler;
+
+var c = new Crawler({
+ "maxConnections":1,
+ "timeout":60,
+ "debug":true,
+ "cache":true,
+ "callback":function(error,result,$) {
+ $("a").each(function(i,a) {
+ console.log(a.href);
+ })
+ }
+});
+
+c.queue(["http://joshfire.com/","http://joshfire.com/","http://joshfire.com/","http://joshfire.com/"]);
+
+/*
+c.queue([{
+ "uri":"http://parisjs.org/register",
+ "method":"POST",
+ "timeout":120,
+ "callback":function(error,result,$) {
+ $("div:contains(Thank you)").after(" very much");
+ }
+}]);
+*/
View
3  test/index.html
@@ -4,12 +4,13 @@
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>jQuery Test Suite</title>
<link rel="Stylesheet" media="screen" href="qunit/qunit/qunit.css" />
- <script type="text/javascript" src="jquery.js"></script>
+<!-- <script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="../sizzle.js"></script>
<script type="text/javascript" src="data/sizzle-jquery.js"></script>
<script type="text/javascript" src="data/testinit.js"></script>
<script type="text/javascript" src="qunit/qunit/qunit.js"></script>
<script type="text/javascript" src="unit/selector.js"></script>
+ -->
</head>
<body id="body">
View
2  test/simple.js
@@ -3,7 +3,9 @@ var Crawler = require("../lib/crawler").Crawler;
var c = new Crawler({
"maxConnections":10,
"timeout":60,
+ "debug":true,
"callback":function(error,result,$) {
+ console.log("Got page");
$("a").each(function(i,a) {
console.log(a.href);
//c.queue(a.href);
Please sign in to comment.
Something went wrong with that request. Please try again.