Skip to content

Commit

Permalink
jQuery fix, Cache support, Retry support, updated dependencies
Browse files Browse the repository at this point in the history
  • Loading branch information
sylvinus committed Jan 31, 2011
1 parent 1565a50 commit e93a312
Show file tree
Hide file tree
Showing 7 changed files with 6,371 additions and 42 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ Rough todolist :
* More crawling tests
* Document the API
* Get feedback on featureset for a 1.0 release (option for autofollowing links?)
* Make sure jQuery is cached / Include latest release in tree or add dependency
* Check how we can support other mimetypes than HTML
* Add+test timeout parameter
* Option to wait for callback to finish before freeing the pool resource (via another callback like next())
Expand Down Expand Up @@ -62,4 +61,11 @@ API
}]);


ChangeLog
---------

0.0.3 (dev)
- Fixes jQuery being redownloaded at each page + include it in the tree
- Cache support
- Retries
- Updated priority support with new generic-pool>=1.0.4
128 changes: 91 additions & 37 deletions lib/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@ var http = require('http'),
sys = require('sys'),
request = require('request');

try {
//https://github.com/joshfire/node-pool
var Pool = require('../../node-pool/lib/generic-pool.js').Pool;
} catch (e) {
var Pool = require('generic-pool').Pool;
}


var Pool = require('generic-pool').Pool;

/* will be needed for jsdom>0.1.23
require('jsdom').defaultDocumentFeatures = {
FetchExternalResources : [],
ProcessExternalResources : false,
MutationEvents : false,
QuerySelector : false
};
*/


var cloneAndExtend = function(obj,ext) {
Expand All @@ -26,24 +30,33 @@ exports.Crawler = function(options) {

//Default options
this.options = cloneAndExtend({
"timeout":60,
"jQueryify":true,
"maxConnections":10,
"jQuery":'http://code.jquery.com/jquery-1.4.2.min.js',
"method":"GET",
"priority":0
timeout: 60,
jQuery: true,
jQueryUrl: path.normalize(__dirname+'/jquery-1.4.2.js'), //http://code.jquery.com/jquery-1.4.2.min.js",
maxConnections: 10,
priorityRange: 10,
priority: 5,
retries: 3,
retryTimeout: 10,
method: "GET",
cache: false, //false,true, [ttl?]
skipDuplicates: false,
priority: 0
},options);

//Do talks one by one
this.pool = Pool({
name : 'crawler',
max : this.options["maxConnections"],
//log : this.options.debug,
max : this.options.maxConnections,
priorityRange:this.options.priorityRange,
create : function(callback) {
callback(1);
},
destroy : function(client) { }
});

this.cache = {};

this.queue = function(item) {

Expand All @@ -67,32 +80,52 @@ exports.Crawler = function(options) {
toQueue = cloneAndExtend(this.options,item);
}

var useCache = function() {
return ((toQueue.cache || toQueue.skipDuplicates) && (toQueue.method=="GET" || toQueue.method=="HEAD"));
};


var self = this;
this.pool.borrow(function(poolRef) {
this.pool.acquire(function(poolRef) {

var makeRequest;

var onContent = function (error, response, body) {
var onContent = function (error, response, body, fromCache) {

if (toQueue["debug"]) {
if (toQueue.debug) {
if (error) {
console.log("Error "+error+" when fetching "+toQueue["uri"].href);
console.log("Error "+error+" when fetching "+toQueue.uri+(toQueue.retries?" ("+toQueue.retries+" retries left)":""));
} else {
console.log("Got "+toQueue["uri"].href+" ("+body.length+" bytes)...");
console.log("Got "+toQueue.uri+" ("+body.length+" bytes)...");
}
}

if (error && toQueue.retries) {
setTimeout(function() {
toQueue.retries--;
makeRequest(toQueue);
},toQueue.retryTimeout*1000);

//Don't return the poolRef yet.
return;
}

if (useCache() && !fromCache) {
self.cache[toQueue.uri] = [error,response,body];
}

if (typeof toQueue["callback"]=="function") {
if (typeof toQueue.callback=="function") {

if (error) {
toQueue["callback"](error);

//No retries left here
toQueue.callback(error);

} else {

response.content = body;
response.request = toQueue;

if (toQueue["jQueryify"] && toQueue["method"]!="HEAD") {
if (toQueue.jQuery && toQueue.method!="HEAD") {

var document = require("jsdom").jsdom(),
window = document.createWindow();
Expand All @@ -102,34 +135,55 @@ exports.Crawler = function(options) {
response.window = window;
response.document = document;

require("jsdom").jQueryify(window, toQueue["jQuery"],function() {
toQueue["callback"](null,response,window.jQuery);
require("jsdom").jQueryify(window, toQueue.jQueryUrl,function() {
toQueue.callback(null,response,window.jQuery);
});
} else {
toQueue["callback"](null,response);
toQueue.callback(null,response);
}
}
}
self.pool.returnToPool(poolRef);
self.pool.release(poolRef);
};


//Static HTML was given
if (toQueue["html"]) {
onContent(null,{},toQueue["html"]);
if (toQueue.html) {
onContent(null,{},toQueue.html,false);

//Make a HTTP request
} else {

var makeRequest = function(q) {
if (q["debug"])
console.log("Fetching "+q["uri"]+" ...");
request(q, onContent);
makeRequest = function(q) {

if (useCache()) {
if (self.cache[q.uri]) {

//If a query has already been made to this URL, don't callback again
if (!q.skipDuplicates) {
onContent.apply(this,self.cache[q.uri].concat(true));
}
return;
}
}

//Clean the object in case of a retry
delete q.client;
delete q.request;

if (q.debug) {
console.log(q.method+" "+q.uri+" ...");
}

request(q, function(error,response,body) {
q.uri=q.uri.href;
onContent(error,response,body,false);
});
};

if (typeof toQueue["uri"]=="function") {
toQueue["uri"](function(uri) {
toQueue["uri"]=uri;
if (typeof toQueue.uri=="function") {
toQueue.uri(function(uri) {
toQueue.uri=uri;
makeRequest(toQueue);
});
} else {
Expand All @@ -140,7 +194,7 @@ exports.Crawler = function(options) {
}


},toQueue["priority"]);
},toQueue.priority);
}

}
Loading

0 comments on commit e93a312

Please sign in to comment.