Browse files

Adding gzip & userAgent support - fixes #38 & #37

  • Loading branch information...
1 parent d6b2443 commit bd79072121614047e20a111e72a4bb50af10b4e9 @sylvinus committed Feb 21, 2013
Showing with 103 additions and 10 deletions.
  1. +8 −0 README.md
  2. +26 −3 lib/crawler.js
  3. +9 −0 test/mockfiles/gzipped/test-gzip.html
  4. +12 −4 test/mockserver.js
  5. +3 −3 test/testrunner.js
  6. +45 −0 test/units/simple.js
View
8 README.md
@@ -108,6 +108,10 @@ Cache:
* cache: Boolean, if true stores requests in memory (Default false)
* skipDuplicates: Boolean, if true skips URIs that were already crawled, without even calling callback() (Default false)
+Other:
+
+ * userAgent: String, defaults to "node-crawler/[version]"
+
Memory leaks
------------
@@ -140,6 +144,10 @@ Rough todolist
ChangeLog
---------
+0.2.3
+ - Added gzip support
+ - Support for userAgent option
+
0.2.2
- Fix relative link bug, all a.href should be absolute when crawling a remote URL
- Updated default jQuery to 1.8.3, request to 2.12.0, genericpool to 2.0.2
View
29 lib/crawler.js
@@ -6,10 +6,13 @@ var http = require('http'),
jschardet = require('jschardet'),
Iconv = require('iconv').Iconv,
jsdom = require('jsdom'),
+ zlib = require("zlib"),
fs = require("fs"),
Pool = require('generic-pool').Pool;
+exports.VERSION = "0.2.3";
+
exports.Crawler = function(options) {
var self = this;
@@ -24,6 +27,7 @@ exports.Crawler = function(options) {
priority: 5,
retries: 3,
forceUTF8: false,
+ userAgent: "node-crawler/"+exports.VERSION,
autoWindowClose:true,
retryTimeout: 10000,
method: "GET",
@@ -111,20 +115,39 @@ exports.Crawler = function(options) {
var ropts = JSON.parse(JSON.stringify(opts));
if (!ropts.headers) ropts.headers={};
- if (opts.forceUTF8) {
+ if (ropts.forceUTF8) {
if (!ropts.headers["Accept-Charset"] && !ropts.headers["accept-charset"]) ropts.headers["Accept-Charset"] = 'utf-8;q=0.7,*;q=0.3';
if (!ropts.encoding) ropts.encoding=null;
}
+ if (!ropts.encoding) {
+ ropts.headers["Accept-Encoding"] = "gzip";
+ ropts.encoding = null;
+ }
+ if (ropts.userAgent) {
+ ropts.headers["User-Agent"] = ropts.userAgent;
+ }
var requestArgs = ["uri","url","qs","method","headers","body","form","json","multipart","followRedirect","followAllRedirects",
"maxRedirects","encoding","pool","timeout","proxy","oauth","strictSSL","jar","aws"];
- request(_.pick.apply(this,[ropts].concat(requestArgs)), function(error,response,body) {
+ var req = request(_.pick.apply(this,[ropts].concat(requestArgs)), function(error,response,body) {
if (error) return self.onContent(error, opts);
response.uri = opts.uri;
- self.onContent(error,opts,response,false);
+
+ // Won't be needed after https://github.com/mikeal/request/pull/303 is merged
+ if (response.headers['content-encoding'] && response.headers['content-encoding'].toLowerCase().indexOf('gzip') >= 0) {
+ zlib.gunzip(response.body, function (error, body) {
+ if (error) return self.onContent(error, opts);
+
+ response.body = body.toString(req.encoding);
+
+ self.onContent(error,opts,response,false);
+ });
+ } else {
+ self.onContent(error,opts,response,false);
+ }
});
};
View
9 test/mockfiles/gzipped/test-gzip.html
@@ -0,0 +1,9 @@
+<html>
+<body>
+ <a href="links2.html">Relative link</a>
+
+ <a href="/mockfiles/links2.html">Absolute link</a>
+
+ gzipped okay.
+</body>
+</html>
View
16 test/mockserver.js
@@ -1,4 +1,5 @@
-var express = require('express');
+var express = require('express'),
+ path = require("path");
var app = express.createServer();
app.get('/timeout', function(req, res){
@@ -15,6 +16,10 @@ app.get('/empty', function(req, res){
res.send("",204);
});
+app.get('/echo_useragent', function(req, res){
+ res.send("<html>Your user agent: "+req.headers["user-agent"]+"</html>");
+});
+
app.get('/close/end', function(req, res){
res.socket.end();
res.end();
@@ -31,9 +36,12 @@ app.get('/bigpage', function(req, res){
res.send("<html><body>"+bigpage+"</body></html>");
});
-app.get('/mockfiles/*', function(req, res){
- res.sendfile("test/mockfiles/"+req.param(0));
-});
+
+app.use("/mockfiles/gzipped/",express.compress());
+
+app.use('/mockfiles/', express["static"](path.resolve(__dirname, 'mockfiles')));
+
+
exports.app = app;
View
6 test/testrunner.js
@@ -16,15 +16,15 @@ testrunner.run([
{
code: path + "/lib/crawler.js",
tests: [
-
+ path + "/test/units/simple.js",
path + "/test/units/links.js",
path + "/test/units/forceutf8.js",
- path + "/test/units/simple.js",
+
path + "/test/units/errors.js",
path + "/test/units/leaks.js"
-
+
]
}
],function() {
View
45 test/units/simple.js
@@ -3,6 +3,8 @@ var Crawler = require("../../lib/crawler").Crawler;
QUnit.module("simple");
var DEBUG = false;
+var MOCKPORT = 30045;
+
test("inline html", function() {
expect( 2 );
@@ -64,4 +66,47 @@ test("two requests", function() {
});
+
+test("one request gzipped", function() {
+ expect( 3 );
+
+ stop();
+
+ var c = new Crawler({
+ "debug":DEBUG,
+ "callback":function(error,result,$) {
+ equal(error,null);
+ ok(result.body.indexOf("gzipped okay.")>0);
+ ok(result.headers["content-encoding"]=="gzip");
+ start();
+ }
+ });
+
+ c.queue(["http://127.0.0.1:"+MOCKPORT+"/mockfiles/gzipped/test-gzip.html"]);
+
+});
+
+
+test("one request + user agent", function() {
+ expect( 2 );
+
+ stop();
+
+ var c = new Crawler({
+ "debug":DEBUG,
+ "userAgent":"test/1.2",
+ "jQuery":false,
+ "callback":function(error,result,$) {
+ equal(error,null);
+ ok(result.body=="<html>Your user agent: test/1.2</html>");
+ start();
+ }
+ });
+
+ c.queue(["http://127.0.0.1:"+MOCKPORT+"/echo_useragent"]);
+
+});
+
+
+
/* */

0 comments on commit bd79072

Please sign in to comment.