diff --git a/README.md b/README.md index cb58a00f..27a8bebc 100644 --- a/README.md +++ b/README.md @@ -49,17 +49,18 @@ Makes requests to `urls` and saves all files found with `sources` to `directory` **options** - object containing next options: - `urls`: array of urls to load and filenames for them *(required, see example below)* - - `urlFilter`: function which is called for each url to check whether it should be scraped. *(optional, see example below)* - `directory`: path to save loaded files *(required)* - - `filenameGenerator`: name of one of the bundled filenameGenerators, or a custom filenameGenerator function *(optional, default: 'byType')* - - `defaultFilename`: filename for index page *(optional, default: 'index.html')* - - `prettifyUrls`: whether urls should be 'prettified', by having the `defaultFilename` removed *(optional, default: false)* - `sources`: array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see example below)* - - `subdirectories`: array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)* - - `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)* - `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)* - `maxDepth`: positive number, maximum allowed depth for dependencies *(optional, see example below)* + - `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)* + - `subdirectories`: array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)* + - `defaultFilename`: filename for index page *(optional, default: 'index.html')* + - `prettifyUrls`: whether urls should be 'prettified', by having the `defaultFilename` removed *(optional, default: false)* - `ignoreErrors`: boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error *(optional, default: true)* + - `urlFilter`: function which is called for each url to check whether it should be scraped. *(optional, see example below)* + - `filenameGenerator`: name of one of the bundled filenameGenerators, or a custom filenameGenerator function *(optional, default: 'byType')* + - `httpResponseHandler`: function which is called on each response, allows to customize resource or reject its downloading *(optional, see example below)* Default options you can find in [lib/config/defaults.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/config/defaults.js). @@ -85,6 +86,14 @@ When the `bySiteStructure` filenameGenerator is used the downloaded files are sa - `/about` => `DIRECTORY/about/index.html` - `/resources/javascript/libraries/jquery.min.js` => `DIRECTORY/resources/javascript/libraries/jquery.min.js` +### Http Response Handlers +HttpResponseHandler is used to reject resource downloading or customize resource text based on response data (for example, status code, content type, etc.) +Function takes `response` argument - response object of [request](https://github.com/request/request) module and should return resolved `Promise` if resource should be downloaded or rejected with Error `Promise` if it should be skipped. +Promise should be resolved with: +* `string` which contains response body +* or object with properies `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result. + +See [example of using httpResponseHandler](#example-5-rejecting-resources-with-404-status-and-adding-metadata). ## Examples #### Example 1 @@ -176,6 +185,29 @@ scrape({ }).then(console.log).catch(console.log); ``` +#### Example 5. Rejecting resources with 404 status and adding metadata +```javascript +var scrape = require('website-scraper'); +scrape({ + urls: ['http://example.com/'], + directory: '/path/to/save', + httpResponseHandler: (response) => { + if (response.statusCode === 404) { + return Promise.reject(new Error('status is 404')); + } else { + // if you don't need metadata - you can just return Promise.resolve(response.body) + return Promise.resolve({ + body: response.body, + metadata: { + headers: response.headers, + someOtherData: [ 1, 2, 3 ] + } + }); + } + } +}).then(console.log).catch(console.log); +``` + ## Log and debug This module uses [debug](https://github.com/visionmedia/debug) to log events. To enable logs you should use environment variable `DEBUG`. Next command will log everything from website-scraper diff --git a/lib/config/defaults.js b/lib/config/defaults.js index 776ced99..1ac79ef5 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -30,12 +30,11 @@ var config = { jar: true, gzip: true }, - urlFilter: function urlFilter () { - return true; - }, + urlFilter: null, recursive: false, maxDepth: null, - ignoreErrors: true + ignoreErrors: true, + httpResponseHandler: null }; module.exports = config; diff --git a/lib/request.js b/lib/request.js index 2fce9b14..56d73d4b 100644 --- a/lib/request.js +++ b/lib/request.js @@ -1,32 +1,79 @@ -var _ = require('lodash'); -var Promise = require('bluebird'); -var request = require('request'); -var get = Promise.promisify(request.get); -var logger = require('./logger'); +'use strict'; + +const _ = require('lodash'); +const Promise = require('bluebird'); +const request = require('request'); +const get = Promise.promisify(request.get); +const logger = require('./logger'); function getMimeType (contentType) { return contentType ? contentType.split(';')[0] : null; } -function makeRequest (options, url, referer) { - var requestOptions = _.clone(options); - requestOptions.url = url; +function defaultResponseHandler (response) { + return Promise.resolve(response.body); +} + +function transformResult (result) { + switch (true) { + case _.isString(result): + return { + body: result, + metadata: null + }; + case _.isPlainObject(result): + return { + body: result.body, + metadata: result.metadata || null + }; + default: + throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result); + } +} - if (referer) { - requestOptions.headers = requestOptions.headers || {}; - requestOptions.headers.referer = referer; +class Request { + /** + * + * @param {Object} options + * @param {function} options.httpResponseHandler - custom response handler + * @param {Object} options.request - custom options for request module + */ + constructor (options) { + this.handleResponse = options && options.httpResponseHandler ? options.httpResponseHandler : defaultResponseHandler; + this.options = options && options.request ? _.clone(options.request) : {}; } - logger.debug(`[request] sending request for url ${url}, referer ${referer}`); + /** + * Performs get request to url and returns data for resource + * @param {string} url - url of resource + * @param {string} referer - url of parent resource + * @return {Promise} + */ + get (url, referer) { + let requestOptions = _.clone(this.options); + requestOptions.url = url; + + if (referer) { + requestOptions.headers = requestOptions.headers || {}; + requestOptions.headers.referer = referer; + } - return get(requestOptions).then(function handleResponse (data) { - logger.debug(`[request] received response for ${data.request.href}, statusCode ${data.statusCode}`); - return { - url: data.request.href, - mimeType: getMimeType(data.headers['content-type']), - body: data.body - }; - }); + logger.debug(`[request] sending request for url ${url}, referer ${referer}`); + + return get(requestOptions).then((response) => { + logger.debug(`[request] received response for ${response.request.href}, statusCode ${response.statusCode}`); + return this.handleResponse(response) + .then(transformResult) + .then((responseHandlerResult) => { + return { + url: response.request.href, + mimeType: getMimeType(response.headers['content-type']), + body: responseHandlerResult.body, + metadata: responseHandlerResult.metadata + }; + }); + }); + } } -module.exports = makeRequest; +module.exports = Request; diff --git a/lib/resource.js b/lib/resource.js index 19522b01..197109e1 100644 --- a/lib/resource.js +++ b/lib/resource.js @@ -88,4 +88,8 @@ Resource.prototype.setSaved = function setSaved () { this.saved = true; }; +Resource.prototype.setMetadata = function setMetadata (metadata) { + this.metadata = metadata; +}; + module.exports = Resource; diff --git a/lib/scraper.js b/lib/scraper.js index d4b44dab..1ade42a3 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -8,7 +8,7 @@ var recursiveSources = require('./config/recursive-sources'); var Resource = require('./resource'); var FilenameGenerator = require('./filename-generator'); -var makeRequest = require('./request'); +var Request = require('./request'); var ResourceHandler = require('./resource-handler'); var FSAdapter = require('./fs-adaper'); var utils = require('./utils'); @@ -28,7 +28,7 @@ function Scraper (options) { logger.info('init with options', self.options); - self.makeRequest = makeRequest.bind(null, self.options.request); + self.request = new Request(self.options); self.resourceHandler = new ResourceHandler(self.options, self); self.filenameGenerator = new FilenameGenerator(self.options); self.fsAdapter = new FSAdapter(self.options); @@ -80,7 +80,7 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) { var requestPromise = Promise.resolve() .then(function makeRequest () { var referer = resource.parent ? resource.parent.getUrl() : null; - return self.makeRequest(url, referer); + return self.request.get(url, referer); }).then(function requestCompleted (responseData) { if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects @@ -104,6 +104,10 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) { resource.setType(utils.getTypeByFilename(filename)); } + if (responseData.metadata) { + resource.setMetadata(responseData.metadata); + } + resource.setText(responseData.body); self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad return resource; @@ -120,7 +124,7 @@ Scraper.prototype.requestResource = function requestResource (resource) { var self = this; var url = resource.getUrl(); - if (!self.options.urlFilter(url)) { + if (self.options.urlFilter && !self.options.urlFilter(url)) { logger.debug('filtering out ' + resource + ' by url filter'); return Promise.resolve(null); } diff --git a/test/unit/request-test.js b/test/unit/request-test.js index 62f3567a..a08ecdcb 100644 --- a/test/unit/request-test.js +++ b/test/unit/request-test.js @@ -1,93 +1,205 @@ -var should = require('should'); -var nock = require('nock'); -var sinon = require('sinon'); -require('sinon-as-promised'); -var proxyquire = require('proxyquire'); - -describe('Request', function () { - var makeRequest, makeStubbedRequest, requestStub; +'use strict'; - beforeEach(function() { - nock.cleanAll(); - nock.enableNetConnect(); +const should = require('should'); +const nock = require('nock'); +const sinon = require('sinon'); +require('sinon-as-promised'); +const proxyquire = require('proxyquire'); +const Request = require('../../lib/request'); - makeRequest = require('../../lib/request'); +describe('Request', () => { - requestStub = sinon.stub().yields(null, { request: {href: ''}, body: '', headers: {} }); - makeStubbedRequest = proxyquire('../../lib/request', { - 'request': { - 'get': requestStub - } + describe('constructor', () => { + it('should set passed responseHandler', () => { + let handler = sinon.stub(); + let r = new Request({ + httpResponseHandler: handler + }); + should(r.handleResponse).be.eql(handler); }); - }); - afterEach(function() { - nock.cleanAll(); - nock.enableNetConnect(); - }); + it('should set correct default handler and options if nothing passed', () => { + let r1 = new Request({}); + should(r1.handleResponse).be.ok(); + should(r1.handleResponse).be.instanceOf(Function); + should(r1.options).be.eql({}); - it('should call request with correct params', function () { - var options = { - headers: { - 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1;' - } - }; - var url = 'http://www.google.com'; - - return makeStubbedRequest(options, url).then(function () { - var expectedOptions = { - headers: { - 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1;' - }, - url: url - }; - - requestStub.calledOnce.should.be.eql(true); - requestStub.calledWith(expectedOptions).should.be.eql(true); + let r2 = new Request(); + should(r2.handleResponse).be.ok(); + should(r2.handleResponse).be.instanceOf(Function); + should(r2.options).be.eql({}); }); - }); - it('should add referer header if referer param was passed', function() { - var options = {}; - var url = 'http://www.google.com'; - var referer = 'http://referer.com'; - - return makeStubbedRequest(options, url, referer).then(function () { - var expectedOptions = { - headers: { - referer: referer - }, - url: url - }; - - requestStub.calledOnce.should.be.eql(true); - requestStub.calledWith(expectedOptions).should.be.eql(true); + it('should set passed request options', () => { + let options = { a: 1 }; + let r = new Request({ + request: options + }); + should(r.options).be.eql(options); }); }); - it('should return object with url, body and mimeType properties', function () { - var url = 'http://www.google.com'; - nock(url).get('/').reply(200, 'Hello from Google!', { - 'content-type': 'text/html; charset=utf-8' - }); + describe('get', () => { + + describe('using stubbed request', () => { + let requestStub, Request, responseMock; + + beforeEach(() => { + responseMock = { request: {href: ''}, body: '', headers: {} }; + requestStub = sinon.stub().yields(null, responseMock); + Request = proxyquire('../../lib/request', { + 'request': { + 'get': requestStub + } + }); + }); + + it('should call request with correct params', () => { + let r = new Request({}); + r.options = { + headers: { + 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1;' + } + }; + let url = 'http://www.google.com'; + + return r.get(url).then(() => { + let expectedOptions = { + headers: { + 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1;' + }, + url: url + }; + + requestStub.calledOnce.should.be.eql(true); + requestStub.calledWith(expectedOptions).should.be.eql(true); + }); + }); + + it('should add referer header if referer param was passed', () => { + let r = new Request({}); + + let url = 'http://www.google.com'; + let referer = 'http://referer.com'; + + return r.get(url, referer).then(() => { + let expectedOptions = { + headers: { + referer: referer + }, + url: url + }; - return makeRequest({}, url).then(function (data) { - data.should.have.properties(['url', 'body', 'mimeType']); - data.url.should.be.eql('http://www.google.com/'); - data.body.should.be.eql('Hello from Google!'); - data.mimeType.should.be.eql('text/html'); + requestStub.calledOnce.should.be.eql(true); + requestStub.calledWith(expectedOptions).should.be.eql(true); + }); + }); + + it('should call handleResponse with correct params', () => { + let handlerStub = sinon.stub().resolves(''); + let r = new Request({ + httpResponseHandler: handlerStub + }); + + return r.get('http://example.com').then(() => { + should(r.handleResponse.calledOnce).be.eql(true); + should(r.handleResponse.calledWith(responseMock)).be.eql(true); + }); + }); + + describe('transformResult from handleResponse', () => { + it('should return object with body and metadata properties', () => { + let handlerStub = sinon.stub().resolves({ + body: 'a', + metadata: 'b' + }); + let r = new Request({ + httpResponseHandler: handlerStub + }); + + return r.get('http://example.com').then((data) => { + should(data.body).be.eql('a'); + should(data.metadata).be.eql('b'); + }); + }); + + it('should return with metadata == null if metadata is not defined', () => { + let handlerStub = sinon.stub().resolves({ + body: 'a' + }); + let r = new Request({ + httpResponseHandler: handlerStub + }); + + return r.get('http://example.com').then((data) => { + should(data.body).be.eql('a'); + should(data.metadata).be.eql(null); + }); + }); + + it('should transform string result', () => { + let handlerStub = sinon.stub().resolves('test body'); + let r = new Request({ + httpResponseHandler: handlerStub + }); + + return r.get('http://example.com').then((data) => { + should(data.body).be.eql('test body'); + should(data.metadata).be.eql(null); + }); + }); + + it('should be rejected if wrong result (no string nor object) returned', () => { + let handlerStub = sinon.stub().resolves(['1', '2']); + let r = new Request({ + httpResponseHandler: handlerStub + }); + + return r.get('http://example.com').then(() => { + should(true).be.eql(false); + }).catch((e) => { + should(e).be.instanceOf(Error); + }); + }); + }); }); - }); - it('should return mimeType = null if content-type header was not found in response', function () { - var url = 'http://www.google.com'; - nock(url).get('/').reply(200, 'Hello from Google!', {}); + describe('using nock', () => { + beforeEach(() => { + nock.cleanAll(); + nock.enableNetConnect(); + }); + + afterEach(() => { + nock.cleanAll(); + nock.enableNetConnect(); + }); + + it('should return object with url, body, mimeType properties', () => { + let url = 'http://www.google.com'; + nock(url).get('/').reply(200, 'Hello from Google!', { + 'content-type': 'text/html; charset=utf-8' + }); + + return new Request().get(url).then((data) => { + data.should.have.properties(['url', 'body', 'mimeType']); + data.url.should.be.eql('http://www.google.com/'); + data.body.should.be.eql('Hello from Google!'); + data.mimeType.should.be.eql('text/html'); + }); + }); + + it('should return mimeType = null if content-type header was not found in response', () => { + let url = 'http://www.google.com'; + nock(url).get('/').reply(200, 'Hello from Google!', {}); - return makeRequest({}, url).then(function (data) { - data.should.have.properties(['url', 'body', 'mimeType']); - data.url.should.be.eql('http://www.google.com/'); - data.body.should.be.eql('Hello from Google!'); - should(data.mimeType).be.eql(null); + return new Request().get(url).then((data) => { + data.should.have.properties(['url', 'body', 'mimeType']); + data.url.should.be.eql('http://www.google.com/'); + data.body.should.be.eql('Hello from Google!'); + should(data.mimeType).be.eql(null); + }); + }); }); }); }); diff --git a/test/unit/scraper-init-test.js b/test/unit/scraper-init-test.js index 4998282b..0fa7c0f3 100644 --- a/test/unit/scraper-init-test.js +++ b/test/unit/scraper-init-test.js @@ -179,35 +179,6 @@ describe('Scraper initialization', function () { }); }); - describe('makeRequest', function () { - it('should bind request object to makeRequest method', function() { - var requestStub = sinon.stub().resolves(); - var Scraper = proxyquire('../../lib/scraper', { - './request': requestStub, - './config/defaults': { - request: {} - } - }); - - var reqOpts = { - headers: { - 'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1;' - } - }; - - var s = new Scraper({ - urls: { url: 'http://first-url.com' }, - directory: testDirname, - request: reqOpts - }); - - return s.makeRequest('http://example.com').then(function() { - requestStub.calledOnce.should.be.eql(true); - requestStub.calledWith(reqOpts).should.be.eql(true); - }); - }); - }); - describe('resourceHandler', function () { it('should create resourceHandler with correct params', function() { var ResourceHandlerStub = sinon.stub(); diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js index 9c3420d3..b019087b 100644 --- a/test/unit/scraper-test.js +++ b/test/unit/scraper-test.js @@ -1,3 +1,5 @@ +'use strict'; + var should = require('should'); var sinon = require('sinon'); require('sinon-as-promised'); @@ -5,10 +7,8 @@ var nock = require('nock'); var proxyquire = require('proxyquire'); var fs = require('fs-extra'); var path = require('path'); -var _ = require('lodash'); var Scraper = require('../../lib/scraper'); var Resource = require('../../lib/resource'); -var Promise = require('bluebird'); var testDirname = __dirname + '/.scraper-test'; var urls = [ 'http://example.com' ]; @@ -386,6 +386,32 @@ describe('Scraper', function () { s.handleError.calledOnce.should.be.eql(true); }); }); + + it('should update resource data with data returned from request', () => { + let metadata = { + solarSystemPlanets: [ 'Mercury', 'Venus', 'Earth', 'Mars', 'Jupiter', 'Saturn', 'Uranus', 'Neptune' ] + }; + + let s = new Scraper({ + urls: 'http://example.com', + directory: testDirname + }); + s.request.get = sinon.stub().resolves({ + url: 'http://google.com', + body: 'test body', + mimeType: 'text/html', + metadata: metadata + }); + + let r = new Resource('http://example.com'); + + return s.requestResource(r).finally(function() { + r.getText().should.be.eql('test body'); + r.getUrl().should.be.eql('http://google.com'); + r.getType().should.be.eql('html'); + r.metadata.should.be.eql(metadata); + }); + }) }); describe('#handleError', function() {