diff --git a/MIGRATION.md b/MIGRATION.md index dc2ee757..697e0aa1 100644 --- a/MIGRATION.md +++ b/MIGRATION.md @@ -107,6 +107,7 @@ scrape({ return Promise.reject(new Error('status is 404')); } else { return Promise.resolve(response.body); + } } }) @@ -115,12 +116,11 @@ class MyAfterResponsePlugin { apply(registerAction) { registerAction('afterResponse', ({response}) => { if (response.statusCode === 404) { - return Promise.reject(new Error('status is 404')); + return null; } else { - return Promise.resolve(response.body); - }); - } - }); + return response.body; + } + }); } } scrape({ diff --git a/README.md b/README.md index 418b7b2c..ef775196 100644 --- a/README.md +++ b/README.md @@ -163,7 +163,7 @@ String, filename for index page. Defaults to `index.html`. Boolean, whether urls should be 'prettified', by having the `defaultFilename` removed. Defaults to `false`. #### ignoreErrors -Boolean, if `true` scraper will continue downloading resources after error occurred, if `false` - scraper will finish process and return error. Defaults to `true`. +Boolean, if `true` scraper will continue downloading resources after error occurred, if `false` - scraper will finish process and return error. Defaults to `false`. #### urlFilter Function which is called for each url to check whether it should be scraped. Defaults to `null` - no url filter will be applied. @@ -300,16 +300,16 @@ If multiple actions `afterResponse` added - scraper will use result from last on // Do not save resources which responded with 404 not found status code registerAction('afterResponse', ({response}) => { if (response.statusCode === 404) { - return Promise.reject(new Error('status is 404')); - } else { - // if you don't need metadata - you can just return Promise.resolve(response.body) - return Promise.resolve({ - body: response.body, - metadata: { - headers: response.headers, - someOtherData: [ 1, 2, 3 ] + return null; + } else { + // if you don't need metadata - you can just return Promise.resolve(response.body) + return { + body: response.body, + metadata: { + headers: response.headers, + someOtherData: [ 1, 2, 3 ] } - }); + } } }); ``` diff --git a/lib/config/defaults.js b/lib/config/defaults.js index 7a8e8261..c980f5f9 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -52,7 +52,7 @@ const config = { recursive: false, maxRecursiveDepth: null, maxDepth: null, - ignoreErrors: true + ignoreErrors: false }; module.exports = config; diff --git a/lib/request.js b/lib/request.js index 65fd12b4..0666be9c 100644 --- a/lib/request.js +++ b/lib/request.js @@ -24,6 +24,8 @@ function transformResult (result) { body: result.body, metadata: result.metadata || null }; + case result === null: + return null; default: throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result); } @@ -44,6 +46,9 @@ module.exports.get = ({url, referer, options = {}, afterResponse = defaultRespon return afterResponse({response}) .then(transformResult) .then((responseHandlerResult) => { + if (!responseHandlerResult) { + return null; + } return { url: response.request.href, mimeType: getMimeType(response.headers['content-type']), diff --git a/lib/scraper.js b/lib/scraper.js index de5881eb..8ebc37dd 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -157,6 +157,10 @@ class Scraper { afterResponse: this.actions.afterResponse.length ? this.runActions.bind(this, 'afterResponse') : undefined })); }).then(async function requestCompleted (responseData) { + if (!responseData) { + logger.debug('no response returned for url ' + url); + return null; + } if (!urlsEqual(responseData.url, url)) { // Url may be changed in redirects logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url); diff --git a/test/functional/callbacks/callbacks.test.js b/test/functional/callbacks/callbacks.test.js index 98bf3a0c..a4151831 100644 --- a/test/functional/callbacks/callbacks.test.js +++ b/test/functional/callbacks/callbacks.test.js @@ -20,7 +20,7 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin', fs.removeSync(testDirname); }); - it('should call onResourceSaved callback and onResourceError callback', function() { + it('should call onResourceSaved callback and onResourceError callback if ignoreErrors = true', function() { nock('http://example.com/').get('/').reply(200, 'OK'); nock('http://nodejs.org/').get('/').replyWithError('REQUEST ERROR!!'); @@ -38,6 +38,7 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin', urls: [ 'http://example.com/', 'http://nodejs.org/' ], directory: testDirname, subdirectories: null, + ignoreErrors: true, plugins: [ new MyPlugin() ] @@ -52,4 +53,38 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin', should(resourceErrorStub.args[0][0].error.message).be.eql('REQUEST ERROR!!'); }); }); + + it('should call onResourceError callback if ignoreErrors = false', function() { + // it is not necessary that 1st (successful) resource will be saved before error occurred, so skip onResourceSaved check + nock('http://example.com/').get('/').reply(200, 'OK'); + nock('http://nodejs.org/').get('/').replyWithError('REQUEST ERROR!!'); + + const resourceSavedStub = sinon.stub(); + const resourceErrorStub = sinon.stub(); + + class MyPlugin { + apply(addAction) { + addAction('onResourceSaved', resourceSavedStub); + addAction('onResourceError', resourceErrorStub); + } + } + + const options = { + urls: [ 'http://example.com/', 'http://nodejs.org/' ], + directory: testDirname, + subdirectories: null, + ignoreErrors: true, + plugins: [ + new MyPlugin() + ] + }; + + return scrape(options).then(function() { + should(true).eql(false); + }).catch(() => { + should(resourceErrorStub.calledOnce).be.eql(true); + should(resourceErrorStub.args[0][0].resource.url).be.eql('http://nodejs.org/'); + should(resourceErrorStub.args[0][0].error.message).be.eql('REQUEST ERROR!!'); + }); + }); }); diff --git a/test/functional/request-response-customizations/after-response-action.test.js b/test/functional/request-response-customizations/after-response-action.test.js new file mode 100644 index 00000000..d9fa7c8d --- /dev/null +++ b/test/functional/request-response-customizations/after-response-action.test.js @@ -0,0 +1,66 @@ +const should = require('should'); +const nock = require('nock'); +const fs = require('fs-extra'); +const scrape = require('../../../index'); + +const testDirname = __dirname + '/.tmp'; +const mockDirname = __dirname + '/mocks'; + +describe('Functional: afterResponse action in plugin', function() { + + beforeEach(function() { + nock.cleanAll(); + nock.disableNetConnect(); + }); + + afterEach(function() { + nock.cleanAll(); + nock.enableNetConnect(); + fs.removeSync(testDirname); + }); + + it('should skip downloading resource if afterResponse returns null', function() { + nock('http://example.com/').get('/1.html').reply(200, 'content of 1.html'); + nock('http://example.com/').get('/2.html').reply(404); + + class Skip404ResponseHandler { + apply(add) { + add('afterResponse', ({response}) => { + if (response.statusCode === 404) { + return null; + } else { + return { + body: response.body, + metadata: { + headers: response.headers, + someOtherData: [ 1, 2, 3 ] + } + } + } + }); + } + } + + const options = { + urls: [ + { url: 'http://example.com/1.html', filename: '1.html' }, + { url: 'http://example.com/2.html', filename: '2.html' } + ], + directory: testDirname, + plugins: [ + new Skip404ResponseHandler() + ] + }; + + return scrape(options).then(function(result) { + should(result[0]).have.properties({ url: 'http://example.com/1.html', filename: '1.html', saved: true }); + should(result[1]).have.properties({ url: 'http://example.com/2.html', filename: '2.html', saved: false }); + + fs.existsSync(testDirname + '/1.html').should.be.eql(true); + const indexHtml = fs.readFileSync(testDirname + '/1.html').toString(); + should(indexHtml).containEql('content of 1.html'); + + fs.existsSync(testDirname + '/2.html').should.be.eql(false); + }); + }); +}); diff --git a/test/functional/request-customization/request.test.js b/test/functional/request-response-customizations/request.test.js similarity index 100% rename from test/functional/request-customization/request.test.js rename to test/functional/request-response-customizations/request.test.js diff --git a/test/functional/update-missing-sources/update-missing-sources.test.js b/test/functional/update-missing-sources/update-missing-sources.test.js index 0bf86a29..4f81ed25 100644 --- a/test/functional/update-missing-sources/update-missing-sources.test.js +++ b/test/functional/update-missing-sources/update-missing-sources.test.js @@ -64,7 +64,8 @@ describe('Functional: update missing sources', () => { directory: testDirname, subdirectories: null, sources: [{ selector: 'img', attr: 'src' }], - plugins: [ new UpdateMissingResourceReferencePlugin() ] + plugins: [ new UpdateMissingResourceReferencePlugin() ], + ignoreErrors: true }; nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); @@ -138,7 +139,8 @@ describe('Functional: update missing sources', () => { directory: testDirname, subdirectories: null, sources: [{selector: 'style'}], - plugins: [ new UpdateMissingResourceReferencePlugin() ] + plugins: [ new UpdateMissingResourceReferencePlugin() ], + ignoreErrors: true }; nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/path-containers.html'); diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js index 087fc2a3..be974b04 100644 --- a/test/unit/scraper-test.js +++ b/test/unit/scraper-test.js @@ -22,28 +22,6 @@ describe('Scraper', function () { fs.removeSync(testDirname); }); - describe('#load', function() { - it('should return array of objects with url, filename and children', function() { - nock('http://first-url.com').get('/').reply(200, 'OK'); - nock('http://second-url.com').get('/').reply(500); - - var s = new Scraper({ - urls: [ - 'http://first-url.com', - 'http://second-url.com' - ], - directory: testDirname - }); - - return s.load().then(function(res) { - res.should.be.instanceOf(Array); - res.should.have.length(2); - res[0].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']); - res[1].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']); - }); - }); - }); - describe('#errorCleanup', function() { it('should throw error', function() { var s = new Scraper({ @@ -345,6 +323,26 @@ describe('Scraper', function () { err.message.should.be.eql('Awful error'); }); }); + + it('should return array of objects with url, filename and children', function() { + nock('http://first-url.com').get('/').reply(200, 'OK'); + nock('http://second-url.com').get('/').reply(500); + + var s = new Scraper({ + urls: [ + 'http://first-url.com', + 'http://second-url.com' + ], + directory: testDirname + }); + + return s.scrape().then(function(res) { + res.should.be.instanceOf(Array); + res.should.have.length(2); + res[0].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']); + res[1].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']); + }); + }); }); describe('#runActions', () => {