Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions MIGRATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ scrape({
return Promise.reject(new Error('status is 404'));
} else {
return Promise.resolve(response.body);
}
}
})

Expand All @@ -115,12 +116,11 @@ class MyAfterResponsePlugin {
apply(registerAction) {
registerAction('afterResponse', ({response}) => {
if (response.statusCode === 404) {
return Promise.reject(new Error('status is 404'));
return null;
} else {
return Promise.resolve(response.body);
});
}
});
return response.body;
}
});
}
}
scrape({
Expand Down
20 changes: 10 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ String, filename for index page. Defaults to `index.html`.
Boolean, whether urls should be 'prettified', by having the `defaultFilename` removed. Defaults to `false`.

#### ignoreErrors
Boolean, if `true` scraper will continue downloading resources after error occurred, if `false` - scraper will finish process and return error. Defaults to `true`.
Boolean, if `true` scraper will continue downloading resources after error occurred, if `false` - scraper will finish process and return error. Defaults to `false`.

#### urlFilter
Function which is called for each url to check whether it should be scraped. Defaults to `null` - no url filter will be applied.
Expand Down Expand Up @@ -300,16 +300,16 @@ If multiple actions `afterResponse` added - scraper will use result from last on
// Do not save resources which responded with 404 not found status code
registerAction('afterResponse', ({response}) => {
if (response.statusCode === 404) {
return Promise.reject(new Error('status is 404'));
} else {
// if you don't need metadata - you can just return Promise.resolve(response.body)
return Promise.resolve({
body: response.body,
metadata: {
headers: response.headers,
someOtherData: [ 1, 2, 3 ]
return null;
} else {
// if you don't need metadata - you can just return Promise.resolve(response.body)
return {
body: response.body,
metadata: {
headers: response.headers,
someOtherData: [ 1, 2, 3 ]
}
});
}
}
});
```
Expand Down
2 changes: 1 addition & 1 deletion lib/config/defaults.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ const config = {
recursive: false,
maxRecursiveDepth: null,
maxDepth: null,
ignoreErrors: true
ignoreErrors: false
};

module.exports = config;
5 changes: 5 additions & 0 deletions lib/request.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ function transformResult (result) {
body: result.body,
metadata: result.metadata || null
};
case result === null:
return null;
default:
throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result);
}
Expand All @@ -44,6 +46,9 @@ module.exports.get = ({url, referer, options = {}, afterResponse = defaultRespon
return afterResponse({response})
.then(transformResult)
.then((responseHandlerResult) => {
if (!responseHandlerResult) {
return null;
}
return {
url: response.request.href,
mimeType: getMimeType(response.headers['content-type']),
Expand Down
4 changes: 4 additions & 0 deletions lib/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,10 @@ class Scraper {
afterResponse: this.actions.afterResponse.length ? this.runActions.bind(this, 'afterResponse') : undefined
}));
}).then(async function requestCompleted (responseData) {
if (!responseData) {
logger.debug('no response returned for url ' + url);
return null;
}

if (!urlsEqual(responseData.url, url)) { // Url may be changed in redirects
logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url);
Expand Down
37 changes: 36 additions & 1 deletion test/functional/callbacks/callbacks.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin',
fs.removeSync(testDirname);
});

it('should call onResourceSaved callback and onResourceError callback', function() {
it('should call onResourceSaved callback and onResourceError callback if ignoreErrors = true', function() {
nock('http://example.com/').get('/').reply(200, 'OK');
nock('http://nodejs.org/').get('/').replyWithError('REQUEST ERROR!!');

Expand All @@ -38,6 +38,7 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin',
urls: [ 'http://example.com/', 'http://nodejs.org/' ],
directory: testDirname,
subdirectories: null,
ignoreErrors: true,
plugins: [
new MyPlugin()
]
Expand All @@ -52,4 +53,38 @@ describe('Functional: onResourceSaved and onResourceError callbacks in plugin',
should(resourceErrorStub.args[0][0].error.message).be.eql('REQUEST ERROR!!');
});
});

it('should call onResourceError callback if ignoreErrors = false', function() {
// it is not necessary that 1st (successful) resource will be saved before error occurred, so skip onResourceSaved check
nock('http://example.com/').get('/').reply(200, 'OK');
nock('http://nodejs.org/').get('/').replyWithError('REQUEST ERROR!!');

const resourceSavedStub = sinon.stub();
const resourceErrorStub = sinon.stub();

class MyPlugin {
apply(addAction) {
addAction('onResourceSaved', resourceSavedStub);
addAction('onResourceError', resourceErrorStub);
}
}

const options = {
urls: [ 'http://example.com/', 'http://nodejs.org/' ],
directory: testDirname,
subdirectories: null,
ignoreErrors: true,
plugins: [
new MyPlugin()
]
};

return scrape(options).then(function() {
should(true).eql(false);
}).catch(() => {
should(resourceErrorStub.calledOnce).be.eql(true);
should(resourceErrorStub.args[0][0].resource.url).be.eql('http://nodejs.org/');
should(resourceErrorStub.args[0][0].error.message).be.eql('REQUEST ERROR!!');
});
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
const should = require('should');
const nock = require('nock');
const fs = require('fs-extra');
const scrape = require('../../../index');

const testDirname = __dirname + '/.tmp';
const mockDirname = __dirname + '/mocks';

describe('Functional: afterResponse action in plugin', function() {

beforeEach(function() {
nock.cleanAll();
nock.disableNetConnect();
});

afterEach(function() {
nock.cleanAll();
nock.enableNetConnect();
fs.removeSync(testDirname);
});

it('should skip downloading resource if afterResponse returns null', function() {
nock('http://example.com/').get('/1.html').reply(200, 'content of 1.html');
nock('http://example.com/').get('/2.html').reply(404);

class Skip404ResponseHandler {
apply(add) {
add('afterResponse', ({response}) => {
if (response.statusCode === 404) {
return null;
} else {
return {
body: response.body,
metadata: {
headers: response.headers,
someOtherData: [ 1, 2, 3 ]
}
}
}
});
}
}

const options = {
urls: [
{ url: 'http://example.com/1.html', filename: '1.html' },
{ url: 'http://example.com/2.html', filename: '2.html' }
],
directory: testDirname,
plugins: [
new Skip404ResponseHandler()
]
};

return scrape(options).then(function(result) {
should(result[0]).have.properties({ url: 'http://example.com/1.html', filename: '1.html', saved: true });
should(result[1]).have.properties({ url: 'http://example.com/2.html', filename: '2.html', saved: false });

fs.existsSync(testDirname + '/1.html').should.be.eql(true);
const indexHtml = fs.readFileSync(testDirname + '/1.html').toString();
should(indexHtml).containEql('content of 1.html');

fs.existsSync(testDirname + '/2.html').should.be.eql(false);
});
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ describe('Functional: update missing sources', () => {
directory: testDirname,
subdirectories: null,
sources: [{ selector: 'img', attr: 'src' }],
plugins: [ new UpdateMissingResourceReferencePlugin() ]
plugins: [ new UpdateMissingResourceReferencePlugin() ],
ignoreErrors: true
};

nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
Expand Down Expand Up @@ -138,7 +139,8 @@ describe('Functional: update missing sources', () => {
directory: testDirname,
subdirectories: null,
sources: [{selector: 'style'}],
plugins: [ new UpdateMissingResourceReferencePlugin() ]
plugins: [ new UpdateMissingResourceReferencePlugin() ],
ignoreErrors: true
};

nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/path-containers.html');
Expand Down
42 changes: 20 additions & 22 deletions test/unit/scraper-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,6 @@ describe('Scraper', function () {
fs.removeSync(testDirname);
});

describe('#load', function() {
it('should return array of objects with url, filename and children', function() {
nock('http://first-url.com').get('/').reply(200, 'OK');
nock('http://second-url.com').get('/').reply(500);

var s = new Scraper({
urls: [
'http://first-url.com',
'http://second-url.com'
],
directory: testDirname
});

return s.load().then(function(res) {
res.should.be.instanceOf(Array);
res.should.have.length(2);
res[0].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']);
res[1].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']);
});
});
});

describe('#errorCleanup', function() {
it('should throw error', function() {
var s = new Scraper({
Expand Down Expand Up @@ -345,6 +323,26 @@ describe('Scraper', function () {
err.message.should.be.eql('Awful error');
});
});

it('should return array of objects with url, filename and children', function() {
nock('http://first-url.com').get('/').reply(200, 'OK');
nock('http://second-url.com').get('/').reply(500);

var s = new Scraper({
urls: [
'http://first-url.com',
'http://second-url.com'
],
directory: testDirname
});

return s.scrape().then(function(res) {
res.should.be.instanceOf(Array);
res.should.have.length(2);
res[0].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']);
res[1].should.be.instanceOf(Resource).and.have.properties(['url', 'filename', 'children']);
});
});
});

describe('#runActions', () => {
Expand Down