diff --git a/README.md b/README.md index 546ba256..9766a94c 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ npm install website-scraper ## Usage ```javascript -var scraper = require('website-scraper'); +var scraper = require('website-scraper'); var options = { urls: ['http://nodejs.org/'], directory: '/path/to/save/', @@ -38,7 +38,7 @@ scraper.scrape(options).then(function (result) { ## API ### scrape(options, callback) -Makes requests to `urls` and saves all files found with `sources` to `directory`. +Makes requests to `urls` and saves all files found with `sources` to `directory`. **options** - object containing next options: @@ -48,10 +48,12 @@ Makes requests to `urls` and saves all files found with `sources` to `directory` - `sources:` array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see default value in `lib/config/defaults.js`)* - `subdirectories:` array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)* - `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)* - - + - `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)* + - `maxDepth`: positive number, maximum allowed depth for dependencies *(optional, see example below)* + + **callback** - callback function *(optional)*, includes following parameters: - + - `error:` if error - `Error` object, if success - `null` - `result:` if error - `null`, if success - array if objects containing: - `url:` url of loaded page @@ -59,20 +61,21 @@ Makes requests to `urls` and saves all files found with `sources` to `directory` ## Examples -Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`. +#### Example 1 +Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`. Imagine we want to load: - [Home page](http://nodejs.org/) to `index.html` - [About page](http://nodejs.org/about/) to `about.html` - [Blog](http://blog.nodejs.org/) to `blog.html` - + and separate files into directories: - - `img` for .jpg, .png, .svg (full path `/path/to/save/img`) + - `img` for .jpg, .png, .svg (full path `/path/to/save/img`) - `js` for .js (full path `/path/to/save/js`) - `css` for .css (full path `/path/to/save/css`) ```javascript -var scraper = require('website-scraper'); +var scraper = require('website-scraper'); scraper.scrape({ urls: [ 'http://nodejs.org/', // Will be saved with default filename 'index.html' @@ -101,3 +104,16 @@ scraper.scrape({ console.log(err); }); ``` + +#### Example 2. Recursive downloading +```javascript +// Links from example.com will be followed +// Links from links will be ignored because theirs depth = 2 is greater than maxDepth +var scraper = require('website-scraper'); +scraper.scrape({ + urls: ['http://example.com/'], + directory: '/path/to/save', + recursive: true, + maxDepth: 1 +}).then(console.log).catch(console.log); +``` diff --git a/lib/config/recursive-sources.js b/lib/config/recursive-sources.js new file mode 100644 index 00000000..774056c1 --- /dev/null +++ b/lib/config/recursive-sources.js @@ -0,0 +1,3 @@ +module.exports = [ + { selector: 'a', attr: 'href' } +]; \ No newline at end of file diff --git a/lib/file-handlers/css.js b/lib/file-handlers/css.js index 4a7cfcd9..17edc596 100644 --- a/lib/file-handlers/css.js +++ b/lib/file-handlers/css.js @@ -1,7 +1,6 @@ var _ = require('underscore'); var Promise = require('bluebird'); var getCssUrls = require('css-url-parser'); -var Resource = require('../resource'); var utils = require('../utils'); function loadCss (context, resource) { @@ -12,8 +11,7 @@ function loadCss (context, resource) { var promises = _.map(cssUrls, function loadResourceFromCssUrl (cssUrl) { var resourceUrl = utils.getUrl(url, cssUrl); - var cssResource = new Resource(resourceUrl); - cssResource.setParent(resource); + var cssResource = resource.createChild(resourceUrl); return context.loadResource(cssResource).then(function handleLoadedSource (loadedResource) { var relativePath = utils.getRelativePath(filename, loadedResource.getFilename()); diff --git a/lib/file-handlers/html.js b/lib/file-handlers/html.js index 95d28058..a99a0b7c 100644 --- a/lib/file-handlers/html.js +++ b/lib/file-handlers/html.js @@ -1,7 +1,6 @@ var cheerio = require('cheerio'); var Promise = require('bluebird'); var utils = require('../utils'); -var Resource = require('../resource'); function loadHtml (context, resource) { var sources = context.getHtmlSources(); @@ -50,8 +49,7 @@ function loadResources (context, resource, source) { if (attr) { var resourceUrl = utils.getUrl(url, attr); - var htmlResource = new Resource(resourceUrl); - htmlResource.setParent(resource); + var htmlResource = resource.createChild(resourceUrl); htmlResource.setHtmlData({ tagName: el[0].name, attributeName: source.attr }); return context.loadResource(htmlResource).then(function handleLoadedSource (loadedResource) { diff --git a/lib/resource.js b/lib/resource.js index 81ea9d8a..781d1bab 100644 --- a/lib/resource.js +++ b/lib/resource.js @@ -15,6 +15,17 @@ function Resource (url, filename) { this.filename = filename; } +Resource.prototype.createChild = function createChild (url, filename) { + var child = new Resource(url, filename); + + var currentDepth = this.getDepth(); + + child.setParent(this); + child.setDepth(++currentDepth); + + return child; +}; + Resource.prototype.getUrl = function getUrl () { return this.url; }; @@ -43,6 +54,14 @@ Resource.prototype.setParent = function setParent (parent) { this.parent = parent; }; +Resource.prototype.getDepth = function getDepth () { + return this.depth || 0; +}; + +Resource.prototype.setDepth = function setDepth (depth) { + this.depth = depth; +}; + /** * * @param {Object} data - html element data diff --git a/lib/scraper.js b/lib/scraper.js index 4c12472f..7b6f0bb6 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -1,28 +1,27 @@ var Promise = require('bluebird'); -var fs = Promise.promisifyAll(require('fs-extra')); + +var fs = require('fs-extra'); +var existsAsync = Promise.promisify(fs.stat); +var outputFileAsync = Promise.promisify(fs.outputFile); +var ensureDirAsync = Promise.promisify(fs.ensureDir); + var path = require('path'); var _ = require('underscore'); var defaults = require('./config/defaults'); var types = require('./config/resource-types'); +var recursiveSources = require('./config/recursive-sources'); var utils = require('./utils.js'); var request = require('./request'); var Resource = require('./resource'); -var loadHtml = require('./file-handlers/html'); -var loadCss = require('./file-handlers/css'); var compareUrls = require('compare-urls'); -function getHandleFunction (resource) { - var type = resource.getType(); - switch (type) { - case types.css: return loadCss; - case types.html: return function loadHtmlAndCss (context, po) { - return loadHtml(context, po).then(function (loaded) { - return loadCss(context, loaded); - }); - }; - default: return _.noop; - } +var loadHtml = require('./file-handlers/html'); +var loadCss = require('./file-handlers/css'); +function loadHtmlAndCss (context, po) { + return loadHtml(context, po).then(function (loaded) { + return loadCss(context, loaded); + }); } function Scraper (options) { @@ -83,6 +82,20 @@ Scraper.prototype.getDirectoryByExtension = function getDirectoryByExtension (ex .value() || ''; }; +Scraper.prototype.getResourceHandler = function getHandler (resource) { + var self = this; + var type = resource.getType(); + var depth = resource.getDepth(); + var depthGreaterThanMax = self.options.maxDepth && depth >= self.options.maxDepth; + + switch (true) { + case depthGreaterThanMax: return _.noop; + case type == types.css: return loadCss; + case type == types.html: return loadHtmlAndCss; + default: return _.noop; + } +}; + Scraper.prototype.loadResource = function loadResource (resource) { var self = this; @@ -102,12 +115,12 @@ Scraper.prototype.loadResource = function loadResource (resource) { return self.makeRequest(url).then(function requestCompleted(data) { resource.setUrl(data.url); // Url may be changed in redirects resource.setText(data.body); - handleFile = getHandleFunction(resource); + handleFile = self.getResourceHandler(resource); return handleFile(self, resource); }).then(function fileHandled() { var filename = path.join(self.options.directory, resource.getFilename()); var text = resource.getText(); - return fs.outputFileAsync(filename, text, { encoding: 'binary' }); + return outputFileAsync(filename, text, { encoding: 'binary' }); }).then(function fileSaved() { return Promise.resolve(resource); }); @@ -116,15 +129,16 @@ Scraper.prototype.loadResource = function loadResource (resource) { }; Scraper.prototype.validate = function validate () { - if (fs.existsSync(this.options.directory)) { - return Promise.reject(new Error('Path ' + this.options.directory + ' exists')); - } - return Promise.resolve(); + var dir = this.options.directory; + return existsAsync(dir).then(function handleDirectoryExist () { + return Promise.reject(new Error('Path ' + dir + ' exists')); + }, function handleDirectoryNotExist () { + return Promise.resolve(); + }); }; Scraper.prototype.prepare = function prepare () { var self = this; - fs.ensureDirSync(self.options.directory); // Create makeRequest function with custom request params self.makeRequest = request.makeRequest.bind(null, self.options.request); @@ -136,7 +150,12 @@ Scraper.prototype.prepare = function prepare () { var filename = _.isObject(obj) && _.has(obj, 'filename') ? obj.filename : self.options.defaultFilename; return new Resource(url, filename); }); - return Promise.resolve(); + + if (self.options.recursive) { + self.options.sources = _.union(self.options.sources, recursiveSources); + } + + return ensureDirAsync(self.options.directory); }; Scraper.prototype.load = function load () { diff --git a/package.json b/package.json index 274b0678..cd9c98c9 100644 --- a/package.json +++ b/package.json @@ -44,7 +44,9 @@ "istanbul": "^0.4.0", "mocha": "^2.2.5", "nock": "^2.9.1", + "proxyquire": "^1.7.3", "should": "^7.0.2", - "sinon": "^1.15.4" + "sinon": "^1.15.4", + "sinon-as-promised": "^4.0.0" } } diff --git a/test/functional/mocks/recursive/about.html b/test/functional/mocks/recursive/about.html new file mode 100644 index 00000000..54abd55d --- /dev/null +++ b/test/functional/mocks/recursive/about.html @@ -0,0 +1,13 @@ + + + + + Title + + + + + + + + \ No newline at end of file diff --git a/test/functional/mocks/recursive/index.html b/test/functional/mocks/recursive/index.html new file mode 100644 index 00000000..ff6e8b7b --- /dev/null +++ b/test/functional/mocks/recursive/index.html @@ -0,0 +1,11 @@ + + + + + Title + + + + + + \ No newline at end of file diff --git a/test/functional/recursive-test.js b/test/functional/recursive-test.js new file mode 100644 index 00000000..64f25bb5 --- /dev/null +++ b/test/functional/recursive-test.js @@ -0,0 +1,119 @@ +require('should'); +var nock = require('nock'); +var fs = require('fs-extra'); +var path = require('path'); +var _ = require('underscore'); +var scraper = require('../../index'); + +var testDirname = __dirname + '/.recursive'; +var mockDirname = __dirname + '/mocks/recursive'; + +describe('Functional recursive downloading', function() { + + beforeEach(function() { + nock.cleanAll(); + nock.disableNetConnect(); + }); + + afterEach(function() { + nock.cleanAll(); + nock.enableNetConnect(); + fs.removeSync(testDirname); + }); + + it('should follow anchors if recursive flag is set', function(done) { + var options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: null, + sources: [], + recursive: true + }; + + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); + + // mock for anchors + nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html'); + nock('http://example.com/').get('/link1.html').reply(200, 'content 1'); + nock('http://example.com/').get('/link2.html').reply(200, 'content 2'); + nock('http://example.com/').get('/link3.html').reply(200, 'content 3'); + + scraper.scrape(options).then(function() { + fs.existsSync(testDirname + '/index.html').should.be.eql(true); + + // index.html anchors loaded + fs.existsSync(testDirname + '/about.html').should.be.eql(true); + + // about.html anchors loaded + fs.existsSync(testDirname + '/link1.html').should.be.eql(true); + fs.existsSync(testDirname + '/link2.html').should.be.eql(true); + fs.existsSync(testDirname + '/link3.html').should.be.eql(true); + + done(); + }).catch(done); + }); + + it('should follow anchors with depth < maxDepth if recursive flag and maxDepth are set', function(done) { + var options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: null, + sources: [], + recursive: true, + maxDepth: 1 + }; + + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); + + // mock for anchors + nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html'); + nock('http://example.com/').get('/link1.html').reply(200, 'content 1'); + nock('http://example.com/').get('/link2.html').reply(200, 'content 2'); + nock('http://example.com/').get('/link3.html').reply(200, 'content 3'); + + scraper.scrape(options).then(function() { + fs.existsSync(testDirname + '/index.html').should.be.eql(true); + + // index.html anchors loaded + fs.existsSync(testDirname + '/about.html').should.be.eql(true); + + // about.html anchors loaded + fs.existsSync(testDirname + '/link1.html').should.be.eql(false); + fs.existsSync(testDirname + '/link2.html').should.be.eql(false); + fs.existsSync(testDirname + '/link3.html').should.be.eql(false); + + done(); + }).catch(done); + }); + + it('should not follow anchors if recursive flag is not set', function(done) { + var options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: null, + sources: [] + }; + + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); + + // mock for anchors + nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html'); + nock('http://example.com/').get('/link1.html').reply(200, 'content 1'); + nock('http://example.com/').get('/link2.html').reply(200, 'content 2'); + nock('http://example.com/').get('/link3.html').reply(200, 'content 3'); + + scraper.scrape(options).then(function() { + fs.existsSync(testDirname + '/index.html').should.be.eql(true); + + // index.html anchors loaded + fs.existsSync(testDirname + '/about.html').should.be.eql(false); + + // about.html anchors loaded + fs.existsSync(testDirname + '/link1.html').should.be.eql(false); + fs.existsSync(testDirname + '/link2.html').should.be.eql(false); + fs.existsSync(testDirname + '/link3.html').should.be.eql(false); + + done(); + }).catch(done); + }); +}); diff --git a/test/unit/resource-test.js b/test/unit/resource-test.js index c2b2264b..3188b68d 100644 --- a/test/unit/resource-test.js +++ b/test/unit/resource-test.js @@ -4,7 +4,7 @@ var Resource = require('../../lib/resource'); var types = require('../../lib/config/resource-types'); describe('Resource', function() { - describe('#Resource', function() { + describe('#getType', function() { it('should return correct type based on extension', function() { var html = new Resource('http://example.com', 'index.html'); var htm = new Resource('http://example.com', 'index.htm'); @@ -68,4 +68,56 @@ describe('Resource', function() { res.getType().should.be.eql(types.other); }); }); + + describe('#setDepth', function () { + it('should set depth', function() { + var o = new Resource('http://google.com'); + o.setDepth(555); + o.depth.should.be.eql(555); + }); + }); + + describe('#getDepth', function () { + it('should return depth if object has it', function() { + var o = new Resource('http://google.com'); + o.setDepth(123); + o.getDepth().should.be.eql(123); + }); + + it('should return 0 if object has no depth', function() { + var o = new Resource('http://google.com'); + o.getDepth().should.be.eql(0); + }); + + }); + + describe('#createChild', function () { + it('should return Resource', function() { + var parent = new Resource('http://example.com'); + var child = parent.createChild('http://google.com'); + child.should.be.instanceOf(Resource); + }); + + it('should set correct url and filename', function() { + var parent = new Resource('http://example.com'); + var child = parent.createChild('http://google.com', 'google.html'); + child.getUrl().should.be.eql('http://google.com'); + child.getFilename().should.be.eql('google.html'); + }); + + it('should set parent', function() { + var parent = new Resource('http://example.com'); + var child = parent.createChild('http://google.com'); + child.parent.should.be.equal(parent); + }); + + it('should set depth', function() { + var parent = new Resource('http://example.com'); + var child = parent.createChild('http://google.com'); + child.getDepth().should.be.eql(1); + + var childOfChild = child.createChild('http://google.com.ua'); + childOfChild.getDepth().should.be.eql(2); + }); + }); }); diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js index 678b4a01..2be585b4 100644 --- a/test/unit/scraper-test.js +++ b/test/unit/scraper-test.js @@ -1,6 +1,8 @@ var should = require('should'); var sinon = require('sinon'); +require('sinon-as-promised'); var nock = require('nock'); +var proxyquire = require('proxyquire'); var fs = require('fs-extra'); var path = require('path'); var _ = require('underscore'); @@ -141,6 +143,24 @@ describe('Scraper', function () { done(); }).catch(done); }); + + it('should extend sources if recursive flag is set', function(done) { + var s = new Scraper({ + urls: { url: 'http://first-url.com' }, + directory: testDirname, + sources: [ + { selector: 'img', attr: 'src' } + ], + recursive: true + }); + + s.prepare().then(function() { + s.options.sources.should.have.length(2); + s.options.sources.should.containEql({ selector: 'img', attr: 'src' }); + s.options.sources.should.containEql({ selector: 'a', attr: 'href' }); + done(); + }).catch(done); + }); }); describe('#load', function() { @@ -533,6 +553,93 @@ describe('Scraper', function () { }); }); + describe('#getResourceHandler', function() { + var Scraper; + var noopStub; + var cssLoadStub; + var htmlLoadStub; + + beforeEach(function() { + noopStub = sinon.stub().resolves(); + cssLoadStub = sinon.stub().resolves(); + htmlLoadStub = sinon.stub().resolves(); + + Scraper = proxyquire('../../lib/scraper', { + 'underscore': { + 'noop': noopStub + }, + './file-handlers/html': htmlLoadStub, + './file-handlers/css': cssLoadStub + }); + }); + + it('should return noop if resource has depth > max', function(done) { + var s = new Scraper({ + urls: 'http://example.com', + directory: testDirname, + maxDepth: 2 + }); + + s.prepare().then(function() { + var r = new Resource('http://example.com/'); + sinon.stub(r, 'getType').returns('html'); + sinon.stub(r, 'getDepth').returns(10); + + s.getResourceHandler(r).call(s, r).then(function() { + noopStub.called.should.be.eql(true); + cssLoadStub.called.should.be.eql(false); + htmlLoadStub.called.should.be.eql(false); + + done(); + }); + }).catch(done); + }); + + it('should return css loader if file has css type', function(done) { + var s = new Scraper({ + urls: 'http://example.com', + directory: testDirname, + maxDepth: 2 + }); + + s.prepare().then(function() { + var r = new Resource('http://example.com/'); + sinon.stub(r, 'getType').returns('css'); + sinon.stub(r, 'getDepth').returns(1); + + s.getResourceHandler(r).call(s, r).then(function() { + noopStub.called.should.be.eql(false); + cssLoadStub.called.should.be.eql(true); + htmlLoadStub.called.should.be.eql(false); + + done(); + }); + }).catch(done); + }); + + it('should return html & css loader if file has html type', function(done) { + var s = new Scraper({ + urls: 'http://example.com', + directory: testDirname, + maxDepth: 2 + }); + + s.prepare().then(function() { + var r = new Resource('http://example.com/'); + sinon.stub(r, 'getType').returns('html'); + sinon.stub(r, 'getDepth').returns(1); + + s.getResourceHandler(r).call(s, r).then(function() { + noopStub.called.should.be.eql(false); + cssLoadStub.called.should.be.eql(true); + htmlLoadStub.called.should.be.eql(true); + + done(); + }); + }).catch(done); + }); + }); + describe('#scrape', function() { it('should call methods in sequence', function(done) { nock('http://example.com').get('/').reply(200, 'OK');