diff --git a/README.md b/README.md index 546ba256..9766a94c 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ npm install website-scraper ## Usage ```javascript -var scraper = require('website-scraper'); +var scraper = require('website-scraper'); var options = { urls: ['http://nodejs.org/'], directory: '/path/to/save/', @@ -38,7 +38,7 @@ scraper.scrape(options).then(function (result) { ## API ### scrape(options, callback) -Makes requests to `urls` and saves all files found with `sources` to `directory`. +Makes requests to `urls` and saves all files found with `sources` to `directory`. **options** - object containing next options: @@ -48,10 +48,12 @@ Makes requests to `urls` and saves all files found with `sources` to `directory` - `sources:` array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see default value in `lib/config/defaults.js`)* - `subdirectories:` array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)* - `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)* - - + - `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)* + - `maxDepth`: positive number, maximum allowed depth for dependencies *(optional, see example below)* + + **callback** - callback function *(optional)*, includes following parameters: - + - `error:` if error - `Error` object, if success - `null` - `result:` if error - `null`, if success - array if objects containing: - `url:` url of loaded page @@ -59,20 +61,21 @@ Makes requests to `urls` and saves all files found with `sources` to `directory` ## Examples -Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`. +#### Example 1 +Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`. Imagine we want to load: - [Home page](http://nodejs.org/) to `index.html` - [About page](http://nodejs.org/about/) to `about.html` - [Blog](http://blog.nodejs.org/) to `blog.html` - + and separate files into directories: - - `img` for .jpg, .png, .svg (full path `/path/to/save/img`) + - `img` for .jpg, .png, .svg (full path `/path/to/save/img`) - `js` for .js (full path `/path/to/save/js`) - `css` for .css (full path `/path/to/save/css`) ```javascript -var scraper = require('website-scraper'); +var scraper = require('website-scraper'); scraper.scrape({ urls: [ 'http://nodejs.org/', // Will be saved with default filename 'index.html' @@ -101,3 +104,16 @@ scraper.scrape({ console.log(err); }); ``` + +#### Example 2. Recursive downloading +```javascript +// Links from example.com will be followed +// Links from links will be ignored because theirs depth = 2 is greater than maxDepth +var scraper = require('website-scraper'); +scraper.scrape({ + urls: ['http://example.com/'], + directory: '/path/to/save', + recursive: true, + maxDepth: 1 +}).then(console.log).catch(console.log); +``` diff --git a/lib/config/recursive-sources.js b/lib/config/recursive-sources.js new file mode 100644 index 00000000..774056c1 --- /dev/null +++ b/lib/config/recursive-sources.js @@ -0,0 +1,3 @@ +module.exports = [ + { selector: 'a', attr: 'href' } +]; \ No newline at end of file diff --git a/lib/file-handlers/css.js b/lib/file-handlers/css.js index 4a7cfcd9..17edc596 100644 --- a/lib/file-handlers/css.js +++ b/lib/file-handlers/css.js @@ -1,7 +1,6 @@ var _ = require('underscore'); var Promise = require('bluebird'); var getCssUrls = require('css-url-parser'); -var Resource = require('../resource'); var utils = require('../utils'); function loadCss (context, resource) { @@ -12,8 +11,7 @@ function loadCss (context, resource) { var promises = _.map(cssUrls, function loadResourceFromCssUrl (cssUrl) { var resourceUrl = utils.getUrl(url, cssUrl); - var cssResource = new Resource(resourceUrl); - cssResource.setParent(resource); + var cssResource = resource.createChild(resourceUrl); return context.loadResource(cssResource).then(function handleLoadedSource (loadedResource) { var relativePath = utils.getRelativePath(filename, loadedResource.getFilename()); diff --git a/lib/file-handlers/html.js b/lib/file-handlers/html.js index 95d28058..a99a0b7c 100644 --- a/lib/file-handlers/html.js +++ b/lib/file-handlers/html.js @@ -1,7 +1,6 @@ var cheerio = require('cheerio'); var Promise = require('bluebird'); var utils = require('../utils'); -var Resource = require('../resource'); function loadHtml (context, resource) { var sources = context.getHtmlSources(); @@ -50,8 +49,7 @@ function loadResources (context, resource, source) { if (attr) { var resourceUrl = utils.getUrl(url, attr); - var htmlResource = new Resource(resourceUrl); - htmlResource.setParent(resource); + var htmlResource = resource.createChild(resourceUrl); htmlResource.setHtmlData({ tagName: el[0].name, attributeName: source.attr }); return context.loadResource(htmlResource).then(function handleLoadedSource (loadedResource) { diff --git a/lib/resource.js b/lib/resource.js index 81ea9d8a..781d1bab 100644 --- a/lib/resource.js +++ b/lib/resource.js @@ -15,6 +15,17 @@ function Resource (url, filename) { this.filename = filename; } +Resource.prototype.createChild = function createChild (url, filename) { + var child = new Resource(url, filename); + + var currentDepth = this.getDepth(); + + child.setParent(this); + child.setDepth(++currentDepth); + + return child; +}; + Resource.prototype.getUrl = function getUrl () { return this.url; }; @@ -43,6 +54,14 @@ Resource.prototype.setParent = function setParent (parent) { this.parent = parent; }; +Resource.prototype.getDepth = function getDepth () { + return this.depth || 0; +}; + +Resource.prototype.setDepth = function setDepth (depth) { + this.depth = depth; +}; + /** * * @param {Object} data - html element data diff --git a/lib/scraper.js b/lib/scraper.js index 4c12472f..7b6f0bb6 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -1,28 +1,27 @@ var Promise = require('bluebird'); -var fs = Promise.promisifyAll(require('fs-extra')); + +var fs = require('fs-extra'); +var existsAsync = Promise.promisify(fs.stat); +var outputFileAsync = Promise.promisify(fs.outputFile); +var ensureDirAsync = Promise.promisify(fs.ensureDir); + var path = require('path'); var _ = require('underscore'); var defaults = require('./config/defaults'); var types = require('./config/resource-types'); +var recursiveSources = require('./config/recursive-sources'); var utils = require('./utils.js'); var request = require('./request'); var Resource = require('./resource'); -var loadHtml = require('./file-handlers/html'); -var loadCss = require('./file-handlers/css'); var compareUrls = require('compare-urls'); -function getHandleFunction (resource) { - var type = resource.getType(); - switch (type) { - case types.css: return loadCss; - case types.html: return function loadHtmlAndCss (context, po) { - return loadHtml(context, po).then(function (loaded) { - return loadCss(context, loaded); - }); - }; - default: return _.noop; - } +var loadHtml = require('./file-handlers/html'); +var loadCss = require('./file-handlers/css'); +function loadHtmlAndCss (context, po) { + return loadHtml(context, po).then(function (loaded) { + return loadCss(context, loaded); + }); } function Scraper (options) { @@ -83,6 +82,20 @@ Scraper.prototype.getDirectoryByExtension = function getDirectoryByExtension (ex .value() || ''; }; +Scraper.prototype.getResourceHandler = function getHandler (resource) { + var self = this; + var type = resource.getType(); + var depth = resource.getDepth(); + var depthGreaterThanMax = self.options.maxDepth && depth >= self.options.maxDepth; + + switch (true) { + case depthGreaterThanMax: return _.noop; + case type == types.css: return loadCss; + case type == types.html: return loadHtmlAndCss; + default: return _.noop; + } +}; + Scraper.prototype.loadResource = function loadResource (resource) { var self = this; @@ -102,12 +115,12 @@ Scraper.prototype.loadResource = function loadResource (resource) { return self.makeRequest(url).then(function requestCompleted(data) { resource.setUrl(data.url); // Url may be changed in redirects resource.setText(data.body); - handleFile = getHandleFunction(resource); + handleFile = self.getResourceHandler(resource); return handleFile(self, resource); }).then(function fileHandled() { var filename = path.join(self.options.directory, resource.getFilename()); var text = resource.getText(); - return fs.outputFileAsync(filename, text, { encoding: 'binary' }); + return outputFileAsync(filename, text, { encoding: 'binary' }); }).then(function fileSaved() { return Promise.resolve(resource); }); @@ -116,15 +129,16 @@ Scraper.prototype.loadResource = function loadResource (resource) { }; Scraper.prototype.validate = function validate () { - if (fs.existsSync(this.options.directory)) { - return Promise.reject(new Error('Path ' + this.options.directory + ' exists')); - } - return Promise.resolve(); + var dir = this.options.directory; + return existsAsync(dir).then(function handleDirectoryExist () { + return Promise.reject(new Error('Path ' + dir + ' exists')); + }, function handleDirectoryNotExist () { + return Promise.resolve(); + }); }; Scraper.prototype.prepare = function prepare () { var self = this; - fs.ensureDirSync(self.options.directory); // Create makeRequest function with custom request params self.makeRequest = request.makeRequest.bind(null, self.options.request); @@ -136,7 +150,12 @@ Scraper.prototype.prepare = function prepare () { var filename = _.isObject(obj) && _.has(obj, 'filename') ? obj.filename : self.options.defaultFilename; return new Resource(url, filename); }); - return Promise.resolve(); + + if (self.options.recursive) { + self.options.sources = _.union(self.options.sources, recursiveSources); + } + + return ensureDirAsync(self.options.directory); }; Scraper.prototype.load = function load () { diff --git a/package.json b/package.json index 274b0678..cd9c98c9 100644 --- a/package.json +++ b/package.json @@ -44,7 +44,9 @@ "istanbul": "^0.4.0", "mocha": "^2.2.5", "nock": "^2.9.1", + "proxyquire": "^1.7.3", "should": "^7.0.2", - "sinon": "^1.15.4" + "sinon": "^1.15.4", + "sinon-as-promised": "^4.0.0" } } diff --git a/test/functional/mocks/recursive/about.html b/test/functional/mocks/recursive/about.html new file mode 100644 index 00000000..54abd55d --- /dev/null +++ b/test/functional/mocks/recursive/about.html @@ -0,0 +1,13 @@ + + +
+ +