From 9e7a31f00cd8bb76d5aac18e754aa5a39e3096fa Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Thu, 20 Apr 2017 23:32:03 +0300 Subject: [PATCH 1/4] Add maxRecursiveDepth option --- lib/config/defaults.js | 1 + .../html/html-source-element.js | 32 ++++--- lib/resource-handler/html/index.js | 83 +++++++++++-------- lib/resource-handler/index.js | 11 ++- lib/scraper.js | 3 +- 5 files changed, 75 insertions(+), 55 deletions(-) diff --git a/lib/config/defaults.js b/lib/config/defaults.js index 8fa7bcc4..f1f0970c 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -48,6 +48,7 @@ const config = { }, urlFilter: null, recursive: false, + maxRecursiveDepth: null, maxDepth: null, ignoreErrors: true, httpResponseHandler: null, diff --git a/lib/resource-handler/html/html-source-element.js b/lib/resource-handler/html/html-source-element.js index a733cc3e..8ce15c84 100644 --- a/lib/resource-handler/html/html-source-element.js +++ b/lib/resource-handler/html/html-source-element.js @@ -1,10 +1,11 @@ -var ImgSrcsetTag = require('../path-containers/html-img-srcset-tag'); -var CommonTag = require('../path-containers/html-common-tag'); -var CssText = require('../path-containers/css-text'); -var _ = require('lodash'); -var utils = require('../../utils'); +'use strict'; -var pathContainersByRule = [ +const ImgSrcsetTag = require('../path-containers/html-img-srcset-tag'); +const CommonTag = require('../path-containers/html-common-tag'); +const CssText = require('../path-containers/css-text'); +const utils = require('../../utils'); + +const pathContainersByRule = [ { selector: '[style]', attr: 'style', containerClass: CssText }, { selector: 'style', containerClass: CssText }, { selector: '*[srcset]', attr: 'srcset', containerClass: ImgSrcsetTag } @@ -28,7 +29,7 @@ function HtmlSourceElement (el, rule) { * @returns {string} */ HtmlSourceElement.prototype.getData = function getData () { - var text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text(); + const text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text(); return utils.decodeHtmlEntities(text); }; @@ -37,17 +38,22 @@ HtmlSourceElement.prototype.setData = function setData (newData) { }; HtmlSourceElement.prototype.getPathContainerClass = function getPathContainerClass () { - var selectedRule = _.find(pathContainersByRule, (containerByRule) => { - return this.el.is(containerByRule.selector) && this.rule.attr === containerByRule.attr; - }); - + const selectedRule = this.findMatchedRule(pathContainersByRule); return selectedRule ? selectedRule.containerClass : CommonTag; }; HtmlSourceElement.prototype.getPathContainer = function getPathContainer () { - var ContainerClass = this.getPathContainerClass(); - var textWithResources = this.getData(); + const ContainerClass = this.getPathContainerClass(); + const textWithResources = this.getData(); return textWithResources ? new ContainerClass(textWithResources) : null; }; +HtmlSourceElement.prototype.matchesRule = function matchesRule (rule) { + return this.el.is(rule.selector) && this.rule.attr === rule.attr; +}; + +HtmlSourceElement.prototype.findMatchedRule = function findMatchedRule (rulesArray) { + return rulesArray.find(this.matchesRule, this); +}; + module.exports = HtmlSourceElement; diff --git a/lib/resource-handler/html/index.js b/lib/resource-handler/html/index.js index 397d1eb5..a614309a 100644 --- a/lib/resource-handler/html/index.js +++ b/lib/resource-handler/html/index.js @@ -1,44 +1,57 @@ -var cheerio = require('cheerio'); -var Promise = require('bluebird'); -var utils = require('../../utils'); -var HtmlSourceElement = require('./html-source-element'); - -function HtmlResourceHandler (options, handleChildrenPaths) { - this.options = options; - this.handleChildrenPaths = handleChildrenPaths; -} +'use strict'; -HtmlResourceHandler.prototype.handle = function handle (resource) { - var $ = loadTextToCheerio(resource.getText()); - prepareToLoad($, resource); - - return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource)) - .then(function updateResource () { - resource.setText($.html()); - return resource; - }); -}; - -HtmlResourceHandler.prototype.loadResourcesForRule = function loadResourcesForRule ($, parentResource, rule) { - var self = this; - var promises = $(rule.selector).map(function loadForElement () { - var el = new HtmlSourceElement($(this), rule); - var pathContainer = el.getPathContainer(); - if (!pathContainer) { - return Promise.resolve(); - } - return self.handleChildrenPaths(pathContainer, parentResource).then(el.setData.bind(el)); - }).get(); +const cheerio = require('cheerio'); +const Promise = require('bluebird'); +const utils = require('../../utils'); +const logger = require('../../logger'); +const HtmlSourceElement = require('./html-source-element'); + +class HtmlResourceHandler { + constructor (options, handleChildrenPaths) { + this.options = options; + this.handleChildrenPaths = handleChildrenPaths; + } + + handle (resource) { + const $ = loadTextToCheerio(resource.getText()); + prepareToLoad($, resource); + + return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource)) + .then(function updateResource () { + resource.setText($.html()); + return resource; + }); + } - return utils.waitAllFulfilled(promises); -}; + loadResourcesForRule ($, parentResource, rule) { + const self = this; + const promises = $(rule.selector).map(function loadForElement () { + const el = new HtmlSourceElement($(this), rule); + + const isRecursive = self.options.recursiveSources && Boolean(el.findMatchedRule(self.options.recursiveSources)); + const isDepthGreaterThanMax = self.options.maxRecursiveDepth && parentResource.getDepth() >= self.options.maxRecursiveDepth; + if (isRecursive && isDepthGreaterThanMax) { + logger.debug('filtering out by depth'); // TODO: log filtered el + return Promise.resolve(); + } + + const pathContainer = el.getPathContainer(); + if (!pathContainer) { + return Promise.resolve(); + } + return self.handleChildrenPaths(pathContainer, parentResource).then(el.setData.bind(el)); + }).get(); + + return utils.waitAllFulfilled(promises); + } +} function prepareToLoad ($, resource) { $('base').each(function handleBaseTag () { - var el = $(this); - var href = el.attr('href'); + const el = $(this); + const href = el.attr('href'); if (href) { - var newUrl = utils.getUrl(resource.getUrl(), href); + const newUrl = utils.getUrl(resource.getUrl(), href); resource.setUrl(newUrl); el.remove(); } diff --git a/lib/resource-handler/index.js b/lib/resource-handler/index.js index 7d27d6a4..7ec3a8e9 100644 --- a/lib/resource-handler/index.js +++ b/lib/resource-handler/index.js @@ -6,15 +6,14 @@ var utils = require('../utils'); var HtmlHandler = require('./html'); var CssHandler = require('./css'); -var supportedOptions = ['prettifyUrls', 'sources', 'defaultFilename']; +var supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename']; function ResourceHandler (options, context) { - var self = this; - self.options = _.pick(options, supportedOptions); - self.context = context; + this.options = _.pick(options, supportedOptions); + this.context = context; - self.htmlHandler = new HtmlHandler(self.options, self.handleChildrenResources.bind(self)); - self.cssHandler = new CssHandler(self.options, self.handleChildrenResources.bind(self)); + this.htmlHandler = new HtmlHandler(this.options, this.handleChildrenResources.bind(this)); + this.cssHandler = new CssHandler(this.options, this.handleChildrenResources.bind(this)); } ResourceHandler.prototype.getResourceHandler = function getResourceHandler (resource) { diff --git a/lib/scraper.js b/lib/scraper.js index a4fc7f57..c843e21b 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -29,8 +29,9 @@ function Scraper (options) { }); } + self.options.recursiveSources = recursiveSources; if (self.options.recursive) { - self.options.sources = _.union(self.options.sources, recursiveSources); + self.options.sources = _.union(self.options.sources, self.options.recursiveSources); } logger.info('init with options', self.options); From 141a15aafc961280b1734c53c631e561c3832ce0 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Sat, 29 Apr 2017 22:07:04 +0300 Subject: [PATCH 2/4] Add tests --- .../html/html-source-element.js | 91 +++++++------- lib/resource-handler/html/index.js | 2 +- test/functional/max-depth/max-depth.test.js | 113 ++++++++++++++++++ test/functional/max-depth/mocks/depth1.html | 12 ++ test/functional/max-depth/mocks/depth2.html | 12 ++ test/functional/max-depth/mocks/index.html | 13 ++ 6 files changed, 202 insertions(+), 41 deletions(-) create mode 100644 test/functional/max-depth/max-depth.test.js create mode 100644 test/functional/max-depth/mocks/depth1.html create mode 100644 test/functional/max-depth/mocks/depth2.html create mode 100644 test/functional/max-depth/mocks/index.html diff --git a/lib/resource-handler/html/html-source-element.js b/lib/resource-handler/html/html-source-element.js index 8ce15c84..3376388a 100644 --- a/lib/resource-handler/html/html-source-element.js +++ b/lib/resource-handler/html/html-source-element.js @@ -13,47 +13,58 @@ const pathContainersByRule = [ /** * Represents pair of cheerio element and rule to find text with children resources - * @param {Object} el - cheerio obj for dom element - * @param {Object} rule - rule used to find current element - * @param {string} rule.selector - cheerio selector - * @param {string} rule.attr - attribute to find text which contains resources. if not set - use inner html - * @constructor */ -function HtmlSourceElement (el, rule) { - this.el = el; - this.rule = rule; -} +class HtmlSourceElement { + /** + * @param {Object} el - cheerio obj for dom element + * @param {Object} rule - rule used to find current element + * @param {string} rule.selector - cheerio selector + * @param {string} rule.attr - attribute to find text which contains resources. if not set - use inner html + */ + constructor (el, rule) { + this.el = el; + this.rule = rule; + } -/** - * Get text from attr or from innerHtml of element based on rule - * @returns {string} - */ -HtmlSourceElement.prototype.getData = function getData () { - const text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text(); - return utils.decodeHtmlEntities(text); -}; - -HtmlSourceElement.prototype.setData = function setData (newData) { - this.rule.attr ? this.el.attr(this.rule.attr, newData) : this.el.text(newData); -}; - -HtmlSourceElement.prototype.getPathContainerClass = function getPathContainerClass () { - const selectedRule = this.findMatchedRule(pathContainersByRule); - return selectedRule ? selectedRule.containerClass : CommonTag; -}; - -HtmlSourceElement.prototype.getPathContainer = function getPathContainer () { - const ContainerClass = this.getPathContainerClass(); - const textWithResources = this.getData(); - return textWithResources ? new ContainerClass(textWithResources) : null; -}; - -HtmlSourceElement.prototype.matchesRule = function matchesRule (rule) { - return this.el.is(rule.selector) && this.rule.attr === rule.attr; -}; - -HtmlSourceElement.prototype.findMatchedRule = function findMatchedRule (rulesArray) { - return rulesArray.find(this.matchesRule, this); -}; + /** + * Get resource data from element using rule + * @returns {string} + */ + getData () { + const text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text(); + return utils.decodeHtmlEntities(text); + } + + /** + * Update attribute or inner text of el with new data + * @param {string} newData + */ + setData (newData) { + this.rule.attr ? this.el.attr(this.rule.attr, newData) : this.el.text(newData); + } + + /** + * Returns PathContainer instance for element + * @returns {CssText|HtmlCommonTag|HtmlImgSrcSetTag|null} + */ + getPathContainer () { + const selectedRule = this.findMatchedRule(pathContainersByRule); + const ContainerClass = selectedRule ? selectedRule.containerClass : CommonTag; + const textWithResources = this.getData(); + return textWithResources ? new ContainerClass(textWithResources) : null; + } + + matchesRule (rule) { + return this.el.is(rule.selector) && this.rule.attr === rule.attr; + } + + findMatchedRule (rulesArray) { + return rulesArray.find(this.matchesRule, this); + } + + toString () { + return JSON.stringify({selector: this.rule.selector, attr: this.rule.attr, data: this.getData()}); + } +} module.exports = HtmlSourceElement; diff --git a/lib/resource-handler/html/index.js b/lib/resource-handler/html/index.js index a614309a..667810a2 100644 --- a/lib/resource-handler/html/index.js +++ b/lib/resource-handler/html/index.js @@ -31,7 +31,7 @@ class HtmlResourceHandler { const isRecursive = self.options.recursiveSources && Boolean(el.findMatchedRule(self.options.recursiveSources)); const isDepthGreaterThanMax = self.options.maxRecursiveDepth && parentResource.getDepth() >= self.options.maxRecursiveDepth; if (isRecursive && isDepthGreaterThanMax) { - logger.debug('filtering out by depth'); // TODO: log filtered el + logger.debug(`filtering out ${el} by max recursive depth`); return Promise.resolve(); } diff --git a/test/functional/max-depth/max-depth.test.js b/test/functional/max-depth/max-depth.test.js new file mode 100644 index 00000000..ae0792bf --- /dev/null +++ b/test/functional/max-depth/max-depth.test.js @@ -0,0 +1,113 @@ +require('should'); +const nock = require('nock'); +const fs = require('fs-extra'); +const scrape = require('../../../index'); + +const testDirname = __dirname + '/.tmp'; +const mockDirname = __dirname + '/mocks'; + +describe('Functional: maxDepth and maxRecursiveDepth ', () => { + + beforeEach(() => { + nock.cleanAll(); + nock.disableNetConnect(); + }); + + afterEach(() => { + nock.cleanAll(); + nock.enableNetConnect(); + fs.removeSync(testDirname); + }); + + it('should filter out all resources by depth > maxDepth', () => { + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: null, + sources: [ + { selector: 'img', attr: 'src' }, + { selector: 'script', attr: 'src' }, + { selector: 'a', attr: 'href' } + ], + maxDepth: 2 + }; + + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); + + nock('http://example.com/').get('/depth1.html').replyWithFile(200, mockDirname + '/depth1.html'); + nock('http://example.com/').get('/img-depth1.jpg').reply(200, 'img-depth1.jpg'); + nock('http://example.com/').get('/script-depth1.js').reply(200, 'script-depth1.js'); + + nock('http://example.com/').get('/depth2.html').replyWithFile(200, mockDirname + '/depth2.html'); + nock('http://example.com/').get('/img-depth2.jpg').reply(200, 'img-depth2.jpg'); + nock('http://example.com/').get('/script-depth2.js').reply(200, 'script-depth2.js'); + + nock('http://example.com/').get('/depth3.html').reply(200, 'OK'); + nock('http://example.com/').get('/img-depth3.jpg').reply(200, 'img-depth3.jpg'); + nock('http://example.com/').get('/script-depth3.js').reply(200, 'script-depth3.js'); + + return scrape(options).then(() => { + fs.existsSync(testDirname + '/index.html').should.be.eql(true); + + fs.existsSync(testDirname + '/depth1.html').should.be.eql(true); + fs.existsSync(testDirname + '/img-depth1.jpg').should.be.eql(true); + fs.existsSync(testDirname + '/script-depth1.js').should.be.eql(true); + + fs.existsSync(testDirname + '/depth2.html').should.be.eql(true); + fs.existsSync(testDirname + '/img-depth2.jpg').should.be.eql(true); + fs.existsSync(testDirname + '/script-depth2.js').should.be.eql(true); + + fs.existsSync(testDirname + '/depth3.html').should.be.eql(false); + fs.existsSync(testDirname + '/img-depth3.jpg').should.be.eql(false); + fs.existsSync(testDirname + '/script-depth3.js').should.be.eql(false); + }); + }); + + + it('should filter out only anchors by depth > maxRecursiveDepth', () => { + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: null, + sources: [ + { selector: 'img', attr: 'src' }, + { selector: 'script', attr: 'src' }, + { selector: 'a', attr: 'href' } + ], + maxRecursiveDepth: 2 + }; + + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); + + nock('http://example.com/').get('/depth1.html').replyWithFile(200, mockDirname + '/depth1.html'); + nock('http://example.com/').get('/img-depth1.jpg').reply(200, 'img-depth1.jpg'); + nock('http://example.com/').get('/script-depth1.js').reply(200, 'script-depth1.js'); + + nock('http://example.com/').get('/depth2.html').replyWithFile(200, mockDirname + '/depth2.html'); + nock('http://example.com/').get('/img-depth2.jpg').reply(200, 'img-depth2.jpg'); + nock('http://example.com/').get('/script-depth2.js').reply(200, 'script-depth2.js'); + + nock('http://example.com/').get('/depth3.html').reply(200, 'OK'); + nock('http://example.com/').get('/img-depth3.jpg').reply(200, 'img-depth3.jpg'); + nock('http://example.com/').get('/script-depth3.js').reply(200, 'script-depth3.js'); + + return scrape(options).then(() => { + fs.existsSync(testDirname + '/index.html').should.be.eql(true); + + fs.existsSync(testDirname + '/depth1.html').should.be.eql(true); + fs.existsSync(testDirname + '/img-depth1.jpg').should.be.eql(true); + fs.existsSync(testDirname + '/script-depth1.js').should.be.eql(true); + + fs.existsSync(testDirname + '/depth2.html').should.be.eql(true); + fs.existsSync(testDirname + '/img-depth2.jpg').should.be.eql(true); + fs.existsSync(testDirname + '/script-depth2.js').should.be.eql(true); + + fs.existsSync(testDirname + '/depth3.html').should.be.eql(false); + // img-depth3.jpg and script-depth3.js - dependencies of depth2.html + // they should be loaded because maxRecursiveDepth applies only to + fs.existsSync(testDirname + '/img-depth3.jpg').should.be.eql(true); + fs.existsSync(testDirname + '/script-depth3.js').should.be.eql(true); + }); + }); + +}); diff --git a/test/functional/max-depth/mocks/depth1.html b/test/functional/max-depth/mocks/depth1.html new file mode 100644 index 00000000..ed3849d5 --- /dev/null +++ b/test/functional/max-depth/mocks/depth1.html @@ -0,0 +1,12 @@ + + + + + Title + + + + + + + \ No newline at end of file diff --git a/test/functional/max-depth/mocks/depth2.html b/test/functional/max-depth/mocks/depth2.html new file mode 100644 index 00000000..5c7b0728 --- /dev/null +++ b/test/functional/max-depth/mocks/depth2.html @@ -0,0 +1,12 @@ + + + + + Title + + + + + + + \ No newline at end of file diff --git a/test/functional/max-depth/mocks/index.html b/test/functional/max-depth/mocks/index.html new file mode 100644 index 00000000..52c566f5 --- /dev/null +++ b/test/functional/max-depth/mocks/index.html @@ -0,0 +1,13 @@ + + + + + Title + + + + + + + + \ No newline at end of file From 4490d6e7bfcbe69f00c3caa825c324caa4514db0 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Sat, 29 Apr 2017 22:09:17 +0300 Subject: [PATCH 3/4] Add newlines --- test/functional/max-depth/mocks/depth1.html | 2 +- test/functional/max-depth/mocks/depth2.html | 6 +++--- test/functional/max-depth/mocks/index.html | 7 +++---- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/test/functional/max-depth/mocks/depth1.html b/test/functional/max-depth/mocks/depth1.html index ed3849d5..78f672ff 100644 --- a/test/functional/max-depth/mocks/depth1.html +++ b/test/functional/max-depth/mocks/depth1.html @@ -9,4 +9,4 @@ - \ No newline at end of file + diff --git a/test/functional/max-depth/mocks/depth2.html b/test/functional/max-depth/mocks/depth2.html index 5c7b0728..025129b2 100644 --- a/test/functional/max-depth/mocks/depth2.html +++ b/test/functional/max-depth/mocks/depth2.html @@ -6,7 +6,7 @@ - - + + - \ No newline at end of file + diff --git a/test/functional/max-depth/mocks/index.html b/test/functional/max-depth/mocks/index.html index 52c566f5..4cd57ac0 100644 --- a/test/functional/max-depth/mocks/index.html +++ b/test/functional/max-depth/mocks/index.html @@ -6,8 +6,7 @@ - - - + + - \ No newline at end of file + From 030aab4b847d154d5c147041bf47c45c9d783bb8 Mon Sep 17 00:00:00 2001 From: Sophia Antipenko Date: Sat, 29 Apr 2017 22:33:14 +0300 Subject: [PATCH 4/4] Update README.md --- README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4e0706a6..0a0198ae 100644 --- a/README.md +++ b/README.md @@ -48,8 +48,9 @@ scrape(options, (error, result) => { * [urls](#urls) - urls to download, *required* * [directory](#directory) - path to save files, *required* * [sources](#sources) - selects which resources should be downloaded -* [recursive](#recursive) - follow anchors in html files -* [maxDepth](#maxdepth) - maximum depth for dependencies +* [recursive](#recursive) - follow hyperlinks in html files +* [maxRecursiveDepth](#maxrecursivedepth) - maximum depth for hyperlinks +* [maxDepth](#maxdepth) - maximum depth for all dependencies * [request](#request) - custom options for for [request](https://github.com/request/request) * [subdirectories](#subdirectories) - subdirectories for file extensions * [defaultFilename](#defaultfilename) - filename for index page @@ -96,10 +97,13 @@ scrape({ ``` #### recursive -Boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading. Defaults to `false`. +Boolean, if `true` scraper will follow hyperlinks in html files. Don't forget to set `maxRecursiveDepth` to avoid infinite downloading. Defaults to `false`. + +#### maxRecursiveDepth +Positive number, maximum allowed depth for hyperlinks. Other dependencies will be saved regardless of their depth. Defaults to `null` - no maximum recursive depth set. #### maxDepth -Positive number, maximum allowed depth for dependencies. Defaults to `null` - no maximum depth set. +Positive number, maximum allowed depth for all dependencies. Defaults to `null` - no maximum depth set. #### request Object, custom options for [request](https://github.com/request/request#requestoptions-callback). Allows to set cookies, userAgent, etc.