diff --git a/README.md b/README.md index cdefb46a..63e74738 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,7 @@ scrape(options, (error, result) => { * [resourceSaver](#resourcesaver) - customize resources saving * [onResourceSaved](#onresourcesaved) - callback called when resource is saved * [onResourceError](#onresourceerror) - callback called when resource's downloading is failed +* [updateMissingSources](#updatemissingsources) - update url for missing sources with absolute url Default options you can find in [lib/config/defaults.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/config/defaults.js) or get them using `scrape.defaults`. @@ -145,7 +146,7 @@ String, filename for index page. Defaults to `index.html`. Boolean, whether urls should be 'prettified', by having the `defaultFilename` removed. Defaults to `false`. #### ignoreErrors -Boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error. Defaults to `true`. +Boolean, if `true` scraper will continue downloading resources after error occurred, if `false` - scraper will finish process and return error. Defaults to `true`. #### urlFilter Function which is called for each url to check whether it should be scraped. Defaults to `null` - no url filter will be applied. @@ -253,6 +254,30 @@ scrape({ }) ``` +#### updateMissingSources +Boolean, if `true` scraper will set absolute urls for all failing `sources`, if `false` - it will leave them as is (which may cause incorrectly displayed page). +Also can contain array of `sources` to update (structure is similar to [sources](#sources)). +Defaults to `false`. +```javascript +// update all failing img srcs with absolute url +scrape({ + urls: ['http://example.com/'], + directory: '/path/to/save', + sources: [{selector: 'img', attr: 'src'}], + updateMissingSources: true +}); + +// download nothing, just update all img srcs with absolute urls +scrape({ + urls: ['http://example.com/'], + directory: '/path/to/save', + sources: [], + updateMissingSources: [{selector: 'img', attr: 'src'}] +}); + +``` + + ## callback Callback function, optional, includes following parameters: - `error`: if error - `Error` object, if success - `null` diff --git a/lib/config/defaults.js b/lib/config/defaults.js index f00f3f7a..dc1e337e 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -56,7 +56,8 @@ const config = { httpResponseHandler: null, onResourceSaved: null, onResourceError: null, - resourceSaver: null + resourceSaver: null, + updateMissingSources: false }; module.exports = config; diff --git a/lib/resource-handler/css/index.js b/lib/resource-handler/css/index.js index 09a05e0c..b88865a5 100644 --- a/lib/resource-handler/css/index.js +++ b/lib/resource-handler/css/index.js @@ -1,16 +1,21 @@ -var CssText = require('./../path-containers/css-text'); +'use strict'; -function CssResourceHandler (options, handleChildrenPaths) { - this.options = options; - this.handleChildrenPaths = handleChildrenPaths; -} +const CssText = require('./../path-containers/css-text'); + +class CssResourceHandler { + constructor (options, methods) { + this.options = options; + this.downloadChildrenPaths = methods.downloadChildrenPaths; + this.updateMissingSources = this.options.updateMissingSources === true || Array.isArray(this.options.updateMissingSources); + } -CssResourceHandler.prototype.handle = function handle (resource) { - var pathContainer = new CssText(resource.getText()); - return this.handleChildrenPaths(pathContainer, resource).then(function updateText (updatedText) { - resource.setText(updatedText); - return resource; - }); -}; + handle (resource) { + const pathContainer = new CssText(resource.getText()); + return this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources).then(function updateText (updatedText) { + resource.setText(updatedText); + return resource; + }); + } +} module.exports = CssResourceHandler; diff --git a/lib/resource-handler/html/index.js b/lib/resource-handler/html/index.js index 0b0f32d5..e781a238 100644 --- a/lib/resource-handler/html/index.js +++ b/lib/resource-handler/html/index.js @@ -7,17 +7,30 @@ const logger = require('../../logger'); const HtmlSourceElement = require('./html-source-element'); class HtmlResourceHandler { - constructor (options, handleChildrenPaths) { + constructor (options, methods) { this.options = options; - this.handleChildrenPaths = handleChildrenPaths; + this.downloadChildrenPaths = methods.downloadChildrenPaths; + this.updateChildrenPaths = methods.updateChildrenPaths; + + this.recursiveSources = this.options.recursiveSources || []; + this.downloadSources = this.options.sources; + this.updateSources = []; + + if (this.options.updateMissingSources === true) { + this.updateSources = this.downloadSources; + } else if (Array.isArray(this.options.updateMissingSources)) { + this.updateSources = this.options.updateMissingSources; + } + + this.allSources = utils.union(this.downloadSources, this.updateSources); } handle (resource) { const $ = loadTextToCheerio(resource.getText()); prepareToLoad($, resource); - return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource)) - .then(function updateResource () { + return Promise.mapSeries(this.allSources, this.loadResourcesForRule.bind(this, $, resource)) + .then(() => { resource.setText($.html()); return resource; }); @@ -27,31 +40,53 @@ class HtmlResourceHandler { const self = this; const promises = $(rule.selector).map((i, element) => { const el = new HtmlSourceElement($(element), rule); + const pathContainer = el.getPathContainer(); - const isRecursive = self.options.recursiveSources && Boolean(el.findMatchedRule(self.options.recursiveSources)); - const isDepthGreaterThanMax = self.options.maxRecursiveDepth && parentResource.getDepth() >= self.options.maxRecursiveDepth; - if (isRecursive && isDepthGreaterThanMax) { + if (!pathContainer) { + return Promise.resolve(null); + } + + const needToDownloadElement = this.needToDownload(el); + const needToUpdateElement = this.needToUpdate(el); + + if (this.exceedMaxRecursiveDepth(el, parentResource)) { logger.debug(`filtering out ${el} by max recursive depth`); - return Promise.resolve(); + return self.updateChildrenPaths(pathContainer, parentResource, needToUpdateElement).then(el.setData.bind(el)); } - const pathContainer = el.getPathContainer(); - if (!pathContainer) { - return Promise.resolve(); + if (!needToDownloadElement) { + return self.updateChildrenPaths(pathContainer, parentResource, needToUpdateElement).then(el.setData.bind(el)); } - return self.handleChildrenPaths(pathContainer, parentResource).then((updatedText) => { - el.setData(updatedText); - el.removeIntegrityCheck(); - }); + + return self.downloadChildrenPaths(pathContainer, parentResource, needToUpdateElement) + .then((updatedText) => { + el.setData(updatedText); + el.removeIntegrityCheck(); + }); + }).get(); return utils.waitAllFulfilled(promises); } + + exceedMaxRecursiveDepth (el, parentResource) { + const isRecursive = Boolean(el.findMatchedRule(this.recursiveSources)); + const isDepthGreaterThanMax = this.options.maxRecursiveDepth && parentResource.getDepth() >= this.options.maxRecursiveDepth; + return isRecursive && isDepthGreaterThanMax; + } + + needToDownload (el) { + return Boolean(el.findMatchedRule(this.downloadSources)); + } + + needToUpdate (el) { + return Boolean(el.findMatchedRule(this.updateSources)); + } } function prepareToLoad ($, resource) { - $('base').each(function handleBaseTag () { - const el = $(this); + $('base').each((i, element) => { + const el = $(element); const href = el.attr('href'); if (href) { const newUrl = utils.getUrl(resource.getUrl(), href); diff --git a/lib/resource-handler/index.js b/lib/resource-handler/index.js index 7ec3a8e9..d149d22c 100644 --- a/lib/resource-handler/index.js +++ b/lib/resource-handler/index.js @@ -1,82 +1,109 @@ -var _ = require('lodash'); -var Promise = require('bluebird'); -var logger = require('../logger'); -var utils = require('../utils'); +'use strict'; -var HtmlHandler = require('./html'); -var CssHandler = require('./css'); +const _ = require('lodash'); +const Promise = require('bluebird'); +const logger = require('../logger'); +const utils = require('../utils'); -var supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename']; +const HtmlHandler = require('./html'); +const CssHandler = require('./css'); -function ResourceHandler (options, context) { - this.options = _.pick(options, supportedOptions); - this.context = context; +const supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename', 'updateMissingSources']; - this.htmlHandler = new HtmlHandler(this.options, this.handleChildrenResources.bind(this)); - this.cssHandler = new CssHandler(this.options, this.handleChildrenResources.bind(this)); -} +class ResourceHandler { + constructor (options, context) { + this.options = _.pick(options, supportedOptions); + this.context = context; + + const methods = { + downloadChildrenPaths: this.downloadChildrenResources.bind(this), + updateChildrenPaths: this.updateChildrenResources.bind(this) + }; -ResourceHandler.prototype.getResourceHandler = function getResourceHandler (resource) { - switch (true) { - case resource.isCss(): - logger.debug('using css handler for ' + resource); - return this.cssHandler; - case resource.isHtml(): - logger.debug('using html handler for ' + resource); - return this.htmlHandler; - default: - logger.debug('using no handler for ' + resource); - return null; + this.htmlHandler = new HtmlHandler(this.options, methods); + this.cssHandler = new CssHandler(this.options, methods); } -}; - -/** - * Request all resources from pathContainers paths - * @param pathContainer - instance of ImgSrcsetTag or CommonTag or CssText, contains original paths for resources - * @param {Resource} parentResource - * @returns {Promise} - resolved when all resources from pathContainer were requested - * and original paths in parentResource were updated with local paths for children resources - */ -ResourceHandler.prototype.handleChildrenResources = function handleChildrenResources (pathContainer, parentResource) { - var self = this; - var childrenPaths = pathContainer.getPaths(); - var pathsToUpdate = []; - - var childrenPromises = childrenPaths.map(function loadChildPath (childPath) { - var childResourceUrl = utils.getUrl(parentResource.getUrl(), childPath); - var childResource = parentResource.createChild(childResourceUrl); - - return self.context.requestResource(childResource).then(function updateChildPath (respondedResource) { - if (respondedResource) { - parentResource.updateChild(childResource, respondedResource); - - var relativePath = utils.getRelativePath(parentResource.getFilename(), respondedResource.getFilename()); - if (self.options.prettifyUrls) { - relativePath = relativePath.replace(self.options.defaultFilename, ''); - } - var hash = utils.getHashFromUrl(childPath); - if (hash) { - relativePath = relativePath.concat(hash); + getResourceHandler (resource) { + switch (true) { + case resource.isCss(): + logger.debug('using css handler for ' + resource); + return this.cssHandler; + case resource.isHtml(): + logger.debug('using html handler for ' + resource); + return this.htmlHandler; + default: + logger.debug('using no handler for ' + resource); + return null; + } + } + + /** + * Request all resources from pathContainers paths + * @param pathContainer - instance of ImgSrcsetTag or CommonTag or CssText, contains original paths for resources + * @param {Resource} parentResource + * @param {boolean} updateIfFailed - if true - failed resources will be updated with absolute links + * @returns {Promise} - resolved when all resources from pathContainer were requested + * and original paths in parentResource were updated with local paths for children resources + */ + downloadChildrenResources (pathContainer, parentResource, updateIfFailed) { + const self = this; + const childrenPaths = pathContainer.getPaths(); + const pathsToUpdate = []; + + const childrenPromises = childrenPaths.map((childPath) => { + const childResourceUrl = utils.getUrl(parentResource.getUrl(), childPath); + const childResource = parentResource.createChild(childResourceUrl); + + return self.context.requestResource(childResource).then((respondedResource) => { + if (respondedResource) { + parentResource.updateChild(childResource, respondedResource); + + let relativePath = utils.getRelativePath(parentResource.getFilename(), respondedResource.getFilename()); + if (self.options.prettifyUrls) { + relativePath = relativePath.replace(self.options.defaultFilename, ''); + } + const hash = utils.getHashFromUrl(childPath); + + if (hash) { + relativePath = relativePath.concat(hash); + } + + pathsToUpdate.push({ oldPath: childPath, newPath: relativePath}); + } else { + if (updateIfFailed) { + pathsToUpdate.push({ oldPath: childPath, newPath: childResourceUrl}); + } } + return null; // Prevent Bluebird warnings + }); + }); - pathsToUpdate.push({ oldPath: childPath, newPath: relativePath}); - } - return null; // Prevent Bluebird warnings + return utils.waitAllFulfilled(childrenPromises).then(function updateChildrenPaths () { + return pathContainer.updateText(pathsToUpdate); }); - }); + } - return utils.waitAllFulfilled(childrenPromises).then(function updateChildrenPaths () { - return pathContainer.updateText(pathsToUpdate); - }); -}; + updateChildrenResources (pathContainer, parentResource, needToUpdate) { + if (!needToUpdate) { + return Promise.resolve(pathContainer.updateText([])); + } + const parentUrl = parentResource.getUrl(); + const pathsToUpdate = []; + pathContainer.getPaths().forEach((path) => { + const childAbsoluteUrl = utils.getUrl(parentUrl, path); + pathsToUpdate.push({ oldPath: path, newPath: childAbsoluteUrl }); + }); + return Promise.resolve(pathContainer.updateText(pathsToUpdate)); + } -ResourceHandler.prototype.handleResource = function handleResource (resource) { - var resourceHandler = this.getResourceHandler(resource); - if (resourceHandler && resourceHandler.handle) { - return resourceHandler.handle(resource); + handleResource (resource) { + const resourceHandler = this.getResourceHandler(resource); + if (resourceHandler && resourceHandler.handle) { + return resourceHandler.handle(resource); + } + return Promise.resolve(resource); } - return Promise.resolve(resource); -}; +} module.exports = ResourceHandler; diff --git a/lib/resource-handler/path-containers/css-text.js b/lib/resource-handler/path-containers/css-text.js index c2165c39..9888c4a1 100644 --- a/lib/resource-handler/path-containers/css-text.js +++ b/lib/resource-handler/path-containers/css-text.js @@ -1,33 +1,38 @@ -var getCssUrls = require('css-url-parser'); -var _ = require('lodash'); -var format = require('util').format; +'use strict'; + +const getCssUrls = require('css-url-parser'); +const _ = require('lodash'); +const format = require('util').format; function changeExactlyMatchedUrl (text, oldUrl, newUrl) { // starts with ' " ( ends with ' " ) - var exactlyMatchedPattern = format('([\'"\\(\\s])%s([\'"\\)\\s])', _.escapeRegExp(oldUrl)); - var exactlyMatchedRegexp = new RegExp(exactlyMatchedPattern, 'g'); + const exactlyMatchedPattern = format('([\'"\\(\\s])%s([\'"\\)\\s])', _.escapeRegExp(oldUrl)); + const exactlyMatchedRegexp = new RegExp(exactlyMatchedPattern, 'g'); text = text.replace(exactlyMatchedRegexp, function changeUrl (match, g1, g2) { return g1 + newUrl + g2; }); return text; } -function CssText (text) { - this.text = text || ''; - this.paths = getCssUrls(this.text); -} +class CssText { + constructor (text) { + this.text = text || ''; + this.paths = getCssUrls(this.text); + } -CssText.prototype.getPaths = function getPaths () { - return this.paths; -}; + getPaths () { + return this.paths; + } + + updateText (pathsToUpdate) { + let updatedText = this.text; + pathsToUpdate.forEach(function updatePath (path) { + updatedText = changeExactlyMatchedUrl(updatedText, path.oldPath, path.newPath); + }); + return updatedText; + } +} -CssText.prototype.updateText = function updateText (pathsToUpdate) { - var updatedText = this.text; - pathsToUpdate.forEach(function updatePath (path) { - updatedText = changeExactlyMatchedUrl(updatedText, path.oldPath, path.newPath); - }); - return updatedText; -}; module.exports = CssText; diff --git a/lib/resource-handler/path-containers/html-common-tag.js b/lib/resource-handler/path-containers/html-common-tag.js index dcae29e2..bdf630e5 100644 --- a/lib/resource-handler/path-containers/html-common-tag.js +++ b/lib/resource-handler/path-containers/html-common-tag.js @@ -1,28 +1,32 @@ -var _ = require('lodash'); -var utils = require('../../utils'); +'use strict'; + +const _ = require('lodash'); +const utils = require('../../utils'); function getPaths (text) { - var isSamePageId = _.startsWith(text, '#'); - var isUriSchemaSupported = utils.isUriSchemaSupported(text); + const isSamePageId = _.startsWith(text, '#'); + const isUriSchemaSupported = utils.isUriSchemaSupported(text); if (isSamePageId || !isUriSchemaSupported) { return []; } return [text]; } -function HtmlCommonTag (text) { - this.text = text || ''; - this.paths = getPaths(this.text); -} +class HtmlCommonTag { + constructor (text) { + this.text = text || ''; + this.paths = getPaths(this.text); + } -HtmlCommonTag.prototype.getPaths = function getPaths () { - return this.paths; -}; + getPaths () { + return this.paths; + } -HtmlCommonTag.prototype.updateText = function updateText (pathsToUpdate) { - var pathToUpdate = _.find(pathsToUpdate, {oldPath: this.paths[0]}); - return pathToUpdate ? pathToUpdate.newPath : this.text; -}; + updateText (pathsToUpdate) { + const pathToUpdate = _.find(pathsToUpdate, {oldPath: this.paths[0]}); + return pathToUpdate ? pathToUpdate.newPath : this.text; + } +} module.exports = HtmlCommonTag; diff --git a/lib/resource-handler/path-containers/html-img-srcset-tag.js b/lib/resource-handler/path-containers/html-img-srcset-tag.js index 623ec37c..25d3b013 100644 --- a/lib/resource-handler/path-containers/html-img-srcset-tag.js +++ b/lib/resource-handler/path-containers/html-img-srcset-tag.js @@ -1,27 +1,29 @@ -var srcset = require('srcset'); -var _ = require('lodash'); +'use strict'; -function HtmlImgSrcSetTag (text) { - this.text = text || ''; - this.imgSrcsetParts = srcset.parse(this.text); - this.paths = this.imgSrcsetParts.map(function getPath (imgSrcset) { - return imgSrcset.url; - }); -} +const srcset = require('srcset'); +const _ = require('lodash'); + +class HtmlImgSrcSetTag { + constructor (text) { + this.text = text || ''; + this.imgSrcsetParts = srcset.parse(this.text); + this.paths = this.imgSrcsetParts.map(imgSrcset => imgSrcset.url); + } -HtmlImgSrcSetTag.prototype.getPaths = function getPaths () { - return this.paths; -}; + getPaths () { + return this.paths; + } -HtmlImgSrcSetTag.prototype.updateText = function updateText (pathsToUpdate) { - var imgSrcsetParts = this.imgSrcsetParts; - pathsToUpdate.forEach(function updatePath (path) { - var srcsToUpdate = _.filter(imgSrcsetParts, {url: path.oldPath}); - srcsToUpdate.forEach((srcToUpdate) => { - srcToUpdate.url = path.newPath; + updateText (pathsToUpdate) { + const imgSrcsetParts = this.imgSrcsetParts; + pathsToUpdate.forEach(function updatePath (path) { + const srcsToUpdate = _.filter(imgSrcsetParts, {url: path.oldPath}); + srcsToUpdate.forEach((srcToUpdate) => { + srcToUpdate.url = path.newPath; + }); }); - }); - return srcset.stringify(imgSrcsetParts); -}; + return srcset.stringify(imgSrcsetParts); + } +} module.exports = HtmlImgSrcSetTag; diff --git a/lib/scraper.js b/lib/scraper.js index c843e21b..3886e471 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -31,7 +31,7 @@ function Scraper (options) { self.options.recursiveSources = recursiveSources; if (self.options.recursive) { - self.options.sources = _.union(self.options.sources, self.options.recursiveSources); + self.options.sources = u.union(self.options.sources, self.options.recursiveSources); } logger.info('init with options', self.options); diff --git a/lib/utils/index.js b/lib/utils/index.js index 09a3d474..23e7ed44 100644 --- a/lib/utils/index.js +++ b/lib/utils/index.js @@ -1,15 +1,18 @@ -var url = require('url'); -var path = require('path'); -var Promise = require('bluebird'); -var normalizeUrl = require('normalize-url'); -var htmlEntities = require('he'); -var typeByMime = require('../config/resource-type-by-mime'); -var typeByExt = require('../config/resource-type-by-ext'); +'use strict'; -var logger = require('../logger'); +const url = require('url'); +const path = require('path'); +const Promise = require('bluebird'); +const normalizeUrl = require('normalize-url'); +const htmlEntities = require('he'); +const _ = require('lodash'); +const typeByMime = require('../config/resource-type-by-mime'); +const typeByExt = require('../config/resource-type-by-ext'); -var MAX_FILENAME_LENGTH = 255; -var IS_URL = /^((http[s]?:)?\/\/)/; +const logger = require('../logger'); + +const MAX_FILENAME_LENGTH = 255; +const IS_URL = /^((http[s]?:)?\/\/)/; function isUrl (path) { return IS_URL.test(path); @@ -136,6 +139,10 @@ function extend (first, second) { return Object.assign({}, first, second); } +function union (first, second) { + return _.unionWith(first, second, _.isEqual); +} + module.exports = { isUrl, getUrl, @@ -155,5 +162,6 @@ module.exports = { getTypeByFilename, decodeHtmlEntities, clone, - extend + extend, + union }; diff --git a/package.json b/package.json index abd3f3b9..6121ccd2 100644 --- a/package.json +++ b/package.json @@ -53,7 +53,7 @@ "nock": "^9.0.2", "proxyquire": "^1.7.3", "should": "^11.1.0", - "sinon": "^2.1.0" + "sinon": "^2.2.0" }, "files": [ "index.js", diff --git a/test/functional/update-missing-sources/mocks/index.html b/test/functional/update-missing-sources/mocks/index.html new file mode 100644 index 00000000..6bd3ccb1 --- /dev/null +++ b/test/functional/update-missing-sources/mocks/index.html @@ -0,0 +1,12 @@ + + + + + Title + + + + + + + \ No newline at end of file diff --git a/test/functional/update-missing-sources/mocks/link1.html b/test/functional/update-missing-sources/mocks/link1.html new file mode 100644 index 00000000..90b61c96 --- /dev/null +++ b/test/functional/update-missing-sources/mocks/link1.html @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/test/functional/update-missing-sources/mocks/path-containers.html b/test/functional/update-missing-sources/mocks/path-containers.html new file mode 100644 index 00000000..2a9d7e93 --- /dev/null +++ b/test/functional/update-missing-sources/mocks/path-containers.html @@ -0,0 +1,12 @@ + + + + + + + + \ No newline at end of file diff --git a/test/functional/update-missing-sources/update-missing-sources.test.js b/test/functional/update-missing-sources/update-missing-sources.test.js new file mode 100644 index 00000000..0ff01433 --- /dev/null +++ b/test/functional/update-missing-sources/update-missing-sources.test.js @@ -0,0 +1,208 @@ +require('should'); +const nock = require('nock'); +const fs = require('fs-extra'); +const scrape = require('../../../index'); + +const testDirname = __dirname + '/.tmp'; +const mockDirname = __dirname + '/mocks'; + +describe('Functional: update missing sources', () => { + + beforeEach(() => { + nock.cleanAll(); + nock.disableNetConnect(); + }); + + afterEach(() => { + nock.cleanAll(); + nock.enableNetConnect(); + fs.removeSync(testDirname); + }); + + it('should not update missing sources if updateMissingSources = false', () => { + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: null, + sources: [{ selector: 'img', attr: 'src' }], + updateMissingSources: false + }; + + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); + nock('http://example.com/').get('/missing-img.png').replyWithError('COULDN\'T DOWNLOAD IMAGE'); + + return scrape(options).then(() => { + fs.existsSync(testDirname + '/index.html').should.be.eql(true); + fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false); + + + const indexBody = fs.readFileSync(testDirname + '/index.html').toString(); + indexBody.should.containEql(' { + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: null, + sources: [{ selector: 'img', attr: 'src' }], + updateMissingSources: true + }; + + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); + nock('http://example.com/').get('/missing-img.png').replyWithError('COULDN\'T DOWNLOAD IMAGE'); + + return scrape(options).then(() => { + fs.existsSync(testDirname + '/index.html').should.be.eql(true); + fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false); + + + const indexBody = fs.readFileSync(testDirname + '/index.html').toString(); + indexBody.should.containEql(' { + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: null, + sources: [ + { selector: 'img', attr: 'src' }, + { selector: 'script', attr: 'src' } + ], + updateMissingSources: [ + { selector: 'img', attr: 'src' } + ] + }; + + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); + nock('http://example.com/').get('/missing-img.png').replyWithError('COULDN\'T DOWNLOAD IMAGE'); + nock('http://example.com/').get('/missing-script.js').replyWithError('COULDN\'T DOWNLOAD SCRIPT'); + + return scrape(options).then(() => { + fs.existsSync(testDirname + '/index.html').should.be.eql(true); + fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false); + fs.existsSync(testDirname + '/missing-script.js').should.be.eql(false); + + + const indexBody = fs.readFileSync(testDirname + '/index.html').toString(); + indexBody.should.containEql(' { + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: null, + sources: [{ selector: 'img', attr: 'src' }], + updateMissingSources: true, + urlFilter: function (url) { + return url.indexOf('/missing-img.png') === -1; + } + }; + + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); + nock('http://example.com/').get('/missing-img.png').reply(200, 'ok'); + + return scrape(options).then(() => { + fs.existsSync(testDirname + '/index.html').should.be.eql(true); + fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false); + + + const indexBody = fs.readFileSync(testDirname + '/index.html').toString(); + indexBody.should.containEql(' { + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: null, + sources: [], + recursive: true, + maxRecursiveDepth: 1, + updateMissingSources: true, + }; + + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); + nock('http://example.com/').get('/link1.html').replyWithFile(200, mockDirname + '/link1.html'); + nock('http://example.com/').get('/missing-link.html').reply(200, 'ok'); + + return scrape(options).then(() => { + fs.existsSync(testDirname + '/index.html').should.be.eql(true); + fs.existsSync(testDirname + '/link1.html').should.be.eql(true); + fs.existsSync(testDirname + '/missing-link.html').should.be.eql(false); + + + const link = fs.readFileSync(testDirname + '/link1.html').toString(); + link.should.containEql(' { + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: null, + sources: [{selector: 'style'}], + updateMissingSources: true, + }; + + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/path-containers.html'); + nock('http://example.com/').get('/a.png').reply(200, 'ok'); + nock('http://example.com/').get('/b.png').replyWithError('Failed!'); + nock('http://example.com/').get('/c.png').reply(200, 'ok'); + + return scrape(options).then(() => { + fs.existsSync(testDirname + '/index.html').should.be.eql(true); + fs.existsSync(testDirname + '/a.png').should.be.eql(true); + fs.existsSync(testDirname + '/b.png').should.be.eql(false); + fs.existsSync(testDirname + '/c.png').should.be.eql(true); + + + const index = fs.readFileSync(testDirname + '/index.html').toString(); + index.should.containEql(`.a { background: url('a.png') }`); + index.should.containEql(`.b { background: url('http://example.com/b.png') }`); + index.should.containEql(`.c { background: url('c.png') }`); + }); + }); + + it('should update all and download nothing', () => { + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: null, + sources: [], + updateMissingSources: [ + { selector: 'img', attr: 'src' }, + { selector: 'script', attr: 'src' }, + { selector: 'a', attr: 'href' }, + ] + }; + + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); + nock('http://example.com/').get('/missing-img.png').reply(200, 'ok'); + nock('http://example.com/').get('/missing-script.js').reply(200, 'ok'); + nock('http://example.com/').get('/link1.html').reply(200, 'ok'); + + return scrape(options).then(() => { + fs.existsSync(testDirname + '/index.html').should.be.eql(true); + fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false); + fs.existsSync(testDirname + '/missing-script.js').should.be.eql(false); + fs.existsSync(testDirname + '/link1.html').should.be.eql(false); + + + const indexBody = fs.readFileSync(testDirname + '/index.html').toString(); + indexBody.should.containEql(' { + it('should call downloadChildrenResources and set returned text to resource', () => { + const downloadChildrenPaths = sinon.stub().resolves('updated text'); - return cssHandler.handle(originalResource).then(function (updatedResource) { + const originalResource = new Resource('http://example.com'); + const cssHandler = new CssResourceHandler({}, {downloadChildrenPaths}); + + return cssHandler.handle(originalResource).then((updatedResource) => { should(updatedResource).be.equal(originalResource); should(updatedResource.getText()).be.eql('updated text'); }); diff --git a/test/unit/resource-handler/html.test.js b/test/unit/resource-handler/html.test.js index 4c895c23..cc575671 100644 --- a/test/unit/resource-handler/html.test.js +++ b/test/unit/resource-handler/html.test.js @@ -11,13 +11,68 @@ const HtmlCommonTag = require('../../../lib/resource-handler/path-containers/htm const CssText = require('../../../lib/resource-handler/path-containers/css-text'); describe('ResourceHandler: Html', () => { - let htmlHandler; + let downloadChildrenPaths, htmlHandler; beforeEach(() => { - htmlHandler = new HtmlHandler({ sources: [] }, sinon.stub().returns(Promise.resolve())); + downloadChildrenPaths = sinon.stub().usingPromise(Promise).resolves(); + }); + + describe('constructor', () => { + describe('sources', () => { + it('should initialize sources if updateMissingSources was not passed', () => { + const sources = [{ selector: 'img', attr: 'src'}]; + htmlHandler = new HtmlHandler({sources}, {downloadChildrenPaths}); + + htmlHandler.downloadSources.should.eql(sources); + htmlHandler.updateSources.should.eql([]); + htmlHandler.allSources.should.eql(sources); + }); + + it('should initialize sources if updateMissingSources = false', () => { + const sources = [{ selector: 'img', attr: 'src'}]; + htmlHandler = new HtmlHandler({sources, updateMissingSources: false}, {downloadChildrenPaths}); + + htmlHandler.downloadSources.should.eql(sources); + htmlHandler.updateSources.should.eql([]); + htmlHandler.allSources.should.eql(sources); + }); + + it('should initialize sources if updateMissingSources = true', () => { + const sources = [{ selector: 'img', attr: 'src'}]; + htmlHandler = new HtmlHandler({sources, updateMissingSources: true}, {downloadChildrenPaths}); + + htmlHandler.downloadSources.should.eql(sources); + htmlHandler.updateSources.should.eql(sources); + htmlHandler.allSources.should.eql(sources); + }); + + it('should initialize sources if updateMissingSources is array of sources', () => { + const sources = [{ selector: 'img', attr: 'src'}]; + const updateMissingSources = [{ selector: 'a', attr: 'href'}]; + htmlHandler = new HtmlHandler({sources, updateMissingSources}, {downloadChildrenPaths}); + + htmlHandler.downloadSources.should.eql(sources); + htmlHandler.updateSources.should.eql(updateMissingSources); + htmlHandler.allSources.should.eql([{ selector: 'img', attr: 'src'}, { selector: 'a', attr: 'href'}]); + }); + + it('should initialize sources without duplicates if updateMissingSources is array of sources', () => { + const sources = [{ selector: 'img', attr: 'src'}]; + const updateMissingSources = [{ selector: 'img', attr: 'src'}, { selector: 'a', attr: 'href'}]; + htmlHandler = new HtmlHandler({sources, updateMissingSources}, {downloadChildrenPaths}); + + htmlHandler.downloadSources.should.eql(sources); + htmlHandler.updateSources.should.eql(updateMissingSources); + htmlHandler.allSources.should.eql([{ selector: 'img', attr: 'src'}, { selector: 'a', attr: 'href'}]); + }); + }); }); describe(' tag', () => { + beforeEach(() => { + htmlHandler = new HtmlHandler({ sources: [] }, {downloadChildrenPaths}); + }); + it('should remove base tag from text and update resource url for absolute href', () => { const html = ` @@ -74,6 +129,7 @@ describe('ResourceHandler: Html', () => { }); it('should not encode text to html entities', () => { + htmlHandler = new HtmlHandler({ sources: [] }, {downloadChildrenPaths}); const html = ` @@ -90,8 +146,9 @@ describe('ResourceHandler: Html', () => { }); }); - it('should call handleChildrenResources for each source', () => { - htmlHandler.options.sources.push({ selector: 'img', attr: 'src' }); + it('should call downloadChildrenResources for each source', () => { + const sources = [{ selector: 'img', attr: 'src' }]; + htmlHandler = new HtmlHandler({sources}, {downloadChildrenPaths}); const html = ` @@ -108,12 +165,13 @@ describe('ResourceHandler: Html', () => { resource.setText(html); return htmlHandler.handle(resource).then(() =>{ - htmlHandler.handleChildrenPaths.calledThrice.should.be.eql(true); + htmlHandler.downloadChildrenPaths.calledThrice.should.be.eql(true); }); }); - it('should not call handleChildrenResources if source attr is empty', () =>{ - htmlHandler.options.sources.push({ selector: 'img', attr: 'src' }); + it('should not call downloadChildrenResources if source attr is empty', () =>{ + const sources = [{ selector: 'img', attr: 'src' }]; + htmlHandler = new HtmlHandler({sources}, {downloadChildrenPaths}); const html = ` @@ -126,14 +184,17 @@ describe('ResourceHandler: Html', () => { resource.setText(html); return htmlHandler.handle(resource).then(() =>{ - htmlHandler.handleChildrenPaths.called.should.be.eql(false); + htmlHandler.downloadChildrenPaths.called.should.be.eql(false); }); }); - it('should use correct path containers based on tag', () =>{ - htmlHandler.options.sources.push({ selector: 'img', attr: 'src' }); - htmlHandler.options.sources.push({ selector: 'img', attr: 'srcset' }); - htmlHandler.options.sources.push({ selector: '.styled', attr: 'style' }); + it('should use correct path containers based on tag', () => { + const sources = [ + { selector: 'img', attr: 'src' }, + { selector: 'img', attr: 'srcset' }, + { selector: '.styled', attr: 'style' } + ]; + htmlHandler = new HtmlHandler({sources}, {downloadChildrenPaths}); const html = ` @@ -150,15 +211,18 @@ describe('ResourceHandler: Html', () => { resource.setText(html); return htmlHandler.handle(resource).then(() =>{ - htmlHandler.handleChildrenPaths.calledThrice.should.be.eql(true); - htmlHandler.handleChildrenPaths.args[0][0].should.be.instanceOf(HtmlCommonTag); - htmlHandler.handleChildrenPaths.args[1][0].should.be.instanceOf(HtmlImgSrcsetTag); - htmlHandler.handleChildrenPaths.args[2][0].should.be.instanceOf(CssText); + htmlHandler.downloadChildrenPaths.calledThrice.should.be.eql(true); + htmlHandler.downloadChildrenPaths.args[0][0].should.be.instanceOf(HtmlCommonTag); + htmlHandler.downloadChildrenPaths.args[1][0].should.be.instanceOf(HtmlImgSrcsetTag); + htmlHandler.downloadChildrenPaths.args[2][0].should.be.instanceOf(CssText); }); }); it('should remove SRI check for loaded resources', () => { - htmlHandler.options.sources.push({ selector: 'script', attr: 'src' }); + const sources = [ + { selector: 'script', attr: 'src'} + ]; + htmlHandler = new HtmlHandler({sources}, {downloadChildrenPaths}); const html = ` diff --git a/test/unit/resource-handler/index.test.js b/test/unit/resource-handler/index.test.js index 1efb1ad4..3d961ff2 100644 --- a/test/unit/resource-handler/index.test.js +++ b/test/unit/resource-handler/index.test.js @@ -1,9 +1,11 @@ -var should = require('should'); -var sinon = require('sinon'); -var Promise = require('bluebird'); -var proxyquire = require('proxyquire'); -var Resource = require('../../../lib/resource'); -var ResourceHandler = require('../../../lib/resource-handler'); +'use strict'; + +const should = require('should'); +const sinon = require('sinon'); +const Promise = require('bluebird'); +const proxyquire = require('proxyquire'); +const Resource = require('../../../lib/resource'); +const ResourceHandler = require('../../../lib/resource-handler'); describe('ResourceHandler', function() { describe('constructor', function() { @@ -38,7 +40,7 @@ describe('ResourceHandler', function() { './css': cssHandlerStub }); - var handleChildResStub = sinon.stub(ResourceHandler.prototype, 'handleChildrenResources').returns(Promise.resolve()); + var handleChildResStub = sinon.stub(ResourceHandler.prototype, 'downloadChildrenResources').usingPromise(Promise).resolves(); var options = { defaultFilename: 'test' }; var context = { dummy: 'context' }; @@ -50,19 +52,19 @@ describe('ResourceHandler', function() { htmlHandlerStub.calledOnce.should.be.eql(true); htmlHandlerStub.args[0][0].should.be.eql(options); - htmlHandlerStub.args[0][1](); + htmlHandlerStub.args[0][1].downloadChildrenPaths(); handleChildResStub.calledOnce.should.be.eql(true); cssHandlerStub.calledOnce.should.be.eql(true); cssHandlerStub.args[0][0].should.be.eql(options); - cssHandlerStub.args[0][1](); + cssHandlerStub.args[0][1].downloadChildrenPaths(); handleChildResStub.calledTwice.should.be.eql(true); handleChildResStub.restore(); }); }); - describe('#getResourceHandler', function() { + describe('#getResourceHandler', () => { var resourceHandler; beforeEach(function() { @@ -94,7 +96,7 @@ describe('ResourceHandler', function() { }); }); - describe('#handleResource', function() { + describe('#handleResource', () => { var resHandler; beforeEach(function() { @@ -128,10 +130,10 @@ describe('ResourceHandler', function() { }); }); - describe('#handleChildrenResources', function () { - var pathContainer, parentResource, scraperContext, resHandler; + describe('#downloadChildrenResources', () => { + let pathContainer, parentResource, scraperContext, resHandler; - beforeEach(function () { + beforeEach(() => { pathContainer = {}; pathContainer.getPaths = sinon.stub(); pathContainer.updateText = sinon.stub(); @@ -146,29 +148,29 @@ describe('ResourceHandler', function() { resHandler = new ResourceHandler({defaultFilename: 'index.html'}, scraperContext); }); - it('should not call requestResource if no paths in text', function () { + it('should not call requestResource if no paths in text', () => { pathContainer.getPaths = sinon.stub().returns([]); - return resHandler.handleChildrenResources(pathContainer, parentResource).then(function () { + return resHandler.downloadChildrenResources(pathContainer, parentResource).then(function () { scraperContext.requestResource.called.should.be.eql(false); }); }); - it('should call requestResource once with correct params', function () { + it('should call requestResource once with correct params', () => { pathContainer.getPaths.returns(['test.png']); parentResource.getUrl = sinon.stub().returns('http://test.com'); - return resHandler.handleChildrenResources(pathContainer, parentResource).then(function () { + return resHandler.downloadChildrenResources(pathContainer, parentResource).then(function () { scraperContext.requestResource.calledOnce.should.be.eql(true); scraperContext.requestResource.args[0][0].url.should.be.eql('http://test.com/test.png'); }); }); - it('should call requestResource for each found source with correct params', function () { + it('should call requestResource for each found source with correct params', () => { pathContainer.getPaths.returns(['a.jpg', 'b.jpg', 'c.jpg']); parentResource.getUrl = sinon.stub().returns('http://test.com'); - return resHandler.handleChildrenResources(pathContainer, parentResource).then(function () { + return resHandler.downloadChildrenResources(pathContainer, parentResource).then(function () { scraperContext.requestResource.calledThrice.should.be.eql(true); scraperContext.requestResource.args[0][0].url.should.be.eql('http://test.com/a.jpg'); scraperContext.requestResource.args[1][0].url.should.be.eql('http://test.com/b.jpg'); @@ -176,7 +178,7 @@ describe('ResourceHandler', function() { }); }); - it('should update paths in text with local files returned by requestResource', function () { + it('should update paths in text with local files returned by requestResource', () => { pathContainer.getPaths.returns([ 'http://first.com/img/a.jpg', 'http://first.com/b.jpg', @@ -189,7 +191,7 @@ describe('ResourceHandler', function() { var updateChildSpy = sinon.spy(parentResource, 'updateChild'); - return resHandler.handleChildrenResources(pathContainer, parentResource).then(function () { + return resHandler.downloadChildrenResources(pathContainer, parentResource).then(function () { var updateTextStub = pathContainer.updateText; updateTextStub.calledOnce.should.be.eql(true); updateTextStub.args[0][0].length.should.be.eql(3); @@ -209,7 +211,7 @@ describe('ResourceHandler', function() { }); }); - it('should not update paths in text, for which requestResource returned null', function () { + it('should not update paths in text, for which requestResource returned null', () => { pathContainer.getPaths.returns([ 'http://first.com/img/a.jpg', 'http://first.com/b.jpg', @@ -222,7 +224,7 @@ describe('ResourceHandler', function() { var updateChildSpy = sinon.spy(parentResource, 'updateChild'); - return resHandler.handleChildrenResources(pathContainer, parentResource).then(function () { + return resHandler.downloadChildrenResources(pathContainer, parentResource).then(function () { var updateTextStub = pathContainer.updateText; updateTextStub.calledOnce.should.be.eql(true); updateTextStub.args[0][0].length.should.be.eql(1); @@ -234,7 +236,7 @@ describe('ResourceHandler', function() { }); }); - it('should wait for all children promises fulfilled and then return updated text', function () { + it('should wait for all children promises fulfilled and then return updated text', () => { pathContainer.getPaths.returns([ 'http://first.com/img/a.jpg', 'http://first.com/b.jpg', @@ -247,12 +249,12 @@ describe('ResourceHandler', function() { scraperContext.requestResource.onSecondCall().returns(Promise.resolve(null)); scraperContext.requestResource.onThirdCall().returns(Promise.reject(new Error('some error'))); - return resHandler.handleChildrenResources(pathContainer, parentResource).then(function (updatedText) { + return resHandler.downloadChildrenResources(pathContainer, parentResource).then(function (updatedText) { updatedText.should.be.eql('UPDATED TEXT'); }); }); - describe('hash in urls', function () { + describe('hash in urls', () => { it('should keep hash in urls', function () { var resourceStub = new Resource('http://example.com/page1.html', 'local/page1.html'); sinon.stub(resourceStub, 'getType').returns('html'); @@ -260,7 +262,7 @@ describe('ResourceHandler', function() { pathContainer.getPaths.returns(['http://example.com/page1.html#hash']); - return resHandler.handleChildrenResources(pathContainer, parentResource).then(function () { + return resHandler.downloadChildrenResources(pathContainer, parentResource).then(function () { var updateTextStub = pathContainer.updateText; updateTextStub.calledOnce.should.be.eql(true); updateTextStub.args[0][0].length.should.be.eql(1); @@ -272,7 +274,7 @@ describe('ResourceHandler', function() { }); }); - describe('prettifyUrls', function () { + describe('prettifyUrls', () => { it('should not prettifyUrls by default', function() { var resourceStub = new Resource('http://example.com/other-page/index.html', 'other-page/index.html'); scraperContext.requestResource.onFirstCall().returns(Promise.resolve(resourceStub)); @@ -280,7 +282,7 @@ describe('ResourceHandler', function() { pathContainer.getPaths.returns(['http://example.com/other-page/index.html']); - return resHandler.handleChildrenResources(pathContainer, parentResource).then(function () { + return resHandler.downloadChildrenResources(pathContainer, parentResource).then(function () { var updateTextStub = pathContainer.updateText; updateTextStub.calledOnce.should.be.eql(true); updateTextStub.args[0][0].length.should.be.eql(1); @@ -291,14 +293,14 @@ describe('ResourceHandler', function() { }); }); - it('should prettifyUrls if specified', function() { + it('should prettifyUrls if specified', () => { var resourceStub = new Resource('http://example.com/other-page/index.html', 'other-page/index.html'); scraperContext.requestResource.onFirstCall().returns(Promise.resolve(resourceStub)); pathContainer.getPaths.returns(['http://example.com/other-page/index.html']); resHandler.options.prettifyUrls = true; - return resHandler.handleChildrenResources(pathContainer, parentResource).then(function () { + return resHandler.downloadChildrenResources(pathContainer, parentResource).then(function () { var updateTextStub = pathContainer.updateText; updateTextStub.calledOnce.should.be.eql(true); updateTextStub.args[0][0].length.should.be.eql(1); @@ -309,6 +311,90 @@ describe('ResourceHandler', function() { }); }); }); + + describe('updateMissingSources', () => { + beforeEach(() => { + pathContainer.getPaths.returns([ + '/success.png', + '/failed.png' + ]); + scraperContext.requestResource.onFirstCall().returns(Promise.resolve(new Resource('http://example.com/success.png', 'local/success.png'))); + scraperContext.requestResource.onSecondCall().returns(Promise.resolve(null)); + }); + + it('should update missing path with absolute url if updateIfFailed = true', () => { + const updateIfFailed = true; + + return resHandler.downloadChildrenResources(pathContainer, parentResource, updateIfFailed).then(() => { + pathContainer.updateText.calledOnce.should.be.eql(true); + pathContainer.updateText.args[0][0].length.should.be.eql(2); + pathContainer.updateText.args[0][0].should.containEql({ + oldPath: '/success.png', + newPath: 'local/success.png' + }); + pathContainer.updateText.args[0][0].should.containEql({ + oldPath: '/failed.png', + newPath: 'http://example.com/failed.png' + }); + }); + }); + + it('should not update missing path with absolute url if updateIfFailed = false', () => { + const updateIfFailed = false; + + return resHandler.downloadChildrenResources(pathContainer, parentResource, updateIfFailed).then(() => { + pathContainer.updateText.calledOnce.should.be.eql(true); + pathContainer.updateText.args[0][0].length.should.be.eql(1); + pathContainer.updateText.args[0][0].should.containEql({ + oldPath: '/success.png', + newPath: 'local/success.png' + }); + }); + }) + }); }); + describe('#updateChildrenResources', () => { + describe('updateMissingSources', () => { + let pathContainer, parentResource, resHandler; + + beforeEach(() => { + pathContainer = {}; + pathContainer.getPaths = sinon.stub().returns([ + '/failed1.png', + '/failed2.png' + ]); + pathContainer.updateText = sinon.stub(); + parentResource = new Resource('http://example.com', 'index.html'); + resHandler = new ResourceHandler({}); + }); + + it('should update all paths with absolute urls if updateIfFailed = true', () => { + const updateIfFailed = true; + + return resHandler.updateChildrenResources(pathContainer, parentResource, updateIfFailed).then(() => { + pathContainer.updateText.calledOnce.should.be.eql(true); + pathContainer.updateText.args[0][0].length.should.be.eql(2); + pathContainer.updateText.args[0][0].should.containEql({ + oldPath: '/failed1.png', + newPath: 'http://example.com/failed1.png' + }); + pathContainer.updateText.args[0][0].should.containEql({ + oldPath: '/failed2.png', + newPath: 'http://example.com/failed2.png' + }); + }); + }); + + it('should not update paths if updateIfFailed = false', () => { + const updateIfFailed = false; + + return resHandler.updateChildrenResources(pathContainer, parentResource, updateIfFailed).then(() => { + pathContainer.updateText.calledOnce.should.be.eql(true); + pathContainer.updateText.args[0][0].length.should.be.eql(0); + }); + }) + }); + + }); });