diff --git a/README.md b/README.md index cdefb46a..63e74738 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,7 @@ scrape(options, (error, result) => { * [resourceSaver](#resourcesaver) - customize resources saving * [onResourceSaved](#onresourcesaved) - callback called when resource is saved * [onResourceError](#onresourceerror) - callback called when resource's downloading is failed +* [updateMissingSources](#updatemissingsources) - update url for missing sources with absolute url Default options you can find in [lib/config/defaults.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/config/defaults.js) or get them using `scrape.defaults`. @@ -145,7 +146,7 @@ String, filename for index page. Defaults to `index.html`. Boolean, whether urls should be 'prettified', by having the `defaultFilename` removed. Defaults to `false`. #### ignoreErrors -Boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error. Defaults to `true`. +Boolean, if `true` scraper will continue downloading resources after error occurred, if `false` - scraper will finish process and return error. Defaults to `true`. #### urlFilter Function which is called for each url to check whether it should be scraped. Defaults to `null` - no url filter will be applied. @@ -253,6 +254,30 @@ scrape({ }) ``` +#### updateMissingSources +Boolean, if `true` scraper will set absolute urls for all failing `sources`, if `false` - it will leave them as is (which may cause incorrectly displayed page). +Also can contain array of `sources` to update (structure is similar to [sources](#sources)). +Defaults to `false`. +```javascript +// update all failing img srcs with absolute url +scrape({ + urls: ['http://example.com/'], + directory: '/path/to/save', + sources: [{selector: 'img', attr: 'src'}], + updateMissingSources: true +}); + +// download nothing, just update all img srcs with absolute urls +scrape({ + urls: ['http://example.com/'], + directory: '/path/to/save', + sources: [], + updateMissingSources: [{selector: 'img', attr: 'src'}] +}); + +``` + + ## callback Callback function, optional, includes following parameters: - `error`: if error - `Error` object, if success - `null` diff --git a/lib/config/defaults.js b/lib/config/defaults.js index f00f3f7a..dc1e337e 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -56,7 +56,8 @@ const config = { httpResponseHandler: null, onResourceSaved: null, onResourceError: null, - resourceSaver: null + resourceSaver: null, + updateMissingSources: false }; module.exports = config; diff --git a/lib/resource-handler/css/index.js b/lib/resource-handler/css/index.js index 09a05e0c..b88865a5 100644 --- a/lib/resource-handler/css/index.js +++ b/lib/resource-handler/css/index.js @@ -1,16 +1,21 @@ -var CssText = require('./../path-containers/css-text'); +'use strict'; -function CssResourceHandler (options, handleChildrenPaths) { - this.options = options; - this.handleChildrenPaths = handleChildrenPaths; -} +const CssText = require('./../path-containers/css-text'); + +class CssResourceHandler { + constructor (options, methods) { + this.options = options; + this.downloadChildrenPaths = methods.downloadChildrenPaths; + this.updateMissingSources = this.options.updateMissingSources === true || Array.isArray(this.options.updateMissingSources); + } -CssResourceHandler.prototype.handle = function handle (resource) { - var pathContainer = new CssText(resource.getText()); - return this.handleChildrenPaths(pathContainer, resource).then(function updateText (updatedText) { - resource.setText(updatedText); - return resource; - }); -}; + handle (resource) { + const pathContainer = new CssText(resource.getText()); + return this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources).then(function updateText (updatedText) { + resource.setText(updatedText); + return resource; + }); + } +} module.exports = CssResourceHandler; diff --git a/lib/resource-handler/html/index.js b/lib/resource-handler/html/index.js index 0b0f32d5..e781a238 100644 --- a/lib/resource-handler/html/index.js +++ b/lib/resource-handler/html/index.js @@ -7,17 +7,30 @@ const logger = require('../../logger'); const HtmlSourceElement = require('./html-source-element'); class HtmlResourceHandler { - constructor (options, handleChildrenPaths) { + constructor (options, methods) { this.options = options; - this.handleChildrenPaths = handleChildrenPaths; + this.downloadChildrenPaths = methods.downloadChildrenPaths; + this.updateChildrenPaths = methods.updateChildrenPaths; + + this.recursiveSources = this.options.recursiveSources || []; + this.downloadSources = this.options.sources; + this.updateSources = []; + + if (this.options.updateMissingSources === true) { + this.updateSources = this.downloadSources; + } else if (Array.isArray(this.options.updateMissingSources)) { + this.updateSources = this.options.updateMissingSources; + } + + this.allSources = utils.union(this.downloadSources, this.updateSources); } handle (resource) { const $ = loadTextToCheerio(resource.getText()); prepareToLoad($, resource); - return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource)) - .then(function updateResource () { + return Promise.mapSeries(this.allSources, this.loadResourcesForRule.bind(this, $, resource)) + .then(() => { resource.setText($.html()); return resource; }); @@ -27,31 +40,53 @@ class HtmlResourceHandler { const self = this; const promises = $(rule.selector).map((i, element) => { const el = new HtmlSourceElement($(element), rule); + const pathContainer = el.getPathContainer(); - const isRecursive = self.options.recursiveSources && Boolean(el.findMatchedRule(self.options.recursiveSources)); - const isDepthGreaterThanMax = self.options.maxRecursiveDepth && parentResource.getDepth() >= self.options.maxRecursiveDepth; - if (isRecursive && isDepthGreaterThanMax) { + if (!pathContainer) { + return Promise.resolve(null); + } + + const needToDownloadElement = this.needToDownload(el); + const needToUpdateElement = this.needToUpdate(el); + + if (this.exceedMaxRecursiveDepth(el, parentResource)) { logger.debug(`filtering out ${el} by max recursive depth`); - return Promise.resolve(); + return self.updateChildrenPaths(pathContainer, parentResource, needToUpdateElement).then(el.setData.bind(el)); } - const pathContainer = el.getPathContainer(); - if (!pathContainer) { - return Promise.resolve(); + if (!needToDownloadElement) { + return self.updateChildrenPaths(pathContainer, parentResource, needToUpdateElement).then(el.setData.bind(el)); } - return self.handleChildrenPaths(pathContainer, parentResource).then((updatedText) => { - el.setData(updatedText); - el.removeIntegrityCheck(); - }); + + return self.downloadChildrenPaths(pathContainer, parentResource, needToUpdateElement) + .then((updatedText) => { + el.setData(updatedText); + el.removeIntegrityCheck(); + }); + }).get(); return utils.waitAllFulfilled(promises); } + + exceedMaxRecursiveDepth (el, parentResource) { + const isRecursive = Boolean(el.findMatchedRule(this.recursiveSources)); + const isDepthGreaterThanMax = this.options.maxRecursiveDepth && parentResource.getDepth() >= this.options.maxRecursiveDepth; + return isRecursive && isDepthGreaterThanMax; + } + + needToDownload (el) { + return Boolean(el.findMatchedRule(this.downloadSources)); + } + + needToUpdate (el) { + return Boolean(el.findMatchedRule(this.updateSources)); + } } function prepareToLoad ($, resource) { - $('base').each(function handleBaseTag () { - const el = $(this); + $('base').each((i, element) => { + const el = $(element); const href = el.attr('href'); if (href) { const newUrl = utils.getUrl(resource.getUrl(), href); diff --git a/lib/resource-handler/index.js b/lib/resource-handler/index.js index 7ec3a8e9..d149d22c 100644 --- a/lib/resource-handler/index.js +++ b/lib/resource-handler/index.js @@ -1,82 +1,109 @@ -var _ = require('lodash'); -var Promise = require('bluebird'); -var logger = require('../logger'); -var utils = require('../utils'); +'use strict'; -var HtmlHandler = require('./html'); -var CssHandler = require('./css'); +const _ = require('lodash'); +const Promise = require('bluebird'); +const logger = require('../logger'); +const utils = require('../utils'); -var supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename']; +const HtmlHandler = require('./html'); +const CssHandler = require('./css'); -function ResourceHandler (options, context) { - this.options = _.pick(options, supportedOptions); - this.context = context; +const supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename', 'updateMissingSources']; - this.htmlHandler = new HtmlHandler(this.options, this.handleChildrenResources.bind(this)); - this.cssHandler = new CssHandler(this.options, this.handleChildrenResources.bind(this)); -} +class ResourceHandler { + constructor (options, context) { + this.options = _.pick(options, supportedOptions); + this.context = context; + + const methods = { + downloadChildrenPaths: this.downloadChildrenResources.bind(this), + updateChildrenPaths: this.updateChildrenResources.bind(this) + }; -ResourceHandler.prototype.getResourceHandler = function getResourceHandler (resource) { - switch (true) { - case resource.isCss(): - logger.debug('using css handler for ' + resource); - return this.cssHandler; - case resource.isHtml(): - logger.debug('using html handler for ' + resource); - return this.htmlHandler; - default: - logger.debug('using no handler for ' + resource); - return null; + this.htmlHandler = new HtmlHandler(this.options, methods); + this.cssHandler = new CssHandler(this.options, methods); } -}; - -/** - * Request all resources from pathContainers paths - * @param pathContainer - instance of ImgSrcsetTag or CommonTag or CssText, contains original paths for resources - * @param {Resource} parentResource - * @returns {Promise} - resolved when all resources from pathContainer were requested - * and original paths in parentResource were updated with local paths for children resources - */ -ResourceHandler.prototype.handleChildrenResources = function handleChildrenResources (pathContainer, parentResource) { - var self = this; - var childrenPaths = pathContainer.getPaths(); - var pathsToUpdate = []; - - var childrenPromises = childrenPaths.map(function loadChildPath (childPath) { - var childResourceUrl = utils.getUrl(parentResource.getUrl(), childPath); - var childResource = parentResource.createChild(childResourceUrl); - - return self.context.requestResource(childResource).then(function updateChildPath (respondedResource) { - if (respondedResource) { - parentResource.updateChild(childResource, respondedResource); - - var relativePath = utils.getRelativePath(parentResource.getFilename(), respondedResource.getFilename()); - if (self.options.prettifyUrls) { - relativePath = relativePath.replace(self.options.defaultFilename, ''); - } - var hash = utils.getHashFromUrl(childPath); - if (hash) { - relativePath = relativePath.concat(hash); + getResourceHandler (resource) { + switch (true) { + case resource.isCss(): + logger.debug('using css handler for ' + resource); + return this.cssHandler; + case resource.isHtml(): + logger.debug('using html handler for ' + resource); + return this.htmlHandler; + default: + logger.debug('using no handler for ' + resource); + return null; + } + } + + /** + * Request all resources from pathContainers paths + * @param pathContainer - instance of ImgSrcsetTag or CommonTag or CssText, contains original paths for resources + * @param {Resource} parentResource + * @param {boolean} updateIfFailed - if true - failed resources will be updated with absolute links + * @returns {Promise} - resolved when all resources from pathContainer were requested + * and original paths in parentResource were updated with local paths for children resources + */ + downloadChildrenResources (pathContainer, parentResource, updateIfFailed) { + const self = this; + const childrenPaths = pathContainer.getPaths(); + const pathsToUpdate = []; + + const childrenPromises = childrenPaths.map((childPath) => { + const childResourceUrl = utils.getUrl(parentResource.getUrl(), childPath); + const childResource = parentResource.createChild(childResourceUrl); + + return self.context.requestResource(childResource).then((respondedResource) => { + if (respondedResource) { + parentResource.updateChild(childResource, respondedResource); + + let relativePath = utils.getRelativePath(parentResource.getFilename(), respondedResource.getFilename()); + if (self.options.prettifyUrls) { + relativePath = relativePath.replace(self.options.defaultFilename, ''); + } + const hash = utils.getHashFromUrl(childPath); + + if (hash) { + relativePath = relativePath.concat(hash); + } + + pathsToUpdate.push({ oldPath: childPath, newPath: relativePath}); + } else { + if (updateIfFailed) { + pathsToUpdate.push({ oldPath: childPath, newPath: childResourceUrl}); + } } + return null; // Prevent Bluebird warnings + }); + }); - pathsToUpdate.push({ oldPath: childPath, newPath: relativePath}); - } - return null; // Prevent Bluebird warnings + return utils.waitAllFulfilled(childrenPromises).then(function updateChildrenPaths () { + return pathContainer.updateText(pathsToUpdate); }); - }); + } - return utils.waitAllFulfilled(childrenPromises).then(function updateChildrenPaths () { - return pathContainer.updateText(pathsToUpdate); - }); -}; + updateChildrenResources (pathContainer, parentResource, needToUpdate) { + if (!needToUpdate) { + return Promise.resolve(pathContainer.updateText([])); + } + const parentUrl = parentResource.getUrl(); + const pathsToUpdate = []; + pathContainer.getPaths().forEach((path) => { + const childAbsoluteUrl = utils.getUrl(parentUrl, path); + pathsToUpdate.push({ oldPath: path, newPath: childAbsoluteUrl }); + }); + return Promise.resolve(pathContainer.updateText(pathsToUpdate)); + } -ResourceHandler.prototype.handleResource = function handleResource (resource) { - var resourceHandler = this.getResourceHandler(resource); - if (resourceHandler && resourceHandler.handle) { - return resourceHandler.handle(resource); + handleResource (resource) { + const resourceHandler = this.getResourceHandler(resource); + if (resourceHandler && resourceHandler.handle) { + return resourceHandler.handle(resource); + } + return Promise.resolve(resource); } - return Promise.resolve(resource); -}; +} module.exports = ResourceHandler; diff --git a/lib/resource-handler/path-containers/css-text.js b/lib/resource-handler/path-containers/css-text.js index c2165c39..9888c4a1 100644 --- a/lib/resource-handler/path-containers/css-text.js +++ b/lib/resource-handler/path-containers/css-text.js @@ -1,33 +1,38 @@ -var getCssUrls = require('css-url-parser'); -var _ = require('lodash'); -var format = require('util').format; +'use strict'; + +const getCssUrls = require('css-url-parser'); +const _ = require('lodash'); +const format = require('util').format; function changeExactlyMatchedUrl (text, oldUrl, newUrl) { // starts with ' " ( ends with ' " ) - var exactlyMatchedPattern = format('([\'"\\(\\s])%s([\'"\\)\\s])', _.escapeRegExp(oldUrl)); - var exactlyMatchedRegexp = new RegExp(exactlyMatchedPattern, 'g'); + const exactlyMatchedPattern = format('([\'"\\(\\s])%s([\'"\\)\\s])', _.escapeRegExp(oldUrl)); + const exactlyMatchedRegexp = new RegExp(exactlyMatchedPattern, 'g'); text = text.replace(exactlyMatchedRegexp, function changeUrl (match, g1, g2) { return g1 + newUrl + g2; }); return text; } -function CssText (text) { - this.text = text || ''; - this.paths = getCssUrls(this.text); -} +class CssText { + constructor (text) { + this.text = text || ''; + this.paths = getCssUrls(this.text); + } -CssText.prototype.getPaths = function getPaths () { - return this.paths; -}; + getPaths () { + return this.paths; + } + + updateText (pathsToUpdate) { + let updatedText = this.text; + pathsToUpdate.forEach(function updatePath (path) { + updatedText = changeExactlyMatchedUrl(updatedText, path.oldPath, path.newPath); + }); + return updatedText; + } +} -CssText.prototype.updateText = function updateText (pathsToUpdate) { - var updatedText = this.text; - pathsToUpdate.forEach(function updatePath (path) { - updatedText = changeExactlyMatchedUrl(updatedText, path.oldPath, path.newPath); - }); - return updatedText; -}; module.exports = CssText; diff --git a/lib/resource-handler/path-containers/html-common-tag.js b/lib/resource-handler/path-containers/html-common-tag.js index dcae29e2..bdf630e5 100644 --- a/lib/resource-handler/path-containers/html-common-tag.js +++ b/lib/resource-handler/path-containers/html-common-tag.js @@ -1,28 +1,32 @@ -var _ = require('lodash'); -var utils = require('../../utils'); +'use strict'; + +const _ = require('lodash'); +const utils = require('../../utils'); function getPaths (text) { - var isSamePageId = _.startsWith(text, '#'); - var isUriSchemaSupported = utils.isUriSchemaSupported(text); + const isSamePageId = _.startsWith(text, '#'); + const isUriSchemaSupported = utils.isUriSchemaSupported(text); if (isSamePageId || !isUriSchemaSupported) { return []; } return [text]; } -function HtmlCommonTag (text) { - this.text = text || ''; - this.paths = getPaths(this.text); -} +class HtmlCommonTag { + constructor (text) { + this.text = text || ''; + this.paths = getPaths(this.text); + } -HtmlCommonTag.prototype.getPaths = function getPaths () { - return this.paths; -}; + getPaths () { + return this.paths; + } -HtmlCommonTag.prototype.updateText = function updateText (pathsToUpdate) { - var pathToUpdate = _.find(pathsToUpdate, {oldPath: this.paths[0]}); - return pathToUpdate ? pathToUpdate.newPath : this.text; -}; + updateText (pathsToUpdate) { + const pathToUpdate = _.find(pathsToUpdate, {oldPath: this.paths[0]}); + return pathToUpdate ? pathToUpdate.newPath : this.text; + } +} module.exports = HtmlCommonTag; diff --git a/lib/resource-handler/path-containers/html-img-srcset-tag.js b/lib/resource-handler/path-containers/html-img-srcset-tag.js index 623ec37c..25d3b013 100644 --- a/lib/resource-handler/path-containers/html-img-srcset-tag.js +++ b/lib/resource-handler/path-containers/html-img-srcset-tag.js @@ -1,27 +1,29 @@ -var srcset = require('srcset'); -var _ = require('lodash'); +'use strict'; -function HtmlImgSrcSetTag (text) { - this.text = text || ''; - this.imgSrcsetParts = srcset.parse(this.text); - this.paths = this.imgSrcsetParts.map(function getPath (imgSrcset) { - return imgSrcset.url; - }); -} +const srcset = require('srcset'); +const _ = require('lodash'); + +class HtmlImgSrcSetTag { + constructor (text) { + this.text = text || ''; + this.imgSrcsetParts = srcset.parse(this.text); + this.paths = this.imgSrcsetParts.map(imgSrcset => imgSrcset.url); + } -HtmlImgSrcSetTag.prototype.getPaths = function getPaths () { - return this.paths; -}; + getPaths () { + return this.paths; + } -HtmlImgSrcSetTag.prototype.updateText = function updateText (pathsToUpdate) { - var imgSrcsetParts = this.imgSrcsetParts; - pathsToUpdate.forEach(function updatePath (path) { - var srcsToUpdate = _.filter(imgSrcsetParts, {url: path.oldPath}); - srcsToUpdate.forEach((srcToUpdate) => { - srcToUpdate.url = path.newPath; + updateText (pathsToUpdate) { + const imgSrcsetParts = this.imgSrcsetParts; + pathsToUpdate.forEach(function updatePath (path) { + const srcsToUpdate = _.filter(imgSrcsetParts, {url: path.oldPath}); + srcsToUpdate.forEach((srcToUpdate) => { + srcToUpdate.url = path.newPath; + }); }); - }); - return srcset.stringify(imgSrcsetParts); -}; + return srcset.stringify(imgSrcsetParts); + } +} module.exports = HtmlImgSrcSetTag; diff --git a/lib/scraper.js b/lib/scraper.js index c843e21b..3886e471 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -31,7 +31,7 @@ function Scraper (options) { self.options.recursiveSources = recursiveSources; if (self.options.recursive) { - self.options.sources = _.union(self.options.sources, self.options.recursiveSources); + self.options.sources = u.union(self.options.sources, self.options.recursiveSources); } logger.info('init with options', self.options); diff --git a/lib/utils/index.js b/lib/utils/index.js index 09a3d474..23e7ed44 100644 --- a/lib/utils/index.js +++ b/lib/utils/index.js @@ -1,15 +1,18 @@ -var url = require('url'); -var path = require('path'); -var Promise = require('bluebird'); -var normalizeUrl = require('normalize-url'); -var htmlEntities = require('he'); -var typeByMime = require('../config/resource-type-by-mime'); -var typeByExt = require('../config/resource-type-by-ext'); +'use strict'; -var logger = require('../logger'); +const url = require('url'); +const path = require('path'); +const Promise = require('bluebird'); +const normalizeUrl = require('normalize-url'); +const htmlEntities = require('he'); +const _ = require('lodash'); +const typeByMime = require('../config/resource-type-by-mime'); +const typeByExt = require('../config/resource-type-by-ext'); -var MAX_FILENAME_LENGTH = 255; -var IS_URL = /^((http[s]?:)?\/\/)/; +const logger = require('../logger'); + +const MAX_FILENAME_LENGTH = 255; +const IS_URL = /^((http[s]?:)?\/\/)/; function isUrl (path) { return IS_URL.test(path); @@ -136,6 +139,10 @@ function extend (first, second) { return Object.assign({}, first, second); } +function union (first, second) { + return _.unionWith(first, second, _.isEqual); +} + module.exports = { isUrl, getUrl, @@ -155,5 +162,6 @@ module.exports = { getTypeByFilename, decodeHtmlEntities, clone, - extend + extend, + union }; diff --git a/package.json b/package.json index abd3f3b9..6121ccd2 100644 --- a/package.json +++ b/package.json @@ -53,7 +53,7 @@ "nock": "^9.0.2", "proxyquire": "^1.7.3", "should": "^11.1.0", - "sinon": "^2.1.0" + "sinon": "^2.2.0" }, "files": [ "index.js", diff --git a/test/functional/update-missing-sources/mocks/index.html b/test/functional/update-missing-sources/mocks/index.html new file mode 100644 index 00000000..6bd3ccb1 --- /dev/null +++ b/test/functional/update-missing-sources/mocks/index.html @@ -0,0 +1,12 @@ + + +
+ +
+
+
+
+
\ No newline at end of file
diff --git a/test/functional/update-missing-sources/mocks/link1.html b/test/functional/update-missing-sources/mocks/link1.html
new file mode 100644
index 00000000..90b61c96
--- /dev/null
+++ b/test/functional/update-missing-sources/mocks/link1.html
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/test/functional/update-missing-sources/mocks/path-containers.html b/test/functional/update-missing-sources/mocks/path-containers.html
new file mode 100644
index 00000000..2a9d7e93
--- /dev/null
+++ b/test/functional/update-missing-sources/mocks/path-containers.html
@@ -0,0 +1,12 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/test/functional/update-missing-sources/update-missing-sources.test.js b/test/functional/update-missing-sources/update-missing-sources.test.js
new file mode 100644
index 00000000..0ff01433
--- /dev/null
+++ b/test/functional/update-missing-sources/update-missing-sources.test.js
@@ -0,0 +1,208 @@
+require('should');
+const nock = require('nock');
+const fs = require('fs-extra');
+const scrape = require('../../../index');
+
+const testDirname = __dirname + '/.tmp';
+const mockDirname = __dirname + '/mocks';
+
+describe('Functional: update missing sources', () => {
+
+ beforeEach(() => {
+ nock.cleanAll();
+ nock.disableNetConnect();
+ });
+
+ afterEach(() => {
+ nock.cleanAll();
+ nock.enableNetConnect();
+ fs.removeSync(testDirname);
+ });
+
+ it('should not update missing sources if updateMissingSources = false', () => {
+ const options = {
+ urls: [ 'http://example.com/' ],
+ directory: testDirname,
+ subdirectories: null,
+ sources: [{ selector: 'img', attr: 'src' }],
+ updateMissingSources: false
+ };
+
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
+ nock('http://example.com/').get('/missing-img.png').replyWithError('COULDN\'T DOWNLOAD IMAGE');
+
+ return scrape(options).then(() => {
+ fs.existsSync(testDirname + '/index.html').should.be.eql(true);
+ fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false);
+
+
+ const indexBody = fs.readFileSync(testDirname + '/index.html').toString();
+ indexBody.should.containEql('
{
+ const options = {
+ urls: [ 'http://example.com/' ],
+ directory: testDirname,
+ subdirectories: null,
+ sources: [{ selector: 'img', attr: 'src' }],
+ updateMissingSources: true
+ };
+
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
+ nock('http://example.com/').get('/missing-img.png').replyWithError('COULDN\'T DOWNLOAD IMAGE');
+
+ return scrape(options).then(() => {
+ fs.existsSync(testDirname + '/index.html').should.be.eql(true);
+ fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false);
+
+
+ const indexBody = fs.readFileSync(testDirname + '/index.html').toString();
+ indexBody.should.containEql('
{
+ const options = {
+ urls: [ 'http://example.com/' ],
+ directory: testDirname,
+ subdirectories: null,
+ sources: [
+ { selector: 'img', attr: 'src' },
+ { selector: 'script', attr: 'src' }
+ ],
+ updateMissingSources: [
+ { selector: 'img', attr: 'src' }
+ ]
+ };
+
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
+ nock('http://example.com/').get('/missing-img.png').replyWithError('COULDN\'T DOWNLOAD IMAGE');
+ nock('http://example.com/').get('/missing-script.js').replyWithError('COULDN\'T DOWNLOAD SCRIPT');
+
+ return scrape(options).then(() => {
+ fs.existsSync(testDirname + '/index.html').should.be.eql(true);
+ fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false);
+ fs.existsSync(testDirname + '/missing-script.js').should.be.eql(false);
+
+
+ const indexBody = fs.readFileSync(testDirname + '/index.html').toString();
+ indexBody.should.containEql('
{
+ const options = {
+ urls: [ 'http://example.com/' ],
+ directory: testDirname,
+ subdirectories: null,
+ sources: [{ selector: 'img', attr: 'src' }],
+ updateMissingSources: true,
+ urlFilter: function (url) {
+ return url.indexOf('/missing-img.png') === -1;
+ }
+ };
+
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
+ nock('http://example.com/').get('/missing-img.png').reply(200, 'ok');
+
+ return scrape(options).then(() => {
+ fs.existsSync(testDirname + '/index.html').should.be.eql(true);
+ fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false);
+
+
+ const indexBody = fs.readFileSync(testDirname + '/index.html').toString();
+ indexBody.should.containEql('
{
+ const options = {
+ urls: [ 'http://example.com/' ],
+ directory: testDirname,
+ subdirectories: null,
+ sources: [],
+ recursive: true,
+ maxRecursiveDepth: 1,
+ updateMissingSources: true,
+ };
+
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
+ nock('http://example.com/').get('/link1.html').replyWithFile(200, mockDirname + '/link1.html');
+ nock('http://example.com/').get('/missing-link.html').reply(200, 'ok');
+
+ return scrape(options).then(() => {
+ fs.existsSync(testDirname + '/index.html').should.be.eql(true);
+ fs.existsSync(testDirname + '/link1.html').should.be.eql(true);
+ fs.existsSync(testDirname + '/missing-link.html').should.be.eql(false);
+
+
+ const link = fs.readFileSync(testDirname + '/link1.html').toString();
+ link.should.containEql(' {
+ const options = {
+ urls: [ 'http://example.com/' ],
+ directory: testDirname,
+ subdirectories: null,
+ sources: [{selector: 'style'}],
+ updateMissingSources: true,
+ };
+
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/path-containers.html');
+ nock('http://example.com/').get('/a.png').reply(200, 'ok');
+ nock('http://example.com/').get('/b.png').replyWithError('Failed!');
+ nock('http://example.com/').get('/c.png').reply(200, 'ok');
+
+ return scrape(options).then(() => {
+ fs.existsSync(testDirname + '/index.html').should.be.eql(true);
+ fs.existsSync(testDirname + '/a.png').should.be.eql(true);
+ fs.existsSync(testDirname + '/b.png').should.be.eql(false);
+ fs.existsSync(testDirname + '/c.png').should.be.eql(true);
+
+
+ const index = fs.readFileSync(testDirname + '/index.html').toString();
+ index.should.containEql(`.a { background: url('a.png') }`);
+ index.should.containEql(`.b { background: url('http://example.com/b.png') }`);
+ index.should.containEql(`.c { background: url('c.png') }`);
+ });
+ });
+
+ it('should update all and download nothing', () => {
+ const options = {
+ urls: [ 'http://example.com/' ],
+ directory: testDirname,
+ subdirectories: null,
+ sources: [],
+ updateMissingSources: [
+ { selector: 'img', attr: 'src' },
+ { selector: 'script', attr: 'src' },
+ { selector: 'a', attr: 'href' },
+ ]
+ };
+
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
+ nock('http://example.com/').get('/missing-img.png').reply(200, 'ok');
+ nock('http://example.com/').get('/missing-script.js').reply(200, 'ok');
+ nock('http://example.com/').get('/link1.html').reply(200, 'ok');
+
+ return scrape(options).then(() => {
+ fs.existsSync(testDirname + '/index.html').should.be.eql(true);
+ fs.existsSync(testDirname + '/missing-img.png').should.be.eql(false);
+ fs.existsSync(testDirname + '/missing-script.js').should.be.eql(false);
+ fs.existsSync(testDirname + '/link1.html').should.be.eql(false);
+
+
+ const indexBody = fs.readFileSync(testDirname + '/index.html').toString();
+ indexBody.should.containEql('
{
+ it('should call downloadChildrenResources and set returned text to resource', () => {
+ const downloadChildrenPaths = sinon.stub().resolves('updated text');
- return cssHandler.handle(originalResource).then(function (updatedResource) {
+ const originalResource = new Resource('http://example.com');
+ const cssHandler = new CssResourceHandler({}, {downloadChildrenPaths});
+
+ return cssHandler.handle(originalResource).then((updatedResource) => {
should(updatedResource).be.equal(originalResource);
should(updatedResource.getText()).be.eql('updated text');
});
diff --git a/test/unit/resource-handler/html.test.js b/test/unit/resource-handler/html.test.js
index 4c895c23..cc575671 100644
--- a/test/unit/resource-handler/html.test.js
+++ b/test/unit/resource-handler/html.test.js
@@ -11,13 +11,68 @@ const HtmlCommonTag = require('../../../lib/resource-handler/path-containers/htm
const CssText = require('../../../lib/resource-handler/path-containers/css-text');
describe('ResourceHandler: Html', () => {
- let htmlHandler;
+ let downloadChildrenPaths, htmlHandler;
beforeEach(() => {
- htmlHandler = new HtmlHandler({ sources: [] }, sinon.stub().returns(Promise.resolve()));
+ downloadChildrenPaths = sinon.stub().usingPromise(Promise).resolves();
+ });
+
+ describe('constructor', () => {
+ describe('sources', () => {
+ it('should initialize sources if updateMissingSources was not passed', () => {
+ const sources = [{ selector: 'img', attr: 'src'}];
+ htmlHandler = new HtmlHandler({sources}, {downloadChildrenPaths});
+
+ htmlHandler.downloadSources.should.eql(sources);
+ htmlHandler.updateSources.should.eql([]);
+ htmlHandler.allSources.should.eql(sources);
+ });
+
+ it('should initialize sources if updateMissingSources = false', () => {
+ const sources = [{ selector: 'img', attr: 'src'}];
+ htmlHandler = new HtmlHandler({sources, updateMissingSources: false}, {downloadChildrenPaths});
+
+ htmlHandler.downloadSources.should.eql(sources);
+ htmlHandler.updateSources.should.eql([]);
+ htmlHandler.allSources.should.eql(sources);
+ });
+
+ it('should initialize sources if updateMissingSources = true', () => {
+ const sources = [{ selector: 'img', attr: 'src'}];
+ htmlHandler = new HtmlHandler({sources, updateMissingSources: true}, {downloadChildrenPaths});
+
+ htmlHandler.downloadSources.should.eql(sources);
+ htmlHandler.updateSources.should.eql(sources);
+ htmlHandler.allSources.should.eql(sources);
+ });
+
+ it('should initialize sources if updateMissingSources is array of sources', () => {
+ const sources = [{ selector: 'img', attr: 'src'}];
+ const updateMissingSources = [{ selector: 'a', attr: 'href'}];
+ htmlHandler = new HtmlHandler({sources, updateMissingSources}, {downloadChildrenPaths});
+
+ htmlHandler.downloadSources.should.eql(sources);
+ htmlHandler.updateSources.should.eql(updateMissingSources);
+ htmlHandler.allSources.should.eql([{ selector: 'img', attr: 'src'}, { selector: 'a', attr: 'href'}]);
+ });
+
+ it('should initialize sources without duplicates if updateMissingSources is array of sources', () => {
+ const sources = [{ selector: 'img', attr: 'src'}];
+ const updateMissingSources = [{ selector: 'img', attr: 'src'}, { selector: 'a', attr: 'href'}];
+ htmlHandler = new HtmlHandler({sources, updateMissingSources}, {downloadChildrenPaths});
+
+ htmlHandler.downloadSources.should.eql(sources);
+ htmlHandler.updateSources.should.eql(updateMissingSources);
+ htmlHandler.allSources.should.eql([{ selector: 'img', attr: 'src'}, { selector: 'a', attr: 'href'}]);
+ });
+ });
});
describe('