Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ scrape(options, (error, result) => {
* [resourceSaver](#resourcesaver) - customize resources saving
* [onResourceSaved](#onresourcesaved) - callback called when resource is saved
* [onResourceError](#onresourceerror) - callback called when resource's downloading is failed
* [updateMissingSources](#updatemissingsources) - update url for missing sources with absolute url

Default options you can find in [lib/config/defaults.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/config/defaults.js) or get them using `scrape.defaults`.

Expand Down Expand Up @@ -145,7 +146,7 @@ String, filename for index page. Defaults to `index.html`.
Boolean, whether urls should be 'prettified', by having the `defaultFilename` removed. Defaults to `false`.

#### ignoreErrors
Boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error. Defaults to `true`.
Boolean, if `true` scraper will continue downloading resources after error occurred, if `false` - scraper will finish process and return error. Defaults to `true`.

#### urlFilter
Function which is called for each url to check whether it should be scraped. Defaults to `null` - no url filter will be applied.
Expand Down Expand Up @@ -253,6 +254,30 @@ scrape({
})
```

#### updateMissingSources
Boolean, if `true` scraper will set absolute urls for all failing `sources`, if `false` - it will leave them as is (which may cause incorrectly displayed page).
Also can contain array of `sources` to update (structure is similar to [sources](#sources)).
Defaults to `false`.
```javascript
// update all failing img srcs with absolute url
scrape({
urls: ['http://example.com/'],
directory: '/path/to/save',
sources: [{selector: 'img', attr: 'src'}],
updateMissingSources: true
});

// download nothing, just update all img srcs with absolute urls
scrape({
urls: ['http://example.com/'],
directory: '/path/to/save',
sources: [],
updateMissingSources: [{selector: 'img', attr: 'src'}]
});

```


## callback
Callback function, optional, includes following parameters:
- `error`: if error - `Error` object, if success - `null`
Expand Down
3 changes: 2 additions & 1 deletion lib/config/defaults.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ const config = {
httpResponseHandler: null,
onResourceSaved: null,
onResourceError: null,
resourceSaver: null
resourceSaver: null,
updateMissingSources: false
};

module.exports = config;
29 changes: 17 additions & 12 deletions lib/resource-handler/css/index.js
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
var CssText = require('./../path-containers/css-text');
'use strict';

function CssResourceHandler (options, handleChildrenPaths) {
this.options = options;
this.handleChildrenPaths = handleChildrenPaths;
}
const CssText = require('./../path-containers/css-text');

class CssResourceHandler {
constructor (options, methods) {
this.options = options;
this.downloadChildrenPaths = methods.downloadChildrenPaths;
this.updateMissingSources = this.options.updateMissingSources === true || Array.isArray(this.options.updateMissingSources);
}

CssResourceHandler.prototype.handle = function handle (resource) {
var pathContainer = new CssText(resource.getText());
return this.handleChildrenPaths(pathContainer, resource).then(function updateText (updatedText) {
resource.setText(updatedText);
return resource;
});
};
handle (resource) {
const pathContainer = new CssText(resource.getText());
return this.downloadChildrenPaths(pathContainer, resource, this.updateMissingSources).then(function updateText (updatedText) {
resource.setText(updatedText);
return resource;
});
}
}

module.exports = CssResourceHandler;
69 changes: 52 additions & 17 deletions lib/resource-handler/html/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,30 @@ const logger = require('../../logger');
const HtmlSourceElement = require('./html-source-element');

class HtmlResourceHandler {
constructor (options, handleChildrenPaths) {
constructor (options, methods) {
this.options = options;
this.handleChildrenPaths = handleChildrenPaths;
this.downloadChildrenPaths = methods.downloadChildrenPaths;
this.updateChildrenPaths = methods.updateChildrenPaths;

this.recursiveSources = this.options.recursiveSources || [];
this.downloadSources = this.options.sources;
this.updateSources = [];

if (this.options.updateMissingSources === true) {
this.updateSources = this.downloadSources;
} else if (Array.isArray(this.options.updateMissingSources)) {
this.updateSources = this.options.updateMissingSources;
}

this.allSources = utils.union(this.downloadSources, this.updateSources);
}

handle (resource) {
const $ = loadTextToCheerio(resource.getText());
prepareToLoad($, resource);

return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource))
.then(function updateResource () {
return Promise.mapSeries(this.allSources, this.loadResourcesForRule.bind(this, $, resource))
.then(() => {
resource.setText($.html());
return resource;
});
Expand All @@ -27,31 +40,53 @@ class HtmlResourceHandler {
const self = this;
const promises = $(rule.selector).map((i, element) => {
const el = new HtmlSourceElement($(element), rule);
const pathContainer = el.getPathContainer();

const isRecursive = self.options.recursiveSources && Boolean(el.findMatchedRule(self.options.recursiveSources));
const isDepthGreaterThanMax = self.options.maxRecursiveDepth && parentResource.getDepth() >= self.options.maxRecursiveDepth;
if (isRecursive && isDepthGreaterThanMax) {
if (!pathContainer) {
return Promise.resolve(null);
}

const needToDownloadElement = this.needToDownload(el);
const needToUpdateElement = this.needToUpdate(el);

if (this.exceedMaxRecursiveDepth(el, parentResource)) {
logger.debug(`filtering out ${el} by max recursive depth`);
return Promise.resolve();
return self.updateChildrenPaths(pathContainer, parentResource, needToUpdateElement).then(el.setData.bind(el));
}

const pathContainer = el.getPathContainer();
if (!pathContainer) {
return Promise.resolve();
if (!needToDownloadElement) {
return self.updateChildrenPaths(pathContainer, parentResource, needToUpdateElement).then(el.setData.bind(el));
}
return self.handleChildrenPaths(pathContainer, parentResource).then((updatedText) => {
el.setData(updatedText);
el.removeIntegrityCheck();
});

return self.downloadChildrenPaths(pathContainer, parentResource, needToUpdateElement)
.then((updatedText) => {
el.setData(updatedText);
el.removeIntegrityCheck();
});

}).get();

return utils.waitAllFulfilled(promises);
}

exceedMaxRecursiveDepth (el, parentResource) {
const isRecursive = Boolean(el.findMatchedRule(this.recursiveSources));
const isDepthGreaterThanMax = this.options.maxRecursiveDepth && parentResource.getDepth() >= this.options.maxRecursiveDepth;
return isRecursive && isDepthGreaterThanMax;
}

needToDownload (el) {
return Boolean(el.findMatchedRule(this.downloadSources));
}

needToUpdate (el) {
return Boolean(el.findMatchedRule(this.updateSources));
}
}

function prepareToLoad ($, resource) {
$('base').each(function handleBaseTag () {
const el = $(this);
$('base').each((i, element) => {
const el = $(element);
const href = el.attr('href');
if (href) {
const newUrl = utils.getUrl(resource.getUrl(), href);
Expand Down
161 changes: 94 additions & 67 deletions lib/resource-handler/index.js
Original file line number Diff line number Diff line change
@@ -1,82 +1,109 @@
var _ = require('lodash');
var Promise = require('bluebird');
var logger = require('../logger');
var utils = require('../utils');
'use strict';

var HtmlHandler = require('./html');
var CssHandler = require('./css');
const _ = require('lodash');
const Promise = require('bluebird');
const logger = require('../logger');
const utils = require('../utils');

var supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename'];
const HtmlHandler = require('./html');
const CssHandler = require('./css');

function ResourceHandler (options, context) {
this.options = _.pick(options, supportedOptions);
this.context = context;
const supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename', 'updateMissingSources'];

this.htmlHandler = new HtmlHandler(this.options, this.handleChildrenResources.bind(this));
this.cssHandler = new CssHandler(this.options, this.handleChildrenResources.bind(this));
}
class ResourceHandler {
constructor (options, context) {
this.options = _.pick(options, supportedOptions);
this.context = context;

const methods = {
downloadChildrenPaths: this.downloadChildrenResources.bind(this),
updateChildrenPaths: this.updateChildrenResources.bind(this)
};

ResourceHandler.prototype.getResourceHandler = function getResourceHandler (resource) {
switch (true) {
case resource.isCss():
logger.debug('using css handler for ' + resource);
return this.cssHandler;
case resource.isHtml():
logger.debug('using html handler for ' + resource);
return this.htmlHandler;
default:
logger.debug('using no handler for ' + resource);
return null;
this.htmlHandler = new HtmlHandler(this.options, methods);
this.cssHandler = new CssHandler(this.options, methods);
}
};

/**
* Request all resources from pathContainers paths
* @param pathContainer - instance of ImgSrcsetTag or CommonTag or CssText, contains original paths for resources
* @param {Resource} parentResource
* @returns {Promise} - resolved when all resources from pathContainer were requested
* and original paths in parentResource were updated with local paths for children resources
*/
ResourceHandler.prototype.handleChildrenResources = function handleChildrenResources (pathContainer, parentResource) {
var self = this;
var childrenPaths = pathContainer.getPaths();
var pathsToUpdate = [];

var childrenPromises = childrenPaths.map(function loadChildPath (childPath) {
var childResourceUrl = utils.getUrl(parentResource.getUrl(), childPath);
var childResource = parentResource.createChild(childResourceUrl);

return self.context.requestResource(childResource).then(function updateChildPath (respondedResource) {
if (respondedResource) {
parentResource.updateChild(childResource, respondedResource);

var relativePath = utils.getRelativePath(parentResource.getFilename(), respondedResource.getFilename());
if (self.options.prettifyUrls) {
relativePath = relativePath.replace(self.options.defaultFilename, '');
}
var hash = utils.getHashFromUrl(childPath);

if (hash) {
relativePath = relativePath.concat(hash);
getResourceHandler (resource) {
switch (true) {
case resource.isCss():
logger.debug('using css handler for ' + resource);
return this.cssHandler;
case resource.isHtml():
logger.debug('using html handler for ' + resource);
return this.htmlHandler;
default:
logger.debug('using no handler for ' + resource);
return null;
}
}

/**
* Request all resources from pathContainers paths
* @param pathContainer - instance of ImgSrcsetTag or CommonTag or CssText, contains original paths for resources
* @param {Resource} parentResource
* @param {boolean} updateIfFailed - if true - failed resources will be updated with absolute links
* @returns {Promise} - resolved when all resources from pathContainer were requested
* and original paths in parentResource were updated with local paths for children resources
*/
downloadChildrenResources (pathContainer, parentResource, updateIfFailed) {
const self = this;
const childrenPaths = pathContainer.getPaths();
const pathsToUpdate = [];

const childrenPromises = childrenPaths.map((childPath) => {
const childResourceUrl = utils.getUrl(parentResource.getUrl(), childPath);
const childResource = parentResource.createChild(childResourceUrl);

return self.context.requestResource(childResource).then((respondedResource) => {
if (respondedResource) {
parentResource.updateChild(childResource, respondedResource);

let relativePath = utils.getRelativePath(parentResource.getFilename(), respondedResource.getFilename());
if (self.options.prettifyUrls) {
relativePath = relativePath.replace(self.options.defaultFilename, '');
}
const hash = utils.getHashFromUrl(childPath);

if (hash) {
relativePath = relativePath.concat(hash);
}

pathsToUpdate.push({ oldPath: childPath, newPath: relativePath});
} else {
if (updateIfFailed) {
pathsToUpdate.push({ oldPath: childPath, newPath: childResourceUrl});
}
}
return null; // Prevent Bluebird warnings
});
});

pathsToUpdate.push({ oldPath: childPath, newPath: relativePath});
}
return null; // Prevent Bluebird warnings
return utils.waitAllFulfilled(childrenPromises).then(function updateChildrenPaths () {
return pathContainer.updateText(pathsToUpdate);
});
});
}

return utils.waitAllFulfilled(childrenPromises).then(function updateChildrenPaths () {
return pathContainer.updateText(pathsToUpdate);
});
};
updateChildrenResources (pathContainer, parentResource, needToUpdate) {
if (!needToUpdate) {
return Promise.resolve(pathContainer.updateText([]));
}
const parentUrl = parentResource.getUrl();
const pathsToUpdate = [];
pathContainer.getPaths().forEach((path) => {
const childAbsoluteUrl = utils.getUrl(parentUrl, path);
pathsToUpdate.push({ oldPath: path, newPath: childAbsoluteUrl });
});
return Promise.resolve(pathContainer.updateText(pathsToUpdate));
}

ResourceHandler.prototype.handleResource = function handleResource (resource) {
var resourceHandler = this.getResourceHandler(resource);
if (resourceHandler && resourceHandler.handle) {
return resourceHandler.handle(resource);
handleResource (resource) {
const resourceHandler = this.getResourceHandler(resource);
if (resourceHandler && resourceHandler.handle) {
return resourceHandler.handle(resource);
}
return Promise.resolve(resource);
}
return Promise.resolve(resource);
};
}

module.exports = ResourceHandler;
Loading