Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ scrape(options, (error, result) => {
* [urls](#urls) - urls to download, *required*
* [directory](#directory) - path to save files, *required*
* [sources](#sources) - selects which resources should be downloaded
* [recursive](#recursive) - follow anchors in html files
* [maxDepth](#maxdepth) - maximum depth for dependencies
* [recursive](#recursive) - follow hyperlinks in html files
* [maxRecursiveDepth](#maxrecursivedepth) - maximum depth for hyperlinks
* [maxDepth](#maxdepth) - maximum depth for all dependencies
* [request](#request) - custom options for for [request](https://github.com/request/request)
* [subdirectories](#subdirectories) - subdirectories for file extensions
* [defaultFilename](#defaultfilename) - filename for index page
Expand Down Expand Up @@ -96,10 +97,13 @@ scrape({
```

#### recursive
Boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading. Defaults to `false`.
Boolean, if `true` scraper will follow hyperlinks in html files. Don't forget to set `maxRecursiveDepth` to avoid infinite downloading. Defaults to `false`.

#### maxRecursiveDepth
Positive number, maximum allowed depth for hyperlinks. Other dependencies will be saved regardless of their depth. Defaults to `null` - no maximum recursive depth set.

#### maxDepth
Positive number, maximum allowed depth for dependencies. Defaults to `null` - no maximum depth set.
Positive number, maximum allowed depth for all dependencies. Defaults to `null` - no maximum depth set.

#### request
Object, custom options for [request](https://github.com/request/request#requestoptions-callback). Allows to set cookies, userAgent, etc.
Expand Down
1 change: 1 addition & 0 deletions lib/config/defaults.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ const config = {
},
urlFilter: null,
recursive: false,
maxRecursiveDepth: null,
maxDepth: null,
ignoreErrors: true,
httpResponseHandler: null,
Expand Down
99 changes: 58 additions & 41 deletions lib/resource-handler/html/html-source-element.js
Original file line number Diff line number Diff line change
@@ -1,53 +1,70 @@
var ImgSrcsetTag = require('../path-containers/html-img-srcset-tag');
var CommonTag = require('../path-containers/html-common-tag');
var CssText = require('../path-containers/css-text');
var _ = require('lodash');
var utils = require('../../utils');
'use strict';

var pathContainersByRule = [
const ImgSrcsetTag = require('../path-containers/html-img-srcset-tag');
const CommonTag = require('../path-containers/html-common-tag');
const CssText = require('../path-containers/css-text');
const utils = require('../../utils');

const pathContainersByRule = [
{ selector: '[style]', attr: 'style', containerClass: CssText },
{ selector: 'style', containerClass: CssText },
{ selector: '*[srcset]', attr: 'srcset', containerClass: ImgSrcsetTag }
];

/**
* Represents pair of cheerio element and rule to find text with children resources
* @param {Object} el - cheerio obj for dom element
* @param {Object} rule - rule used to find current element
* @param {string} rule.selector - cheerio selector
* @param {string} rule.attr - attribute to find text which contains resources. if not set - use inner html
* @constructor
*/
function HtmlSourceElement (el, rule) {
this.el = el;
this.rule = rule;
}
class HtmlSourceElement {
/**
* @param {Object} el - cheerio obj for dom element
* @param {Object} rule - rule used to find current element
* @param {string} rule.selector - cheerio selector
* @param {string} rule.attr - attribute to find text which contains resources. if not set - use inner html
*/
constructor (el, rule) {
this.el = el;
this.rule = rule;
}

/**
* Get text from attr or from innerHtml of element based on rule
* @returns {string}
*/
HtmlSourceElement.prototype.getData = function getData () {
var text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text();
return utils.decodeHtmlEntities(text);
};

HtmlSourceElement.prototype.setData = function setData (newData) {
this.rule.attr ? this.el.attr(this.rule.attr, newData) : this.el.text(newData);
};

HtmlSourceElement.prototype.getPathContainerClass = function getPathContainerClass () {
var selectedRule = _.find(pathContainersByRule, (containerByRule) => {
return this.el.is(containerByRule.selector) && this.rule.attr === containerByRule.attr;
});

return selectedRule ? selectedRule.containerClass : CommonTag;
};

HtmlSourceElement.prototype.getPathContainer = function getPathContainer () {
var ContainerClass = this.getPathContainerClass();
var textWithResources = this.getData();
return textWithResources ? new ContainerClass(textWithResources) : null;
};
/**
* Get resource data from element using rule
* @returns {string}
*/
getData () {
const text = this.rule.attr ? this.el.attr(this.rule.attr) : this.el.text();
return utils.decodeHtmlEntities(text);
}

/**
* Update attribute or inner text of el with new data
* @param {string} newData
*/
setData (newData) {
this.rule.attr ? this.el.attr(this.rule.attr, newData) : this.el.text(newData);
}

/**
* Returns PathContainer instance for element
* @returns {CssText|HtmlCommonTag|HtmlImgSrcSetTag|null}
*/
getPathContainer () {
const selectedRule = this.findMatchedRule(pathContainersByRule);
const ContainerClass = selectedRule ? selectedRule.containerClass : CommonTag;
const textWithResources = this.getData();
return textWithResources ? new ContainerClass(textWithResources) : null;
}

matchesRule (rule) {
return this.el.is(rule.selector) && this.rule.attr === rule.attr;
}

findMatchedRule (rulesArray) {
return rulesArray.find(this.matchesRule, this);
}

toString () {
return JSON.stringify({selector: this.rule.selector, attr: this.rule.attr, data: this.getData()});
}
}

module.exports = HtmlSourceElement;
83 changes: 48 additions & 35 deletions lib/resource-handler/html/index.js
Original file line number Diff line number Diff line change
@@ -1,44 +1,57 @@
var cheerio = require('cheerio');
var Promise = require('bluebird');
var utils = require('../../utils');
var HtmlSourceElement = require('./html-source-element');

function HtmlResourceHandler (options, handleChildrenPaths) {
this.options = options;
this.handleChildrenPaths = handleChildrenPaths;
}
'use strict';

HtmlResourceHandler.prototype.handle = function handle (resource) {
var $ = loadTextToCheerio(resource.getText());
prepareToLoad($, resource);

return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource))
.then(function updateResource () {
resource.setText($.html());
return resource;
});
};

HtmlResourceHandler.prototype.loadResourcesForRule = function loadResourcesForRule ($, parentResource, rule) {
var self = this;
var promises = $(rule.selector).map(function loadForElement () {
var el = new HtmlSourceElement($(this), rule);
var pathContainer = el.getPathContainer();
if (!pathContainer) {
return Promise.resolve();
}
return self.handleChildrenPaths(pathContainer, parentResource).then(el.setData.bind(el));
}).get();
const cheerio = require('cheerio');
const Promise = require('bluebird');
const utils = require('../../utils');
const logger = require('../../logger');
const HtmlSourceElement = require('./html-source-element');

class HtmlResourceHandler {
constructor (options, handleChildrenPaths) {
this.options = options;
this.handleChildrenPaths = handleChildrenPaths;
}

handle (resource) {
const $ = loadTextToCheerio(resource.getText());
prepareToLoad($, resource);

return Promise.mapSeries(this.options.sources, this.loadResourcesForRule.bind(this, $, resource))
.then(function updateResource () {
resource.setText($.html());
return resource;
});
}

return utils.waitAllFulfilled(promises);
};
loadResourcesForRule ($, parentResource, rule) {
const self = this;
const promises = $(rule.selector).map(function loadForElement () {
const el = new HtmlSourceElement($(this), rule);

const isRecursive = self.options.recursiveSources && Boolean(el.findMatchedRule(self.options.recursiveSources));
const isDepthGreaterThanMax = self.options.maxRecursiveDepth && parentResource.getDepth() >= self.options.maxRecursiveDepth;
if (isRecursive && isDepthGreaterThanMax) {
logger.debug(`filtering out ${el} by max recursive depth`);
return Promise.resolve();
}

const pathContainer = el.getPathContainer();
if (!pathContainer) {
return Promise.resolve();
}
return self.handleChildrenPaths(pathContainer, parentResource).then(el.setData.bind(el));
}).get();

return utils.waitAllFulfilled(promises);
}
}

function prepareToLoad ($, resource) {
$('base').each(function handleBaseTag () {
var el = $(this);
var href = el.attr('href');
const el = $(this);
const href = el.attr('href');
if (href) {
var newUrl = utils.getUrl(resource.getUrl(), href);
const newUrl = utils.getUrl(resource.getUrl(), href);
resource.setUrl(newUrl);
el.remove();
}
Expand Down
11 changes: 5 additions & 6 deletions lib/resource-handler/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@ var utils = require('../utils');
var HtmlHandler = require('./html');
var CssHandler = require('./css');

var supportedOptions = ['prettifyUrls', 'sources', 'defaultFilename'];
var supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename'];

function ResourceHandler (options, context) {
var self = this;
self.options = _.pick(options, supportedOptions);
self.context = context;
this.options = _.pick(options, supportedOptions);
this.context = context;

self.htmlHandler = new HtmlHandler(self.options, self.handleChildrenResources.bind(self));
self.cssHandler = new CssHandler(self.options, self.handleChildrenResources.bind(self));
this.htmlHandler = new HtmlHandler(this.options, this.handleChildrenResources.bind(this));
this.cssHandler = new CssHandler(this.options, this.handleChildrenResources.bind(this));
}

ResourceHandler.prototype.getResourceHandler = function getResourceHandler (resource) {
Expand Down
3 changes: 2 additions & 1 deletion lib/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ function Scraper (options) {
});
}

self.options.recursiveSources = recursiveSources;
if (self.options.recursive) {
self.options.sources = _.union(self.options.sources, recursiveSources);
self.options.sources = _.union(self.options.sources, self.options.recursiveSources);
}

logger.info('init with options', self.options);
Expand Down
113 changes: 113 additions & 0 deletions test/functional/max-depth/max-depth.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
require('should');
const nock = require('nock');
const fs = require('fs-extra');
const scrape = require('../../../index');

const testDirname = __dirname + '/.tmp';
const mockDirname = __dirname + '/mocks';

describe('Functional: maxDepth and maxRecursiveDepth ', () => {

beforeEach(() => {
nock.cleanAll();
nock.disableNetConnect();
});

afterEach(() => {
nock.cleanAll();
nock.enableNetConnect();
fs.removeSync(testDirname);
});

it('should filter out all resources by depth > maxDepth', () => {
const options = {
urls: [ 'http://example.com/' ],
directory: testDirname,
subdirectories: null,
sources: [
{ selector: 'img', attr: 'src' },
{ selector: 'script', attr: 'src' },
{ selector: 'a', attr: 'href' }
],
maxDepth: 2
};

nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');

nock('http://example.com/').get('/depth1.html').replyWithFile(200, mockDirname + '/depth1.html');
nock('http://example.com/').get('/img-depth1.jpg').reply(200, 'img-depth1.jpg');
nock('http://example.com/').get('/script-depth1.js').reply(200, 'script-depth1.js');

nock('http://example.com/').get('/depth2.html').replyWithFile(200, mockDirname + '/depth2.html');
nock('http://example.com/').get('/img-depth2.jpg').reply(200, 'img-depth2.jpg');
nock('http://example.com/').get('/script-depth2.js').reply(200, 'script-depth2.js');

nock('http://example.com/').get('/depth3.html').reply(200, 'OK');
nock('http://example.com/').get('/img-depth3.jpg').reply(200, 'img-depth3.jpg');
nock('http://example.com/').get('/script-depth3.js').reply(200, 'script-depth3.js');

return scrape(options).then(() => {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);

fs.existsSync(testDirname + '/depth1.html').should.be.eql(true);
fs.existsSync(testDirname + '/img-depth1.jpg').should.be.eql(true);
fs.existsSync(testDirname + '/script-depth1.js').should.be.eql(true);

fs.existsSync(testDirname + '/depth2.html').should.be.eql(true);
fs.existsSync(testDirname + '/img-depth2.jpg').should.be.eql(true);
fs.existsSync(testDirname + '/script-depth2.js').should.be.eql(true);

fs.existsSync(testDirname + '/depth3.html').should.be.eql(false);
fs.existsSync(testDirname + '/img-depth3.jpg').should.be.eql(false);
fs.existsSync(testDirname + '/script-depth3.js').should.be.eql(false);
});
});


it('should filter out only anchors by depth > maxRecursiveDepth', () => {
const options = {
urls: [ 'http://example.com/' ],
directory: testDirname,
subdirectories: null,
sources: [
{ selector: 'img', attr: 'src' },
{ selector: 'script', attr: 'src' },
{ selector: 'a', attr: 'href' }
],
maxRecursiveDepth: 2
};

nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');

nock('http://example.com/').get('/depth1.html').replyWithFile(200, mockDirname + '/depth1.html');
nock('http://example.com/').get('/img-depth1.jpg').reply(200, 'img-depth1.jpg');
nock('http://example.com/').get('/script-depth1.js').reply(200, 'script-depth1.js');

nock('http://example.com/').get('/depth2.html').replyWithFile(200, mockDirname + '/depth2.html');
nock('http://example.com/').get('/img-depth2.jpg').reply(200, 'img-depth2.jpg');
nock('http://example.com/').get('/script-depth2.js').reply(200, 'script-depth2.js');

nock('http://example.com/').get('/depth3.html').reply(200, 'OK');
nock('http://example.com/').get('/img-depth3.jpg').reply(200, 'img-depth3.jpg');
nock('http://example.com/').get('/script-depth3.js').reply(200, 'script-depth3.js');

return scrape(options).then(() => {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);

fs.existsSync(testDirname + '/depth1.html').should.be.eql(true);
fs.existsSync(testDirname + '/img-depth1.jpg').should.be.eql(true);
fs.existsSync(testDirname + '/script-depth1.js').should.be.eql(true);

fs.existsSync(testDirname + '/depth2.html').should.be.eql(true);
fs.existsSync(testDirname + '/img-depth2.jpg').should.be.eql(true);
fs.existsSync(testDirname + '/script-depth2.js').should.be.eql(true);

fs.existsSync(testDirname + '/depth3.html').should.be.eql(false);
// img-depth3.jpg and script-depth3.js - dependencies of depth2.html
// they should be loaded because maxRecursiveDepth applies only to <a href=''>
fs.existsSync(testDirname + '/img-depth3.jpg').should.be.eql(true);
fs.existsSync(testDirname + '/script-depth3.js').should.be.eql(true);
});
});

});
12 changes: 12 additions & 0 deletions test/functional/max-depth/mocks/depth1.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
<script src="script-depth2.js"></script>
</head>
<body>
<a href="/depth2.html"></a>
<img src="img-depth2.jpg"/>
</body>
</html>
Loading