Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 25 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ npm install website-scraper

## Usage
```javascript
var scraper = require('website-scraper');
var scraper = require('website-scraper');
var options = {
urls: ['http://nodejs.org/'],
directory: '/path/to/save/',
Expand All @@ -38,7 +38,7 @@ scraper.scrape(options).then(function (result) {

## API
### scrape(options, callback)
Makes requests to `urls` and saves all files found with `sources` to `directory`.
Makes requests to `urls` and saves all files found with `sources` to `directory`.

**options** - object containing next options:

Expand All @@ -48,31 +48,34 @@ Makes requests to `urls` and saves all files found with `sources` to `directory`
- `sources:` array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see default value in `lib/config/defaults.js`)*
- `subdirectories:` array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
- `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)*


- `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)*
- `maxDepth`: positive number, maximum allowed depth for dependencies *(optional, see example below)*


**callback** - callback function *(optional)*, includes following parameters:

- `error:` if error - `Error` object, if success - `null`
- `result:` if error - `null`, if success - array if objects containing:
- `url:` url of loaded page
- `filename:` filename where page was saved (relative to `directory`)


## Examples
Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`.
#### Example 1
Let's scrape some pages from [http://nodejs.org/](http://nodejs.org/) with images, css, js files and save them to `/path/to/save/`.
Imagine we want to load:
- [Home page](http://nodejs.org/) to `index.html`
- [About page](http://nodejs.org/about/) to `about.html`
- [Blog](http://blog.nodejs.org/) to `blog.html`

and separate files into directories:

- `img` for .jpg, .png, .svg (full path `/path/to/save/img`)
- `img` for .jpg, .png, .svg (full path `/path/to/save/img`)
- `js` for .js (full path `/path/to/save/js`)
- `css` for .css (full path `/path/to/save/css`)

```javascript
var scraper = require('website-scraper');
var scraper = require('website-scraper');
scraper.scrape({
urls: [
'http://nodejs.org/', // Will be saved with default filename 'index.html'
Expand Down Expand Up @@ -101,3 +104,16 @@ scraper.scrape({
console.log(err);
});
```

#### Example 2. Recursive downloading
```javascript
// Links from example.com will be followed
// Links from links will be ignored because theirs depth = 2 is greater than maxDepth
var scraper = require('website-scraper');
scraper.scrape({
urls: ['http://example.com/'],
directory: '/path/to/save',
recursive: true,
maxDepth: 1
}).then(console.log).catch(console.log);
```
3 changes: 3 additions & 0 deletions lib/config/recursive-sources.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module.exports = [
{ selector: 'a', attr: 'href' }
];
4 changes: 1 addition & 3 deletions lib/file-handlers/css.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
var _ = require('underscore');
var Promise = require('bluebird');
var getCssUrls = require('css-url-parser');
var Resource = require('../resource');
var utils = require('../utils');

function loadCss (context, resource) {
Expand All @@ -12,8 +11,7 @@ function loadCss (context, resource) {

var promises = _.map(cssUrls, function loadResourceFromCssUrl (cssUrl) {
var resourceUrl = utils.getUrl(url, cssUrl);
var cssResource = new Resource(resourceUrl);
cssResource.setParent(resource);
var cssResource = resource.createChild(resourceUrl);

return context.loadResource(cssResource).then(function handleLoadedSource (loadedResource) {
var relativePath = utils.getRelativePath(filename, loadedResource.getFilename());
Expand Down
4 changes: 1 addition & 3 deletions lib/file-handlers/html.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
var cheerio = require('cheerio');
var Promise = require('bluebird');
var utils = require('../utils');
var Resource = require('../resource');

function loadHtml (context, resource) {
var sources = context.getHtmlSources();
Expand Down Expand Up @@ -50,8 +49,7 @@ function loadResources (context, resource, source) {

if (attr) {
var resourceUrl = utils.getUrl(url, attr);
var htmlResource = new Resource(resourceUrl);
htmlResource.setParent(resource);
var htmlResource = resource.createChild(resourceUrl);
htmlResource.setHtmlData({ tagName: el[0].name, attributeName: source.attr });

return context.loadResource(htmlResource).then(function handleLoadedSource (loadedResource) {
Expand Down
19 changes: 19 additions & 0 deletions lib/resource.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,17 @@ function Resource (url, filename) {
this.filename = filename;
}

Resource.prototype.createChild = function createChild (url, filename) {
var child = new Resource(url, filename);

var currentDepth = this.getDepth();

child.setParent(this);
child.setDepth(++currentDepth);

return child;
};

Resource.prototype.getUrl = function getUrl () {
return this.url;
};
Expand Down Expand Up @@ -43,6 +54,14 @@ Resource.prototype.setParent = function setParent (parent) {
this.parent = parent;
};

Resource.prototype.getDepth = function getDepth () {
return this.depth || 0;
};

Resource.prototype.setDepth = function setDepth (depth) {
this.depth = depth;
};

/**
*
* @param {Object} data - html element data
Expand Down
63 changes: 41 additions & 22 deletions lib/scraper.js
Original file line number Diff line number Diff line change
@@ -1,28 +1,27 @@
var Promise = require('bluebird');
var fs = Promise.promisifyAll(require('fs-extra'));

var fs = require('fs-extra');
var existsAsync = Promise.promisify(fs.stat);
var outputFileAsync = Promise.promisify(fs.outputFile);
var ensureDirAsync = Promise.promisify(fs.ensureDir);

var path = require('path');
var _ = require('underscore');

var defaults = require('./config/defaults');
var types = require('./config/resource-types');
var recursiveSources = require('./config/recursive-sources');
var utils = require('./utils.js');
var request = require('./request');
var Resource = require('./resource');
var loadHtml = require('./file-handlers/html');
var loadCss = require('./file-handlers/css');
var compareUrls = require('compare-urls');

function getHandleFunction (resource) {
var type = resource.getType();
switch (type) {
case types.css: return loadCss;
case types.html: return function loadHtmlAndCss (context, po) {
return loadHtml(context, po).then(function (loaded) {
return loadCss(context, loaded);
});
};
default: return _.noop;
}
var loadHtml = require('./file-handlers/html');
var loadCss = require('./file-handlers/css');
function loadHtmlAndCss (context, po) {
return loadHtml(context, po).then(function (loaded) {
return loadCss(context, loaded);
});
}

function Scraper (options) {
Expand Down Expand Up @@ -83,6 +82,20 @@ Scraper.prototype.getDirectoryByExtension = function getDirectoryByExtension (ex
.value() || '';
};

Scraper.prototype.getResourceHandler = function getHandler (resource) {
var self = this;
var type = resource.getType();
var depth = resource.getDepth();
var depthGreaterThanMax = self.options.maxDepth && depth >= self.options.maxDepth;

switch (true) {
case depthGreaterThanMax: return _.noop;
case type == types.css: return loadCss;
case type == types.html: return loadHtmlAndCss;
default: return _.noop;
}
};

Scraper.prototype.loadResource = function loadResource (resource) {
var self = this;

Expand All @@ -102,12 +115,12 @@ Scraper.prototype.loadResource = function loadResource (resource) {
return self.makeRequest(url).then(function requestCompleted(data) {
resource.setUrl(data.url); // Url may be changed in redirects
resource.setText(data.body);
handleFile = getHandleFunction(resource);
handleFile = self.getResourceHandler(resource);
return handleFile(self, resource);
}).then(function fileHandled() {
var filename = path.join(self.options.directory, resource.getFilename());
var text = resource.getText();
return fs.outputFileAsync(filename, text, { encoding: 'binary' });
return outputFileAsync(filename, text, { encoding: 'binary' });
}).then(function fileSaved() {
return Promise.resolve(resource);
});
Expand All @@ -116,15 +129,16 @@ Scraper.prototype.loadResource = function loadResource (resource) {
};

Scraper.prototype.validate = function validate () {
if (fs.existsSync(this.options.directory)) {
return Promise.reject(new Error('Path ' + this.options.directory + ' exists'));
}
return Promise.resolve();
var dir = this.options.directory;
return existsAsync(dir).then(function handleDirectoryExist () {
return Promise.reject(new Error('Path ' + dir + ' exists'));
}, function handleDirectoryNotExist () {
return Promise.resolve();
});
};

Scraper.prototype.prepare = function prepare () {
var self = this;
fs.ensureDirSync(self.options.directory);

// Create makeRequest function with custom request params
self.makeRequest = request.makeRequest.bind(null, self.options.request);
Expand All @@ -136,7 +150,12 @@ Scraper.prototype.prepare = function prepare () {
var filename = _.isObject(obj) && _.has(obj, 'filename') ? obj.filename : self.options.defaultFilename;
return new Resource(url, filename);
});
return Promise.resolve();

if (self.options.recursive) {
self.options.sources = _.union(self.options.sources, recursiveSources);
}

return ensureDirAsync(self.options.directory);
};

Scraper.prototype.load = function load () {
Expand Down
4 changes: 3 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@
"istanbul": "^0.4.0",
"mocha": "^2.2.5",
"nock": "^2.9.1",
"proxyquire": "^1.7.3",
"should": "^7.0.2",
"sinon": "^1.15.4"
"sinon": "^1.15.4",
"sinon-as-promised": "^4.0.0"
}
}
13 changes: 13 additions & 0 deletions test/functional/mocks/recursive/about.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<a href="/link1.html"></a>
<a href="/link2.html"></a>
<a href="/link3.html"></a>

</body>
</html>
11 changes: 11 additions & 0 deletions test/functional/mocks/recursive/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<a href="/about.html"></a>

</body>
</html>
Loading