Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 38 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,18 @@ Makes requests to `urls` and saves all files found with `sources` to `directory`
**options** - object containing next options:

- `urls`: array of urls to load and filenames for them *(required, see example below)*
- `urlFilter`: function which is called for each url to check whether it should be scraped. *(optional, see example below)*
- `directory`: path to save loaded files *(required)*
- `filenameGenerator`: name of one of the bundled filenameGenerators, or a custom filenameGenerator function *(optional, default: 'byType')*
- `defaultFilename`: filename for index page *(optional, default: 'index.html')*
- `prettifyUrls`: whether urls should be 'prettified', by having the `defaultFilename` removed *(optional, default: false)*
- `sources`: array of objects to load, specifies selectors and attribute values to select files for loading *(optional, see example below)*
- `subdirectories`: array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
- `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)*
- `recursive`: boolean, if `true` scraper will follow anchors in html files. Don't forget to set `maxDepth` to avoid infinite downloading *(optional, see example below)*
- `maxDepth`: positive number, maximum allowed depth for dependencies *(optional, see example below)*
- `request`: object, custom options for [request](https://github.com/request/request#requestoptions-callback) *(optional, see example below)*
- `subdirectories`: array of objects, specifies subdirectories for file extensions. If `null` all files will be saved to `directory` *(optional, see example below)*
- `defaultFilename`: filename for index page *(optional, default: 'index.html')*
- `prettifyUrls`: whether urls should be 'prettified', by having the `defaultFilename` removed *(optional, default: false)*
- `ignoreErrors`: boolean, if `true` scraper will continue downloading resources after error occured, if `false` - scraper will finish process and return error *(optional, default: true)*
- `urlFilter`: function which is called for each url to check whether it should be scraped. *(optional, see example below)*
- `filenameGenerator`: name of one of the bundled filenameGenerators, or a custom filenameGenerator function *(optional, default: 'byType')*
- `httpResponseHandler`: function which is called on each response, allows to customize resource or reject its downloading *(optional, see example below)*

Default options you can find in [lib/config/defaults.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/config/defaults.js).

Expand All @@ -85,6 +86,14 @@ When the `bySiteStructure` filenameGenerator is used the downloaded files are sa
- `/about` => `DIRECTORY/about/index.html`
- `/resources/javascript/libraries/jquery.min.js` => `DIRECTORY/resources/javascript/libraries/jquery.min.js`

### Http Response Handlers
HttpResponseHandler is used to reject resource downloading or customize resource text based on response data (for example, status code, content type, etc.)
Function takes `response` argument - response object of [request](https://github.com/request/request) module and should return resolved `Promise` if resource should be downloaded or rejected with Error `Promise` if it should be skipped.
Promise should be resolved with:
* `string` which contains response body
* or object with properies `body` (response body, string) and `metadata` - everything you want to save for this resource (like headers, original text, timestamps, etc.), scraper will not use this field at all, it is only for result.

See [example of using httpResponseHandler](#example-5-rejecting-resources-with-404-status-and-adding-metadata).

## Examples
#### Example 1
Expand Down Expand Up @@ -176,6 +185,29 @@ scrape({
}).then(console.log).catch(console.log);
```

#### Example 5. Rejecting resources with 404 status and adding metadata
```javascript
var scrape = require('website-scraper');
scrape({
urls: ['http://example.com/'],
directory: '/path/to/save',
httpResponseHandler: (response) => {
if (response.statusCode === 404) {
return Promise.reject(new Error('status is 404'));
} else {
// if you don't need metadata - you can just return Promise.resolve(response.body)
return Promise.resolve({
body: response.body,
metadata: {
headers: response.headers,
someOtherData: [ 1, 2, 3 ]
}
});
}
}
}).then(console.log).catch(console.log);
```

## Log and debug
This module uses [debug](https://github.com/visionmedia/debug) to log events. To enable logs you should use environment variable `DEBUG`.
Next command will log everything from website-scraper
Expand Down
7 changes: 3 additions & 4 deletions lib/config/defaults.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,11 @@ var config = {
jar: true,
gzip: true
},
urlFilter: function urlFilter () {
return true;
},
urlFilter: null,
recursive: false,
maxDepth: null,
ignoreErrors: true
ignoreErrors: true,
httpResponseHandler: null
};

module.exports = config;
89 changes: 68 additions & 21 deletions lib/request.js
Original file line number Diff line number Diff line change
@@ -1,32 +1,79 @@
var _ = require('lodash');
var Promise = require('bluebird');
var request = require('request');
var get = Promise.promisify(request.get);
var logger = require('./logger');
'use strict';

const _ = require('lodash');
const Promise = require('bluebird');
const request = require('request');
const get = Promise.promisify(request.get);
const logger = require('./logger');

function getMimeType (contentType) {
return contentType ? contentType.split(';')[0] : null;
}

function makeRequest (options, url, referer) {
var requestOptions = _.clone(options);
requestOptions.url = url;
function defaultResponseHandler (response) {
return Promise.resolve(response.body);
}

function transformResult (result) {
switch (true) {
case _.isString(result):
return {
body: result,
metadata: null
};
case _.isPlainObject(result):
return {
body: result.body,
metadata: result.metadata || null
};
default:
throw new Error('Wrong response handler result. Expected string or object, but received ' + typeof result);
}
}

if (referer) {
requestOptions.headers = requestOptions.headers || {};
requestOptions.headers.referer = referer;
class Request {
/**
*
* @param {Object} options
* @param {function} options.httpResponseHandler - custom response handler
* @param {Object} options.request - custom options for request module
*/
constructor (options) {
this.handleResponse = options && options.httpResponseHandler ? options.httpResponseHandler : defaultResponseHandler;
this.options = options && options.request ? _.clone(options.request) : {};
}

logger.debug(`[request] sending request for url ${url}, referer ${referer}`);
/**
* Performs get request to url and returns data for resource
* @param {string} url - url of resource
* @param {string} referer - url of parent resource
* @return {Promise}
*/
get (url, referer) {
let requestOptions = _.clone(this.options);
requestOptions.url = url;

if (referer) {
requestOptions.headers = requestOptions.headers || {};
requestOptions.headers.referer = referer;
}

return get(requestOptions).then(function handleResponse (data) {
logger.debug(`[request] received response for ${data.request.href}, statusCode ${data.statusCode}`);
return {
url: data.request.href,
mimeType: getMimeType(data.headers['content-type']),
body: data.body
};
});
logger.debug(`[request] sending request for url ${url}, referer ${referer}`);

return get(requestOptions).then((response) => {
logger.debug(`[request] received response for ${response.request.href}, statusCode ${response.statusCode}`);
return this.handleResponse(response)
.then(transformResult)
.then((responseHandlerResult) => {
return {
url: response.request.href,
mimeType: getMimeType(response.headers['content-type']),
body: responseHandlerResult.body,
metadata: responseHandlerResult.metadata
};
});
});
}
}

module.exports = makeRequest;
module.exports = Request;
4 changes: 4 additions & 0 deletions lib/resource.js
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,8 @@ Resource.prototype.setSaved = function setSaved () {
this.saved = true;
};

Resource.prototype.setMetadata = function setMetadata (metadata) {
this.metadata = metadata;
};

module.exports = Resource;
12 changes: 8 additions & 4 deletions lib/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ var recursiveSources = require('./config/recursive-sources');
var Resource = require('./resource');

var FilenameGenerator = require('./filename-generator');
var makeRequest = require('./request');
var Request = require('./request');
var ResourceHandler = require('./resource-handler');
var FSAdapter = require('./fs-adaper');
var utils = require('./utils');
Expand All @@ -28,7 +28,7 @@ function Scraper (options) {

logger.info('init with options', self.options);

self.makeRequest = makeRequest.bind(null, self.options.request);
self.request = new Request(self.options);
self.resourceHandler = new ResourceHandler(self.options, self);
self.filenameGenerator = new FilenameGenerator(self.options);
self.fsAdapter = new FSAdapter(self.options);
Expand Down Expand Up @@ -80,7 +80,7 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) {
var requestPromise = Promise.resolve()
.then(function makeRequest () {
var referer = resource.parent ? resource.parent.getUrl() : null;
return self.makeRequest(url, referer);
return self.request.get(url, referer);
}).then(function requestCompleted (responseData) {

if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects
Expand All @@ -104,6 +104,10 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) {
resource.setType(utils.getTypeByFilename(filename));
}

if (responseData.metadata) {
resource.setMetadata(responseData.metadata);
}

resource.setText(responseData.body);
self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad
return resource;
Expand All @@ -120,7 +124,7 @@ Scraper.prototype.requestResource = function requestResource (resource) {
var self = this;
var url = resource.getUrl();

if (!self.options.urlFilter(url)) {
if (self.options.urlFilter && !self.options.urlFilter(url)) {
logger.debug('filtering out ' + resource + ' by url filter');
return Promise.resolve(null);
}
Expand Down
Loading