Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ scrape(options, (error, result) => {
* [onResourceError](#onresourceerror) - callback called when resource's downloading is failed
* [updateMissingSources](#updatemissingsources) - update url for missing sources with absolute url
* [requestConcurrency](#requestconcurrency) - set maximum concurrent requests
* [updateSources](#updateSources) - set to false to keep all html content unmodified

Default options you can find in [lib/config/defaults.js](https://github.com/website-scraper/node-website-scraper/blob/master/lib/config/defaults.js) or get them using `scrape.defaults`.

Expand Down Expand Up @@ -296,6 +297,12 @@ scrape({
Number, maximum amount of concurrent requests. Defaults to `Infinity`.


#### updateSources
Boolean. Defaults to `true`. Use `false` when scraped site structure does not
fit your custom filename generator or if you do not want html content to be
modified in any way.


## callback
Callback function, optional, includes following parameters:
- `error`: if error - `Error` object, if success - `null`
Expand Down
3 changes: 2 additions & 1 deletion lib/config/defaults.js
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ const config = {
onResourceSaved: null,
onResourceError: null,
resourceSaver: null,
updateMissingSources: false
updateMissingSources: false,
updateSources: true,
};

module.exports = config;
7 changes: 5 additions & 2 deletions lib/resource-handler/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ const utils = require('../utils');
const HtmlHandler = require('./html');
const CssHandler = require('./css');

const supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename', 'updateMissingSources'];
const supportedOptions = ['prettifyUrls', 'sources', 'recursiveSources', 'maxRecursiveDepth', 'defaultFilename', 'updateMissingSources', 'updateSources'];

class ResourceHandler {
constructor (options, context) {
Expand Down Expand Up @@ -83,12 +83,15 @@ class ResourceHandler {
});

return utils.waitAllFulfilled(childrenPromises).then(function updateChildrenPaths () {
if (self.options.updateSources === false) {
return pathContainer.updateText([]);
}
return pathContainer.updateText(pathsToUpdate);
});
}

updateChildrenResources (pathContainer, parentResource, needToUpdate) {
if (!needToUpdate) {
if (!needToUpdate || this.options.updateSources === false) {
return Promise.resolve(pathContainer.updateText([]));
}
const parentUrl = parentResource.getUrl();
Expand Down
13 changes: 13 additions & 0 deletions test/functional/recursive/mocks/data.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Data</title>
</head>
<body>
<a href="http://example.com/about.html">About 1</a>
<a href="//example.com/about.html">About 2</a>
<a href="//about.html">About 3</a>
<a href="../about.html">About 4</a>
</body>
</html>
7 changes: 6 additions & 1 deletion test/functional/recursive/mocks/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
<title>Title</title>
</head>
<body>
<a href="/about.html"></a>
<a href="/about.html">About</a>
<a href="http://example.com/data/data.html">Data 1</a>
<a href="//example.com/data/data.html">Data 2</a>
<a href="//data/data.html">Data 3</a>
<a href="/data/data.html">Data 4</a>
<a href="/data/data/data.html">Data 5</a>

</body>
</html>
94 changes: 92 additions & 2 deletions test/functional/recursive/recursive.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@ var nock = require('nock');
var fs = require('fs-extra');
var scrape = require('../../../index');

var testDirname = __dirname + '/.tmp';
var mockDirname = __dirname + '/mocks';
var testDirname = __dirname + '/.tmp';
var URL = require('url');

describe('Functional recursive downloading', function() {

Expand All @@ -25,7 +26,8 @@ describe('Functional recursive downloading', function() {
directory: testDirname,
subdirectories: null,
sources: [],
recursive: true
recursive: true,
updateSources: true,
};

nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
Expand All @@ -35,6 +37,7 @@ describe('Functional recursive downloading', function() {
nock('http://example.com/').get('/link1.html').reply(200, 'content 1');
nock('http://example.com/').get('/link2.html').reply(200, 'content 2');
nock('http://example.com/').get('/link3.html').reply(200, 'content 3');
nock('http://example.com/').get('/data/data.html').replyWithFile(200, mockDirname + '/data.html');

return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
Expand All @@ -49,6 +52,93 @@ describe('Functional recursive downloading', function() {
});
});

it('should follow anchors if recursive flag is set and links not replaced', function () {
var options = {
urls: [ 'http://example.com/' ],
directory: testDirname,
subdirectories: null,
sources: [],
recursive: true,
updateSources: false,
};

nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');

// mock for anchors
nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html');
nock('http://example.com/').get('/link1.html').reply(200, 'content 1');
nock('http://example.com/').get('/link2.html').reply(200, 'content 2');
nock('http://example.com/').get('/link3.html').reply(200, 'content 3');
nock('http://example.com/').get('/data/data.html').replyWithFile(200, mockDirname + '/data.html');

return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);

fs.readFileSync(testDirname + '/data.html').toString().should.eql(
fs.readFileSync(mockDirname + '/data.html').toString());

fs.readFileSync(testDirname + '/index.html').toString().should.eql(
fs.readFileSync(mockDirname + '/index.html').toString());

// index.html anchors loaded
fs.existsSync(testDirname + '/about.html').should.be.eql(true);

// about.html anchors loaded
fs.existsSync(testDirname + '/link1.html').should.be.eql(true);
fs.existsSync(testDirname + '/link2.html').should.be.eql(true);
fs.existsSync(testDirname + '/link3.html').should.be.eql(true);
});
});

it('should follow anchors if recursive flag is set and custom filename generator follows exact site structure',
function () {
var generateFilename = function (url) {
var parsedUrl = URL.parse(url);
if (parsedUrl.pathname === '/') {
return parsedUrl.hostname + parsedUrl.pathname + "/index.html";
}
return parsedUrl.hostname + parsedUrl.pathname;
};
var options = {
urls: [ 'http://example.com/' ],
directory: testDirname,
subdirectories: null,
sources: [],
recursive: true,
updateSources: false,
filenameGenerator: (resource, options, occupiedFileNames) => {
return generateFilename(resource.url);
}
};

nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');

// mock for anchors
nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html');
nock('http://example.com/').get('/link1.html').reply(200, 'content 1');
nock('http://example.com/').get('/link2.html').reply(200, 'content 2');
nock('http://example.com/').get('/link3.html').reply(200, 'content 3');
nock('http://example.com/').get('/data/data.html').replyWithFile(200, mockDirname + '/data.html');

return scrape(options).then(function() {
fs.existsSync(testDirname + '/example.com/index.html').should.be.eql(true);

fs.readFileSync(testDirname + '/example.com/data/data.html').toString().should.eql(
fs.readFileSync(mockDirname + '/data.html').toString());

fs.readFileSync(testDirname + '/example.com/index.html').toString().should.eql(
fs.readFileSync(mockDirname + '/index.html').toString());

// index.html anchors loaded
fs.existsSync(testDirname + '/example.com/about.html').should.be.eql(true);

// about.html anchors loaded
fs.existsSync(testDirname + '/example.com/link1.html').should.be.eql(true);
fs.existsSync(testDirname + '/example.com/link2.html').should.be.eql(true);
fs.existsSync(testDirname + '/example.com/link3.html').should.be.eql(true);
});
});

it('should follow anchors with depth <= maxDepth if recursive flag and maxDepth are set', function () {
var options = {
urls: [ 'http://example.com/' ],
Expand Down