Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,19 @@ npm install website-scraper

## Usage
```javascript
var scraper = require('website-scraper');
var scrape = require('website-scraper');
var options = {
urls: ['http://nodejs.org/'],
directory: '/path/to/save/',
};

// with callback
scraper.scrape(options, function (error, result) {
scrape(options, function (error, result) {
/* some code here */
});

// or with promise
scraper.scrape(options).then(function (result) {
scrape(options).then(function (result) {
/* some code here */
});
```
Expand Down Expand Up @@ -98,8 +98,8 @@ and separate files into directories:
- `css` for .css (full path `/path/to/save/css`)

```javascript
var scraper = require('website-scraper');
scraper.scrape({
var scrape = require('website-scraper');
scrape({
urls: [
'http://nodejs.org/', // Will be saved with default filename 'index.html'
{url: 'http://nodejs.org/about', filename: 'about.html'},
Expand Down Expand Up @@ -132,8 +132,8 @@ scraper.scrape({
```javascript
// Links from example.com will be followed
// Links from links will be ignored because theirs depth = 2 is greater than maxDepth
var scraper = require('website-scraper');
scraper.scrape({
var scrape = require('website-scraper');
scrape({
urls: ['http://example.com/'],
directory: '/path/to/save',
recursive: true,
Expand All @@ -144,8 +144,8 @@ scraper.scrape({
#### Example 3. Filtering out external resources
```javascript
// Links to other websites are filtered out by the urlFilter
var scraper = require('website-scraper');
scraper.scrape({
var scrape = require('website-scraper');
scrape({
urls: ['http://example.com/'],
urlFilter: function(url){
return url.indexOf('http://example.com') === 0;
Expand All @@ -159,8 +159,8 @@ scraper.scrape({
// Downloads all the crawlable files of example.com.
// The files are saved in the same structure as the structure of the website, by using the `bySiteStructure` filenameGenerator.
// Links to other websites are filtered out by the urlFilter
var scraper = require('website-scraper');
scraper.scrape({
var scrape = require('website-scraper');
scrape({
urls: ['http://example.com/'],
urlFilter: function(url){
return url.indexOf('http://example.com') === 0;
Expand Down
2 changes: 1 addition & 1 deletion index.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
var Scraper = require('./lib/scraper.js');

module.exports.scrape = function scrape (options, callback) {
module.exports = function scrape (options, callback) {
return new Scraper(options).scrape(callback);
};
6 changes: 3 additions & 3 deletions test/e2e/e2e-test.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
var should = require('should');
var scraper = require('../../index');
var scrape = require('../../index');
var URL = require('url');
var fs = require('fs-extra');
var _ = require('lodash');
Expand All @@ -26,7 +26,7 @@ describe('E2E', function() {
scraperOptions.directory = resultDirname + '/' + hostname + '-byType';
scraperOptions.urls = [ { url: url, filename: 'index.html' } ];
scraperOptions.filenameGenerator = 'byType';
return scraper.scrape(scraperOptions).then(function(result) {
return scrape(scraperOptions).then(function(result) {
result.should.be.ok();
});
});
Expand All @@ -37,7 +37,7 @@ describe('E2E', function() {
scraperOptions.directory = resultDirname + '/' + hostname + '-bySiteStructure';
scraperOptions.urls = [ { url: url } ];
scraperOptions.filenameGenerator = 'bySiteStructure';
return scraper.scrape(scraperOptions).then(function(result) {
return scrape(scraperOptions).then(function(result) {
result.should.be.ok();
});
});
Expand Down
4 changes: 2 additions & 2 deletions test/functional/base/base.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ require('should');
var nock = require('nock');
var fs = require('fs-extra');
var cheerio = require('cheerio');
var scraper = require('../../../index');
var scrape = require('../../../index');
var Resource = require('../../../lib/resource');

var testDirname = __dirname + '/.tmp';
Expand Down Expand Up @@ -65,7 +65,7 @@ describe('Functional base', function() {
// mocks for blog.html
nock('http://blog.example.com/').get('/files/fail-1.png').replyWithError('something awful happened');

return scraper.scrape(options).then(function(result) {
return scrape(options).then(function(result) {
// should return right result
result.should.be.instanceOf(Array).and.have.length(3);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
require('should');
var nock = require('nock');
var fs = require('fs-extra');
var scraper = require('../../../index');
var scrape = require('../../../index');

var testDirname = __dirname + '/.tmp';
var mockDirname = __dirname + '/mocks';
Expand Down Expand Up @@ -38,7 +38,7 @@ describe('Functional circular dependencies', function() {
nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css');
nock('http://example.com/').get('/style2.css').replyWithFile(200, mockDirname + '/style2.css');

return scraper.scrape(options).then(function() {
return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
fs.existsSync(testDirname + '/about.html').should.be.eql(true);
fs.existsSync(testDirname + '/style.css').should.be.eql(true);
Expand Down
5 changes: 2 additions & 3 deletions test/functional/css-handling/css-handling.test.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
var should = require('should');
var nock = require('nock');
var fs = require('fs-extra');
var Scraper = require('../../../lib/scraper');
var scrape = require('../../../index');

var testDirname = __dirname + '/.tmp';
var mockDirname = __dirname + '/mocks';
Expand Down Expand Up @@ -38,9 +38,8 @@ describe('Functional: css handling', function() {
{ directory: 'local', extensions: ['.png', '.css'] }
]
};
var scraper = new Scraper(options);

return scraper.scrape(options).then(function() {
return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
fs.existsSync(testDirname + '/local/style.css').should.be.eql(true);
fs.existsSync(testDirname + '/local/style-import-1.css').should.be.eql(true);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
var should = require('should');
var nock = require('nock');
var fs = require('fs-extra');
var Scraper = require('../../../lib/scraper');
var scrape = require('../../../index');

var testDirname = __dirname + '/.tmp';
var mockDirname = __dirname + '/mocks';
Expand Down Expand Up @@ -47,9 +47,8 @@ describe('Functional: html entities in url', function() {
],
ignoreErrors: false
};
var scraper = new Scraper(options);

return scraper.scrape(options).then(function() {
return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
var indexHtml = fs.readFileSync(testDirname + '/index.html').toString();

Expand Down
5 changes: 2 additions & 3 deletions test/functional/html-id-href/html-id-href.test.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
var should = require('should');
var nock = require('nock');
var fs = require('fs-extra');
var Scraper = require('../../../lib/scraper');
var scrape = require('../../../index');

var testDirname = __dirname + '/.tmp';
var mockDirname = __dirname + '/mocks';
Expand Down Expand Up @@ -39,9 +39,8 @@ describe('Functional html id href', function() {
{ directory: 'local', extensions: ['.png', '.svg'] }
]
};
var scraper = new Scraper(options);

return scraper.scrape(options).then(function() {
return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
fs.existsSync(testDirname + '/other.html').should.be.eql(true);
fs.existsSync(testDirname + '/local/sprite.svg').should.be.eql(true);
Expand Down
8 changes: 4 additions & 4 deletions test/functional/recursive/recursive.test.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
require('should');
var nock = require('nock');
var fs = require('fs-extra');
var scraper = require('../../../index');
var scrape = require('../../../index');

var testDirname = __dirname + '/.tmp';
var mockDirname = __dirname + '/mocks';
Expand Down Expand Up @@ -36,7 +36,7 @@ describe('Functional recursive downloading', function() {
nock('http://example.com/').get('/link2.html').reply(200, 'content 2');
nock('http://example.com/').get('/link3.html').reply(200, 'content 3');

return scraper.scrape(options).then(function() {
return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);

// index.html anchors loaded
Expand Down Expand Up @@ -73,7 +73,7 @@ describe('Functional recursive downloading', function() {
nock('http://example.com/').get('/link1-1.html').reply(200, 'content 1-1');
nock('http://example.com/').get('/link1-2.html').reply(200, 'content 1-2');

return scraper.scrape(options).then(function() {
return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);

// index.html anchors loaded (depth 1)
Expand Down Expand Up @@ -106,7 +106,7 @@ describe('Functional recursive downloading', function() {
nock('http://example.com/').get('/link2.html').reply(200, 'content 2');
nock('http://example.com/').get('/link3.html').reply(200, 'content 3');

return scraper.scrape(options).then(function() {
return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);

// index.html anchors loaded
Expand Down
2 changes: 1 addition & 1 deletion test/functional/redirect/redirect.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ describe('Functional redirects', function() {
var scraper = new Scraper(options);
var loadToFsSpy = sinon.spy(scraper.fsAdapter, 'saveResource');

return scraper.scrape(options).then(function() {
return scraper.scrape().then(function() {
loadToFsSpy.callCount.should.be.eql(2);
loadToFsSpy.args[0][0].filename.should.be.eql('index.html');
loadToFsSpy.args[1][0].filename.should.be.eql('true-page.html');
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
require('should');
var nock = require('nock');
var fs = require('fs-extra');
var scraper = require('../../../index');
var scrape = require('../../../index');

var testDirname = __dirname + '/.tmp';
var mockDirname = __dirname + '/mocks';
Expand Down Expand Up @@ -51,7 +51,7 @@ describe('Functional resources without extensions', function() {
nock('http://google.com').get('/').replyWithFile(200, mockDirname + '/google.html');
nock('http://google.com').get('/google.png').reply(200, 'OK');

return scraper.scrape(options).then(function() {
return scrape(options).then(function() {
// should load css file and fonts from css file
fs.existsSync(testDirname + '/css.css').should.be.eql(true); // http://fonts.googleapis.com/css?family=Lato
fs.existsSync(testDirname + '/UyBMtLsHKBKXelqf4x7VRQ.woff2').should.be.eql(true);
Expand Down