From 11333eb145956385feb85dbb126490f34ccc1425 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Sat, 8 Apr 2017 10:54:00 +0300 Subject: [PATCH 01/14] Rewrite FSAdaper --- index.js | 9 +- lib/fs-adaper.js | 90 ++++++++++--------- lib/scraper.js | 19 ++-- .../error-handling/error-handling.test.js | 8 +- test/unit/fs-adapter-test.js | 86 +++++++++++++----- test/unit/scraper-test.js | 85 +----------------- 6 files changed, 131 insertions(+), 166 deletions(-) diff --git a/index.js b/index.js index 9b698f3c..8a38d324 100644 --- a/index.js +++ b/index.js @@ -1,7 +1,10 @@ -var Scraper = require('./lib/scraper.js'); +const Promise = require('bluebird'); +const Scraper = require('./lib/scraper.js'); -module.exports = function scrape (options, callback) { - return new Scraper(options).scrape(callback); +module.exports = (options, callback) => { + return Promise.try(() => { + return new Scraper(options).scrape(callback); + }); }; module.exports.defaults = Scraper.defaults; diff --git a/lib/fs-adaper.js b/lib/fs-adaper.js index cdc1c90f..7556818e 100644 --- a/lib/fs-adaper.js +++ b/lib/fs-adaper.js @@ -1,59 +1,61 @@ -var path = require('path'); -var _ = require('lodash'); -var Promise = require('bluebird'); +const path = require('path'); +const _ = require('lodash'); +const Promise = require('bluebird'); -var fs = require('fs-extra'); -var existsAsync = Promise.promisify(fs.stat); -var outputFileAsync = Promise.promisify(fs.outputFile); -var ensureDirAsync = Promise.promisify(fs.ensureDir); -var removeAsync = Promise.promisify(fs.remove); +const fs = require('fs-extra'); +const outputFileAsync = Promise.promisify(fs.outputFile); +const removeAsync = Promise.promisify(fs.remove); -var supportedOptions = [ 'directory' ]; +const supportedOptions = [ 'directory' ]; -function FSAdapter (options) { - var self = this; +class FSAdapter { + constructor(options) { + this.options = _.pick(options, supportedOptions); - self.loadedResources = []; // Array of loaded Resources + if (!this.options.directory || !_.isString(this.options.directory)) { + throw new Error('Incorrect directory ' + this.options.directory); + } - self.options = _.pick(options, supportedOptions); + this.absoluteDirectoryPath = path.resolve(process.cwd(), this.options.directory); - if (self.options.directory) { - self.absoluteDirectoryPath = path.resolve(process.cwd(), self.options.directory); + if (exists(this.absoluteDirectoryPath)) { + throw new Error('Directory ' + this.absoluteDirectoryPath + ' exists'); + } + + this.loadedResources = []; } -} -FSAdapter.prototype.validateDirectory = function validateDirectory () { - var self = this; - if (_.isEmpty(self.options.directory) || !_.isString(self.options.directory)) { - return Promise.reject(new Error('Incorrect directory ' + self.options.directory)); + saveResource (resource) { + const filename = path.join(this.absoluteDirectoryPath, resource.getFilename()); + const text = resource.getText(); + return outputFileAsync(filename, text, { encoding: 'binary' }).then(() => { + this.loadedResources.push(resource); + }); } - return existsAsync(self.absoluteDirectoryPath).then(function handleDirectoryExist () { - return Promise.reject(new Error('Directory ' + self.absoluteDirectoryPath + ' exists')); - }, function handleDirectoryNotExist () { + handleError () { + if (!_.isEmpty(this.loadedResources)) { + return removeAsync(this.absoluteDirectoryPath); + } return Promise.resolve(); - }); -}; - -FSAdapter.prototype.createDirectory = function createDirectory () { - return ensureDirAsync(this.absoluteDirectoryPath); -}; + } +} -FSAdapter.prototype.cleanDirectory = function cleanDirectory () { - if (!_.isEmpty(this.loadedResources)) { - return removeAsync(this.absoluteDirectoryPath); +function exists (path) { + let exists; + try { + if (fs.statSync(path)) { + exists = true; + } + } catch (e) { + if (e.code === 'ENOENT') { + exists = false; + } else { + throw e; + } } - return Promise.resolve(); -}; - -FSAdapter.prototype.saveResource = function saveResource (resource) { - var self = this; - - var filename = path.join(self.absoluteDirectoryPath, resource.getFilename()); - var text = resource.getText(); - return outputFileAsync(filename, text, { encoding: 'binary' }).then(function resourceSaved () { - self.loadedResources.push(resource); - }); -}; + + return exists; +} module.exports = FSAdapter; diff --git a/lib/scraper.js b/lib/scraper.js index f3872429..5133fe19 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -41,9 +41,9 @@ function Scraper (options) { // Custom structures // Array of Resources for downloading - self.originalResources = _.map(self.options.urls, function createResource (obj) { - var url = _.isObject(obj) && _.has(obj, 'url') ? obj.url : obj; - var filename = _.isObject(obj) && _.has(obj, 'filename') ? obj.filename : self.options.defaultFilename; + self.originalResources = self.options.urls.map((obj) => { + const url = _.isObject(obj) && _.has(obj, 'url') ? obj.url : obj; + const filename = _.isObject(obj) && _.has(obj, 'filename') ? obj.filename : self.options.defaultFilename; return new Resource(url, filename); }); @@ -152,15 +152,11 @@ Scraper.prototype.requestResource = function requestResource (resource) { return self.createNewRequest(resource); }; -Scraper.prototype.validate = function validate () { - return this.fsAdapter.validateDirectory(); -}; - Scraper.prototype.load = function load () { var self = this; - return self.fsAdapter.createDirectory().then(function loadAllResources () { - return Promise.map(self.originalResources, self.requestResource.bind(self)); - }).then(self.waitForLoad.bind(self)); + return Promise + .map(this.originalResources, this.requestResource.bind(this)) + .then(this.waitForLoad.bind(this)); }; // Returns a promise which gets resolved when all resources are loaded. @@ -192,7 +188,7 @@ Scraper.prototype.handleError = function handleError (err, resource) { Scraper.prototype.errorCleanup = function errorCleanup (error) { logger.error('finishing with error: ' + error.message); - return this.fsAdapter.cleanDirectory().then(function loadedDataRemoved () { + return this.fsAdapter.handleError().then(function loadedDataRemoved () { return Promise.reject(error); }); }; @@ -200,7 +196,6 @@ Scraper.prototype.errorCleanup = function errorCleanup (error) { Scraper.prototype.scrape = function scrape (callback) { var self = this; return Promise.bind(self) - .then(self.validate) .then(self.load) .catch(self.errorCleanup) .asCallback(callback); diff --git a/test/functional/error-handling/error-handling.test.js b/test/functional/error-handling/error-handling.test.js index 3371d4d5..6a993af2 100644 --- a/test/functional/error-handling/error-handling.test.js +++ b/test/functional/error-handling/error-handling.test.js @@ -41,12 +41,13 @@ describe('Functional error handling', function() { }); describe('FS Error', function () { - var loadToFsStub; + let loadToFsStub, handleErrorSpy; beforeEach(function() { scraper.fsAdapter.loadedResources = [1, 2]; loadToFsStub = sinon.stub(scraper.fsAdapter, 'saveResource').resolves(); loadToFsStub.onCall(2).rejects(new Error('FS FAILED!')); + handleErrorSpy = sinon.spy(scraper.fsAdapter, 'handleError'); }); it('should remove directory and immediately reject on fs error if ignoreErrors is false', function () { @@ -55,9 +56,10 @@ describe('Functional error handling', function() { return scraper.scrape().then(function() { should(true).be.eql(false); }).catch(function (err) { - fs.existsSync(testDirname).should.be.eql(false); should(err.message).be.eql('FS FAILED!'); should(loadToFsStub.callCount).be.eql(3); + should(handleErrorSpy.callCount).be.eql(1); + fs.existsSync(testDirname).should.be.eql(false); }); }); @@ -66,7 +68,7 @@ describe('Functional error handling', function() { return scraper.scrape().then(function() { should(loadToFsStub.callCount).be.eql(7); - fs.existsSync(testDirname).should.be.eql(true); + should(handleErrorSpy.callCount).be.eql(0); }); }); }); diff --git a/test/unit/fs-adapter-test.js b/test/unit/fs-adapter-test.js index d7419004..6906ce25 100644 --- a/test/unit/fs-adapter-test.js +++ b/test/unit/fs-adapter-test.js @@ -1,34 +1,80 @@ -var should = require('should'); -var path = require('path'); -var FSAdapter = require('../../lib/fs-adaper'); +const should = require('should'); +const sinon = require('sinon'); +const proxyquire = require('proxyquire'); +const path = require('path'); +const FSAdapter = require('../../lib/fs-adaper'); describe('FSAdapter', function () { describe('constructor', function() { it('should pick supported options', function() { - var options = { a: 1, b: 2, directory: 'test' }; - var fsAdapter = new FSAdapter(options); - fsAdapter.options.should.eql({directory: 'test'}); + const options = { a: 1, b: 2, directory: 'myDirectory' }; + const fsAdapter = new FSAdapter(options); + fsAdapter.options.should.eql({directory: 'myDirectory'}); }); - it('should create absolute path if directory is relative path', function () { - var options = { directory: 'my/relative/path' }; + describe('absoluteDirectoryPath', () => { + it('should create absolute path if directory is relative path', function () { + const options = { directory: 'my/relative/path' }; + const fsAdapter = new FSAdapter(options); + const expected = path.join(process.cwd(), 'my/relative/path'); + fsAdapter.absoluteDirectoryPath.should.equalFileSystemPath(expected); + }); - var fsAdapter = new FSAdapter(options); - var expected = path.join(process.cwd(), 'my/relative/path'); - fsAdapter.absoluteDirectoryPath.should.equalFileSystemPath(expected); + it('should use directory if directory is absolute path', function () { + const options = { directory: '/my/absolute/path' }; + const fsAdapter = new FSAdapter(options); + const expected = '/my/absolute/path'; + fsAdapter.absoluteDirectoryPath.should.equalFileSystemPath(expected); + }); }); - it('should use directory if directory is absolute path', function () { - var options = { directory: '/my/absolute/path' }; - var fsAdapter = new FSAdapter(options); - var expected = '/my/absolute/path'; - fsAdapter.absoluteDirectoryPath.should.equalFileSystemPath(expected); + describe('incorrect directory', () => { + it('should throw error if no directory were passed', function () { + const options = {}; + function createFsAdapter () { + new FSAdapter(options); + } + should(createFsAdapter).throw(/Incorrect directory/); + }); + + it('should throw error if empty directory were passed', function () { + const options = { + directory: '' + }; + function createFsAdapter () { + new FSAdapter(options); + } + should(createFsAdapter).throw(/Incorrect directory/); + }); + + it('should throw error if incorrect directory passed', function () { + const options = { + directory: {} + }; + function createFsAdapter () { + new FSAdapter(options); + } + should(createFsAdapter).throw(/Incorrect directory/); + }); }); - it('should not define absoluteDirectoryPath if no directory were passed', function () { - var options = {}; - var fsAdapter = new FSAdapter(options); - should(fsAdapter.absoluteDirectoryPath).eql(undefined); + describe('existing directory', () => { + it('should throw error if directory exists', () => { + const FSAdapter = proxyquire('../../lib/fs-adaper', { + 'fs-extra': { + statSync: sinon.stub().returns('fake-stat') + } + }); + + const options = { + directory: 'fake-directory' + }; + function createFsAdapter () { + new FSAdapter(options); + } + should(createFsAdapter).throw(/Directory (.*?) exists/); + }); }); + }); }); diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js index e36b8c37..ecd3f198 100644 --- a/test/unit/scraper-test.js +++ b/test/unit/scraper-test.js @@ -25,86 +25,6 @@ describe('Scraper', function () { fs.removeSync(testDirname); }); - describe('#validate', function () { - it('should return resolved promise if everything is ok', function () { - var s = new Scraper({ - urls: urls, - directory: 'good/directory' - }); - - return s.validate().then(function() { - should(true).eql(true); - }); - }); - - it('should return rejected promise if no directory was provided', function () { - var s = new Scraper({ - urls: urls - }); - - return s.validate().then(function() { - should(true).be.eql(false); - }, function(err) { - err.should.be.an.instanceOf(Error); - err.message.should.match(/^Incorrect directory/); - }); - }); - - it('should return rejected promise if directory is not correct', function () { - var s1 = new Scraper({ - urls: urls, - directory: '' - }); - - return s1.validate().then(function() { - should(true).be.eql(false); - }, function(err) { - err.should.be.an.instanceOf(Error); - err.message.should.match(/^Incorrect directory/); - }); - - var s2 = new Scraper({ - urls: urls, - directory: { name: '/incorrect/directory' } - }); - - return s2.validate().then(function() { - should(true).be.eql(false); - }, function(err) { - err.should.be.an.instanceOf(Error); - err.message.should.match(/^Incorrect directory/); - }); - - var s3 = new Scraper({ - urls: urls, - directory: 42 - }); - - return s3.validate().then(function() { - should(true).be.eql(false); - }, function(err) { - err.should.be.an.instanceOf(Error); - err.message.should.match(/^Incorrect directory/); - }); - }); - - it('should return rejected promise if directory exists', function() { - fs.mkdirpSync(testDirname); - - var s = new Scraper({ - urls: urls, - directory: testDirname - }); - - return s.validate().then(function() { - should(true).be.eql(false); - }, function(err) { - err.should.be.an.instanceOf(Error); - err.message.should.match(/^Directory (.*?) exists/); - }); - }); - }); - describe('#load', function() { it('should create directory', function() { nock('http://example.com').get('/').reply(200, 'OK'); @@ -440,7 +360,7 @@ describe('Scraper', function () { }); describe('#scrape', function() { - it('should call methods in sequence', function() { + it('should call load', function() { nock('http://example.com').get('/').reply(200, 'OK'); var s = new Scraper({ @@ -448,13 +368,10 @@ describe('Scraper', function () { directory: testDirname }); - var validateSpy = sinon.spy(s, 'validate'); var loadSpy = sinon.spy(s, 'load'); return s.scrape().then(function() { - validateSpy.calledOnce.should.be.eql(true); loadSpy.calledOnce.should.be.eql(true); - loadSpy.calledAfter(validateSpy).should.be.eql(true); }); }); From 838162bb02ef64f931c3090a7f46334a92b86b9c Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Sat, 8 Apr 2017 12:04:16 +0300 Subject: [PATCH 02/14] Add fsAdapter options, change var to const --- lib/config/defaults.js | 5 +- lib/fs-adaper.js | 13 ++- lib/scraper.js | 86 +++++++++---------- .../error-handling/error-handling.test.js | 2 +- test/unit/scraper-init-test.js | 20 ++--- 5 files changed, 66 insertions(+), 60 deletions(-) diff --git a/lib/config/defaults.js b/lib/config/defaults.js index 2091cfbb..df6a0cb7 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -1,4 +1,4 @@ -var config = { +const config = { filenameGenerator: 'byType', defaultFilename: 'index.html', prettifyUrls: false, @@ -51,7 +51,8 @@ var config = { ignoreErrors: true, httpResponseHandler: null, onResourceSaved: null, - onResourceError: null + onResourceError: null, + fsAdapter: null }; module.exports = config; diff --git a/lib/fs-adaper.js b/lib/fs-adaper.js index 7556818e..69382c74 100644 --- a/lib/fs-adaper.js +++ b/lib/fs-adaper.js @@ -12,7 +12,7 @@ class FSAdapter { constructor(options) { this.options = _.pick(options, supportedOptions); - if (!this.options.directory || !_.isString(this.options.directory)) { + if (!this.options.directory || typeof this.options.directory !== 'string') { throw new Error('Incorrect directory ' + this.options.directory); } @@ -25,6 +25,11 @@ class FSAdapter { this.loadedResources = []; } + /** + * Save resource to file system + * @param {Resource} resource + * @returns {Promise} + */ saveResource (resource) { const filename = path.join(this.absoluteDirectoryPath, resource.getFilename()); const text = resource.getText(); @@ -33,7 +38,11 @@ class FSAdapter { }); } - handleError () { + /** + * Remove all files that were saved before + * @returns {Promise} + */ + removeSavedResources () { if (!_.isEmpty(this.loadedResources)) { return removeAsync(this.absoluteDirectoryPath); } diff --git a/lib/scraper.js b/lib/scraper.js index 5133fe19..8158c8fa 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -1,26 +1,25 @@ -var Promise = require('bluebird'); -var _ = require('lodash'); +const Promise = require('bluebird'); +const _ = require('lodash'); -var logger = require('./logger'); +const logger = require('./logger'); -var defaults = require('./config/defaults'); -var recursiveSources = require('./config/recursive-sources'); -var Resource = require('./resource'); +const defaults = require('./config/defaults'); +const recursiveSources = require('./config/recursive-sources'); +const Resource = require('./resource'); -var FilenameGenerator = require('./filename-generator'); -var Request = require('./request'); -var ResourceHandler = require('./resource-handler'); -var FSAdapter = require('./fs-adaper'); -var utils = require('./utils'); -var NormalizedUrlMap = require('./utils/normalized-url-map'); +const FilenameGenerator = require('./filename-generator'); +const Request = require('./request'); +const ResourceHandler = require('./resource-handler'); +const FSAdapter = require('./fs-adaper'); +const utils = require('./utils'); +const NormalizedUrlMap = require('./utils/normalized-url-map'); function Scraper (options) { - var self = this; + const self = this; - // Extend options - self.options = _.extend({}, defaults, options); - self.options.request = _.extend({}, defaults.request, options.request); - self.options.urls = _.isArray(self.options.urls) ? self.options.urls : [self.options.urls]; + self.options = Object.assign({}, defaults, options); + self.options.request = Object.assign({}, defaults.request, options.request); + self.options.urls = Array.isArray(self.options.urls) ? self.options.urls : [self.options.urls]; if (self.options.subdirectories) { self.options.subdirectories.forEach((element) => { @@ -37,13 +36,13 @@ function Scraper (options) { self.request = new Request(self.options); self.resourceHandler = new ResourceHandler(self.options, self); self.filenameGenerator = new FilenameGenerator(self.options); - self.fsAdapter = new FSAdapter(self.options); + self.fsAdapter = self.options.fsAdapter ? new self.options.fsAdapter(self.options) : new FSAdapter(self.options); // Custom structures // Array of Resources for downloading - self.originalResources = self.options.urls.map((obj) => { - const url = _.isObject(obj) && _.has(obj, 'url') ? obj.url : obj; - const filename = _.isObject(obj) && _.has(obj, 'filename') ? obj.filename : self.options.defaultFilename; + self.resources = self.options.urls.map((obj) => { + const url = (obj && obj.url) ? obj.url : obj; + const filename = (obj && obj.filename) ? obj.filename : self.options.defaultFilename; return new Resource(url, filename); }); @@ -52,19 +51,18 @@ function Scraper (options) { } Scraper.prototype.loadResource = function loadResource (resource) { - var self = this; - var url = resource.getUrl(); + const url = resource.getUrl(); - if (self.loadedResources.has(url)) { + if (this.loadedResources.has(url)) { logger.debug('found loaded resource for ' + resource); } else { logger.debug('add loaded resource ' + resource); - self.loadedResources.set(url, resource); + this.loadedResources.set(url, resource); } }; Scraper.prototype.saveResource = function saveResource (resource) { - var self = this; + const self = this; resource.setSaved(); return Promise.resolve() @@ -84,12 +82,12 @@ Scraper.prototype.saveResource = function saveResource (resource) { }; Scraper.prototype.createNewRequest = function createNewRequest (resource) { - var self = this; - var url = resource.getUrl(); + const self = this; + const url = resource.getUrl(); - var requestPromise = Promise.resolve() + const requestPromise = Promise.resolve() .then(function makeRequest () { - var referer = resource.parent ? resource.parent.getUrl() : null; + const referer = resource.parent ? resource.parent.getUrl() : null; return self.request.get(url, referer); }).then(function requestCompleted (responseData) { @@ -106,7 +104,7 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) { resource.setType(utils.getTypeByMime(responseData.mimeType)); - var filename = self.filenameGenerator.generateFilename(resource); + const filename = self.filenameGenerator.generateFilename(resource); resource.setFilename(filename); // if type was not determined by mime we can try to get it from filename after it was generated @@ -131,8 +129,8 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) { }; Scraper.prototype.requestResource = function requestResource (resource) { - var self = this; - var url = resource.getUrl(); + const self = this; + const url = resource.getUrl(); if (self.options.urlFilter && !self.options.urlFilter(url)) { logger.debug('filtering out ' + resource + ' by url filter'); @@ -153,9 +151,8 @@ Scraper.prototype.requestResource = function requestResource (resource) { }; Scraper.prototype.load = function load () { - var self = this; return Promise - .map(this.originalResources, this.requestResource.bind(this)) + .map(this.resources, this.requestResource.bind(this)) .then(this.waitForLoad.bind(this)); }; @@ -163,16 +160,16 @@ Scraper.prototype.load = function load () { // 1. Get all not saved resources and save them // 2. Recursion if any new not saved resource were added during this time. If not, loading is done. Scraper.prototype.waitForLoad = function waitForLoad () { - var self = this; - var resourcesToSave = Array.from(self.loadedResources.values()).filter((r) => !r.isSaved()); - var loadingIsFinished = _.isEmpty(resourcesToSave); + const self = this; + const resourcesToSave = Array.from(self.loadedResources.values()).filter((r) => !r.isSaved()); + const loadingIsFinished = _.isEmpty(resourcesToSave); if (!loadingIsFinished) { return Promise.mapSeries(resourcesToSave, self.saveResource.bind(self)) .then(self.waitForLoad.bind(self)); } logger.info('downloading is finished successfully'); - return Promise.resolve(self.originalResources); + return Promise.resolve(self.resources); }; Scraper.prototype.handleError = function handleError (err, resource) { @@ -188,19 +185,18 @@ Scraper.prototype.handleError = function handleError (err, resource) { Scraper.prototype.errorCleanup = function errorCleanup (error) { logger.error('finishing with error: ' + error.message); - return this.fsAdapter.handleError().then(function loadedDataRemoved () { + return this.fsAdapter.removeSavedResources().then(function loadedDataRemoved () { return Promise.reject(error); }); }; Scraper.prototype.scrape = function scrape (callback) { - var self = this; - return Promise.bind(self) - .then(self.load) - .catch(self.errorCleanup) + return Promise.bind(this) + .then(this.load) + .catch(this.errorCleanup) .asCallback(callback); }; -Scraper.defaults = _.clone(defaults); +Scraper.defaults = Object.assign({}, defaults); module.exports = Scraper; diff --git a/test/functional/error-handling/error-handling.test.js b/test/functional/error-handling/error-handling.test.js index 6a993af2..2ce00a9a 100644 --- a/test/functional/error-handling/error-handling.test.js +++ b/test/functional/error-handling/error-handling.test.js @@ -47,7 +47,7 @@ describe('Functional error handling', function() { scraper.fsAdapter.loadedResources = [1, 2]; loadToFsStub = sinon.stub(scraper.fsAdapter, 'saveResource').resolves(); loadToFsStub.onCall(2).rejects(new Error('FS FAILED!')); - handleErrorSpy = sinon.spy(scraper.fsAdapter, 'handleError'); + handleErrorSpy = sinon.spy(scraper.fsAdapter, 'removeSavedResources'); }); it('should remove directory and immediately reject on fs error if ignoreErrors is false', function () { diff --git a/test/unit/scraper-init-test.js b/test/unit/scraper-init-test.js index 5d84701d..74208b34 100644 --- a/test/unit/scraper-init-test.js +++ b/test/unit/scraper-init-test.js @@ -224,7 +224,7 @@ describe('Scraper initialization', function () { }); }); - describe('originalResources', function () { + describe('resources', function () { it('should create Resource object for each url', function() { var s = new Scraper({ urls: [ @@ -235,13 +235,13 @@ describe('Scraper initialization', function () { directory: testDirname }); - s.originalResources.should.be.an.instanceOf(Array).and.have.length(3); - s.originalResources[0].should.be.an.instanceOf(Resource); - s.originalResources[0].url.should.be.eql('http://first-url.com'); - s.originalResources[1].should.be.an.instanceOf(Resource); - s.originalResources[1].url.should.be.eql('http://second-url.com'); - s.originalResources[2].should.be.an.instanceOf(Resource); - s.originalResources[2].url.should.be.eql('http://third-url.com'); + s.resources.should.be.an.instanceOf(Array).and.have.length(3); + s.resources[0].should.be.an.instanceOf(Resource); + s.resources[0].url.should.be.eql('http://first-url.com'); + s.resources[1].should.be.an.instanceOf(Resource); + s.resources[1].url.should.be.eql('http://second-url.com'); + s.resources[2].should.be.an.instanceOf(Resource); + s.resources[2].url.should.be.eql('http://third-url.com'); }); it('should use urls filename', function() { @@ -249,7 +249,7 @@ describe('Scraper initialization', function () { urls: { url: 'http://first-url.com', filename: 'first.html' }, directory: testDirname }); - s.originalResources[0].getFilename().should.equalFileSystemPath('first.html'); + s.resources[0].getFilename().should.equalFileSystemPath('first.html'); }); it('should use default filename if no url filename was provided', function() { @@ -258,7 +258,7 @@ describe('Scraper initialization', function () { defaultFilename: 'default.html', directory: testDirname }); - s.originalResources[0].getFilename().should.equalFileSystemPath('default.html'); + s.resources[0].getFilename().should.equalFileSystemPath('default.html'); }); }); }); \ No newline at end of file From 13fbdf32e5c1e3a355a5328fc78a516be90968dc Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Sat, 8 Apr 2017 12:16:40 +0300 Subject: [PATCH 03/14] Fix failing build in node 4 and 5 --- index.js | 2 ++ lib/config/defaults.js | 1 + lib/fs-adaper.js | 4 +++- lib/scraper.js | 2 ++ test/unit/fs-adapter-test.js | 2 ++ 5 files changed, 10 insertions(+), 1 deletion(-) diff --git a/index.js b/index.js index 8a38d324..06c18c69 100644 --- a/index.js +++ b/index.js @@ -1,3 +1,5 @@ +'use strict'; + const Promise = require('bluebird'); const Scraper = require('./lib/scraper.js'); diff --git a/lib/config/defaults.js b/lib/config/defaults.js index df6a0cb7..5262fb75 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -1,3 +1,4 @@ +'use strict'; const config = { filenameGenerator: 'byType', defaultFilename: 'index.html', diff --git a/lib/fs-adaper.js b/lib/fs-adaper.js index 69382c74..3a6c7a34 100644 --- a/lib/fs-adaper.js +++ b/lib/fs-adaper.js @@ -1,3 +1,5 @@ +'use strict'; + const path = require('path'); const _ = require('lodash'); const Promise = require('bluebird'); @@ -9,7 +11,7 @@ const removeAsync = Promise.promisify(fs.remove); const supportedOptions = [ 'directory' ]; class FSAdapter { - constructor(options) { + constructor (options) { this.options = _.pick(options, supportedOptions); if (!this.options.directory || typeof this.options.directory !== 'string') { diff --git a/lib/scraper.js b/lib/scraper.js index 8158c8fa..087bd843 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -1,3 +1,5 @@ +'use strict'; + const Promise = require('bluebird'); const _ = require('lodash'); diff --git a/test/unit/fs-adapter-test.js b/test/unit/fs-adapter-test.js index 6906ce25..5a3928bd 100644 --- a/test/unit/fs-adapter-test.js +++ b/test/unit/fs-adapter-test.js @@ -1,3 +1,5 @@ +'use strict'; + const should = require('should'); const sinon = require('sinon'); const proxyquire = require('proxyquire'); From f4b2389ff4434f3ded730e1939207712e0f1ab1d Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Sat, 8 Apr 2017 12:44:09 +0300 Subject: [PATCH 04/14] Add nodejs 7 to travis --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 7940d5c0..5e1d2555 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ node_js: - '4' - '5' - '6' + - '7' after_success: - codeclimate-test-reporter < coverage/lcov.info - coveralls < coverage/lcov.info From 38b0c959adb594107ebedaa35c7f8dec4b484cf3 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Sat, 8 Apr 2017 12:45:46 +0300 Subject: [PATCH 05/14] Add nodejs 7 to appveyor --- appveyor.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/appveyor.yml b/appveyor.yml index f12f1a5c..7fd2e426 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -3,6 +3,7 @@ environment: - nodejs_version: "4" - nodejs_version: "5" - nodejs_version: "6" + - nodejs_version: "7" install: - ps: Install-Product node $env:nodejs_version From 7ffea161c4b93717d3bba6a0012c1e4f023faafb Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Sat, 8 Apr 2017 12:48:56 +0300 Subject: [PATCH 06/14] Fix test --- .../error-handling/error-handling.test.js | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/test/functional/error-handling/error-handling.test.js b/test/functional/error-handling/error-handling.test.js index 2ce00a9a..210b5f36 100644 --- a/test/functional/error-handling/error-handling.test.js +++ b/test/functional/error-handling/error-handling.test.js @@ -1,13 +1,15 @@ -var should = require('should'); -var nock = require('nock'); -var sinon = require('sinon'); -var fs = require('fs-extra'); -var Promise = require('bluebird'); -var Scraper = require('../../../lib/scraper'); - -var testDirname = __dirname + '/.tmp'; -var mockDirname = __dirname + '/mocks'; -var scraper; +'use strict'; + +const should = require('should'); +const nock = require('nock'); +const sinon = require('sinon'); +const fs = require('fs-extra'); +const Promise = require('bluebird'); +const Scraper = require('../../../lib/scraper'); + +const testDirname = __dirname + '/.tmp'; +const mockDirname = __dirname + '/mocks'; +let scraper; describe('Functional error handling', function() { From 0630ba902a06da17a5456f1cf2db55782d3fa769 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Sat, 8 Apr 2017 13:43:01 +0300 Subject: [PATCH 07/14] Rename fsAdapter -> resourceStorage --- lib/config/defaults.js | 2 +- lib/{fs-adaper.js => resource-storage.js} | 4 +- lib/scraper.js | 9 ++-- .../error-handling/error-handling.test.js | 6 +-- test/functional/redirect/redirect.test.js | 2 +- test/unit/fs-adapter-test.js | 42 +++++++++---------- 6 files changed, 32 insertions(+), 33 deletions(-) rename lib/{fs-adaper.js => resource-storage.js} (96%) diff --git a/lib/config/defaults.js b/lib/config/defaults.js index 5262fb75..afbd1872 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -53,7 +53,7 @@ const config = { httpResponseHandler: null, onResourceSaved: null, onResourceError: null, - fsAdapter: null + resourceStorage: null }; module.exports = config; diff --git a/lib/fs-adaper.js b/lib/resource-storage.js similarity index 96% rename from lib/fs-adaper.js rename to lib/resource-storage.js index 3a6c7a34..f0d67177 100644 --- a/lib/fs-adaper.js +++ b/lib/resource-storage.js @@ -10,7 +10,7 @@ const removeAsync = Promise.promisify(fs.remove); const supportedOptions = [ 'directory' ]; -class FSAdapter { +class ResourceStorage { constructor (options) { this.options = _.pick(options, supportedOptions); @@ -69,4 +69,4 @@ function exists (path) { return exists; } -module.exports = FSAdapter; +module.exports = ResourceStorage; diff --git a/lib/scraper.js b/lib/scraper.js index 087bd843..bba1dada 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -12,7 +12,7 @@ const Resource = require('./resource'); const FilenameGenerator = require('./filename-generator'); const Request = require('./request'); const ResourceHandler = require('./resource-handler'); -const FSAdapter = require('./fs-adaper'); +const ResourceStorage = require('./resource-storage'); const utils = require('./utils'); const NormalizedUrlMap = require('./utils/normalized-url-map'); @@ -38,9 +38,8 @@ function Scraper (options) { self.request = new Request(self.options); self.resourceHandler = new ResourceHandler(self.options, self); self.filenameGenerator = new FilenameGenerator(self.options); - self.fsAdapter = self.options.fsAdapter ? new self.options.fsAdapter(self.options) : new FSAdapter(self.options); + self.resourceStorage = self.options.resourceStorage ? new self.options.resourceStorage(self.options) : new ResourceStorage(self.options); - // Custom structures // Array of Resources for downloading self.resources = self.options.urls.map((obj) => { const url = (obj && obj.url) ? obj.url : obj; @@ -72,7 +71,7 @@ Scraper.prototype.saveResource = function saveResource (resource) { return self.resourceHandler.handleResource(resource); }).then(function fileHandled () { logger.info('saving resource ' + resource + ' to fs'); - return self.fsAdapter.saveResource(resource); + return self.resourceStorage.saveResource(resource); }).then(function afterResourceSaved () { if (self.options.onResourceSaved) { self.options.onResourceSaved(resource); @@ -187,7 +186,7 @@ Scraper.prototype.handleError = function handleError (err, resource) { Scraper.prototype.errorCleanup = function errorCleanup (error) { logger.error('finishing with error: ' + error.message); - return this.fsAdapter.removeSavedResources().then(function loadedDataRemoved () { + return this.resourceStorage.removeSavedResources().then(function loadedDataRemoved () { return Promise.reject(error); }); }; diff --git a/test/functional/error-handling/error-handling.test.js b/test/functional/error-handling/error-handling.test.js index 210b5f36..67067c99 100644 --- a/test/functional/error-handling/error-handling.test.js +++ b/test/functional/error-handling/error-handling.test.js @@ -46,10 +46,10 @@ describe('Functional error handling', function() { let loadToFsStub, handleErrorSpy; beforeEach(function() { - scraper.fsAdapter.loadedResources = [1, 2]; - loadToFsStub = sinon.stub(scraper.fsAdapter, 'saveResource').resolves(); + scraper.resourceStorage.loadedResources = [1, 2]; + loadToFsStub = sinon.stub(scraper.resourceStorage, 'saveResource').resolves(); loadToFsStub.onCall(2).rejects(new Error('FS FAILED!')); - handleErrorSpy = sinon.spy(scraper.fsAdapter, 'removeSavedResources'); + handleErrorSpy = sinon.spy(scraper.resourceStorage, 'removeSavedResources'); }); it('should remove directory and immediately reject on fs error if ignoreErrors is false', function () { diff --git a/test/functional/redirect/redirect.test.js b/test/functional/redirect/redirect.test.js index 322e1fa4..1ef858b4 100644 --- a/test/functional/redirect/redirect.test.js +++ b/test/functional/redirect/redirect.test.js @@ -41,7 +41,7 @@ describe('Functional redirects', function() { sources: [] }; var scraper = new Scraper(options); - var loadToFsSpy = sinon.spy(scraper.fsAdapter, 'saveResource'); + var loadToFsSpy = sinon.spy(scraper.resourceStorage, 'saveResource'); return scraper.scrape().then(function() { loadToFsSpy.callCount.should.be.eql(2); diff --git a/test/unit/fs-adapter-test.js b/test/unit/fs-adapter-test.js index 5a3928bd..b5ebe028 100644 --- a/test/unit/fs-adapter-test.js +++ b/test/unit/fs-adapter-test.js @@ -4,65 +4,65 @@ const should = require('should'); const sinon = require('sinon'); const proxyquire = require('proxyquire'); const path = require('path'); -const FSAdapter = require('../../lib/fs-adaper'); +const ResourceStorage = require('../../lib/resource-storage'); -describe('FSAdapter', function () { +describe('ResourceStorage', function () { describe('constructor', function() { it('should pick supported options', function() { const options = { a: 1, b: 2, directory: 'myDirectory' }; - const fsAdapter = new FSAdapter(options); - fsAdapter.options.should.eql({directory: 'myDirectory'}); + const resourceStorage = new ResourceStorage(options); + resourceStorage.options.should.eql({directory: 'myDirectory'}); }); describe('absoluteDirectoryPath', () => { it('should create absolute path if directory is relative path', function () { const options = { directory: 'my/relative/path' }; - const fsAdapter = new FSAdapter(options); + const resourceStorage = new ResourceStorage(options); const expected = path.join(process.cwd(), 'my/relative/path'); - fsAdapter.absoluteDirectoryPath.should.equalFileSystemPath(expected); + resourceStorage.absoluteDirectoryPath.should.equalFileSystemPath(expected); }); it('should use directory if directory is absolute path', function () { const options = { directory: '/my/absolute/path' }; - const fsAdapter = new FSAdapter(options); + const resourceStorage = new ResourceStorage(options); const expected = '/my/absolute/path'; - fsAdapter.absoluteDirectoryPath.should.equalFileSystemPath(expected); + resourceStorage.absoluteDirectoryPath.should.equalFileSystemPath(expected); }); }); describe('incorrect directory', () => { it('should throw error if no directory were passed', function () { const options = {}; - function createFsAdapter () { - new FSAdapter(options); + function createResourceStorage () { + new ResourceStorage(options); } - should(createFsAdapter).throw(/Incorrect directory/); + should(createResourceStorage).throw(/Incorrect directory/); }); it('should throw error if empty directory were passed', function () { const options = { directory: '' }; - function createFsAdapter () { - new FSAdapter(options); + function createResourceStorage () { + new ResourceStorage(options); } - should(createFsAdapter).throw(/Incorrect directory/); + should(createResourceStorage).throw(/Incorrect directory/); }); it('should throw error if incorrect directory passed', function () { const options = { directory: {} }; - function createFsAdapter () { - new FSAdapter(options); + function createResourceStorage () { + new ResourceStorage(options); } - should(createFsAdapter).throw(/Incorrect directory/); + should(createResourceStorage).throw(/Incorrect directory/); }); }); describe('existing directory', () => { it('should throw error if directory exists', () => { - const FSAdapter = proxyquire('../../lib/fs-adaper', { + const ResourceStorage = proxyquire('../../lib/resource-storage', { 'fs-extra': { statSync: sinon.stub().returns('fake-stat') } @@ -71,10 +71,10 @@ describe('FSAdapter', function () { const options = { directory: 'fake-directory' }; - function createFsAdapter () { - new FSAdapter(options); + function createResourceStorage () { + new ResourceStorage(options); } - should(createFsAdapter).throw(/Directory (.*?) exists/); + should(createResourceStorage).throw(/Directory (.*?) exists/); }); }); From 3fea5658c026e6c3466dbd8f1e0be278f1dc2236 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Sat, 8 Apr 2017 14:34:19 +0300 Subject: [PATCH 08/14] Add tests --- .../resource-storage/resource-storage.test.js | 68 +++++++++++++++++++ ...apter-test.js => resource-storage.test.js} | 0 test/unit/scraper-init-test.js | 57 +++++++++++++--- 3 files changed, 117 insertions(+), 8 deletions(-) create mode 100644 test/functional/resource-storage/resource-storage.test.js rename test/unit/{fs-adapter-test.js => resource-storage.test.js} (100%) diff --git a/test/functional/resource-storage/resource-storage.test.js b/test/functional/resource-storage/resource-storage.test.js new file mode 100644 index 00000000..c3a0da37 --- /dev/null +++ b/test/functional/resource-storage/resource-storage.test.js @@ -0,0 +1,68 @@ +'use strict'; + +const should = require('should'); +const nock = require('nock'); +const fs = require('fs-extra'); +const sinon = require('sinon'); +const scrape = require('../../../index'); + +const testDirname = __dirname + '/.tmp'; + +describe('Functional resourceStorage', () => { + + beforeEach(() => { + nock.cleanAll(); + nock.disableNetConnect(); + }); + + afterEach(() => { + nock.cleanAll(); + nock.enableNetConnect(); + fs.removeSync(testDirname); + }); + + it('should use passed resourceStorage when saving resource', function() { + nock('http://example.com/').get('/').reply(200, 'OK'); + + class MyResourceStorage { + saveResource() {} + removeSavedResources() {} + } + + const saveResourceStub = sinon.stub(MyResourceStorage.prototype, 'saveResource').resolves(); + + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + resourceStorage: MyResourceStorage + }; + + return scrape(options).catch(function() { + should(saveResourceStub.calledOnce).be.eql(true); + should(saveResourceStub.args[0][0].url).be.eql('http://example.com/'); + }); + }); + + it('should use passed resourceStorage on error', function() { + nock('http://example.com/').get('/').replyWithError('ERROR'); + + class MyResourceStorage { + saveResource() {} + removeSavedResources() {} + } + + const removeResourcesStub = sinon.stub(MyResourceStorage.prototype, 'removeSavedResources').resolves(); + + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + resourceStorage: MyResourceStorage, + ignoreErrors: false + }; + + return scrape(options).catch(function() { + should(removeResourcesStub.calledOnce).be.eql(true); + }); + }); + +}); diff --git a/test/unit/fs-adapter-test.js b/test/unit/resource-storage.test.js similarity index 100% rename from test/unit/fs-adapter-test.js rename to test/unit/resource-storage.test.js diff --git a/test/unit/scraper-init-test.js b/test/unit/scraper-init-test.js index 74208b34..650f959f 100644 --- a/test/unit/scraper-init-test.js +++ b/test/unit/scraper-init-test.js @@ -1,12 +1,14 @@ -var should = require('should'); -var proxyquire = require('proxyquire').noCallThru(); -var sinon = require('sinon'); -var path = require('path'); -var Scraper = require('../../lib/scraper'); -var Resource = require('../../lib/resource'); +'use strict'; -var testDirname = __dirname + '/.scraper-init-test'; -var urls = [ 'http://example.com' ]; +const should = require('should'); +const proxyquire = require('proxyquire').noCallThru(); +const sinon = require('sinon'); +const path = require('path'); +const Scraper = require('../../lib/scraper'); +const Resource = require('../../lib/resource'); + +const testDirname = __dirname + '/.scraper-init-test'; +const urls = [ 'http://example.com' ]; describe('Scraper initialization', function () { describe('defaultFilename', function() { @@ -261,4 +263,43 @@ describe('Scraper initialization', function () { s.resources[0].getFilename().should.equalFileSystemPath('default.html'); }); }); + + describe('resourceStorage', () => { + it('should create default resourceStorage with correct params', () => { + const ResourceStorageStub = sinon.stub(); + const Scraper = proxyquire('../../lib/scraper', { + './resource-storage': ResourceStorageStub + }); + + const options = { + urls: { url: 'http://first-url.com' }, + directory: testDirname, + maxDepth: 100 + }; + + const s = new Scraper(options); + ResourceStorageStub.calledOnce.should.be.eql(true); + ResourceStorageStub.args[0][0].should.be.eql(s.options); + }); + + it('should create custom resourceStorage with correct params', () => { + const DefaultResourceStorageStub = sinon.stub(); + const Scraper = proxyquire('../../lib/scraper', { + './resource-storage': DefaultResourceStorageStub + }); + const CustomResourceStorageStub = sinon.stub(); + + const options = { + urls: { url: 'http://first-url.com' }, + directory: testDirname, + maxDepth: 100, + resourceStorage: CustomResourceStorageStub + }; + + const s = new Scraper(options); + CustomResourceStorageStub.calledOnce.should.be.eql(true); + CustomResourceStorageStub.args[0][0].should.be.eql(s.options); + DefaultResourceStorageStub.called.should.be.eql(false); + }); + }); }); \ No newline at end of file From 2778d937565269a18afbe8dc9c6a93071fd70d47 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Sat, 8 Apr 2017 14:45:53 +0300 Subject: [PATCH 09/14] Add tests --- .../error-handling/error-handling.test.js | 2 +- test/unit/resource-storage.test.js | 16 ++++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/test/functional/error-handling/error-handling.test.js b/test/functional/error-handling/error-handling.test.js index 67067c99..821f04f6 100644 --- a/test/functional/error-handling/error-handling.test.js +++ b/test/functional/error-handling/error-handling.test.js @@ -81,7 +81,7 @@ describe('Functional error handling', function() { beforeEach(function() { var originalHandleResource = scraper.resourceHandler.handleResource; var callCount = 0; - handleResourceStub = sinon.stub(scraper.resourceHandler, 'handleResource', function() { + handleResourceStub = sinon.stub(scraper.resourceHandler, 'handleResource').callsFake(function() { if (callCount++ === 3) { return Promise.reject(new Error('RESOURCE HANDLER FAILED!')); } diff --git a/test/unit/resource-storage.test.js b/test/unit/resource-storage.test.js index b5ebe028..430bdff8 100644 --- a/test/unit/resource-storage.test.js +++ b/test/unit/resource-storage.test.js @@ -76,6 +76,22 @@ describe('ResourceStorage', function () { } should(createResourceStorage).throw(/Directory (.*?) exists/); }); + + it('should throw other errors as is', () => { + const ResourceStorage = proxyquire('../../lib/resource-storage', { + 'fs-extra': { + statSync: sinon.stub().throws(new Error('other fs error')) + } + }); + + const options = { + directory: 'fake-directory' + }; + function createResourceStorage () { + new ResourceStorage(options); + } + should(createResourceStorage).throw('other fs error'); + }); }); }); From 05e303d1f401792347add58997eaacbefbf0fd42 Mon Sep 17 00:00:00 2001 From: Sophia Antipenko Date: Sat, 8 Apr 2017 15:05:28 +0300 Subject: [PATCH 10/14] Add description for resourceStorage --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index cc1020bc..3b87deac 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ scrape(options, (error, result) => { * [urlFilter](#urlfilter) - skip some urls * [filenameGenerator](#filenamegenerator) - generate filename for downloaded resource * [httpResponseHandler](#httpresponsehandler) - customize http response handling +* [resourceStorage](#resourcestorage) - customize resources saving * [onResourceSaved](#onresourcesaved) - callback called when resource is saved * [onResourceError](#onresourceerror) - callback called when resource's downloading is failed @@ -211,6 +212,19 @@ scrape({ ``` Scrape function resolves with array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects which contain `metadata` property from `httpResponseHandler`. +#### resourceStorage +Class which saves [Resources](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js), should have methods `saveResource` and `removeSavedResources` which return Promises. Use it to save files where you need: to dropbox, amazon S3, existing directory, etc. By default all files are saved in local file system to new directory passed in `directory` option (see [lib/resource-storage.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource-storage.js)). +```javascript +scrape({ + urls: ['http://example.com/'], + directory: '/path/to/save', + resourceStorage: class MyResourceStorage { + saveResource (resource) {/* code to save file where you need */} + removeSavedResources() {/* code to remove all previously saved files in case of error */} + } +}).then(console.log).catch(console.log); +``` + #### onResourceSaved Function called each time when resource is saved to file system. Callback is called with [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) object. Defaults to `null` - no callback will be called. ```javascript From 0a8cf623750aff11720f9c833c3856b402e2e505 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Tue, 11 Apr 2017 22:55:05 +0300 Subject: [PATCH 11/14] Rename resourceStorage -> resourceSaver --- README.md | 10 ++-- lib/config/defaults.js | 2 +- .../index.js} | 6 +-- lib/scraper.js | 43 ++++++++-------- lib/utils/index.js | 42 ++++++++++------ .../error-handling/error-handling.test.js | 6 +-- test/functional/redirect/redirect.test.js | 2 +- .../resource-saver.test.js} | 25 +++++----- test/unit/resource-storage.test.js | 50 +++++++++---------- test/unit/scraper-init-test.js | 28 +++++------ 10 files changed, 112 insertions(+), 102 deletions(-) rename lib/{resource-storage.js => resource-saver/index.js} (94%) rename test/functional/{resource-storage/resource-storage.test.js => resource-saver/resource-saver.test.js} (59%) diff --git a/README.md b/README.md index 3b87deac..91b2e9cf 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ scrape(options, (error, result) => { * [urlFilter](#urlfilter) - skip some urls * [filenameGenerator](#filenamegenerator) - generate filename for downloaded resource * [httpResponseHandler](#httpresponsehandler) - customize http response handling -* [resourceStorage](#resourcestorage) - customize resources saving +* [resourceSaver](#resourcesaver) - customize resources saving * [onResourceSaved](#onresourcesaved) - callback called when resource is saved * [onResourceError](#onresourceerror) - callback called when resource's downloading is failed @@ -212,15 +212,15 @@ scrape({ ``` Scrape function resolves with array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects which contain `metadata` property from `httpResponseHandler`. -#### resourceStorage -Class which saves [Resources](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js), should have methods `saveResource` and `removeSavedResources` which return Promises. Use it to save files where you need: to dropbox, amazon S3, existing directory, etc. By default all files are saved in local file system to new directory passed in `directory` option (see [lib/resource-storage.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource-storage.js)). +#### resourceSaver +Class which saves [Resources](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js), should have methods `saveResource` and `errorCleanup` which return Promises. Use it to save files where you need: to dropbox, amazon S3, existing directory, etc. By default all files are saved in local file system to new directory passed in `directory` option (see [lib/resource-saver.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource-saver/index.js)). ```javascript scrape({ urls: ['http://example.com/'], directory: '/path/to/save', - resourceStorage: class MyResourceStorage { + resourceSaver: class MyResourceSaver { saveResource (resource) {/* code to save file where you need */} - removeSavedResources() {/* code to remove all previously saved files in case of error */} + errorCleanup() {/* code to remove all previously saved files in case of error */} } }).then(console.log).catch(console.log); ``` diff --git a/lib/config/defaults.js b/lib/config/defaults.js index afbd1872..8fa7bcc4 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -53,7 +53,7 @@ const config = { httpResponseHandler: null, onResourceSaved: null, onResourceError: null, - resourceStorage: null + resourceSaver: null }; module.exports = config; diff --git a/lib/resource-storage.js b/lib/resource-saver/index.js similarity index 94% rename from lib/resource-storage.js rename to lib/resource-saver/index.js index f0d67177..11196ac3 100644 --- a/lib/resource-storage.js +++ b/lib/resource-saver/index.js @@ -10,7 +10,7 @@ const removeAsync = Promise.promisify(fs.remove); const supportedOptions = [ 'directory' ]; -class ResourceStorage { +class ResourceSaver { constructor (options) { this.options = _.pick(options, supportedOptions); @@ -44,7 +44,7 @@ class ResourceStorage { * Remove all files that were saved before * @returns {Promise} */ - removeSavedResources () { + errorCleanup () { if (!_.isEmpty(this.loadedResources)) { return removeAsync(this.absoluteDirectoryPath); } @@ -69,4 +69,4 @@ function exists (path) { return exists; } -module.exports = ResourceStorage; +module.exports = ResourceSaver; diff --git a/lib/scraper.js b/lib/scraper.js index bba1dada..a4fc7f57 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -12,15 +12,15 @@ const Resource = require('./resource'); const FilenameGenerator = require('./filename-generator'); const Request = require('./request'); const ResourceHandler = require('./resource-handler'); -const ResourceStorage = require('./resource-storage'); -const utils = require('./utils'); +const ResourceSaver = require('./resource-saver'); +const u = require('./utils'); const NormalizedUrlMap = require('./utils/normalized-url-map'); function Scraper (options) { const self = this; - self.options = Object.assign({}, defaults, options); - self.options.request = Object.assign({}, defaults.request, options.request); + self.options = u.extend(defaults, options); + self.options.request = u.extend(defaults.request, options.request); self.options.urls = Array.isArray(self.options.urls) ? self.options.urls : [self.options.urls]; if (self.options.subdirectories) { @@ -38,7 +38,7 @@ function Scraper (options) { self.request = new Request(self.options); self.resourceHandler = new ResourceHandler(self.options, self); self.filenameGenerator = new FilenameGenerator(self.options); - self.resourceStorage = self.options.resourceStorage ? new self.options.resourceStorage(self.options) : new ResourceStorage(self.options); + self.resourceSaver = self.options.resourceSaver ? new self.options.resourceSaver(u.clone(self.options)) : new ResourceSaver(self.options); // Array of Resources for downloading self.resources = self.options.urls.map((obj) => { @@ -71,7 +71,7 @@ Scraper.prototype.saveResource = function saveResource (resource) { return self.resourceHandler.handleResource(resource); }).then(function fileHandled () { logger.info('saving resource ' + resource + ' to fs'); - return self.resourceStorage.saveResource(resource); + return self.resourceSaver.saveResource(resource); }).then(function afterResourceSaved () { if (self.options.onResourceSaved) { self.options.onResourceSaved(resource); @@ -92,7 +92,7 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) { return self.request.get(url, referer); }).then(function requestCompleted (responseData) { - if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects + if (!u.urlsEqual(responseData.url, url)) { // Url may be changed in redirects logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url); if (self.requestedResourcePromises.has(responseData.url)) { @@ -103,14 +103,14 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) { self.requestedResourcePromises.set(responseData.url, requestPromise); } - resource.setType(utils.getTypeByMime(responseData.mimeType)); + resource.setType(u.getTypeByMime(responseData.mimeType)); const filename = self.filenameGenerator.generateFilename(resource); resource.setFilename(filename); // if type was not determined by mime we can try to get it from filename after it was generated if (!resource.getType()) { - resource.setType(utils.getTypeByFilename(filename)); + resource.setType(u.getTypeByFilename(filename)); } if (responseData.metadata) { @@ -130,25 +130,24 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) { }; Scraper.prototype.requestResource = function requestResource (resource) { - const self = this; const url = resource.getUrl(); - if (self.options.urlFilter && !self.options.urlFilter(url)) { + if (this.options.urlFilter && !this.options.urlFilter(url)) { logger.debug('filtering out ' + resource + ' by url filter'); return Promise.resolve(null); } - if (self.options.maxDepth && resource.getDepth() > self.options.maxDepth) { + if (this.options.maxDepth && resource.getDepth() > this.options.maxDepth) { logger.debug('filtering out ' + resource + ' by depth'); return Promise.resolve(null); } - if (self.requestedResourcePromises.has(url)) { + if (this.requestedResourcePromises.has(url)) { logger.debug('found requested resource for ' + resource); - return self.requestedResourcePromises.get(url); + return this.requestedResourcePromises.get(url); } - return self.createNewRequest(resource); + return this.createNewRequest(resource); }; Scraper.prototype.load = function load () { @@ -161,16 +160,16 @@ Scraper.prototype.load = function load () { // 1. Get all not saved resources and save them // 2. Recursion if any new not saved resource were added during this time. If not, loading is done. Scraper.prototype.waitForLoad = function waitForLoad () { - const self = this; - const resourcesToSave = Array.from(self.loadedResources.values()).filter((r) => !r.isSaved()); + const resourcesToSave = Array.from(this.loadedResources.values()).filter((r) => !r.isSaved()); const loadingIsFinished = _.isEmpty(resourcesToSave); if (!loadingIsFinished) { - return Promise.mapSeries(resourcesToSave, self.saveResource.bind(self)) - .then(self.waitForLoad.bind(self)); + return Promise + .mapSeries(resourcesToSave, this.saveResource.bind(this)) + .then(this.waitForLoad.bind(this)); } logger.info('downloading is finished successfully'); - return Promise.resolve(self.resources); + return Promise.resolve(this.resources); }; Scraper.prototype.handleError = function handleError (err, resource) { @@ -186,7 +185,7 @@ Scraper.prototype.handleError = function handleError (err, resource) { Scraper.prototype.errorCleanup = function errorCleanup (error) { logger.error('finishing with error: ' + error.message); - return this.resourceStorage.removeSavedResources().then(function loadedDataRemoved () { + return this.resourceSaver.errorCleanup(error).then(() => { return Promise.reject(error); }); }; @@ -198,6 +197,6 @@ Scraper.prototype.scrape = function scrape (callback) { .asCallback(callback); }; -Scraper.defaults = Object.assign({}, defaults); +Scraper.defaults = u.clone(defaults); module.exports = Scraper; diff --git a/lib/utils/index.js b/lib/utils/index.js index ff7f277b..90f29d4d 100644 --- a/lib/utils/index.js +++ b/lib/utils/index.js @@ -118,21 +118,31 @@ function decodeHtmlEntities (text) { return typeof text === 'string' ? htmlEntities.decode(text) : ''; } +function clone (obj) { + return Object.assign({}, obj); +} + +function extend (first, second) { + return Object.assign({}, first, second); +} + module.exports = { - isUrl: isUrl, - getUrl: getUrl, - getUnixPath: getUnixPath, - getRelativePath: getRelativePath, - getFilenameFromUrl: getFilenameFromUrl, - getFilepathFromUrl: getFilepathFromUrl, - getFilenameExtension: getFilenameExtension, - getHashFromUrl: getHashFromUrl, - shortenFilename: shortenFilename, - waitAllFulfilled: waitAllFulfilled, - normalizeUrl: normalizeUrl, - urlsEqual: urlsEqual, - isUriSchemaSupported: isUriSchemaSupported, - getTypeByMime: getTypeByMime, - getTypeByFilename: getTypeByFilename, - decodeHtmlEntities: decodeHtmlEntities + isUrl, + getUrl, + getUnixPath, + getRelativePath, + getFilenameFromUrl, + getFilepathFromUrl, + getFilenameExtension, + getHashFromUrl, + shortenFilename, + waitAllFulfilled, + normalizeUrl, + urlsEqual, + isUriSchemaSupported, + getTypeByMime, + getTypeByFilename, + decodeHtmlEntities, + clone, + extend }; diff --git a/test/functional/error-handling/error-handling.test.js b/test/functional/error-handling/error-handling.test.js index 821f04f6..da7952b6 100644 --- a/test/functional/error-handling/error-handling.test.js +++ b/test/functional/error-handling/error-handling.test.js @@ -46,10 +46,10 @@ describe('Functional error handling', function() { let loadToFsStub, handleErrorSpy; beforeEach(function() { - scraper.resourceStorage.loadedResources = [1, 2]; - loadToFsStub = sinon.stub(scraper.resourceStorage, 'saveResource').resolves(); + scraper.resourceSaver.loadedResources = [1, 2]; + loadToFsStub = sinon.stub(scraper.resourceSaver, 'saveResource').resolves(); loadToFsStub.onCall(2).rejects(new Error('FS FAILED!')); - handleErrorSpy = sinon.spy(scraper.resourceStorage, 'removeSavedResources'); + handleErrorSpy = sinon.spy(scraper.resourceSaver, 'errorCleanup'); }); it('should remove directory and immediately reject on fs error if ignoreErrors is false', function () { diff --git a/test/functional/redirect/redirect.test.js b/test/functional/redirect/redirect.test.js index 1ef858b4..2ddf81e8 100644 --- a/test/functional/redirect/redirect.test.js +++ b/test/functional/redirect/redirect.test.js @@ -41,7 +41,7 @@ describe('Functional redirects', function() { sources: [] }; var scraper = new Scraper(options); - var loadToFsSpy = sinon.spy(scraper.resourceStorage, 'saveResource'); + var loadToFsSpy = sinon.spy(scraper.resourceSaver, 'saveResource'); return scraper.scrape().then(function() { loadToFsSpy.callCount.should.be.eql(2); diff --git a/test/functional/resource-storage/resource-storage.test.js b/test/functional/resource-saver/resource-saver.test.js similarity index 59% rename from test/functional/resource-storage/resource-storage.test.js rename to test/functional/resource-saver/resource-saver.test.js index c3a0da37..37598d88 100644 --- a/test/functional/resource-storage/resource-storage.test.js +++ b/test/functional/resource-saver/resource-saver.test.js @@ -8,7 +8,7 @@ const scrape = require('../../../index'); const testDirname = __dirname + '/.tmp'; -describe('Functional resourceStorage', () => { +describe('Functional resourceSaver', () => { beforeEach(() => { nock.cleanAll(); @@ -21,20 +21,20 @@ describe('Functional resourceStorage', () => { fs.removeSync(testDirname); }); - it('should use passed resourceStorage when saving resource', function() { + it('should use passed resourceSaver when saving resource', function() { nock('http://example.com/').get('/').reply(200, 'OK'); - class MyResourceStorage { + class MyResourceSaver { saveResource() {} - removeSavedResources() {} + errorCleanup() {} } - const saveResourceStub = sinon.stub(MyResourceStorage.prototype, 'saveResource').resolves(); + const saveResourceStub = sinon.stub(MyResourceSaver.prototype, 'saveResource').resolves(); const options = { urls: [ 'http://example.com/' ], directory: testDirname, - resourceStorage: MyResourceStorage + resourceSaver: MyResourceSaver }; return scrape(options).catch(function() { @@ -43,25 +43,26 @@ describe('Functional resourceStorage', () => { }); }); - it('should use passed resourceStorage on error', function() { - nock('http://example.com/').get('/').replyWithError('ERROR'); + it('should use passed resourceSaver on error', function() { + nock('http://example.com/').get('/').replyWithError('SCRAPER AWFUL ERROR'); - class MyResourceStorage { + class MyResourceSaver { saveResource() {} - removeSavedResources() {} + errorCleanup() {} } - const removeResourcesStub = sinon.stub(MyResourceStorage.prototype, 'removeSavedResources').resolves(); + const removeResourcesStub = sinon.stub(MyResourceSaver.prototype, 'errorCleanup').resolves(); const options = { urls: [ 'http://example.com/' ], directory: testDirname, - resourceStorage: MyResourceStorage, + resourceSaver: MyResourceSaver, ignoreErrors: false }; return scrape(options).catch(function() { should(removeResourcesStub.calledOnce).be.eql(true); + should(removeResourcesStub.args[0][0].message).be.eql('SCRAPER AWFUL ERROR'); }); }); diff --git a/test/unit/resource-storage.test.js b/test/unit/resource-storage.test.js index 430bdff8..a807748a 100644 --- a/test/unit/resource-storage.test.js +++ b/test/unit/resource-storage.test.js @@ -4,65 +4,65 @@ const should = require('should'); const sinon = require('sinon'); const proxyquire = require('proxyquire'); const path = require('path'); -const ResourceStorage = require('../../lib/resource-storage'); +const ResourceSaver = require('../../lib/resource-saver'); -describe('ResourceStorage', function () { +describe('ResourceSaver', function () { describe('constructor', function() { it('should pick supported options', function() { const options = { a: 1, b: 2, directory: 'myDirectory' }; - const resourceStorage = new ResourceStorage(options); - resourceStorage.options.should.eql({directory: 'myDirectory'}); + const resourceSaver = new ResourceSaver(options); + resourceSaver.options.should.eql({directory: 'myDirectory'}); }); describe('absoluteDirectoryPath', () => { it('should create absolute path if directory is relative path', function () { const options = { directory: 'my/relative/path' }; - const resourceStorage = new ResourceStorage(options); + const resourceSaver = new ResourceSaver(options); const expected = path.join(process.cwd(), 'my/relative/path'); - resourceStorage.absoluteDirectoryPath.should.equalFileSystemPath(expected); + resourceSaver.absoluteDirectoryPath.should.equalFileSystemPath(expected); }); it('should use directory if directory is absolute path', function () { const options = { directory: '/my/absolute/path' }; - const resourceStorage = new ResourceStorage(options); + const resourceSaver = new ResourceSaver(options); const expected = '/my/absolute/path'; - resourceStorage.absoluteDirectoryPath.should.equalFileSystemPath(expected); + resourceSaver.absoluteDirectoryPath.should.equalFileSystemPath(expected); }); }); describe('incorrect directory', () => { it('should throw error if no directory were passed', function () { const options = {}; - function createResourceStorage () { - new ResourceStorage(options); + function createResourceSaver () { + new ResourceSaver(options); } - should(createResourceStorage).throw(/Incorrect directory/); + should(createResourceSaver).throw(/Incorrect directory/); }); it('should throw error if empty directory were passed', function () { const options = { directory: '' }; - function createResourceStorage () { - new ResourceStorage(options); + function createResourceSaver () { + new ResourceSaver(options); } - should(createResourceStorage).throw(/Incorrect directory/); + should(createResourceSaver).throw(/Incorrect directory/); }); it('should throw error if incorrect directory passed', function () { const options = { directory: {} }; - function createResourceStorage () { - new ResourceStorage(options); + function createResourceSaver () { + new ResourceSaver(options); } - should(createResourceStorage).throw(/Incorrect directory/); + should(createResourceSaver).throw(/Incorrect directory/); }); }); describe('existing directory', () => { it('should throw error if directory exists', () => { - const ResourceStorage = proxyquire('../../lib/resource-storage', { + const ResourceSaver = proxyquire('../../lib/resource-saver', { 'fs-extra': { statSync: sinon.stub().returns('fake-stat') } @@ -71,14 +71,14 @@ describe('ResourceStorage', function () { const options = { directory: 'fake-directory' }; - function createResourceStorage () { - new ResourceStorage(options); + function createResourceSaver () { + new ResourceSaver(options); } - should(createResourceStorage).throw(/Directory (.*?) exists/); + should(createResourceSaver).throw(/Directory (.*?) exists/); }); it('should throw other errors as is', () => { - const ResourceStorage = proxyquire('../../lib/resource-storage', { + const ResourceSaver = proxyquire('../../lib/resource-saver', { 'fs-extra': { statSync: sinon.stub().throws(new Error('other fs error')) } @@ -87,10 +87,10 @@ describe('ResourceStorage', function () { const options = { directory: 'fake-directory' }; - function createResourceStorage () { - new ResourceStorage(options); + function createResourceSaver () { + new ResourceSaver(options); } - should(createResourceStorage).throw('other fs error'); + should(createResourceSaver).throw('other fs error'); }); }); diff --git a/test/unit/scraper-init-test.js b/test/unit/scraper-init-test.js index 650f959f..05d7e14f 100644 --- a/test/unit/scraper-init-test.js +++ b/test/unit/scraper-init-test.js @@ -264,11 +264,11 @@ describe('Scraper initialization', function () { }); }); - describe('resourceStorage', () => { - it('should create default resourceStorage with correct params', () => { - const ResourceStorageStub = sinon.stub(); + describe('resourceSaver', () => { + it('should create default resourceSaver with correct params', () => { + const ResourceSaverStub = sinon.stub(); const Scraper = proxyquire('../../lib/scraper', { - './resource-storage': ResourceStorageStub + './resource-saver': ResourceSaverStub }); const options = { @@ -278,28 +278,28 @@ describe('Scraper initialization', function () { }; const s = new Scraper(options); - ResourceStorageStub.calledOnce.should.be.eql(true); - ResourceStorageStub.args[0][0].should.be.eql(s.options); + ResourceSaverStub.calledOnce.should.be.eql(true); + ResourceSaverStub.args[0][0].should.be.eql(s.options); }); - it('should create custom resourceStorage with correct params', () => { - const DefaultResourceStorageStub = sinon.stub(); + it('should create custom resourceSaver with correct params', () => { + const DefaultResourceSaverStub = sinon.stub(); const Scraper = proxyquire('../../lib/scraper', { - './resource-storage': DefaultResourceStorageStub + './resource-saver': DefaultResourceSaverStub }); - const CustomResourceStorageStub = sinon.stub(); + const CustomResourceSaverStub = sinon.stub(); const options = { urls: { url: 'http://first-url.com' }, directory: testDirname, maxDepth: 100, - resourceStorage: CustomResourceStorageStub + resourceSaver: CustomResourceSaverStub }; const s = new Scraper(options); - CustomResourceStorageStub.calledOnce.should.be.eql(true); - CustomResourceStorageStub.args[0][0].should.be.eql(s.options); - DefaultResourceStorageStub.called.should.be.eql(false); + CustomResourceSaverStub.calledOnce.should.be.eql(true); + CustomResourceSaverStub.args[0][0].should.be.eql(s.options); + DefaultResourceSaverStub.called.should.be.eql(false); }); }); }); \ No newline at end of file From 327c15d1cba35dde9864bb29049c7debf46bb7e7 Mon Sep 17 00:00:00 2001 From: Sophia Antipenko Date: Tue, 11 Apr 2017 23:06:17 +0300 Subject: [PATCH 12/14] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 91b2e9cf..c0b9db8c 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,7 @@ scrape({ Scrape function resolves with array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects which contain `metadata` property from `httpResponseHandler`. #### resourceSaver -Class which saves [Resources](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js), should have methods `saveResource` and `errorCleanup` which return Promises. Use it to save files where you need: to dropbox, amazon S3, existing directory, etc. By default all files are saved in local file system to new directory passed in `directory` option (see [lib/resource-saver.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource-saver/index.js)). +Class which saves [Resources](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js), should have methods `saveResource` and `errorCleanup` which return Promises. Use it to save files where you need: to dropbox, amazon S3, existing directory, etc. By default all files are saved in local file system to new directory passed in `directory` option (see [lib/resource-saver/index.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource-saver/index.js)). ```javascript scrape({ urls: ['http://example.com/'], From 177873edbc0f59b25aa6ebcd1714459b04b503f2 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Tue, 11 Apr 2017 23:10:03 +0300 Subject: [PATCH 13/14] Rename test file --- test/unit/{resource-storage.test.js => resource-saver.test.js} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/unit/{resource-storage.test.js => resource-saver.test.js} (100%) diff --git a/test/unit/resource-storage.test.js b/test/unit/resource-saver.test.js similarity index 100% rename from test/unit/resource-storage.test.js rename to test/unit/resource-saver.test.js From cab57fc6aa780a139d4fe32b517d635b148cd01b Mon Sep 17 00:00:00 2001 From: Sophia Antipenko Date: Tue, 11 Apr 2017 23:18:44 +0300 Subject: [PATCH 14/14] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c0b9db8c..4e0706a6 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ scrape({ directory: '/path/to/save', resourceSaver: class MyResourceSaver { saveResource (resource) {/* code to save file where you need */} - errorCleanup() {/* code to remove all previously saved files in case of error */} + errorCleanup (err) {/* code to remove all previously saved files in case of error */} } }).then(console.log).catch(console.log); ```