diff --git a/.travis.yml b/.travis.yml index 7940d5c0..5e1d2555 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ node_js: - '4' - '5' - '6' + - '7' after_success: - codeclimate-test-reporter < coverage/lcov.info - coveralls < coverage/lcov.info diff --git a/README.md b/README.md index cc1020bc..4e0706a6 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,7 @@ scrape(options, (error, result) => { * [urlFilter](#urlfilter) - skip some urls * [filenameGenerator](#filenamegenerator) - generate filename for downloaded resource * [httpResponseHandler](#httpresponsehandler) - customize http response handling +* [resourceSaver](#resourcesaver) - customize resources saving * [onResourceSaved](#onresourcesaved) - callback called when resource is saved * [onResourceError](#onresourceerror) - callback called when resource's downloading is failed @@ -211,6 +212,19 @@ scrape({ ``` Scrape function resolves with array of [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) objects which contain `metadata` property from `httpResponseHandler`. +#### resourceSaver +Class which saves [Resources](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js), should have methods `saveResource` and `errorCleanup` which return Promises. Use it to save files where you need: to dropbox, amazon S3, existing directory, etc. By default all files are saved in local file system to new directory passed in `directory` option (see [lib/resource-saver/index.js](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource-saver/index.js)). +```javascript +scrape({ + urls: ['http://example.com/'], + directory: '/path/to/save', + resourceSaver: class MyResourceSaver { + saveResource (resource) {/* code to save file where you need */} + errorCleanup (err) {/* code to remove all previously saved files in case of error */} + } +}).then(console.log).catch(console.log); +``` + #### onResourceSaved Function called each time when resource is saved to file system. Callback is called with [Resource](https://github.com/s0ph1e/node-website-scraper/blob/master/lib/resource.js) object. Defaults to `null` - no callback will be called. ```javascript diff --git a/appveyor.yml b/appveyor.yml index f12f1a5c..7fd2e426 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -3,6 +3,7 @@ environment: - nodejs_version: "4" - nodejs_version: "5" - nodejs_version: "6" + - nodejs_version: "7" install: - ps: Install-Product node $env:nodejs_version diff --git a/index.js b/index.js index 9b698f3c..06c18c69 100644 --- a/index.js +++ b/index.js @@ -1,7 +1,12 @@ -var Scraper = require('./lib/scraper.js'); +'use strict'; -module.exports = function scrape (options, callback) { - return new Scraper(options).scrape(callback); +const Promise = require('bluebird'); +const Scraper = require('./lib/scraper.js'); + +module.exports = (options, callback) => { + return Promise.try(() => { + return new Scraper(options).scrape(callback); + }); }; module.exports.defaults = Scraper.defaults; diff --git a/lib/config/defaults.js b/lib/config/defaults.js index 2091cfbb..8fa7bcc4 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -1,4 +1,5 @@ -var config = { +'use strict'; +const config = { filenameGenerator: 'byType', defaultFilename: 'index.html', prettifyUrls: false, @@ -51,7 +52,8 @@ var config = { ignoreErrors: true, httpResponseHandler: null, onResourceSaved: null, - onResourceError: null + onResourceError: null, + resourceSaver: null }; module.exports = config; diff --git a/lib/fs-adaper.js b/lib/fs-adaper.js deleted file mode 100644 index cdc1c90f..00000000 --- a/lib/fs-adaper.js +++ /dev/null @@ -1,59 +0,0 @@ -var path = require('path'); -var _ = require('lodash'); -var Promise = require('bluebird'); - -var fs = require('fs-extra'); -var existsAsync = Promise.promisify(fs.stat); -var outputFileAsync = Promise.promisify(fs.outputFile); -var ensureDirAsync = Promise.promisify(fs.ensureDir); -var removeAsync = Promise.promisify(fs.remove); - -var supportedOptions = [ 'directory' ]; - -function FSAdapter (options) { - var self = this; - - self.loadedResources = []; // Array of loaded Resources - - self.options = _.pick(options, supportedOptions); - - if (self.options.directory) { - self.absoluteDirectoryPath = path.resolve(process.cwd(), self.options.directory); - } -} - -FSAdapter.prototype.validateDirectory = function validateDirectory () { - var self = this; - if (_.isEmpty(self.options.directory) || !_.isString(self.options.directory)) { - return Promise.reject(new Error('Incorrect directory ' + self.options.directory)); - } - - return existsAsync(self.absoluteDirectoryPath).then(function handleDirectoryExist () { - return Promise.reject(new Error('Directory ' + self.absoluteDirectoryPath + ' exists')); - }, function handleDirectoryNotExist () { - return Promise.resolve(); - }); -}; - -FSAdapter.prototype.createDirectory = function createDirectory () { - return ensureDirAsync(this.absoluteDirectoryPath); -}; - -FSAdapter.prototype.cleanDirectory = function cleanDirectory () { - if (!_.isEmpty(this.loadedResources)) { - return removeAsync(this.absoluteDirectoryPath); - } - return Promise.resolve(); -}; - -FSAdapter.prototype.saveResource = function saveResource (resource) { - var self = this; - - var filename = path.join(self.absoluteDirectoryPath, resource.getFilename()); - var text = resource.getText(); - return outputFileAsync(filename, text, { encoding: 'binary' }).then(function resourceSaved () { - self.loadedResources.push(resource); - }); -}; - -module.exports = FSAdapter; diff --git a/lib/resource-saver/index.js b/lib/resource-saver/index.js new file mode 100644 index 00000000..11196ac3 --- /dev/null +++ b/lib/resource-saver/index.js @@ -0,0 +1,72 @@ +'use strict'; + +const path = require('path'); +const _ = require('lodash'); +const Promise = require('bluebird'); + +const fs = require('fs-extra'); +const outputFileAsync = Promise.promisify(fs.outputFile); +const removeAsync = Promise.promisify(fs.remove); + +const supportedOptions = [ 'directory' ]; + +class ResourceSaver { + constructor (options) { + this.options = _.pick(options, supportedOptions); + + if (!this.options.directory || typeof this.options.directory !== 'string') { + throw new Error('Incorrect directory ' + this.options.directory); + } + + this.absoluteDirectoryPath = path.resolve(process.cwd(), this.options.directory); + + if (exists(this.absoluteDirectoryPath)) { + throw new Error('Directory ' + this.absoluteDirectoryPath + ' exists'); + } + + this.loadedResources = []; + } + + /** + * Save resource to file system + * @param {Resource} resource + * @returns {Promise} + */ + saveResource (resource) { + const filename = path.join(this.absoluteDirectoryPath, resource.getFilename()); + const text = resource.getText(); + return outputFileAsync(filename, text, { encoding: 'binary' }).then(() => { + this.loadedResources.push(resource); + }); + } + + /** + * Remove all files that were saved before + * @returns {Promise} + */ + errorCleanup () { + if (!_.isEmpty(this.loadedResources)) { + return removeAsync(this.absoluteDirectoryPath); + } + return Promise.resolve(); + } +} + +function exists (path) { + let exists; + try { + if (fs.statSync(path)) { + exists = true; + } + } catch (e) { + if (e.code === 'ENOENT') { + exists = false; + } else { + throw e; + } + } + + return exists; +} + +module.exports = ResourceSaver; diff --git a/lib/scraper.js b/lib/scraper.js index f3872429..a4fc7f57 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -1,26 +1,27 @@ -var Promise = require('bluebird'); -var _ = require('lodash'); +'use strict'; -var logger = require('./logger'); +const Promise = require('bluebird'); +const _ = require('lodash'); -var defaults = require('./config/defaults'); -var recursiveSources = require('./config/recursive-sources'); -var Resource = require('./resource'); +const logger = require('./logger'); -var FilenameGenerator = require('./filename-generator'); -var Request = require('./request'); -var ResourceHandler = require('./resource-handler'); -var FSAdapter = require('./fs-adaper'); -var utils = require('./utils'); -var NormalizedUrlMap = require('./utils/normalized-url-map'); +const defaults = require('./config/defaults'); +const recursiveSources = require('./config/recursive-sources'); +const Resource = require('./resource'); + +const FilenameGenerator = require('./filename-generator'); +const Request = require('./request'); +const ResourceHandler = require('./resource-handler'); +const ResourceSaver = require('./resource-saver'); +const u = require('./utils'); +const NormalizedUrlMap = require('./utils/normalized-url-map'); function Scraper (options) { - var self = this; + const self = this; - // Extend options - self.options = _.extend({}, defaults, options); - self.options.request = _.extend({}, defaults.request, options.request); - self.options.urls = _.isArray(self.options.urls) ? self.options.urls : [self.options.urls]; + self.options = u.extend(defaults, options); + self.options.request = u.extend(defaults.request, options.request); + self.options.urls = Array.isArray(self.options.urls) ? self.options.urls : [self.options.urls]; if (self.options.subdirectories) { self.options.subdirectories.forEach((element) => { @@ -37,13 +38,12 @@ function Scraper (options) { self.request = new Request(self.options); self.resourceHandler = new ResourceHandler(self.options, self); self.filenameGenerator = new FilenameGenerator(self.options); - self.fsAdapter = new FSAdapter(self.options); + self.resourceSaver = self.options.resourceSaver ? new self.options.resourceSaver(u.clone(self.options)) : new ResourceSaver(self.options); - // Custom structures // Array of Resources for downloading - self.originalResources = _.map(self.options.urls, function createResource (obj) { - var url = _.isObject(obj) && _.has(obj, 'url') ? obj.url : obj; - var filename = _.isObject(obj) && _.has(obj, 'filename') ? obj.filename : self.options.defaultFilename; + self.resources = self.options.urls.map((obj) => { + const url = (obj && obj.url) ? obj.url : obj; + const filename = (obj && obj.filename) ? obj.filename : self.options.defaultFilename; return new Resource(url, filename); }); @@ -52,19 +52,18 @@ function Scraper (options) { } Scraper.prototype.loadResource = function loadResource (resource) { - var self = this; - var url = resource.getUrl(); + const url = resource.getUrl(); - if (self.loadedResources.has(url)) { + if (this.loadedResources.has(url)) { logger.debug('found loaded resource for ' + resource); } else { logger.debug('add loaded resource ' + resource); - self.loadedResources.set(url, resource); + this.loadedResources.set(url, resource); } }; Scraper.prototype.saveResource = function saveResource (resource) { - var self = this; + const self = this; resource.setSaved(); return Promise.resolve() @@ -72,7 +71,7 @@ Scraper.prototype.saveResource = function saveResource (resource) { return self.resourceHandler.handleResource(resource); }).then(function fileHandled () { logger.info('saving resource ' + resource + ' to fs'); - return self.fsAdapter.saveResource(resource); + return self.resourceSaver.saveResource(resource); }).then(function afterResourceSaved () { if (self.options.onResourceSaved) { self.options.onResourceSaved(resource); @@ -84,16 +83,16 @@ Scraper.prototype.saveResource = function saveResource (resource) { }; Scraper.prototype.createNewRequest = function createNewRequest (resource) { - var self = this; - var url = resource.getUrl(); + const self = this; + const url = resource.getUrl(); - var requestPromise = Promise.resolve() + const requestPromise = Promise.resolve() .then(function makeRequest () { - var referer = resource.parent ? resource.parent.getUrl() : null; + const referer = resource.parent ? resource.parent.getUrl() : null; return self.request.get(url, referer); }).then(function requestCompleted (responseData) { - if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects + if (!u.urlsEqual(responseData.url, url)) { // Url may be changed in redirects logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url); if (self.requestedResourcePromises.has(responseData.url)) { @@ -104,14 +103,14 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) { self.requestedResourcePromises.set(responseData.url, requestPromise); } - resource.setType(utils.getTypeByMime(responseData.mimeType)); + resource.setType(u.getTypeByMime(responseData.mimeType)); - var filename = self.filenameGenerator.generateFilename(resource); + const filename = self.filenameGenerator.generateFilename(resource); resource.setFilename(filename); // if type was not determined by mime we can try to get it from filename after it was generated if (!resource.getType()) { - resource.setType(utils.getTypeByFilename(filename)); + resource.setType(u.getTypeByFilename(filename)); } if (responseData.metadata) { @@ -131,52 +130,46 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) { }; Scraper.prototype.requestResource = function requestResource (resource) { - var self = this; - var url = resource.getUrl(); + const url = resource.getUrl(); - if (self.options.urlFilter && !self.options.urlFilter(url)) { + if (this.options.urlFilter && !this.options.urlFilter(url)) { logger.debug('filtering out ' + resource + ' by url filter'); return Promise.resolve(null); } - if (self.options.maxDepth && resource.getDepth() > self.options.maxDepth) { + if (this.options.maxDepth && resource.getDepth() > this.options.maxDepth) { logger.debug('filtering out ' + resource + ' by depth'); return Promise.resolve(null); } - if (self.requestedResourcePromises.has(url)) { + if (this.requestedResourcePromises.has(url)) { logger.debug('found requested resource for ' + resource); - return self.requestedResourcePromises.get(url); + return this.requestedResourcePromises.get(url); } - return self.createNewRequest(resource); -}; - -Scraper.prototype.validate = function validate () { - return this.fsAdapter.validateDirectory(); + return this.createNewRequest(resource); }; Scraper.prototype.load = function load () { - var self = this; - return self.fsAdapter.createDirectory().then(function loadAllResources () { - return Promise.map(self.originalResources, self.requestResource.bind(self)); - }).then(self.waitForLoad.bind(self)); + return Promise + .map(this.resources, this.requestResource.bind(this)) + .then(this.waitForLoad.bind(this)); }; // Returns a promise which gets resolved when all resources are loaded. // 1. Get all not saved resources and save them // 2. Recursion if any new not saved resource were added during this time. If not, loading is done. Scraper.prototype.waitForLoad = function waitForLoad () { - var self = this; - var resourcesToSave = Array.from(self.loadedResources.values()).filter((r) => !r.isSaved()); - var loadingIsFinished = _.isEmpty(resourcesToSave); + const resourcesToSave = Array.from(this.loadedResources.values()).filter((r) => !r.isSaved()); + const loadingIsFinished = _.isEmpty(resourcesToSave); if (!loadingIsFinished) { - return Promise.mapSeries(resourcesToSave, self.saveResource.bind(self)) - .then(self.waitForLoad.bind(self)); + return Promise + .mapSeries(resourcesToSave, this.saveResource.bind(this)) + .then(this.waitForLoad.bind(this)); } logger.info('downloading is finished successfully'); - return Promise.resolve(self.originalResources); + return Promise.resolve(this.resources); }; Scraper.prototype.handleError = function handleError (err, resource) { @@ -192,20 +185,18 @@ Scraper.prototype.handleError = function handleError (err, resource) { Scraper.prototype.errorCleanup = function errorCleanup (error) { logger.error('finishing with error: ' + error.message); - return this.fsAdapter.cleanDirectory().then(function loadedDataRemoved () { + return this.resourceSaver.errorCleanup(error).then(() => { return Promise.reject(error); }); }; Scraper.prototype.scrape = function scrape (callback) { - var self = this; - return Promise.bind(self) - .then(self.validate) - .then(self.load) - .catch(self.errorCleanup) + return Promise.bind(this) + .then(this.load) + .catch(this.errorCleanup) .asCallback(callback); }; -Scraper.defaults = _.clone(defaults); +Scraper.defaults = u.clone(defaults); module.exports = Scraper; diff --git a/lib/utils/index.js b/lib/utils/index.js index ff7f277b..90f29d4d 100644 --- a/lib/utils/index.js +++ b/lib/utils/index.js @@ -118,21 +118,31 @@ function decodeHtmlEntities (text) { return typeof text === 'string' ? htmlEntities.decode(text) : ''; } +function clone (obj) { + return Object.assign({}, obj); +} + +function extend (first, second) { + return Object.assign({}, first, second); +} + module.exports = { - isUrl: isUrl, - getUrl: getUrl, - getUnixPath: getUnixPath, - getRelativePath: getRelativePath, - getFilenameFromUrl: getFilenameFromUrl, - getFilepathFromUrl: getFilepathFromUrl, - getFilenameExtension: getFilenameExtension, - getHashFromUrl: getHashFromUrl, - shortenFilename: shortenFilename, - waitAllFulfilled: waitAllFulfilled, - normalizeUrl: normalizeUrl, - urlsEqual: urlsEqual, - isUriSchemaSupported: isUriSchemaSupported, - getTypeByMime: getTypeByMime, - getTypeByFilename: getTypeByFilename, - decodeHtmlEntities: decodeHtmlEntities + isUrl, + getUrl, + getUnixPath, + getRelativePath, + getFilenameFromUrl, + getFilepathFromUrl, + getFilenameExtension, + getHashFromUrl, + shortenFilename, + waitAllFulfilled, + normalizeUrl, + urlsEqual, + isUriSchemaSupported, + getTypeByMime, + getTypeByFilename, + decodeHtmlEntities, + clone, + extend }; diff --git a/test/functional/error-handling/error-handling.test.js b/test/functional/error-handling/error-handling.test.js index 3371d4d5..da7952b6 100644 --- a/test/functional/error-handling/error-handling.test.js +++ b/test/functional/error-handling/error-handling.test.js @@ -1,13 +1,15 @@ -var should = require('should'); -var nock = require('nock'); -var sinon = require('sinon'); -var fs = require('fs-extra'); -var Promise = require('bluebird'); -var Scraper = require('../../../lib/scraper'); +'use strict'; -var testDirname = __dirname + '/.tmp'; -var mockDirname = __dirname + '/mocks'; -var scraper; +const should = require('should'); +const nock = require('nock'); +const sinon = require('sinon'); +const fs = require('fs-extra'); +const Promise = require('bluebird'); +const Scraper = require('../../../lib/scraper'); + +const testDirname = __dirname + '/.tmp'; +const mockDirname = __dirname + '/mocks'; +let scraper; describe('Functional error handling', function() { @@ -41,12 +43,13 @@ describe('Functional error handling', function() { }); describe('FS Error', function () { - var loadToFsStub; + let loadToFsStub, handleErrorSpy; beforeEach(function() { - scraper.fsAdapter.loadedResources = [1, 2]; - loadToFsStub = sinon.stub(scraper.fsAdapter, 'saveResource').resolves(); + scraper.resourceSaver.loadedResources = [1, 2]; + loadToFsStub = sinon.stub(scraper.resourceSaver, 'saveResource').resolves(); loadToFsStub.onCall(2).rejects(new Error('FS FAILED!')); + handleErrorSpy = sinon.spy(scraper.resourceSaver, 'errorCleanup'); }); it('should remove directory and immediately reject on fs error if ignoreErrors is false', function () { @@ -55,9 +58,10 @@ describe('Functional error handling', function() { return scraper.scrape().then(function() { should(true).be.eql(false); }).catch(function (err) { - fs.existsSync(testDirname).should.be.eql(false); should(err.message).be.eql('FS FAILED!'); should(loadToFsStub.callCount).be.eql(3); + should(handleErrorSpy.callCount).be.eql(1); + fs.existsSync(testDirname).should.be.eql(false); }); }); @@ -66,7 +70,7 @@ describe('Functional error handling', function() { return scraper.scrape().then(function() { should(loadToFsStub.callCount).be.eql(7); - fs.existsSync(testDirname).should.be.eql(true); + should(handleErrorSpy.callCount).be.eql(0); }); }); }); @@ -77,7 +81,7 @@ describe('Functional error handling', function() { beforeEach(function() { var originalHandleResource = scraper.resourceHandler.handleResource; var callCount = 0; - handleResourceStub = sinon.stub(scraper.resourceHandler, 'handleResource', function() { + handleResourceStub = sinon.stub(scraper.resourceHandler, 'handleResource').callsFake(function() { if (callCount++ === 3) { return Promise.reject(new Error('RESOURCE HANDLER FAILED!')); } diff --git a/test/functional/redirect/redirect.test.js b/test/functional/redirect/redirect.test.js index 322e1fa4..2ddf81e8 100644 --- a/test/functional/redirect/redirect.test.js +++ b/test/functional/redirect/redirect.test.js @@ -41,7 +41,7 @@ describe('Functional redirects', function() { sources: [] }; var scraper = new Scraper(options); - var loadToFsSpy = sinon.spy(scraper.fsAdapter, 'saveResource'); + var loadToFsSpy = sinon.spy(scraper.resourceSaver, 'saveResource'); return scraper.scrape().then(function() { loadToFsSpy.callCount.should.be.eql(2); diff --git a/test/functional/resource-saver/resource-saver.test.js b/test/functional/resource-saver/resource-saver.test.js new file mode 100644 index 00000000..37598d88 --- /dev/null +++ b/test/functional/resource-saver/resource-saver.test.js @@ -0,0 +1,69 @@ +'use strict'; + +const should = require('should'); +const nock = require('nock'); +const fs = require('fs-extra'); +const sinon = require('sinon'); +const scrape = require('../../../index'); + +const testDirname = __dirname + '/.tmp'; + +describe('Functional resourceSaver', () => { + + beforeEach(() => { + nock.cleanAll(); + nock.disableNetConnect(); + }); + + afterEach(() => { + nock.cleanAll(); + nock.enableNetConnect(); + fs.removeSync(testDirname); + }); + + it('should use passed resourceSaver when saving resource', function() { + nock('http://example.com/').get('/').reply(200, 'OK'); + + class MyResourceSaver { + saveResource() {} + errorCleanup() {} + } + + const saveResourceStub = sinon.stub(MyResourceSaver.prototype, 'saveResource').resolves(); + + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + resourceSaver: MyResourceSaver + }; + + return scrape(options).catch(function() { + should(saveResourceStub.calledOnce).be.eql(true); + should(saveResourceStub.args[0][0].url).be.eql('http://example.com/'); + }); + }); + + it('should use passed resourceSaver on error', function() { + nock('http://example.com/').get('/').replyWithError('SCRAPER AWFUL ERROR'); + + class MyResourceSaver { + saveResource() {} + errorCleanup() {} + } + + const removeResourcesStub = sinon.stub(MyResourceSaver.prototype, 'errorCleanup').resolves(); + + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + resourceSaver: MyResourceSaver, + ignoreErrors: false + }; + + return scrape(options).catch(function() { + should(removeResourcesStub.calledOnce).be.eql(true); + should(removeResourcesStub.args[0][0].message).be.eql('SCRAPER AWFUL ERROR'); + }); + }); + +}); diff --git a/test/unit/fs-adapter-test.js b/test/unit/fs-adapter-test.js deleted file mode 100644 index d7419004..00000000 --- a/test/unit/fs-adapter-test.js +++ /dev/null @@ -1,34 +0,0 @@ -var should = require('should'); -var path = require('path'); -var FSAdapter = require('../../lib/fs-adaper'); - -describe('FSAdapter', function () { - describe('constructor', function() { - it('should pick supported options', function() { - var options = { a: 1, b: 2, directory: 'test' }; - var fsAdapter = new FSAdapter(options); - fsAdapter.options.should.eql({directory: 'test'}); - }); - - it('should create absolute path if directory is relative path', function () { - var options = { directory: 'my/relative/path' }; - - var fsAdapter = new FSAdapter(options); - var expected = path.join(process.cwd(), 'my/relative/path'); - fsAdapter.absoluteDirectoryPath.should.equalFileSystemPath(expected); - }); - - it('should use directory if directory is absolute path', function () { - var options = { directory: '/my/absolute/path' }; - var fsAdapter = new FSAdapter(options); - var expected = '/my/absolute/path'; - fsAdapter.absoluteDirectoryPath.should.equalFileSystemPath(expected); - }); - - it('should not define absoluteDirectoryPath if no directory were passed', function () { - var options = {}; - var fsAdapter = new FSAdapter(options); - should(fsAdapter.absoluteDirectoryPath).eql(undefined); - }); - }); -}); diff --git a/test/unit/resource-saver.test.js b/test/unit/resource-saver.test.js new file mode 100644 index 00000000..a807748a --- /dev/null +++ b/test/unit/resource-saver.test.js @@ -0,0 +1,98 @@ +'use strict'; + +const should = require('should'); +const sinon = require('sinon'); +const proxyquire = require('proxyquire'); +const path = require('path'); +const ResourceSaver = require('../../lib/resource-saver'); + +describe('ResourceSaver', function () { + describe('constructor', function() { + it('should pick supported options', function() { + const options = { a: 1, b: 2, directory: 'myDirectory' }; + const resourceSaver = new ResourceSaver(options); + resourceSaver.options.should.eql({directory: 'myDirectory'}); + }); + + describe('absoluteDirectoryPath', () => { + it('should create absolute path if directory is relative path', function () { + const options = { directory: 'my/relative/path' }; + const resourceSaver = new ResourceSaver(options); + const expected = path.join(process.cwd(), 'my/relative/path'); + resourceSaver.absoluteDirectoryPath.should.equalFileSystemPath(expected); + }); + + it('should use directory if directory is absolute path', function () { + const options = { directory: '/my/absolute/path' }; + const resourceSaver = new ResourceSaver(options); + const expected = '/my/absolute/path'; + resourceSaver.absoluteDirectoryPath.should.equalFileSystemPath(expected); + }); + }); + + describe('incorrect directory', () => { + it('should throw error if no directory were passed', function () { + const options = {}; + function createResourceSaver () { + new ResourceSaver(options); + } + should(createResourceSaver).throw(/Incorrect directory/); + }); + + it('should throw error if empty directory were passed', function () { + const options = { + directory: '' + }; + function createResourceSaver () { + new ResourceSaver(options); + } + should(createResourceSaver).throw(/Incorrect directory/); + }); + + it('should throw error if incorrect directory passed', function () { + const options = { + directory: {} + }; + function createResourceSaver () { + new ResourceSaver(options); + } + should(createResourceSaver).throw(/Incorrect directory/); + }); + }); + + describe('existing directory', () => { + it('should throw error if directory exists', () => { + const ResourceSaver = proxyquire('../../lib/resource-saver', { + 'fs-extra': { + statSync: sinon.stub().returns('fake-stat') + } + }); + + const options = { + directory: 'fake-directory' + }; + function createResourceSaver () { + new ResourceSaver(options); + } + should(createResourceSaver).throw(/Directory (.*?) exists/); + }); + + it('should throw other errors as is', () => { + const ResourceSaver = proxyquire('../../lib/resource-saver', { + 'fs-extra': { + statSync: sinon.stub().throws(new Error('other fs error')) + } + }); + + const options = { + directory: 'fake-directory' + }; + function createResourceSaver () { + new ResourceSaver(options); + } + should(createResourceSaver).throw('other fs error'); + }); + }); + + }); +}); diff --git a/test/unit/scraper-init-test.js b/test/unit/scraper-init-test.js index 5d84701d..05d7e14f 100644 --- a/test/unit/scraper-init-test.js +++ b/test/unit/scraper-init-test.js @@ -1,12 +1,14 @@ -var should = require('should'); -var proxyquire = require('proxyquire').noCallThru(); -var sinon = require('sinon'); -var path = require('path'); -var Scraper = require('../../lib/scraper'); -var Resource = require('../../lib/resource'); +'use strict'; -var testDirname = __dirname + '/.scraper-init-test'; -var urls = [ 'http://example.com' ]; +const should = require('should'); +const proxyquire = require('proxyquire').noCallThru(); +const sinon = require('sinon'); +const path = require('path'); +const Scraper = require('../../lib/scraper'); +const Resource = require('../../lib/resource'); + +const testDirname = __dirname + '/.scraper-init-test'; +const urls = [ 'http://example.com' ]; describe('Scraper initialization', function () { describe('defaultFilename', function() { @@ -224,7 +226,7 @@ describe('Scraper initialization', function () { }); }); - describe('originalResources', function () { + describe('resources', function () { it('should create Resource object for each url', function() { var s = new Scraper({ urls: [ @@ -235,13 +237,13 @@ describe('Scraper initialization', function () { directory: testDirname }); - s.originalResources.should.be.an.instanceOf(Array).and.have.length(3); - s.originalResources[0].should.be.an.instanceOf(Resource); - s.originalResources[0].url.should.be.eql('http://first-url.com'); - s.originalResources[1].should.be.an.instanceOf(Resource); - s.originalResources[1].url.should.be.eql('http://second-url.com'); - s.originalResources[2].should.be.an.instanceOf(Resource); - s.originalResources[2].url.should.be.eql('http://third-url.com'); + s.resources.should.be.an.instanceOf(Array).and.have.length(3); + s.resources[0].should.be.an.instanceOf(Resource); + s.resources[0].url.should.be.eql('http://first-url.com'); + s.resources[1].should.be.an.instanceOf(Resource); + s.resources[1].url.should.be.eql('http://second-url.com'); + s.resources[2].should.be.an.instanceOf(Resource); + s.resources[2].url.should.be.eql('http://third-url.com'); }); it('should use urls filename', function() { @@ -249,7 +251,7 @@ describe('Scraper initialization', function () { urls: { url: 'http://first-url.com', filename: 'first.html' }, directory: testDirname }); - s.originalResources[0].getFilename().should.equalFileSystemPath('first.html'); + s.resources[0].getFilename().should.equalFileSystemPath('first.html'); }); it('should use default filename if no url filename was provided', function() { @@ -258,7 +260,46 @@ describe('Scraper initialization', function () { defaultFilename: 'default.html', directory: testDirname }); - s.originalResources[0].getFilename().should.equalFileSystemPath('default.html'); + s.resources[0].getFilename().should.equalFileSystemPath('default.html'); + }); + }); + + describe('resourceSaver', () => { + it('should create default resourceSaver with correct params', () => { + const ResourceSaverStub = sinon.stub(); + const Scraper = proxyquire('../../lib/scraper', { + './resource-saver': ResourceSaverStub + }); + + const options = { + urls: { url: 'http://first-url.com' }, + directory: testDirname, + maxDepth: 100 + }; + + const s = new Scraper(options); + ResourceSaverStub.calledOnce.should.be.eql(true); + ResourceSaverStub.args[0][0].should.be.eql(s.options); + }); + + it('should create custom resourceSaver with correct params', () => { + const DefaultResourceSaverStub = sinon.stub(); + const Scraper = proxyquire('../../lib/scraper', { + './resource-saver': DefaultResourceSaverStub + }); + const CustomResourceSaverStub = sinon.stub(); + + const options = { + urls: { url: 'http://first-url.com' }, + directory: testDirname, + maxDepth: 100, + resourceSaver: CustomResourceSaverStub + }; + + const s = new Scraper(options); + CustomResourceSaverStub.calledOnce.should.be.eql(true); + CustomResourceSaverStub.args[0][0].should.be.eql(s.options); + DefaultResourceSaverStub.called.should.be.eql(false); }); }); }); \ No newline at end of file diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js index e36b8c37..ecd3f198 100644 --- a/test/unit/scraper-test.js +++ b/test/unit/scraper-test.js @@ -25,86 +25,6 @@ describe('Scraper', function () { fs.removeSync(testDirname); }); - describe('#validate', function () { - it('should return resolved promise if everything is ok', function () { - var s = new Scraper({ - urls: urls, - directory: 'good/directory' - }); - - return s.validate().then(function() { - should(true).eql(true); - }); - }); - - it('should return rejected promise if no directory was provided', function () { - var s = new Scraper({ - urls: urls - }); - - return s.validate().then(function() { - should(true).be.eql(false); - }, function(err) { - err.should.be.an.instanceOf(Error); - err.message.should.match(/^Incorrect directory/); - }); - }); - - it('should return rejected promise if directory is not correct', function () { - var s1 = new Scraper({ - urls: urls, - directory: '' - }); - - return s1.validate().then(function() { - should(true).be.eql(false); - }, function(err) { - err.should.be.an.instanceOf(Error); - err.message.should.match(/^Incorrect directory/); - }); - - var s2 = new Scraper({ - urls: urls, - directory: { name: '/incorrect/directory' } - }); - - return s2.validate().then(function() { - should(true).be.eql(false); - }, function(err) { - err.should.be.an.instanceOf(Error); - err.message.should.match(/^Incorrect directory/); - }); - - var s3 = new Scraper({ - urls: urls, - directory: 42 - }); - - return s3.validate().then(function() { - should(true).be.eql(false); - }, function(err) { - err.should.be.an.instanceOf(Error); - err.message.should.match(/^Incorrect directory/); - }); - }); - - it('should return rejected promise if directory exists', function() { - fs.mkdirpSync(testDirname); - - var s = new Scraper({ - urls: urls, - directory: testDirname - }); - - return s.validate().then(function() { - should(true).be.eql(false); - }, function(err) { - err.should.be.an.instanceOf(Error); - err.message.should.match(/^Directory (.*?) exists/); - }); - }); - }); - describe('#load', function() { it('should create directory', function() { nock('http://example.com').get('/').reply(200, 'OK'); @@ -440,7 +360,7 @@ describe('Scraper', function () { }); describe('#scrape', function() { - it('should call methods in sequence', function() { + it('should call load', function() { nock('http://example.com').get('/').reply(200, 'OK'); var s = new Scraper({ @@ -448,13 +368,10 @@ describe('Scraper', function () { directory: testDirname }); - var validateSpy = sinon.spy(s, 'validate'); var loadSpy = sinon.spy(s, 'load'); return s.scrape().then(function() { - validateSpy.calledOnce.should.be.eql(true); loadSpy.calledOnce.should.be.eql(true); - loadSpy.calledAfter(validateSpy).should.be.eql(true); }); });