diff --git a/.eslintrc.yml b/.eslintrc.yml index a9420280..d348fb2d 100644 --- a/.eslintrc.yml +++ b/.eslintrc.yml @@ -3,6 +3,7 @@ parserOptions: ecmaVersion: 6 env: node: true + es6: true rules: consistent-return: "error" curly: "error" diff --git a/lib/filename-generator/by-type.js b/lib/filename-generator/by-type.js index eb95ddc2..50c8456d 100644 --- a/lib/filename-generator/by-type.js +++ b/lib/filename-generator/by-type.js @@ -1,6 +1,6 @@ var _ = require('lodash'); var path = require('path'); -var utils = require('../utils.js'); +var utils = require('../utils'); var typeExtensions = require('../config/resource-ext-by-type'); module.exports = function generateFilename (resource, options, occupiedFileNames) { diff --git a/lib/resource-handler/index.js b/lib/resource-handler/index.js index 6e3ba7e1..7d27d6a4 100644 --- a/lib/resource-handler/index.js +++ b/lib/resource-handler/index.js @@ -62,9 +62,6 @@ ResourceHandler.prototype.handleChildrenResources = function handleChildrenResou } pathsToUpdate.push({ oldPath: childPath, newPath: relativePath}); - - // Do not wait for loadResource here, to prevent deadlock, see scraper.waitForLoad - self.context.loadResource(respondedResource); } return null; // Prevent Bluebird warnings }); diff --git a/lib/scraper.js b/lib/scraper.js index 9fb1ce50..d4b44dab 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -12,6 +12,7 @@ var makeRequest = require('./request'); var ResourceHandler = require('./resource-handler'); var FSAdapter = require('./fs-adaper'); var utils = require('./utils'); +var NormalizedUrlMap = require('./utils/normalized-url-map'); function Scraper (options) { var self = this; @@ -40,36 +41,19 @@ function Scraper (options) { return new Resource(url, filename); }); - self.respondedResourcePromises = {}; // Map url -> request promise - self.loadedResources = {}; // Map url -> resource + self.requestedResourcePromises = new NormalizedUrlMap(); // Map url -> request promise + self.loadedResources = new NormalizedUrlMap(); // Map url -> resource } -Scraper.prototype.addRespondedResourcePromise = function addRespondedResourcePromise (url, promise) { - this.respondedResourcePromises[utils.normalizeUrl(url)] = promise; -}; - -Scraper.prototype.getRespondedResourcePromise = function getRespondedResourcePromise (url) { - return this.respondedResourcePromises[utils.normalizeUrl(url)]; -}; - -Scraper.prototype.addLoadedResource = function addLoadedResourcePromise (url, promise) { - this.loadedResources[utils.normalizeUrl(url)] = promise; -}; - -Scraper.prototype.getLoadedResource = function getLoadedResourcePromise (url) { - return this.loadedResources[utils.normalizeUrl(url)]; -}; - Scraper.prototype.loadResource = function loadResource (resource) { var self = this; var url = resource.getUrl(); - var loadedResource = self.getLoadedResource(url); - if (loadedResource) { + if (self.loadedResources.has(url)) { logger.debug('found loaded resource for ' + resource); } else { - logger.debug('start loading resource ' + resource); - self.addLoadedResource(url, resource); + logger.debug('add loaded resource ' + resource); + self.loadedResources.set(url, resource); } }; @@ -89,27 +73,11 @@ Scraper.prototype.saveResource = function saveResource (resource) { }); }; -Scraper.prototype.requestResource = function requestResource (resource) { +Scraper.prototype.createNewRequest = function createNewRequest (resource) { var self = this; var url = resource.getUrl(); - if (!self.options.urlFilter(url)) { - logger.debug('filtering out ' + resource + ' by url filter'); - return Promise.resolve(null); - } - - if (self.options.maxDepth && resource.getDepth() > self.options.maxDepth) { - logger.debug('filtering out ' + resource + ' by depth'); - return Promise.resolve(null); - } - - var respondedResourcePromise = self.getRespondedResourcePromise(url); - if (respondedResourcePromise) { - logger.debug('found responded resource for ' + resource); - return respondedResourcePromise; - } - - respondedResourcePromise = Promise.resolve() + var requestPromise = Promise.resolve() .then(function makeRequest () { var referer = resource.parent ? resource.parent.getUrl() : null; return self.makeRequest(url, referer); @@ -117,12 +85,13 @@ Scraper.prototype.requestResource = function requestResource (resource) { if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url); - var respondedNewUrlResource = self.getRespondedResourcePromise(responseData.url); - if (respondedNewUrlResource) { - return respondedNewUrlResource; + + if (self.requestedResourcePromises.has(responseData.url)) { + return self.requestedResourcePromises.get(responseData.url); } + resource.setUrl(responseData.url); - self.addRespondedResourcePromise(responseData.url, respondedResourcePromise); + self.requestedResourcePromises.set(responseData.url, requestPromise); } resource.setType(utils.getTypeByMime(responseData.mimeType)); @@ -130,22 +99,43 @@ Scraper.prototype.requestResource = function requestResource (resource) { var filename = self.filenameGenerator.generateFilename(resource); resource.setFilename(filename); - // if type was not determined by mime - // we can try to get it from filename after it was generated + // if type was not determined by mime we can try to get it from filename after it was generated if (!resource.getType()) { resource.setType(utils.getTypeByFilename(filename)); } resource.setText(responseData.body); + self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad return resource; }).catch(function handleError (err) { logger.warn('failed to request resource ' + resource); return self.handleError(err); }); - self.addRespondedResourcePromise(url, respondedResourcePromise); + self.requestedResourcePromises.set(url, requestPromise); + return requestPromise; +}; - return respondedResourcePromise; +Scraper.prototype.requestResource = function requestResource (resource) { + var self = this; + var url = resource.getUrl(); + + if (!self.options.urlFilter(url)) { + logger.debug('filtering out ' + resource + ' by url filter'); + return Promise.resolve(null); + } + + if (self.options.maxDepth && resource.getDepth() > self.options.maxDepth) { + logger.debug('filtering out ' + resource + ' by depth'); + return Promise.resolve(null); + } + + if (self.requestedResourcePromises.has(url)) { + logger.debug('found requested resource for ' + resource); + return self.requestedResourcePromises.get(url); + } + + return self.createNewRequest(resource); }; Scraper.prototype.validate = function validate () { @@ -155,15 +145,8 @@ Scraper.prototype.validate = function validate () { Scraper.prototype.load = function load () { var self = this; return self.fsAdapter.createDirectory().then(function loadAllResources () { - return Promise.map(self.originalResources, function loadResource (originalResource) { - return self.requestResource(originalResource).then(function receivedResponse (resOriginalResource) { - if (resOriginalResource) { - // Do not wait for loadResource here, to prevent deadlock, scraper.waitForLoad - self.loadResource(resOriginalResource); - } - }); - }).then(self.waitForLoad.bind(self)); - }); + return Promise.map(self.originalResources, self.requestResource.bind(self)); + }).then(self.waitForLoad.bind(self)); }; // Returns a promise which gets resolved when all resources are loaded. @@ -171,7 +154,7 @@ Scraper.prototype.load = function load () { // 2. Recursion if any new not saved resource were added during this time. If not, loading is done. Scraper.prototype.waitForLoad = function waitForLoad () { var self = this; - var resourcesToSave = _.filter(self.loadedResources, (r) => !r.isSaved()); + var resourcesToSave = Array.from(self.loadedResources.values()).filter((r) => !r.isSaved()); var loadingIsFinished = _.isEmpty(resourcesToSave); if (!loadingIsFinished) { diff --git a/lib/utils.js b/lib/utils/index.js similarity index 95% rename from lib/utils.js rename to lib/utils/index.js index 2f89279c..bdfacbe3 100644 --- a/lib/utils.js +++ b/lib/utils/index.js @@ -3,10 +3,10 @@ var path = require('path'); var Promise = require('bluebird'); var normalizeUrl = require('normalize-url'); var htmlEntities = require('he'); -var typeByMime = require('./config/resource-type-by-mime'); -var typeByExt = require('./config/resource-type-by-ext'); +var typeByMime = require('../config/resource-type-by-mime'); +var typeByExt = require('../config/resource-type-by-ext'); -var logger = require('./logger'); +var logger = require('../logger'); var MAX_FILENAME_LENGTH = 255; var IS_URL = /^((http[s]?:)?\/\/)/; diff --git a/lib/utils/normalized-url-map.js b/lib/utils/normalized-url-map.js new file mode 100644 index 00000000..50ff0b7a --- /dev/null +++ b/lib/utils/normalized-url-map.js @@ -0,0 +1,19 @@ +'use strict'; + +var normalizeUrl = require('./index').normalizeUrl; + +class NormalizedUrlMap extends Map { + get (key) { + return super.get(normalizeUrl(key)); + } + + set (key, value) { + return super.set(normalizeUrl(key), value); + } + + has (key) { + return super.has(normalizeUrl(key)); + } +} + +module.exports = NormalizedUrlMap; diff --git a/test/unit/resource-handler/index.test.js b/test/unit/resource-handler/index.test.js index 686c56e8..57dbc907 100644 --- a/test/unit/resource-handler/index.test.js +++ b/test/unit/resource-handler/index.test.js @@ -177,18 +177,6 @@ describe('ResourceHandler', function() { }); }); - it('should call loadResource with resource returned by requestResource', function () { - pathContainer.getPaths.returns(['http://example.com/child']); - - var childResourceRespondedMock = new Resource('http://example.com/child', 'child.png'); - scraperContext.requestResource.resolves(childResourceRespondedMock); - - return resHandler.handleChildrenResources(pathContainer, parentResource).then(function () { - scraperContext.loadResource.calledOnce.should.be.eql(true); - scraperContext.loadResource.args[0][0].should.be.eql(childResourceRespondedMock); - }); - }); - it('should update paths in text with local files returned by requestResource', function () { pathContainer.getPaths.returns([ 'http://first.com/img/a.jpg', diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js index 1c2dd182..9c3420d3 100644 --- a/test/unit/scraper-test.js +++ b/test/unit/scraper-test.js @@ -120,47 +120,6 @@ describe('Scraper', function () { }); }); - it('should call loadResource for each url', function() { - nock('http://first-url.com').get('/').reply(200, 'OK'); - nock('http://second-url.com').get('/').reply(200, 'OK'); - - var s = new Scraper({ - urls: [ - 'http://first-url.com', - 'http://second-url.com' - ], - directory: testDirname - }); - - var loadResourceSpy = sinon.spy(s, 'loadResource'); - return s.load().then(function() { - loadResourceSpy.calledTwice.should.be.eql(true); - }); - }); - - it('should not call loadResource if no resource was returned', function() { - nock('http://first-url.com').get('/').reply(200, 'OK'); - nock('http://second-url.com').get('/').reply(200, 'OK'); - - var s = new Scraper({ - urls: [ - 'http://first-url.com', - 'http://second-url.com' - ], - directory: testDirname - }); - - var loadResourceSpy = sinon.spy(s, 'loadResource'); - var requestStub = sinon.stub(s, 'requestResource'); - requestStub.onCall(0).resolves(null); - requestStub.onCall(1).resolves(new Resource('http://second-url.com')); - - return s.load().then(function() { - loadResourceSpy.calledOnce.should.be.eql(true); - loadResourceSpy.args[0][0].url.should.be.eql('http://second-url.com'); - }); - }); - it('should return array of objects with url, filename and children', function() { nock('http://first-url.com').get('/').reply(200, 'OK'); nock('http://second-url.com').get('/').reply(500); @@ -237,59 +196,34 @@ describe('Scraper', function () { }); }); - describe('#getLoadedResource', function() { - it('should find nothing if no resource with same url was loaded',function() { - var s = new Scraper({ - urls: 'http://example.com', - directory: testDirname - }); - var a = new Resource('http://first-resource.com'); - var loaded = s.getLoadedResource(a.getUrl()); - should(loaded).be.eql(undefined); - }); - - it('should find loaded resource with same url', function() { + describe('#loadResource', function() { + it('should add different resource to the map', function() { var s = new Scraper({ urls: 'http://example.com', directory: testDirname }); - var a = new Resource('http://first-resource.com'); - s.addLoadedResource(a.getUrl(), a); + var r1 = new Resource('http://example.com/a1.png', 'a1.png'); + var r2 = new Resource('http://example.com/a2.png', 'a2.png'); + var r3 = new Resource('http://example.com/a3.png', 'a3.png'); - var b = new Resource('http://first-resource.com'); - var c = new Resource('http://first-resource.com/'); - var d = new Resource('http://first-resource.com?'); - should(s.getLoadedResource(b.getUrl())).be.equal(a); - should(s.getLoadedResource(c.getUrl())).be.equal(a); - should(s.getLoadedResource(d.getUrl())).be.equal(a); + s.loadResource(r1); + s.loadResource(r2); + s.loadResource(r3); + s.loadedResources.size.should.be.eql(3); }); - }); - describe('#loadResource', function() { - it('should not save the same resource twice (should skip already loaded)', function() { + it('should not add the same resource twice', function() { var s = new Scraper({ urls: 'http://example.com', directory: testDirname }); - s.getResourceHandler = sinon.stub().returns(_.noop); - - sinon.stub(s, 'getLoadedResource') - .withArgs('http://example.com/a.png') - .onFirstCall().returns() - .onSecondCall().returns(Promise.resolve()); - - sinon.spy(s, 'addLoadedResource'); var r = new Resource('http://example.com/a.png', 'a.png'); s.loadResource(r); - s.getLoadedResource.calledOnce.should.be.eql(true); - s.addLoadedResource.calledOnce.should.be.eql(true); - s.loadResource(r); - s.getLoadedResource.calledTwice.should.be.eql(true); - s.addLoadedResource.calledOnce.should.be.eql(true); + s.loadedResources.size.should.be.eql(1); }); }); @@ -299,12 +233,12 @@ describe('Scraper', function () { urls: 'http://example.com', directory: testDirname }); - s.getResourceHandler = sinon.stub().returns(_.noop); - sinon.spy(s, 'addLoadedResource'); var r = new Resource('http://example.com/a.png', 'a.png'); r.setText('some text'); + s.resourceHandler.handleResource = sinon.stub().resolves(r); + return s.saveResource(r).then(function() { var text = fs.readFileSync(path.join(testDirname, r.getFilename())).toString(); text.should.be.eql(r.getText()); diff --git a/test/unit/utils/normalized-url-map-test.js b/test/unit/utils/normalized-url-map-test.js new file mode 100644 index 00000000..16913230 --- /dev/null +++ b/test/unit/utils/normalized-url-map-test.js @@ -0,0 +1,30 @@ +var should = require('should'); +var NormalizedUrlMap = require('../../../lib/utils/normalized-url-map'); + +describe('NormalizedUrlMap', function () { + describe('#get', function() { + it('should find nothing if no value with same url was set',function() { + var map = new NormalizedUrlMap(); + should(map.get('http://first-resource.com')).be.eql(undefined); + }); + + it('should return value with same url', function() { + var map = new NormalizedUrlMap(); + var a = { test: 'hello' }; + map.set('http://first-resource.com', a); + + should(map.get('http://first-resource.com')).be.eql(a); + should(map.get('http://first-resource.com/')).be.eql(a); + should(map.get('http://first-resource.com?')).be.eql(a); + + + var b = { foo: 'bar' }; + map.set('http://example.com?a=1&b=2&c=3', b); + + should(map.get('http://example.com/?a=1&c=3&b=2')).be.eql(b); + should(map.get('http://example.com/?c=3&b=2&a=1&')).be.eql(b); + should(map.get('http://example.com/?c=3&a=1&b=2')).be.eql(b); + should(map.get('http://example.com/?b=2&a=1&c=3')).be.eql(b); + }); + }); +}); diff --git a/test/unit/utils-test.js b/test/unit/utils/utils-test.js similarity index 99% rename from test/unit/utils-test.js rename to test/unit/utils/utils-test.js index 7c64d26e..9024c754 100644 --- a/test/unit/utils-test.js +++ b/test/unit/utils/utils-test.js @@ -1,6 +1,6 @@ var should = require('should'); var _ = require('lodash'); -var utils = require('../../lib/utils'); +var utils = require('../../../lib/utils'); var Promise = require('bluebird'); describe('Utils', function () {