From dad796b497a86f791049adb0cf2b2a355083f7f4 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Tue, 31 Jan 2017 14:59:01 +0200 Subject: [PATCH 1/3] Refactor code duplicates, call loadResource inside requestResource --- .eslintrc.yml | 1 + lib/filename-generator/by-type.js | 2 +- lib/resource-handler/index.js | 3 - lib/scraper.js | 64 +++++---------- lib/{utils.js => utils/index.js} | 6 +- lib/utils/normalized-url-map.js | 19 +++++ test/unit/resource-handler/index.test.js | 12 --- test/unit/scraper-test.js | 92 +++------------------- test/unit/utils/normalized-url-map-test.js | 30 +++++++ test/unit/{ => utils}/utils-test.js | 2 +- 10 files changed, 89 insertions(+), 142 deletions(-) rename lib/{utils.js => utils/index.js} (95%) create mode 100644 lib/utils/normalized-url-map.js create mode 100644 test/unit/utils/normalized-url-map-test.js rename test/unit/{ => utils}/utils-test.js (99%) diff --git a/.eslintrc.yml b/.eslintrc.yml index a9420280..d348fb2d 100644 --- a/.eslintrc.yml +++ b/.eslintrc.yml @@ -3,6 +3,7 @@ parserOptions: ecmaVersion: 6 env: node: true + es6: true rules: consistent-return: "error" curly: "error" diff --git a/lib/filename-generator/by-type.js b/lib/filename-generator/by-type.js index eb95ddc2..50c8456d 100644 --- a/lib/filename-generator/by-type.js +++ b/lib/filename-generator/by-type.js @@ -1,6 +1,6 @@ var _ = require('lodash'); var path = require('path'); -var utils = require('../utils.js'); +var utils = require('../utils'); var typeExtensions = require('../config/resource-ext-by-type'); module.exports = function generateFilename (resource, options, occupiedFileNames) { diff --git a/lib/resource-handler/index.js b/lib/resource-handler/index.js index 6e3ba7e1..7d27d6a4 100644 --- a/lib/resource-handler/index.js +++ b/lib/resource-handler/index.js @@ -62,9 +62,6 @@ ResourceHandler.prototype.handleChildrenResources = function handleChildrenResou } pathsToUpdate.push({ oldPath: childPath, newPath: relativePath}); - - // Do not wait for loadResource here, to prevent deadlock, see scraper.waitForLoad - self.context.loadResource(respondedResource); } return null; // Prevent Bluebird warnings }); diff --git a/lib/scraper.js b/lib/scraper.js index 9fb1ce50..6614a7f5 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -12,6 +12,7 @@ var makeRequest = require('./request'); var ResourceHandler = require('./resource-handler'); var FSAdapter = require('./fs-adaper'); var utils = require('./utils'); +var NormalizedUrlMap = require('./utils/normalized-url-map'); function Scraper (options) { var self = this; @@ -40,36 +41,19 @@ function Scraper (options) { return new Resource(url, filename); }); - self.respondedResourcePromises = {}; // Map url -> request promise - self.loadedResources = {}; // Map url -> resource + self.requestedResourcePromises = new NormalizedUrlMap(); // Map url -> request promise + self.loadedResources = new NormalizedUrlMap(); // Map url -> resource } -Scraper.prototype.addRespondedResourcePromise = function addRespondedResourcePromise (url, promise) { - this.respondedResourcePromises[utils.normalizeUrl(url)] = promise; -}; - -Scraper.prototype.getRespondedResourcePromise = function getRespondedResourcePromise (url) { - return this.respondedResourcePromises[utils.normalizeUrl(url)]; -}; - -Scraper.prototype.addLoadedResource = function addLoadedResourcePromise (url, promise) { - this.loadedResources[utils.normalizeUrl(url)] = promise; -}; - -Scraper.prototype.getLoadedResource = function getLoadedResourcePromise (url) { - return this.loadedResources[utils.normalizeUrl(url)]; -}; - Scraper.prototype.loadResource = function loadResource (resource) { var self = this; var url = resource.getUrl(); - var loadedResource = self.getLoadedResource(url); - if (loadedResource) { + if (self.loadedResources.has(url)) { logger.debug('found loaded resource for ' + resource); } else { - logger.debug('start loading resource ' + resource); - self.addLoadedResource(url, resource); + logger.debug('add loaded resource ' + resource); + self.loadedResources.set(url, resource); } }; @@ -103,13 +87,12 @@ Scraper.prototype.requestResource = function requestResource (resource) { return Promise.resolve(null); } - var respondedResourcePromise = self.getRespondedResourcePromise(url); - if (respondedResourcePromise) { - logger.debug('found responded resource for ' + resource); - return respondedResourcePromise; + if (self.requestedResourcePromises.has(url)) { + logger.debug('found requested resource for ' + resource); + return self.requestedResourcePromises.get(url); } - respondedResourcePromise = Promise.resolve() + var requestedResourcePromise = Promise.resolve() .then(function makeRequest () { var referer = resource.parent ? resource.parent.getUrl() : null; return self.makeRequest(url, referer); @@ -117,12 +100,13 @@ Scraper.prototype.requestResource = function requestResource (resource) { if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url); - var respondedNewUrlResource = self.getRespondedResourcePromise(responseData.url); - if (respondedNewUrlResource) { - return respondedNewUrlResource; + + if (self.requestedResourcePromises.has(responseData.url)) { + return self.requestedResourcePromises.get(responseData.url); } + resource.setUrl(responseData.url); - self.addRespondedResourcePromise(responseData.url, respondedResourcePromise); + self.requestedResourcePromises.set(responseData.url, requestedResourcePromise); } resource.setType(utils.getTypeByMime(responseData.mimeType)); @@ -130,22 +114,21 @@ Scraper.prototype.requestResource = function requestResource (resource) { var filename = self.filenameGenerator.generateFilename(resource); resource.setFilename(filename); - // if type was not determined by mime - // we can try to get it from filename after it was generated + // if type was not determined by mime we can try to get it from filename after it was generated if (!resource.getType()) { resource.setType(utils.getTypeByFilename(filename)); } resource.setText(responseData.body); + self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad return resource; }).catch(function handleError (err) { logger.warn('failed to request resource ' + resource); return self.handleError(err); }); - self.addRespondedResourcePromise(url, respondedResourcePromise); - - return respondedResourcePromise; + self.requestedResourcePromises.set(url, requestedResourcePromise); + return requestedResourcePromise; }; Scraper.prototype.validate = function validate () { @@ -156,12 +139,7 @@ Scraper.prototype.load = function load () { var self = this; return self.fsAdapter.createDirectory().then(function loadAllResources () { return Promise.map(self.originalResources, function loadResource (originalResource) { - return self.requestResource(originalResource).then(function receivedResponse (resOriginalResource) { - if (resOriginalResource) { - // Do not wait for loadResource here, to prevent deadlock, scraper.waitForLoad - self.loadResource(resOriginalResource); - } - }); + return self.requestResource(originalResource); }).then(self.waitForLoad.bind(self)); }); }; @@ -171,7 +149,7 @@ Scraper.prototype.load = function load () { // 2. Recursion if any new not saved resource were added during this time. If not, loading is done. Scraper.prototype.waitForLoad = function waitForLoad () { var self = this; - var resourcesToSave = _.filter(self.loadedResources, (r) => !r.isSaved()); + var resourcesToSave = Array.from(self.loadedResources.values()).filter((r) => !r.isSaved()); var loadingIsFinished = _.isEmpty(resourcesToSave); if (!loadingIsFinished) { diff --git a/lib/utils.js b/lib/utils/index.js similarity index 95% rename from lib/utils.js rename to lib/utils/index.js index 2f89279c..bdfacbe3 100644 --- a/lib/utils.js +++ b/lib/utils/index.js @@ -3,10 +3,10 @@ var path = require('path'); var Promise = require('bluebird'); var normalizeUrl = require('normalize-url'); var htmlEntities = require('he'); -var typeByMime = require('./config/resource-type-by-mime'); -var typeByExt = require('./config/resource-type-by-ext'); +var typeByMime = require('../config/resource-type-by-mime'); +var typeByExt = require('../config/resource-type-by-ext'); -var logger = require('./logger'); +var logger = require('../logger'); var MAX_FILENAME_LENGTH = 255; var IS_URL = /^((http[s]?:)?\/\/)/; diff --git a/lib/utils/normalized-url-map.js b/lib/utils/normalized-url-map.js new file mode 100644 index 00000000..50ff0b7a --- /dev/null +++ b/lib/utils/normalized-url-map.js @@ -0,0 +1,19 @@ +'use strict'; + +var normalizeUrl = require('./index').normalizeUrl; + +class NormalizedUrlMap extends Map { + get (key) { + return super.get(normalizeUrl(key)); + } + + set (key, value) { + return super.set(normalizeUrl(key), value); + } + + has (key) { + return super.has(normalizeUrl(key)); + } +} + +module.exports = NormalizedUrlMap; diff --git a/test/unit/resource-handler/index.test.js b/test/unit/resource-handler/index.test.js index 686c56e8..57dbc907 100644 --- a/test/unit/resource-handler/index.test.js +++ b/test/unit/resource-handler/index.test.js @@ -177,18 +177,6 @@ describe('ResourceHandler', function() { }); }); - it('should call loadResource with resource returned by requestResource', function () { - pathContainer.getPaths.returns(['http://example.com/child']); - - var childResourceRespondedMock = new Resource('http://example.com/child', 'child.png'); - scraperContext.requestResource.resolves(childResourceRespondedMock); - - return resHandler.handleChildrenResources(pathContainer, parentResource).then(function () { - scraperContext.loadResource.calledOnce.should.be.eql(true); - scraperContext.loadResource.args[0][0].should.be.eql(childResourceRespondedMock); - }); - }); - it('should update paths in text with local files returned by requestResource', function () { pathContainer.getPaths.returns([ 'http://first.com/img/a.jpg', diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js index 1c2dd182..9c3420d3 100644 --- a/test/unit/scraper-test.js +++ b/test/unit/scraper-test.js @@ -120,47 +120,6 @@ describe('Scraper', function () { }); }); - it('should call loadResource for each url', function() { - nock('http://first-url.com').get('/').reply(200, 'OK'); - nock('http://second-url.com').get('/').reply(200, 'OK'); - - var s = new Scraper({ - urls: [ - 'http://first-url.com', - 'http://second-url.com' - ], - directory: testDirname - }); - - var loadResourceSpy = sinon.spy(s, 'loadResource'); - return s.load().then(function() { - loadResourceSpy.calledTwice.should.be.eql(true); - }); - }); - - it('should not call loadResource if no resource was returned', function() { - nock('http://first-url.com').get('/').reply(200, 'OK'); - nock('http://second-url.com').get('/').reply(200, 'OK'); - - var s = new Scraper({ - urls: [ - 'http://first-url.com', - 'http://second-url.com' - ], - directory: testDirname - }); - - var loadResourceSpy = sinon.spy(s, 'loadResource'); - var requestStub = sinon.stub(s, 'requestResource'); - requestStub.onCall(0).resolves(null); - requestStub.onCall(1).resolves(new Resource('http://second-url.com')); - - return s.load().then(function() { - loadResourceSpy.calledOnce.should.be.eql(true); - loadResourceSpy.args[0][0].url.should.be.eql('http://second-url.com'); - }); - }); - it('should return array of objects with url, filename and children', function() { nock('http://first-url.com').get('/').reply(200, 'OK'); nock('http://second-url.com').get('/').reply(500); @@ -237,59 +196,34 @@ describe('Scraper', function () { }); }); - describe('#getLoadedResource', function() { - it('should find nothing if no resource with same url was loaded',function() { - var s = new Scraper({ - urls: 'http://example.com', - directory: testDirname - }); - var a = new Resource('http://first-resource.com'); - var loaded = s.getLoadedResource(a.getUrl()); - should(loaded).be.eql(undefined); - }); - - it('should find loaded resource with same url', function() { + describe('#loadResource', function() { + it('should add different resource to the map', function() { var s = new Scraper({ urls: 'http://example.com', directory: testDirname }); - var a = new Resource('http://first-resource.com'); - s.addLoadedResource(a.getUrl(), a); + var r1 = new Resource('http://example.com/a1.png', 'a1.png'); + var r2 = new Resource('http://example.com/a2.png', 'a2.png'); + var r3 = new Resource('http://example.com/a3.png', 'a3.png'); - var b = new Resource('http://first-resource.com'); - var c = new Resource('http://first-resource.com/'); - var d = new Resource('http://first-resource.com?'); - should(s.getLoadedResource(b.getUrl())).be.equal(a); - should(s.getLoadedResource(c.getUrl())).be.equal(a); - should(s.getLoadedResource(d.getUrl())).be.equal(a); + s.loadResource(r1); + s.loadResource(r2); + s.loadResource(r3); + s.loadedResources.size.should.be.eql(3); }); - }); - describe('#loadResource', function() { - it('should not save the same resource twice (should skip already loaded)', function() { + it('should not add the same resource twice', function() { var s = new Scraper({ urls: 'http://example.com', directory: testDirname }); - s.getResourceHandler = sinon.stub().returns(_.noop); - - sinon.stub(s, 'getLoadedResource') - .withArgs('http://example.com/a.png') - .onFirstCall().returns() - .onSecondCall().returns(Promise.resolve()); - - sinon.spy(s, 'addLoadedResource'); var r = new Resource('http://example.com/a.png', 'a.png'); s.loadResource(r); - s.getLoadedResource.calledOnce.should.be.eql(true); - s.addLoadedResource.calledOnce.should.be.eql(true); - s.loadResource(r); - s.getLoadedResource.calledTwice.should.be.eql(true); - s.addLoadedResource.calledOnce.should.be.eql(true); + s.loadedResources.size.should.be.eql(1); }); }); @@ -299,12 +233,12 @@ describe('Scraper', function () { urls: 'http://example.com', directory: testDirname }); - s.getResourceHandler = sinon.stub().returns(_.noop); - sinon.spy(s, 'addLoadedResource'); var r = new Resource('http://example.com/a.png', 'a.png'); r.setText('some text'); + s.resourceHandler.handleResource = sinon.stub().resolves(r); + return s.saveResource(r).then(function() { var text = fs.readFileSync(path.join(testDirname, r.getFilename())).toString(); text.should.be.eql(r.getText()); diff --git a/test/unit/utils/normalized-url-map-test.js b/test/unit/utils/normalized-url-map-test.js new file mode 100644 index 00000000..16913230 --- /dev/null +++ b/test/unit/utils/normalized-url-map-test.js @@ -0,0 +1,30 @@ +var should = require('should'); +var NormalizedUrlMap = require('../../../lib/utils/normalized-url-map'); + +describe('NormalizedUrlMap', function () { + describe('#get', function() { + it('should find nothing if no value with same url was set',function() { + var map = new NormalizedUrlMap(); + should(map.get('http://first-resource.com')).be.eql(undefined); + }); + + it('should return value with same url', function() { + var map = new NormalizedUrlMap(); + var a = { test: 'hello' }; + map.set('http://first-resource.com', a); + + should(map.get('http://first-resource.com')).be.eql(a); + should(map.get('http://first-resource.com/')).be.eql(a); + should(map.get('http://first-resource.com?')).be.eql(a); + + + var b = { foo: 'bar' }; + map.set('http://example.com?a=1&b=2&c=3', b); + + should(map.get('http://example.com/?a=1&c=3&b=2')).be.eql(b); + should(map.get('http://example.com/?c=3&b=2&a=1&')).be.eql(b); + should(map.get('http://example.com/?c=3&a=1&b=2')).be.eql(b); + should(map.get('http://example.com/?b=2&a=1&c=3')).be.eql(b); + }); + }); +}); diff --git a/test/unit/utils-test.js b/test/unit/utils/utils-test.js similarity index 99% rename from test/unit/utils-test.js rename to test/unit/utils/utils-test.js index 7c64d26e..9024c754 100644 --- a/test/unit/utils-test.js +++ b/test/unit/utils/utils-test.js @@ -1,6 +1,6 @@ var should = require('should'); var _ = require('lodash'); -var utils = require('../../lib/utils'); +var utils = require('../../../lib/utils'); var Promise = require('bluebird'); describe('Utils', function () { From 73b22eb00b9d4fb71a5b92e7c4fc9d222af13b9c Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Tue, 31 Jan 2017 20:29:47 +0200 Subject: [PATCH 2/3] Split scraper.requestResource into 2 functions --- lib/scraper.js | 87 ++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/lib/scraper.js b/lib/scraper.js index 6614a7f5..9d38429f 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -73,6 +73,49 @@ Scraper.prototype.saveResource = function saveResource (resource) { }); }; +Scraper.prototype.createNewRequest = function createNewRequest (resource) { + var self = this; + var url = resource.getUrl(); + + var requestPromise = Promise.resolve() + .then(function makeRequest () { + var referer = resource.parent ? resource.parent.getUrl() : null; + return self.makeRequest(url, referer); + }).then(function requestCompleted (responseData) { + + if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects + logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url); + + if (self.requestedResourcePromises.has(responseData.url)) { + return self.requestedResourcePromises.get(responseData.url); + } + + resource.setUrl(responseData.url); + self.requestedResourcePromises.set(responseData.url, requestPromise); + } + + resource.setType(utils.getTypeByMime(responseData.mimeType)); + + var filename = self.filenameGenerator.generateFilename(resource); + resource.setFilename(filename); + + // if type was not determined by mime we can try to get it from filename after it was generated + if (!resource.getType()) { + resource.setType(utils.getTypeByFilename(filename)); + } + + resource.setText(responseData.body); + self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad + return resource; + }).catch(function handleError (err) { + logger.warn('failed to request resource ' + resource); + return self.handleError(err); + }); + + self.requestedResourcePromises.set(url, requestPromise); + return requestPromise; +}; + Scraper.prototype.requestResource = function requestResource (resource) { var self = this; var url = resource.getUrl(); @@ -92,43 +135,7 @@ Scraper.prototype.requestResource = function requestResource (resource) { return self.requestedResourcePromises.get(url); } - var requestedResourcePromise = Promise.resolve() - .then(function makeRequest () { - var referer = resource.parent ? resource.parent.getUrl() : null; - return self.makeRequest(url, referer); - }).then(function requestCompleted (responseData) { - - if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects - logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url); - - if (self.requestedResourcePromises.has(responseData.url)) { - return self.requestedResourcePromises.get(responseData.url); - } - - resource.setUrl(responseData.url); - self.requestedResourcePromises.set(responseData.url, requestedResourcePromise); - } - - resource.setType(utils.getTypeByMime(responseData.mimeType)); - - var filename = self.filenameGenerator.generateFilename(resource); - resource.setFilename(filename); - - // if type was not determined by mime we can try to get it from filename after it was generated - if (!resource.getType()) { - resource.setType(utils.getTypeByFilename(filename)); - } - - resource.setText(responseData.body); - self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad - return resource; - }).catch(function handleError (err) { - logger.warn('failed to request resource ' + resource); - return self.handleError(err); - }); - - self.requestedResourcePromises.set(url, requestedResourcePromise); - return requestedResourcePromise; + return self.createNewRequest(resource); }; Scraper.prototype.validate = function validate () { @@ -138,10 +145,8 @@ Scraper.prototype.validate = function validate () { Scraper.prototype.load = function load () { var self = this; return self.fsAdapter.createDirectory().then(function loadAllResources () { - return Promise.map(self.originalResources, function loadResource (originalResource) { - return self.requestResource(originalResource); - }).then(self.waitForLoad.bind(self)); - }); + return Promise.map(self.originalResources, self.requestResource.bind(self)) + }).then(self.waitForLoad.bind(self)); }; // Returns a promise which gets resolved when all resources are loaded. From 5e2d8d0708de9f7a61d2b9b1c66c49dd228ef61d Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Tue, 31 Jan 2017 20:33:46 +0200 Subject: [PATCH 3/3] Fix intendation --- lib/scraper.js | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/lib/scraper.js b/lib/scraper.js index 9d38429f..d4b44dab 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -83,34 +83,34 @@ Scraper.prototype.createNewRequest = function createNewRequest (resource) { return self.makeRequest(url, referer); }).then(function requestCompleted (responseData) { - if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects - logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url); + if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects + logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url); - if (self.requestedResourcePromises.has(responseData.url)) { - return self.requestedResourcePromises.get(responseData.url); - } + if (self.requestedResourcePromises.has(responseData.url)) { + return self.requestedResourcePromises.get(responseData.url); + } - resource.setUrl(responseData.url); - self.requestedResourcePromises.set(responseData.url, requestPromise); - } + resource.setUrl(responseData.url); + self.requestedResourcePromises.set(responseData.url, requestPromise); + } - resource.setType(utils.getTypeByMime(responseData.mimeType)); + resource.setType(utils.getTypeByMime(responseData.mimeType)); - var filename = self.filenameGenerator.generateFilename(resource); - resource.setFilename(filename); + var filename = self.filenameGenerator.generateFilename(resource); + resource.setFilename(filename); - // if type was not determined by mime we can try to get it from filename after it was generated - if (!resource.getType()) { - resource.setType(utils.getTypeByFilename(filename)); - } + // if type was not determined by mime we can try to get it from filename after it was generated + if (!resource.getType()) { + resource.setType(utils.getTypeByFilename(filename)); + } - resource.setText(responseData.body); - self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad - return resource; - }).catch(function handleError (err) { - logger.warn('failed to request resource ' + resource); - return self.handleError(err); - }); + resource.setText(responseData.body); + self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad + return resource; + }).catch(function handleError (err) { + logger.warn('failed to request resource ' + resource); + return self.handleError(err); + }); self.requestedResourcePromises.set(url, requestPromise); return requestPromise; @@ -145,7 +145,7 @@ Scraper.prototype.validate = function validate () { Scraper.prototype.load = function load () { var self = this; return self.fsAdapter.createDirectory().then(function loadAllResources () { - return Promise.map(self.originalResources, self.requestResource.bind(self)) + return Promise.map(self.originalResources, self.requestResource.bind(self)); }).then(self.waitForLoad.bind(self)); };