Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .eslintrc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ parserOptions:
ecmaVersion: 6
env:
node: true
es6: true
rules:
consistent-return: "error"
curly: "error"
Expand Down
2 changes: 1 addition & 1 deletion lib/filename-generator/by-type.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
var _ = require('lodash');
var path = require('path');
var utils = require('../utils.js');
var utils = require('../utils');
var typeExtensions = require('../config/resource-ext-by-type');

module.exports = function generateFilename (resource, options, occupiedFileNames) {
Expand Down
3 changes: 0 additions & 3 deletions lib/resource-handler/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,6 @@ ResourceHandler.prototype.handleChildrenResources = function handleChildrenResou
}

pathsToUpdate.push({ oldPath: childPath, newPath: relativePath});

// Do not wait for loadResource here, to prevent deadlock, see scraper.waitForLoad
self.context.loadResource(respondedResource);
}
return null; // Prevent Bluebird warnings
});
Expand Down
99 changes: 41 additions & 58 deletions lib/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ var makeRequest = require('./request');
var ResourceHandler = require('./resource-handler');
var FSAdapter = require('./fs-adaper');
var utils = require('./utils');
var NormalizedUrlMap = require('./utils/normalized-url-map');

function Scraper (options) {
var self = this;
Expand Down Expand Up @@ -40,36 +41,19 @@ function Scraper (options) {
return new Resource(url, filename);
});

self.respondedResourcePromises = {}; // Map url -> request promise
self.loadedResources = {}; // Map url -> resource
self.requestedResourcePromises = new NormalizedUrlMap(); // Map url -> request promise
self.loadedResources = new NormalizedUrlMap(); // Map url -> resource
}

Scraper.prototype.addRespondedResourcePromise = function addRespondedResourcePromise (url, promise) {
this.respondedResourcePromises[utils.normalizeUrl(url)] = promise;
};

Scraper.prototype.getRespondedResourcePromise = function getRespondedResourcePromise (url) {
return this.respondedResourcePromises[utils.normalizeUrl(url)];
};

Scraper.prototype.addLoadedResource = function addLoadedResourcePromise (url, promise) {
this.loadedResources[utils.normalizeUrl(url)] = promise;
};

Scraper.prototype.getLoadedResource = function getLoadedResourcePromise (url) {
return this.loadedResources[utils.normalizeUrl(url)];
};

Scraper.prototype.loadResource = function loadResource (resource) {
var self = this;
var url = resource.getUrl();

var loadedResource = self.getLoadedResource(url);
if (loadedResource) {
if (self.loadedResources.has(url)) {
logger.debug('found loaded resource for ' + resource);
} else {
logger.debug('start loading resource ' + resource);
self.addLoadedResource(url, resource);
logger.debug('add loaded resource ' + resource);
self.loadedResources.set(url, resource);
}
};

Expand All @@ -89,63 +73,69 @@ Scraper.prototype.saveResource = function saveResource (resource) {
});
};

Scraper.prototype.requestResource = function requestResource (resource) {
Scraper.prototype.createNewRequest = function createNewRequest (resource) {
var self = this;
var url = resource.getUrl();

if (!self.options.urlFilter(url)) {
logger.debug('filtering out ' + resource + ' by url filter');
return Promise.resolve(null);
}

if (self.options.maxDepth && resource.getDepth() > self.options.maxDepth) {
logger.debug('filtering out ' + resource + ' by depth');
return Promise.resolve(null);
}

var respondedResourcePromise = self.getRespondedResourcePromise(url);
if (respondedResourcePromise) {
logger.debug('found responded resource for ' + resource);
return respondedResourcePromise;
}

respondedResourcePromise = Promise.resolve()
var requestPromise = Promise.resolve()
.then(function makeRequest () {
var referer = resource.parent ? resource.parent.getUrl() : null;
return self.makeRequest(url, referer);
}).then(function requestCompleted (responseData) {

if (!utils.urlsEqual(responseData.url, url)) { // Url may be changed in redirects
logger.debug('url changed. old url = ' + url + ', new url = ' + responseData.url);
var respondedNewUrlResource = self.getRespondedResourcePromise(responseData.url);
if (respondedNewUrlResource) {
return respondedNewUrlResource;

if (self.requestedResourcePromises.has(responseData.url)) {
return self.requestedResourcePromises.get(responseData.url);
}

resource.setUrl(responseData.url);
self.addRespondedResourcePromise(responseData.url, respondedResourcePromise);
self.requestedResourcePromises.set(responseData.url, requestPromise);
}

resource.setType(utils.getTypeByMime(responseData.mimeType));

var filename = self.filenameGenerator.generateFilename(resource);
resource.setFilename(filename);

// if type was not determined by mime
// we can try to get it from filename after it was generated
// if type was not determined by mime we can try to get it from filename after it was generated
if (!resource.getType()) {
resource.setType(utils.getTypeByFilename(filename));
}

resource.setText(responseData.body);
self.loadResource(resource); // Add resource to list for future downloading, see Scraper.waitForLoad
return resource;
}).catch(function handleError (err) {
logger.warn('failed to request resource ' + resource);
return self.handleError(err);
});

self.addRespondedResourcePromise(url, respondedResourcePromise);
self.requestedResourcePromises.set(url, requestPromise);
return requestPromise;
};

return respondedResourcePromise;
Scraper.prototype.requestResource = function requestResource (resource) {
var self = this;
var url = resource.getUrl();

if (!self.options.urlFilter(url)) {
logger.debug('filtering out ' + resource + ' by url filter');
return Promise.resolve(null);
}

if (self.options.maxDepth && resource.getDepth() > self.options.maxDepth) {
logger.debug('filtering out ' + resource + ' by depth');
return Promise.resolve(null);
}

if (self.requestedResourcePromises.has(url)) {
logger.debug('found requested resource for ' + resource);
return self.requestedResourcePromises.get(url);
}

return self.createNewRequest(resource);
};

Scraper.prototype.validate = function validate () {
Expand All @@ -155,23 +145,16 @@ Scraper.prototype.validate = function validate () {
Scraper.prototype.load = function load () {
var self = this;
return self.fsAdapter.createDirectory().then(function loadAllResources () {
return Promise.map(self.originalResources, function loadResource (originalResource) {
return self.requestResource(originalResource).then(function receivedResponse (resOriginalResource) {
if (resOriginalResource) {
// Do not wait for loadResource here, to prevent deadlock, scraper.waitForLoad
self.loadResource(resOriginalResource);
}
});
}).then(self.waitForLoad.bind(self));
});
return Promise.map(self.originalResources, self.requestResource.bind(self));
}).then(self.waitForLoad.bind(self));
};

// Returns a promise which gets resolved when all resources are loaded.
// 1. Get all not saved resources and save them
// 2. Recursion if any new not saved resource were added during this time. If not, loading is done.
Scraper.prototype.waitForLoad = function waitForLoad () {
var self = this;
var resourcesToSave = _.filter(self.loadedResources, (r) => !r.isSaved());
var resourcesToSave = Array.from(self.loadedResources.values()).filter((r) => !r.isSaved());
var loadingIsFinished = _.isEmpty(resourcesToSave);

if (!loadingIsFinished) {
Expand Down
6 changes: 3 additions & 3 deletions lib/utils.js → lib/utils/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ var path = require('path');
var Promise = require('bluebird');
var normalizeUrl = require('normalize-url');
var htmlEntities = require('he');
var typeByMime = require('./config/resource-type-by-mime');
var typeByExt = require('./config/resource-type-by-ext');
var typeByMime = require('../config/resource-type-by-mime');
var typeByExt = require('../config/resource-type-by-ext');

var logger = require('./logger');
var logger = require('../logger');

var MAX_FILENAME_LENGTH = 255;
var IS_URL = /^((http[s]?:)?\/\/)/;
Expand Down
19 changes: 19 additions & 0 deletions lib/utils/normalized-url-map.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
'use strict';

var normalizeUrl = require('./index').normalizeUrl;

class NormalizedUrlMap extends Map {
get (key) {
return super.get(normalizeUrl(key));
}

set (key, value) {
return super.set(normalizeUrl(key), value);
}

has (key) {
return super.has(normalizeUrl(key));
}
}

module.exports = NormalizedUrlMap;
12 changes: 0 additions & 12 deletions test/unit/resource-handler/index.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -177,18 +177,6 @@ describe('ResourceHandler', function() {
});
});

it('should call loadResource with resource returned by requestResource', function () {
pathContainer.getPaths.returns(['http://example.com/child']);

var childResourceRespondedMock = new Resource('http://example.com/child', 'child.png');
scraperContext.requestResource.resolves(childResourceRespondedMock);

return resHandler.handleChildrenResources(pathContainer, parentResource).then(function () {
scraperContext.loadResource.calledOnce.should.be.eql(true);
scraperContext.loadResource.args[0][0].should.be.eql(childResourceRespondedMock);
});
});

it('should update paths in text with local files returned by requestResource', function () {
pathContainer.getPaths.returns([
'http://first.com/img/a.jpg',
Expand Down
92 changes: 13 additions & 79 deletions test/unit/scraper-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -120,47 +120,6 @@ describe('Scraper', function () {
});
});

it('should call loadResource for each url', function() {
nock('http://first-url.com').get('/').reply(200, 'OK');
nock('http://second-url.com').get('/').reply(200, 'OK');

var s = new Scraper({
urls: [
'http://first-url.com',
'http://second-url.com'
],
directory: testDirname
});

var loadResourceSpy = sinon.spy(s, 'loadResource');
return s.load().then(function() {
loadResourceSpy.calledTwice.should.be.eql(true);
});
});

it('should not call loadResource if no resource was returned', function() {
nock('http://first-url.com').get('/').reply(200, 'OK');
nock('http://second-url.com').get('/').reply(200, 'OK');

var s = new Scraper({
urls: [
'http://first-url.com',
'http://second-url.com'
],
directory: testDirname
});

var loadResourceSpy = sinon.spy(s, 'loadResource');
var requestStub = sinon.stub(s, 'requestResource');
requestStub.onCall(0).resolves(null);
requestStub.onCall(1).resolves(new Resource('http://second-url.com'));

return s.load().then(function() {
loadResourceSpy.calledOnce.should.be.eql(true);
loadResourceSpy.args[0][0].url.should.be.eql('http://second-url.com');
});
});

it('should return array of objects with url, filename and children', function() {
nock('http://first-url.com').get('/').reply(200, 'OK');
nock('http://second-url.com').get('/').reply(500);
Expand Down Expand Up @@ -237,59 +196,34 @@ describe('Scraper', function () {
});
});

describe('#getLoadedResource', function() {
it('should find nothing if no resource with same url was loaded',function() {
var s = new Scraper({
urls: 'http://example.com',
directory: testDirname
});
var a = new Resource('http://first-resource.com');
var loaded = s.getLoadedResource(a.getUrl());
should(loaded).be.eql(undefined);
});

it('should find loaded resource with same url', function() {
describe('#loadResource', function() {
it('should add different resource to the map', function() {
var s = new Scraper({
urls: 'http://example.com',
directory: testDirname
});

var a = new Resource('http://first-resource.com');
s.addLoadedResource(a.getUrl(), a);
var r1 = new Resource('http://example.com/a1.png', 'a1.png');
var r2 = new Resource('http://example.com/a2.png', 'a2.png');
var r3 = new Resource('http://example.com/a3.png', 'a3.png');

var b = new Resource('http://first-resource.com');
var c = new Resource('http://first-resource.com/');
var d = new Resource('http://first-resource.com?');
should(s.getLoadedResource(b.getUrl())).be.equal(a);
should(s.getLoadedResource(c.getUrl())).be.equal(a);
should(s.getLoadedResource(d.getUrl())).be.equal(a);
s.loadResource(r1);
s.loadResource(r2);
s.loadResource(r3);
s.loadedResources.size.should.be.eql(3);
});
});

describe('#loadResource', function() {
it('should not save the same resource twice (should skip already loaded)', function() {
it('should not add the same resource twice', function() {
var s = new Scraper({
urls: 'http://example.com',
directory: testDirname
});
s.getResourceHandler = sinon.stub().returns(_.noop);

sinon.stub(s, 'getLoadedResource')
.withArgs('http://example.com/a.png')
.onFirstCall().returns()
.onSecondCall().returns(Promise.resolve());

sinon.spy(s, 'addLoadedResource');

var r = new Resource('http://example.com/a.png', 'a.png');

s.loadResource(r);
s.getLoadedResource.calledOnce.should.be.eql(true);
s.addLoadedResource.calledOnce.should.be.eql(true);

s.loadResource(r);
s.getLoadedResource.calledTwice.should.be.eql(true);
s.addLoadedResource.calledOnce.should.be.eql(true);
s.loadedResources.size.should.be.eql(1);
});
});

Expand All @@ -299,12 +233,12 @@ describe('Scraper', function () {
urls: 'http://example.com',
directory: testDirname
});
s.getResourceHandler = sinon.stub().returns(_.noop);
sinon.spy(s, 'addLoadedResource');

var r = new Resource('http://example.com/a.png', 'a.png');
r.setText('some text');

s.resourceHandler.handleResource = sinon.stub().resolves(r);

return s.saveResource(r).then(function() {
var text = fs.readFileSync(path.join(testDirname, r.getFilename())).toString();
text.should.be.eql(r.getText());
Expand Down
Loading