Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions lib/config/resource-ext-by-type.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
var types = require('./resource-types');
var defaultExtensions = {};

// should contain same data as ./resource-type-by-ext
defaultExtensions[types.html] = [ '.html', '.htm' ];
defaultExtensions[types.css] = [ '.css' ];

module.exports = defaultExtensions;
14 changes: 0 additions & 14 deletions lib/config/resource-extensions-by-type.js

This file was deleted.

8 changes: 8 additions & 0 deletions lib/config/resource-type-by-ext.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
var types = require('./resource-types');

// should contain same data as ./resource-ext-by-type
module.exports = {
'.html': types.html,
'.htm': types.html,
'.css': types.css
};
6 changes: 6 additions & 0 deletions lib/config/resource-type-by-mime.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
var types = require('./resource-types');

module.exports = {
'text/html': types.html,
'text/css': types.css
};
14 changes: 0 additions & 14 deletions lib/config/resource-types-by-tag.js

This file was deleted.

3 changes: 1 addition & 2 deletions lib/config/resource-types.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
var types = {
css: 'css',
html: 'html',
other: 'other'
html: 'html'
};

module.exports = types;
4 changes: 2 additions & 2 deletions lib/filename-generator/by-site-structure.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ var _ = require('lodash');
var path = require('path');
var utils = require('../utils');
var resourceTypes = require('../config/resource-types');
var resourceTypeExtensions = require('../config/resource-extensions-by-type');
var resourceTypeExtensions = require('../config/resource-ext-by-type');

module.exports = function generateFilename (resource, options) {
var resourceUrl = resource.getUrl();
Expand All @@ -11,7 +11,7 @@ module.exports = function generateFilename (resource, options) {

// If we have HTML from 'http://example.com/path' => set 'path/index.html' as filepath
if (resource.isHtml()) {
var htmlExtensions = resourceTypeExtensions[resourceTypes.html].possibleExtensions;
var htmlExtensions = resourceTypeExtensions[resourceTypes.html];
var resourceHasHtmlExtension = _.includes(htmlExtensions, extension);
// add index.html only if filepath has ext != html '/path/test.com' => '/path/test.com/index.html'
if (!resourceHasHtmlExtension) {
Expand Down
4 changes: 2 additions & 2 deletions lib/filename-generator/by-type.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
var _ = require('lodash');
var path = require('path');
var utils = require('../utils.js');
var typeExtensions = require('../config/resource-extensions-by-type');
var typeExtensions = require('../config/resource-ext-by-type');

module.exports = function generateFilename (resource, options, occupiedFileNames) {
var occupiedNames = getSubDirectoryNames(options).concat(occupiedFileNames);
Expand Down Expand Up @@ -33,7 +33,7 @@ function getFilenameForResource (resource, options) {
var extension = utils.getFilenameExtension(filename);

if (!extension && typeExtensions[resourceType]) {
extension = typeExtensions[resourceType].defaultExtension;
extension = typeExtensions[resourceType][0];
filename += extension;
}

Expand Down
5 changes: 5 additions & 0 deletions lib/request.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,18 @@ var Promise = require('bluebird');
var request = require('request');
var get = Promise.promisify(request.get);

function getMimeType (contentType) {
return contentType ? contentType.split(';')[0] : null;
}

function makeRequest (options, url) {
var requestOptions = _.clone(options);
requestOptions.url = url;

return get(requestOptions).then(function handleResponse (data) {
return {
url: data.request.href,
mimeType: getMimeType(data.headers['content-type']),
body: data.body
};
});
Expand Down
16 changes: 2 additions & 14 deletions lib/resource-handler/html/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@ HtmlResourceHandler.prototype.handle = function handle (resource) {
});
};

HtmlResourceHandler.prototype.loadResourcesForRule = function loadResourcesForRule ($, resource, rule) {
HtmlResourceHandler.prototype.loadResourcesForRule = function loadResourcesForRule ($, parentResource, rule) {
var self = this;
var promises = $(rule.selector).map(function loadForElement () {
var el = new HtmlSourceElement($(this), rule);
var pathContainer = el.getPathContainer();
if (!pathContainer) {
return Promise.resolve();
}
return self.handleChildrenPaths(pathContainer, resource, createHtmlData(el)).then(el.setData.bind(el));
return self.handleChildrenPaths(pathContainer, parentResource).then(el.setData.bind(el));
}).get();

return utils.waitAllFulfilled(promises);
Expand All @@ -51,16 +51,4 @@ function loadTextToCheerio (text) {
});
}

/**
* @param {HtmlSourceElement} htmlSourceEl
* @returns {HtmlData}
*/
function createHtmlData (htmlSourceEl) {
return {
tagName: htmlSourceEl.el[0].name,
attributeName: htmlSourceEl.rule.attr,
attributeValue: htmlSourceEl.el.attr(htmlSourceEl.rule.attr)
};
}

module.exports = HtmlResourceHandler;
7 changes: 2 additions & 5 deletions lib/resource-handler/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,19 @@ ResourceHandler.prototype.getResourceHandler = function getResourceHandler (reso

/**
* Request all resources from pathContainers paths
* @param pathContainer - instance of imgSrcsetTag or CommonTag or CssText, contains original paths for resources
* @param pathContainer - instance of ImgSrcsetTag or CommonTag or CssText, contains original paths for resources
* @param {Resource} parentResource
* @param {HtmlData} [childResourceHtmlData]
* @returns {Promise} - resolved when all resources from pathContainer were requested
* and original paths in parentResource were updated with local paths for children resources
*/
ResourceHandler.prototype.handleChildrenResources = function handleChildrenResources (pathContainer, parentResource, childResourceHtmlData) {
ResourceHandler.prototype.handleChildrenResources = function handleChildrenResources (pathContainer, parentResource) {
var self = this;
var childrenPaths = pathContainer.getPaths();
var pathsToUpdate = [];

var childrenPromises = childrenPaths.map(function loadChildPath (childPath) {
var childResourceUrl = utils.getUrl(parentResource.getUrl(), childPath);

var childResource = parentResource.createChild(childResourceUrl);
childResource.setHtmlData(childResourceHtmlData);

return self.context.requestResource(childResource).then(function updateChildPath (respondedResource) {
if (respondedResource) {
Expand Down
76 changes: 14 additions & 62 deletions lib/resource.js
Original file line number Diff line number Diff line change
@@ -1,36 +1,30 @@
var _ = require('lodash');
var utils = require('./utils');
var types = require('./config/resource-types');
var typesByHtmlData = require('./config/resource-types-by-tag');

function getTypeByHtmlData (htmlData) {
var type = _.findKey(typesByHtmlData, function containsHtmlData (rules) {
return _.find(rules, htmlData);
});
return type || types.other;
}

function Resource (url, filename) {
this.url = url;
this.filename = filename;

this.assets = [];
this.type = null;
this.depth = 0;
this.parent = null;
this.saved = false;
}

Resource.prototype.createChild = function createChild (url, filename) {
var child = new Resource(url, filename);

var currentDepth = this.getDepth();

child.setParent(this);
child.setDepth(++currentDepth);
child.parent = this;
child.depth = ++currentDepth;

this.assets.push(child);

return child;
};

Resource.prototype.updateChild = function updateChild (oldChild, newChild) {
var index = _.indexOf(this.assets, oldChild);
var index = this.assets.indexOf(oldChild);
if (index >= 0) {
this.assets[index] = newChild;
}
Expand Down Expand Up @@ -60,58 +54,16 @@ Resource.prototype.setText = function setText (text) {
this.text = text;
};

Resource.prototype.setParent = function setParent (parent) {
this.parent = parent;
};

Resource.prototype.getDepth = function getDepth () {
return this.depth || 0;
return this.depth;
};

Resource.prototype.setDepth = function setDepth (depth) {
this.depth = depth;
};

/**
* Html Data for resource, represents html element where resource was found
*
* @typedef {Object} HtmlData
* @property {string} tagName - tag of element
* @property {string} attributeName - attribute in tag where resource was found
* @property {string} attributeValue - attribute value, contains url of resources
*
* Example: for resource in <img src="/images/foo.png"> it will be
* {
* tagName: 'img',
* attributeName: 'src',
* attributeValue: '/images/foo.png'
* }
*/

/**
*
* @param {HtmlData} data
*/
Resource.prototype.setHtmlData = function setHtmlData (data) {
this.htmlData = _.pick(data, ['tagName', 'attributeName']);
Resource.prototype.setType = function setType (type) {
this.type = type;
};

Resource.prototype.getType = function getType () {
var ext = utils.getFilenameExtension(this.filename);
var parent = this.parent;
var hasHtmlData = !_.isEmpty(this.htmlData);

switch (true) {
case ext === '.html' || ext === '.htm':
return types.html;
case ext === '.css':
case !ext && parent && parent.isCss():
return types.css;
case !ext && parent && parent.isHtml() && hasHtmlData:
return getTypeByHtmlData(this.htmlData);
default:
return types.other;
}
return this.type;
};

Resource.prototype.isHtml = function isHtml () {
Expand All @@ -127,11 +79,11 @@ Resource.prototype.toString = function toString () {
};

Resource.prototype.isSaved = function isSaved () {
return this.saved || 0;
return this.saved;
};

Resource.prototype.setSaved = function setSaved () {
this.saved = 1;
this.saved = true;
};

module.exports = Resource;
9 changes: 9 additions & 0 deletions lib/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,17 @@ Scraper.prototype.requestResource = function requestResource (resource) {
self.addRespondedResourcePromise(responseData.url, respondedResourcePromise);
}

resource.setType(utils.getTypeByMime(responseData.mimeType));

var filename = self.filenameGenerator.generateFilename(resource);
resource.setFilename(filename);

// if type was not determined by mime
// we can try to get it from filename after it was generated
if (!resource.getType()) {
resource.setType(utils.getTypeByFilename(filename));
}

resource.setText(responseData.body);

logger.debug('finish request for ' + resource);
Expand Down
15 changes: 14 additions & 1 deletion lib/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ var url = require('url');
var path = require('path');
var Promise = require('bluebird');
var normalizeUrl = require('normalize-url');
var typeByMime = require('./config/resource-type-by-mime');
var typeByExt = require('./config/resource-type-by-ext');

var logger = require('./logger');

Expand Down Expand Up @@ -91,6 +93,15 @@ function isUriSchemaSupported (path) {
return !protocol || protocol && isUrl(path);
}

function getTypeByMime (mimeType) {
return typeByMime[mimeType];
}

function getTypeByFilename (filename) {
var ext = getFilenameExtension(filename);
return typeByExt[ext];
}

module.exports = {
isUrl: isUrl,
getUrl: getUrl,
Expand All @@ -104,5 +115,7 @@ module.exports = {
waitAllFulfilled: waitAllFulfilled,
normalizeUrl: normalizeUrl,
urlsEqual: urlsEqual,
isUriSchemaSupported: isUriSchemaSupported
isUriSchemaSupported: isUriSchemaSupported,
getTypeByMime: getTypeByMime,
getTypeByFilename: getTypeByFilename
};
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,16 @@ describe('Functional resources without extensions', function() {
nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');

// mock for css fonts
nock('http://fonts.googleapis.com/').get('/css?family=Lato').replyWithFile(200, mockDirname + '/fonts.css');
nock('http://fonts.googleapis.com/').get('/css?family=Lato').replyWithFile(200, mockDirname + '/fonts.css', {
'content-type': 'text/css'
});
nock('http://fonts.gstatic.com/').get('/s/lato/v11/UyBMtLsHKBKXelqf4x7VRQ.woff2').reply(200, 'OK');
nock('http://fonts.gstatic.com/').get('/s/lato/v11/1YwB1sO8YE1Lyjf12WNiUA.woff2').reply(200, 'OK');

// mock for iframe
nock('http://example.com/').get('/iframe').replyWithFile(200, mockDirname + '/iframe.html');
nock('http://example.com/').get('/iframe').replyWithFile(200, mockDirname + '/iframe.html', {
'Content-Type': 'text/html'
});
nock('http://example.com/').get('/cat.png').reply(200, 'OK');

// mock for anchor
Expand Down
2 changes: 1 addition & 1 deletion test/unit/request-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ describe('Request', function () {
describe('#makeRequest', function () {

it('should call request with correct params', function(done) {
var responseMock = { request: {href: ''}, body: '' };
var responseMock = { request: {href: ''}, body: '', headers: {} };
var requestStub = sinon.stub().yields(null, responseMock);

var customRequest = proxyquire('../../lib/request', {
Expand Down
Loading