From 171f6d53a73914cce63eba86ef35a45b5a90b68f Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Fri, 30 Dec 2016 20:14:14 +0200 Subject: [PATCH 1/3] Get resource type by mime or filename --- lib/config/resource-ext-by-type.js | 8 ++++++++ lib/config/resource-extensions-by-type.js | 14 -------------- lib/config/resource-type-by-ext.js | 8 ++++++++ lib/config/resource-type-by-mime.js | 6 ++++++ lib/filename-generator/by-site-structure.js | 4 ++-- lib/filename-generator/by-type.js | 4 ++-- lib/request.js | 7 ++++++- lib/resource.js | 6 ++++++ lib/scraper.js | 2 ++ lib/utils.js | 15 ++++++++++++++- 10 files changed, 54 insertions(+), 20 deletions(-) create mode 100644 lib/config/resource-ext-by-type.js delete mode 100644 lib/config/resource-extensions-by-type.js create mode 100644 lib/config/resource-type-by-ext.js create mode 100644 lib/config/resource-type-by-mime.js diff --git a/lib/config/resource-ext-by-type.js b/lib/config/resource-ext-by-type.js new file mode 100644 index 00000000..eba3a1ed --- /dev/null +++ b/lib/config/resource-ext-by-type.js @@ -0,0 +1,8 @@ +var types = require('./resource-types'); +var defaultExtensions = {}; + +// should contain same data as ./resource-type-by-ext +defaultExtensions[types.html] = [ '.html', '.htm' ]; +defaultExtensions[types.css] = [ '.css' ]; + +module.exports = defaultExtensions; diff --git a/lib/config/resource-extensions-by-type.js b/lib/config/resource-extensions-by-type.js deleted file mode 100644 index e6a98672..00000000 --- a/lib/config/resource-extensions-by-type.js +++ /dev/null @@ -1,14 +0,0 @@ -var types = require('./resource-types'); - -var defaultExtensions = {}; - -defaultExtensions[types.html] = { - defaultExtension: '.html', - possibleExtensions: [ '.html', '.htm' ] -}; -defaultExtensions[types.css] = { - defaultExtension: '.css', - possibleExtensions: [ '.css' ] -}; - -module.exports = defaultExtensions; diff --git a/lib/config/resource-type-by-ext.js b/lib/config/resource-type-by-ext.js new file mode 100644 index 00000000..2798b65c --- /dev/null +++ b/lib/config/resource-type-by-ext.js @@ -0,0 +1,8 @@ +var types = require('./resource-types'); + +// should contain same data as ./resource-ext-by-type +module.exports = { + '.html': types.html, + '.htm': types.html, + '.css': types.css +}; \ No newline at end of file diff --git a/lib/config/resource-type-by-mime.js b/lib/config/resource-type-by-mime.js new file mode 100644 index 00000000..ba8f3182 --- /dev/null +++ b/lib/config/resource-type-by-mime.js @@ -0,0 +1,6 @@ +var types = require('./resource-types'); + +module.exports = { + 'text/html': types.html, + 'text/css': types.css +}; \ No newline at end of file diff --git a/lib/filename-generator/by-site-structure.js b/lib/filename-generator/by-site-structure.js index 1ae5b5cf..16964007 100644 --- a/lib/filename-generator/by-site-structure.js +++ b/lib/filename-generator/by-site-structure.js @@ -2,7 +2,7 @@ var _ = require('lodash'); var path = require('path'); var utils = require('../utils'); var resourceTypes = require('../config/resource-types'); -var resourceTypeExtensions = require('../config/resource-extensions-by-type'); +var resourceTypeExtensions = require('../config/resource-ext-by-type'); module.exports = function generateFilename (resource, options) { var resourceUrl = resource.getUrl(); @@ -11,7 +11,7 @@ module.exports = function generateFilename (resource, options) { // If we have HTML from 'http://example.com/path' => set 'path/index.html' as filepath if (resource.isHtml()) { - var htmlExtensions = resourceTypeExtensions[resourceTypes.html].possibleExtensions; + var htmlExtensions = resourceTypeExtensions[resourceTypes.html]; var resourceHasHtmlExtension = _.includes(htmlExtensions, extension); // add index.html only if filepath has ext != html '/path/test.com' => '/path/test.com/index.html' if (!resourceHasHtmlExtension) { diff --git a/lib/filename-generator/by-type.js b/lib/filename-generator/by-type.js index 83a5971b..eb95ddc2 100644 --- a/lib/filename-generator/by-type.js +++ b/lib/filename-generator/by-type.js @@ -1,7 +1,7 @@ var _ = require('lodash'); var path = require('path'); var utils = require('../utils.js'); -var typeExtensions = require('../config/resource-extensions-by-type'); +var typeExtensions = require('../config/resource-ext-by-type'); module.exports = function generateFilename (resource, options, occupiedFileNames) { var occupiedNames = getSubDirectoryNames(options).concat(occupiedFileNames); @@ -33,7 +33,7 @@ function getFilenameForResource (resource, options) { var extension = utils.getFilenameExtension(filename); if (!extension && typeExtensions[resourceType]) { - extension = typeExtensions[resourceType].defaultExtension; + extension = typeExtensions[resourceType][0]; filename += extension; } diff --git a/lib/request.js b/lib/request.js index 9bb4c884..fe6d27cc 100644 --- a/lib/request.js +++ b/lib/request.js @@ -3,6 +3,10 @@ var Promise = require('bluebird'); var request = require('request'); var get = Promise.promisify(request.get); +function getMimeType (contentType) { + return contentType ? contentType.split(';')[0] : null; +} + function makeRequest (options, url) { var requestOptions = _.clone(options); requestOptions.url = url; @@ -10,7 +14,8 @@ function makeRequest (options, url) { return get(requestOptions).then(function handleResponse (data) { return { url: data.request.href, - body: data.body + mimeType: getMimeType(data.headers['content-type']), + body: data.body, }; }); } diff --git a/lib/resource.js b/lib/resource.js index 1329931e..dd83b8d9 100644 --- a/lib/resource.js +++ b/lib/resource.js @@ -96,7 +96,13 @@ Resource.prototype.setHtmlData = function setHtmlData (data) { this.htmlData = _.pick(data, ['tagName', 'attributeName']); }; +Resource.prototype.setType = function setType (type) { + this.type = type; +}; + Resource.prototype.getType = function getType () { + return this.type; + var ext = utils.getFilenameExtension(this.filename); var parent = this.parent; var hasHtmlData = !_.isEmpty(this.htmlData); diff --git a/lib/scraper.js b/lib/scraper.js index 978ca1b7..626999e0 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -127,7 +127,9 @@ Scraper.prototype.requestResource = function requestResource (resource) { } var filename = self.filenameGenerator.generateFilename(resource); + var type = utils.getTypeByMime(responseData.mimeType) || utils.getTypeByFilename(filename); resource.setFilename(filename); + resource.setType(type); resource.setText(responseData.body); logger.debug('finish request for ' + resource); diff --git a/lib/utils.js b/lib/utils.js index 4889a029..5b7d3161 100644 --- a/lib/utils.js +++ b/lib/utils.js @@ -2,6 +2,8 @@ var url = require('url'); var path = require('path'); var Promise = require('bluebird'); var normalizeUrl = require('normalize-url'); +var typeByMime = require('./config/resource-type-by-mime'); +var typeByExt = require('./config/resource-type-by-ext'); var logger = require('./logger'); @@ -91,6 +93,15 @@ function isUriSchemaSupported (path) { return !protocol || protocol && isUrl(path); } +function getTypeByMime (mimeType) { + return typeByMime[mimeType]; +} + +function getTypeByFilename (filename) { + var ext = getFilenameExtension(filename); + return typeByExt[ext]; +} + module.exports = { isUrl: isUrl, getUrl: getUrl, @@ -104,5 +115,7 @@ module.exports = { waitAllFulfilled: waitAllFulfilled, normalizeUrl: normalizeUrl, urlsEqual: urlsEqual, - isUriSchemaSupported: isUriSchemaSupported + isUriSchemaSupported: isUriSchemaSupported, + getTypeByMime: getTypeByMime, + getTypeByFilename: getTypeByFilename }; From 6c7571cf8385ba6ee31e55b43cecbdc5aafe8f51 Mon Sep 17 00:00:00 2001 From: s0ph1e Date: Fri, 30 Dec 2016 21:15:55 +0200 Subject: [PATCH 2/3] Remove determining of resource type by html markup --- lib/config/resource-type-by-ext.js | 2 +- lib/config/resource-type-by-mime.js | 2 +- lib/config/resource-types-by-tag.js | 14 --- lib/config/resource-types.js | 3 +- lib/resource-handler/html/index.js | 16 +--- lib/resource-handler/index.js | 7 +- lib/resource.js | 76 +++------------ lib/scraper.js | 11 ++- .../resource-without-ext.test.js | 8 +- test/unit/request-test.js | 2 +- test/unit/resource-test.js | 93 +------------------ 11 files changed, 36 insertions(+), 198 deletions(-) delete mode 100644 lib/config/resource-types-by-tag.js diff --git a/lib/config/resource-type-by-ext.js b/lib/config/resource-type-by-ext.js index 2798b65c..ab9b9df8 100644 --- a/lib/config/resource-type-by-ext.js +++ b/lib/config/resource-type-by-ext.js @@ -5,4 +5,4 @@ module.exports = { '.html': types.html, '.htm': types.html, '.css': types.css -}; \ No newline at end of file +}; diff --git a/lib/config/resource-type-by-mime.js b/lib/config/resource-type-by-mime.js index ba8f3182..ebc1f91b 100644 --- a/lib/config/resource-type-by-mime.js +++ b/lib/config/resource-type-by-mime.js @@ -3,4 +3,4 @@ var types = require('./resource-types'); module.exports = { 'text/html': types.html, 'text/css': types.css -}; \ No newline at end of file +}; diff --git a/lib/config/resource-types-by-tag.js b/lib/config/resource-types-by-tag.js deleted file mode 100644 index 72d42907..00000000 --- a/lib/config/resource-types-by-tag.js +++ /dev/null @@ -1,14 +0,0 @@ -var types = require('./resource-types'); - -var typesByHtmlTag = {}; - -typesByHtmlTag[types.css] = [ - { tagName: 'link', attributeName: 'href' } - -]; -typesByHtmlTag[types.html] = [ - { tagName: 'a', attributeName: 'href' }, - { tagName: 'iframe', attributeName: 'src' } -]; - -module.exports = typesByHtmlTag; diff --git a/lib/config/resource-types.js b/lib/config/resource-types.js index d31208d3..3e15125c 100644 --- a/lib/config/resource-types.js +++ b/lib/config/resource-types.js @@ -1,7 +1,6 @@ var types = { css: 'css', - html: 'html', - other: 'other' + html: 'html' }; module.exports = types; diff --git a/lib/resource-handler/html/index.js b/lib/resource-handler/html/index.js index d810307e..397d1eb5 100644 --- a/lib/resource-handler/html/index.js +++ b/lib/resource-handler/html/index.js @@ -19,7 +19,7 @@ HtmlResourceHandler.prototype.handle = function handle (resource) { }); }; -HtmlResourceHandler.prototype.loadResourcesForRule = function loadResourcesForRule ($, resource, rule) { +HtmlResourceHandler.prototype.loadResourcesForRule = function loadResourcesForRule ($, parentResource, rule) { var self = this; var promises = $(rule.selector).map(function loadForElement () { var el = new HtmlSourceElement($(this), rule); @@ -27,7 +27,7 @@ HtmlResourceHandler.prototype.loadResourcesForRule = function loadResourcesForRu if (!pathContainer) { return Promise.resolve(); } - return self.handleChildrenPaths(pathContainer, resource, createHtmlData(el)).then(el.setData.bind(el)); + return self.handleChildrenPaths(pathContainer, parentResource).then(el.setData.bind(el)); }).get(); return utils.waitAllFulfilled(promises); @@ -51,16 +51,4 @@ function loadTextToCheerio (text) { }); } -/** - * @param {HtmlSourceElement} htmlSourceEl - * @returns {HtmlData} - */ -function createHtmlData (htmlSourceEl) { - return { - tagName: htmlSourceEl.el[0].name, - attributeName: htmlSourceEl.rule.attr, - attributeValue: htmlSourceEl.el.attr(htmlSourceEl.rule.attr) - }; -} - module.exports = HtmlResourceHandler; diff --git a/lib/resource-handler/index.js b/lib/resource-handler/index.js index c152780a..6e3ba7e1 100644 --- a/lib/resource-handler/index.js +++ b/lib/resource-handler/index.js @@ -33,22 +33,19 @@ ResourceHandler.prototype.getResourceHandler = function getResourceHandler (reso /** * Request all resources from pathContainers paths - * @param pathContainer - instance of imgSrcsetTag or CommonTag or CssText, contains original paths for resources + * @param pathContainer - instance of ImgSrcsetTag or CommonTag or CssText, contains original paths for resources * @param {Resource} parentResource - * @param {HtmlData} [childResourceHtmlData] * @returns {Promise} - resolved when all resources from pathContainer were requested * and original paths in parentResource were updated with local paths for children resources */ -ResourceHandler.prototype.handleChildrenResources = function handleChildrenResources (pathContainer, parentResource, childResourceHtmlData) { +ResourceHandler.prototype.handleChildrenResources = function handleChildrenResources (pathContainer, parentResource) { var self = this; var childrenPaths = pathContainer.getPaths(); var pathsToUpdate = []; var childrenPromises = childrenPaths.map(function loadChildPath (childPath) { var childResourceUrl = utils.getUrl(parentResource.getUrl(), childPath); - var childResource = parentResource.createChild(childResourceUrl); - childResource.setHtmlData(childResourceHtmlData); return self.context.requestResource(childResource).then(function updateChildPath (respondedResource) { if (respondedResource) { diff --git a/lib/resource.js b/lib/resource.js index dd83b8d9..ea76646c 100644 --- a/lib/resource.js +++ b/lib/resource.js @@ -1,28 +1,22 @@ -var _ = require('lodash'); -var utils = require('./utils'); var types = require('./config/resource-types'); -var typesByHtmlData = require('./config/resource-types-by-tag'); - -function getTypeByHtmlData (htmlData) { - var type = _.findKey(typesByHtmlData, function containsHtmlData (rules) { - return _.find(rules, htmlData); - }); - return type || types.other; -} function Resource (url, filename) { this.url = url; this.filename = filename; + this.assets = []; + this.type = null; + this.depth = 0; + this.parent = null; + this.saved = false; } Resource.prototype.createChild = function createChild (url, filename) { var child = new Resource(url, filename); - var currentDepth = this.getDepth(); - child.setParent(this); - child.setDepth(++currentDepth); + child.parent = this; + child.depth = ++currentDepth; this.assets.push(child); @@ -30,7 +24,7 @@ Resource.prototype.createChild = function createChild (url, filename) { }; Resource.prototype.updateChild = function updateChild (oldChild, newChild) { - var index = _.indexOf(this.assets, oldChild); + var index = this.assets.indexOf(oldChild); if (index >= 0) { this.assets[index] = newChild; } @@ -60,40 +54,8 @@ Resource.prototype.setText = function setText (text) { this.text = text; }; -Resource.prototype.setParent = function setParent (parent) { - this.parent = parent; -}; - Resource.prototype.getDepth = function getDepth () { - return this.depth || 0; -}; - -Resource.prototype.setDepth = function setDepth (depth) { - this.depth = depth; -}; - -/** - * Html Data for resource, represents html element where resource was found - * - * @typedef {Object} HtmlData - * @property {string} tagName - tag of element - * @property {string} attributeName - attribute in tag where resource was found - * @property {string} attributeValue - attribute value, contains url of resources - * - * Example: for resource in it will be - * { - * tagName: 'img', - * attributeName: 'src', - * attributeValue: '/images/foo.png' - * } - */ - -/** - * - * @param {HtmlData} data - */ -Resource.prototype.setHtmlData = function setHtmlData (data) { - this.htmlData = _.pick(data, ['tagName', 'attributeName']); + return this.depth; }; Resource.prototype.setType = function setType (type) { @@ -102,22 +64,6 @@ Resource.prototype.setType = function setType (type) { Resource.prototype.getType = function getType () { return this.type; - - var ext = utils.getFilenameExtension(this.filename); - var parent = this.parent; - var hasHtmlData = !_.isEmpty(this.htmlData); - - switch (true) { - case ext === '.html' || ext === '.htm': - return types.html; - case ext === '.css': - case !ext && parent && parent.isCss(): - return types.css; - case !ext && parent && parent.isHtml() && hasHtmlData: - return getTypeByHtmlData(this.htmlData); - default: - return types.other; - } }; Resource.prototype.isHtml = function isHtml () { @@ -133,11 +79,11 @@ Resource.prototype.toString = function toString () { }; Resource.prototype.isSaved = function isSaved () { - return this.saved || 0; + return this.saved; }; Resource.prototype.setSaved = function setSaved () { - this.saved = 1; + this.saved = true; }; module.exports = Resource; diff --git a/lib/scraper.js b/lib/scraper.js index 626999e0..8f6cffde 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -126,10 +126,17 @@ Scraper.prototype.requestResource = function requestResource (resource) { self.addRespondedResourcePromise(responseData.url, respondedResourcePromise); } + resource.setType(utils.getTypeByMime(responseData.mimeType)); + var filename = self.filenameGenerator.generateFilename(resource); - var type = utils.getTypeByMime(responseData.mimeType) || utils.getTypeByFilename(filename); resource.setFilename(filename); - resource.setType(type); + + // if type was not determined by mime + // we can try to get it from filename after it was generated + if (!resource.getType()) { + resource.setType(utils.getTypeByFilename(filename)); + } + resource.setText(responseData.body); logger.debug('finish request for ' + resource); diff --git a/test/functional/resource-without-ext/resource-without-ext.test.js b/test/functional/resource-without-ext/resource-without-ext.test.js index 5b54cccd..4441b3ac 100644 --- a/test/functional/resource-without-ext/resource-without-ext.test.js +++ b/test/functional/resource-without-ext/resource-without-ext.test.js @@ -35,12 +35,16 @@ describe('Functional resources without extensions', function() { nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); // mock for css fonts - nock('http://fonts.googleapis.com/').get('/css?family=Lato').replyWithFile(200, mockDirname + '/fonts.css'); + nock('http://fonts.googleapis.com/').get('/css?family=Lato').replyWithFile(200, mockDirname + '/fonts.css', { + 'content-type': 'text/css' + }); nock('http://fonts.gstatic.com/').get('/s/lato/v11/UyBMtLsHKBKXelqf4x7VRQ.woff2').reply(200, 'OK'); nock('http://fonts.gstatic.com/').get('/s/lato/v11/1YwB1sO8YE1Lyjf12WNiUA.woff2').reply(200, 'OK'); // mock for iframe - nock('http://example.com/').get('/iframe').replyWithFile(200, mockDirname + '/iframe.html'); + nock('http://example.com/').get('/iframe').replyWithFile(200, mockDirname + '/iframe.html', { + 'Content-Type': 'text/html' + }); nock('http://example.com/').get('/cat.png').reply(200, 'OK'); // mock for anchor diff --git a/test/unit/request-test.js b/test/unit/request-test.js index 2c40a872..e354c685 100644 --- a/test/unit/request-test.js +++ b/test/unit/request-test.js @@ -21,7 +21,7 @@ describe('Request', function () { describe('#makeRequest', function () { it('should call request with correct params', function(done) { - var responseMock = { request: {href: ''}, body: '' }; + var responseMock = { request: {href: ''}, body: '', headers: {} }; var requestStub = sinon.stub().yields(null, responseMock); var customRequest = proxyquire('../../lib/request', { diff --git a/test/unit/resource-test.js b/test/unit/resource-test.js index 269d621c..e259432e 100644 --- a/test/unit/resource-test.js +++ b/test/unit/resource-test.js @@ -1,96 +1,7 @@ require('should'); - var Resource = require('../../lib/resource'); -var types = require('../../lib/config/resource-types'); describe('Resource', function() { - describe('#getType', function() { - it('should return correct type based on extension', function() { - var html = new Resource('http://example.com', 'index.html'); - var htm = new Resource('http://example.com', 'index.htm'); - var css = new Resource('http://example.com/style.css', 'style.css'); - var img = new Resource('http://example.com/img/logo.png', 'logo.png'); - - html.getType().should.be.eql(types.html); - htm.getType().should.be.eql(types.html); - css.getType().should.be.eql(types.css); - img.getType().should.be.eql(types.other); - }); - - it('should return other if resource has no extension', function() { - var unknown = new Resource('http://example.com/smthelse'); - unknown.getType().should.be.eql(types.other); - }); - - it('should return css if resource has no extension and parent is css', function() { - var css = new Resource('http://example.com/style.css', 'style.css'); - var res = new Resource('http://example.com/some-resource'); - res.setParent(css); - res.getType().should.be.eql(types.css); - }); - - it('should return css if resource has no extension and parent is html and resource is loading from link tag', function() { - var html = new Resource('http://example.com', 'index.html'); - var res = new Resource('http://example.com/some-resource'); - res.setParent(html); - res.setHtmlData({ tagName: 'link', attributeName: 'href' }); - res.getType().should.be.eql(types.css); - }); - - it('should return html if resource has no extension and parent is html and resource is loading from a tag', function() { - var html = new Resource('http://example.com', 'index.html'); - var res = new Resource('http://example.com/some-resource'); - res.setParent(html); - res.setHtmlData({ tagName: 'a', attributeName: 'href' }); - res.getType().should.be.eql(types.html); - }); - - it('should return html if resource has no extension and parent is html and resource is loading from iframe tag', function() { - var html = new Resource('http://example.com', 'index.html'); - var res = new Resource('http://example.com/some-resource'); - res.setParent(html); - res.setHtmlData({ tagName: 'iframe', attributeName: 'src' }); - res.getType().should.be.eql(types.html); - }); - - it('should return other if resource has no extension and parent is html and resource has no html tag', function() { - var html = new Resource('http://example.com', 'index.html'); - var res = new Resource('http://example.com/some-resource'); - res.setParent(html); - res.getType().should.be.eql(types.other); - }); - - it('should return other if resource has no extension and parent is html and html tag doesn\'t load html or css ', function() { - var html = new Resource('http://example.com', 'index.html'); - var res = new Resource('http://example.com/some-resource'); - res.setParent(html); - res.setHtmlData({ tagName: 'img', attributeName: 'src' }); - res.getType().should.be.eql(types.other); - }); - }); - - describe('#setDepth', function () { - it('should set depth', function() { - var o = new Resource('http://google.com'); - o.setDepth(555); - o.depth.should.be.eql(555); - }); - }); - - describe('#getDepth', function () { - it('should return depth if object has it', function() { - var o = new Resource('http://google.com'); - o.setDepth(123); - o.getDepth().should.be.eql(123); - }); - - it('should return 0 if object has no depth', function() { - var o = new Resource('http://google.com'); - o.getDepth().should.be.eql(0); - }); - - }); - describe('#createChild', function () { it('should return Resource', function() { var parent = new Resource('http://example.com'); @@ -114,10 +25,10 @@ describe('Resource', function() { it('should set depth', function() { var parent = new Resource('http://example.com'); var child = parent.createChild('http://google.com'); - child.getDepth().should.be.eql(1); + child.depth.should.be.eql(1); var childOfChild = child.createChild('http://google.com.ua'); - childOfChild.getDepth().should.be.eql(2); + childOfChild.depth.should.be.eql(2); }); }); }); From ff76eb752cafb87f103cd868a1438ba1de6cd3f3 Mon Sep 17 00:00:00 2001 From: Sophia Antipenko Date: Fri, 30 Dec 2016 21:30:12 +0200 Subject: [PATCH 3/3] Fix extra coma --- lib/request.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/request.js b/lib/request.js index fe6d27cc..1d2035c2 100644 --- a/lib/request.js +++ b/lib/request.js @@ -15,7 +15,7 @@ function makeRequest (options, url) { return { url: data.request.href, mimeType: getMimeType(data.headers['content-type']), - body: data.body, + body: data.body }; }); }