Skip to content

Commit

Permalink
Have ReSpec run in JSDOM, stop using RespecDocWriter
Browse files Browse the repository at this point in the history
Reffy needs to run ReSpec on Editor's Drafts. It relied on RespecDocWriter
until now, which uses Puppeeter, which in turn relies on a headless version
of Chrome. That requires downloading 100s of MB of dependencies and is a tad
heavy for the task at hands, especially as we consider running Reffy as an
HTTP service.

With this update, RespecDocWriter is no longer being used. Instead, Reffy makes
ReSpec run in JSDOM, which is much more lightweight (and JS-based).

Problem is that JSDOM does not yet support enough features to run ReSpec.
Concrete issues are documented in:
w3c/respec#1469 (comment)

This update monkey-patches JSDOM's code to add good-enough implementations of
missing features, as well as to regain control over network requests so that
all network requests go through our cache. It also monkey-patches ReSpec to:

1. drop a couple of modules that cannot run because they use non-implemented
features of JSDOM that cannot be easily patched
2. update the code of a couple of ReSpec dependencies for the same reason

The whole thing remains fragile: updates to ReSpec or to its dependencies may
well break the regular expressions used to monkey patch the code. Most of the
monkey-patching should resist minor changes. A couple of them may not though.
  • Loading branch information
tidoust committed May 15, 2018
1 parent ba8809a commit 3721d05
Show file tree
Hide file tree
Showing 6 changed files with 538 additions and 3,107 deletions.
48 changes: 48 additions & 0 deletions fetch.js
@@ -0,0 +1,48 @@
/**
* Wrapper around the fetch module to setup a few config parameters from
* config.json
*
* @module finder
*/

const path = require('path');
const baseFetch = require('fetch-filecache-for-crawling');

// Read configuration parameters from `config.json` file
let config = null;
try {
config = require(path.resolve('config.json'));
}
catch (err) {
config = {};
}


/**
* Fetch function that applies fetch parameters defined in `config.json`
* unless parameters are already set.
*
* By default, force the HTTP refresh strategy to "once", so that only one
* HTTP request gets sent on a given URL per crawl.
*
* @function
* @param {String} url URL to fetch
* @param {Object} options Fetch options (and options for node-fetch, and
* options for fetch-filecache-for-crawling)
* @return {Promise(Response)} Promise to get an HTTP response
*/
function fetch(url, options) {
options = Object.assign({}, options);
['cacheFolder', 'resetCache', 'cacheRefresh', 'logToConsole'].forEach(param => {
let fetchParam = (param === 'cacheRefresh') ? 'refresh' : param;
if (config[param] && !options.hasOwnProperty(fetchParam)) {
options[fetchParam] = config[param];
}
});
if (!options.refresh) {
options.refresh = 'once';
}
return baseFetch(url, options);
}

module.exports = fetch;
308 changes: 308 additions & 0 deletions jsdom-monkeypatch.js
@@ -0,0 +1,308 @@
/**
* Reffy runs ReSpec in JSDOM to avoid having to download an run a complete
* headless browser.
*
* Problem is JSDOM does not yet support a number of features that ReSpec needs.
* Also, JSDOM does not allow interception of network requests anymore.
*
* This code monkey-patches JSDOM (and ReSpec) code before other modules make
* use of them, so that ReSpec can run in JSDOM and so that network requests
* use our fetch implementation.
*
* This module needs to be required before any other module that make use of
* JSDOM.
*
* The whole thing is PRETTY UGLY.
*
* NB: This code WILL LIKELY BREAK when switching to new versions of JSDOM or
* ReSpec.
*/

const resourceLoader = require('jsdom/lib/jsdom/browser/resource-loader');
const Attr = require('jsdom/lib/jsdom/living/generated/Attr').interface;
const Element = require('jsdom/lib/jsdom/living/generated/Element').interface;
const fetch = require('./fetch.js');
const { Headers, Request, Response } = require('node-fetch');

resourceLoader.download = function (url, options, callback) {
// Restrict resource loading to ReSpec and script resources that sit next
// to the spec under test, excluding scripts of WebIDL as well as the
// WHATWG annotate_spec script that JSDOM does not seem to like.
// Explicitly whitelist the "autolink" script of the shadow DOM spec which
// is needed to initialize respecConfig
const respecUrl = 'https://www.w3.org/Tools/respec/respec-w3c-common';
function getUrlToFetch() {
let referrer = options.referrer;
if (!referrer.endsWith('/')) {
referrer = referrer.substring(0, referrer.lastIndexOf('/') + 1);
}
if (/\/respec[\/\-]/i.test(url.path)) {
//console.log(`fetch ReSpec (force latest version)`);
return respecUrl;
}
else if (/\.[^\/\.]+$/.test(url.path) &&
!url.path.endsWith('.js') &&
!url.path.endsWith('.json')) {
//console.log(`fetch not needed for ${url.href} (not a JS/JSON file)`);
return null;
}
else if ((url.pathname === '/webcomponents/assets/scripts/autolink.js') ||
(url.href.startsWith(referrer) &&
!(/annotate_spec/i.test(url.pathname)) &&
!(/expanders/i.test(url.pathname)) &&
!(/bug-assist/i.test(url.pathname)) &&
!(/dfn/i.test(url.pathname)) &&
!(/section-links/i.test(url.pathname)) &&
!(/^\/webidl\//i.test(url.pathname)))) {
//console.log(`fetch useful script at ${url.href}`);
return url.href;
}
//console.log(`fetch not needed for ${url.href}`);
return null;
}

let urlToFetch = getUrlToFetch();
if (!urlToFetch) {
return callback(null, '');
}
fetch(urlToFetch, options)
.then(response => response.text())
.then(data => {
if (urlToFetch !== respecUrl) {
return data;
}

////////////////////////////////////////////////////////////
// REALLY UGLY CODE WARNING
//
// Tweak Respec built code so that it can run in JSDOM.
//
// NB: Some of these lines will just break if Respec build
// produces slightly different code, e.g. if variables do
// not end up with the same name!
////////////////////////////////////////////////////////////

// Remove core/highlight module because JSDOM does not yet
// support URL.createObjectURL
// https://github.com/jsdom/jsdom/issues/1721
// Remove core/list-sorter module because JSDOM does not yet
// support document.createRange
// https://github.com/jsdom/jsdom/blob/master/lib/jsdom/living/nodes/Document.webidl#L39
["core/highlight", "core/list-sorter"]
.forEach(module => data = data.replace(
new RegExp('(define\\(\\s*"profile-w3c-common"\\s*,\\s*\\[[^\\]]+),\\s*"' + module + '"'),
'$1'));

// JSDOM's CSS parser does not quite like uncommon "@" rules
// so let's pretend they are just @media rules
// https://github.com/jsdom/jsdom/issues/2026
// (NB: this replacement is just for convenience, to avoid JSDOM reporting
// lengthy errors (including a full dump of the CSS) to stderr
data = data.replace(/@keyframes \S+? {/, '@media all {');
data = data.replace(/@supports \(.+?\) {/, '@media all {');

// Respec drops blank lines in Markdown, but marked.js actually
// needs them around <pre> tags, otherwise it produces really weird
// HTML (with <pre> and <p> intertwined). For some reason, this does
// not bother regular browsers. It does bother JSDOM though.
data = data.replace(/r\.createTextNode\("\\n"\)/, 'r.createTextNode("\\n\\n")');

// JSDOM does not support cloning of attributes yet, and polyfill
// only works for attributes that already belong to a document.
// HyperHTML needs to clone attributes that do not belong to the
// document, so let's intercept the call to `cloneNode` in HyperHTML
// and use `createAttributeNS` instead
// https://github.com/jsdom/jsdom/commit/acf0156b563b5e2ba606da36fd597e0a0b344f5a
data = data.replace(/p=r\.cloneNode\(!0\);/,
`p = null;
if (r.ownerDocument) {
p = r.cloneNode(true);
} else {
p = document.createAttributeNS(r.namespaceURI,r.name);
p.value = r.value;
}`);

return data;
////////////////////////////////////////////////////////////
// END OF REALLY UGLY CODE WARNING
////////////////////////////////////////////////////////////
})
.then(data => callback(null, data))
.catch(err => callback(err));
};


// JSDOM does not yet support innerText. Only used in Respec
// to set text of empty elements, so replacing with
// textContent should be good enough
// https://github.com/jsdom/jsdom/issues/1245
if (!Element.prototype.innerText) {
Object.defineProperty(Element.prototype, 'innerText', {
get() {
if (!this) {
throw new TypeError("Illegal invocation");
}
return this.textContent;
},

set(V) {
if (!this) {
throw new TypeError("Illegal invocation");
}
this.textContent = V;
},

enumerable: true,
configurable: true
});
}


// Not yet supported in JSDOM
// https://github.com/jsdom/jsdom/issues/1890
if (!Element.prototype.insertAdjacentElement) {
Element.prototype.insertAdjacentElement = function (position, element) {
switch (position.toLowerCase()) {
case 'beforebegin':
this.parentElement.insertBefore(element, this);
break;
case 'afterbegin':
if (this.firstChild) {
this.insertBefore(element, this.firstChild);
} else {
this.appendChild(element);
}
break;
case 'beforeend':
this.appendChild(element);
break;
case 'afterend':
this.parentElement.appendChild(element);
this.after(element);
break;
}
return element;
};
}


// Not yet supported in JSDOM
// https://github.com/jsdom/jsdom/issues/1555
if (!Element.prototype.closest) {
Element.prototype.closest = function (selector) {
var el = this;
if (!this.ownerDocument.documentElement.contains(el)) return null;
do {
if (el.matches(selector)) return el;
el = el.parentElement || el.parentNode;
} while (el !== null && el.nodeType === 1);
return null;
};
}


// Not yet supported in JSDOM for attributes
// (but needed by HyperHTML)
// https://github.com/jsdom/jsdom/commit/acf0156b563b5e2ba606da36fd597e0a0b344f5a
if (!Attr.prototype.cloneNode) {
Attr.prototype.cloneNode = function () {
if (!this.ownerDocument) {
// Cloning an attribute that does not yet belong to a document is possible
// in theory, but we have a major problem here: we simply do not have any
// pointer to the window/document that triggers the request (and
// prototypes are shared across browser contexts in JSDOM)
throw new Error('Cannot clone an attribute that does not belong to a document!');
}
let attr = this.ownerDocument.createAttributeNS(
this.namespaceURI, this.name);
attr.value = this.value;
return attr;
};
}


// That's it, JSDOM will now use our `download` function and all specs that
// reference ReSpec will download the latest version (with our monkey patch).
const { JSDOM } = require('jsdom');

// Window methods cannot be monkey-patched in the interface prototype, because
// `this` is not always set to the Window object when these methods, probably
// because the global object in Node.js is not the Window instance. For these
// methods, we need to monkey-patch the code in the JSDOM `beforeParse` method,
// called when the Window instance is created.
module.exports.JSDOM = function (html, options) {
options = Object.assign({}, options);
const beforeParse = options.beforeParse;
options.beforeParse = function (window) {
// Not yet supported in JSDOM and JSDOM define them in the Window
// constructor, so we need to override them after that.
// (most are not used in our specs, but some still call "scrollBy")
// https://github.com/jsdom/jsdom/blob/master/lib/jsdom/browser/Window.js#L570
['blur', 'focus', 'moveBy', 'moveTo', 'resizeBy', 'resizeTo', 'scroll', 'scrollBy', 'scrollTo']
.forEach(method => window[method] = function () {});

// Not yet supported in JSDOM
// https://github.com/jsdom/jsdom/blob/master/test/web-platform-tests/to-upstream/html/browsers/the-window-object/window-properties-dont-upstream.html#L104
if (!window.matchMedia) {
window.matchMedia = function () {
return {
matches: false,
addListener: () => {},
removeListener: () => {},
onchange: () => {}
};
};
}

// Not yet supported in JSDOM and cannot be directly monkey-patched.
// (and actually, good for us since we want to control caching logic here)
// https://github.com/jsdom/jsdom/issues/1724
if (!window.fetch) {
window.fetch = async function (url, options) {
if (url.url) {
// Called with a Request object
if (url.headers) {
options = Object.assign({}, options, {
headers: url.headers
});
}
url = url.url;
}
if (!url.startsWith('http:') && !url.startsWith('https:')) {
let a = window.document.createElement('a');
a.href = url;
url = a.href;
}
return fetch(url, options);
};

window.Request = Request;
window.Response = Response;
window.Headers = Headers;
}

if (beforeParse) {
return beforeParse(window);
}
};

return new JSDOM(html, options);
};


// Not much we can do but:
// 1. JSDOM does not support `IndexedDB`, which Respec uses to
// store the biblio. No big deal as Respec degrades gracefully
// but that outputs errors to the console when trying to call
// `IndexedDB.open`
// https://github.com/jsdom/jsdom/issues/1748
// 2. JSDOM does not support the `whatToShow` filter in
// `TreeWalker`. As a result, HyperHTML fails to remove the
// `<!-- _hyper: xxxx -->` comments it adds while running.

// Also, Node.js is prompt to output warnings on what it thinks are unhandled
// promise rejections but are usually rejections that are handled asynchronously
// (i.e. not on the same tick). Let's intercept these warnings not to output
// false positives.
/*process.on('unhandledRejection', () => {});
process.on('rejectionHandled', () => {});*/

0 comments on commit 3721d05

Please sign in to comment.