Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Record and use HTTP cache information to skip unnecessary crawling and processing #856

Merged
merged 19 commits into from
Feb 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,258 changes: 2,163 additions & 95 deletions package-lock.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"abortcontroller-polyfill": "1.7.3",
"browser-specs": "2.27.0",
"commander": "9.0.0",
"fetch-filecache-for-crawling": "4.0.2",
"fetch-filecache-for-crawling": "4.1.0",
"puppeteer": "13.1.3",
"semver": "^7.3.5",
"webidl2": "24.2.0"
Expand Down
4 changes: 2 additions & 2 deletions src/lib/fetch.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ catch (err) {
* @return {Promise(Response)} Promise to get an HTTP response
*/
async function fetch(url, options) {
options = Object.assign({}, options);
options = Object.assign({headers: {}}, options);
['cacheFolder', 'resetCache', 'cacheRefresh', 'logToConsole'].forEach(param => {
let fetchParam = (param === 'cacheRefresh') ? 'refresh' : param;
if (config[param] && !options.hasOwnProperty(fetchParam)) {
Expand All @@ -53,4 +53,4 @@ async function fetch(url, options) {
}


module.exports = fetch;
module.exports = fetch;
11 changes: 9 additions & 2 deletions src/lib/nock-server.js
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,14 @@ nock("https://www.w3.org")
.get("/Tools/respec/respec-w3c").replyWithFile(200,
path.join(modulesFolder, "respec", "builds", "respec-w3c.js"),
{ "Content-Type": "application/js" })
.get("/TR/idontexist/").reply(404, '');
.get("/TR/idontexist/").reply(404, '')
.get("/TR/ididnotchange/").reply(function() {
if (this.req.headers['if-modified-since'][0] === "Fri, 11 Feb 2022 00:00:42 GMT") {
return [304, ''];
} else {
return [200, 'Unexpected path'];
}
});

nock("https://drafts.csswg.org")
.persist()
Expand All @@ -117,4 +124,4 @@ nock.emitter.on('no match', function(req, options, requestBody) {
}
});

module.exports = nock;
module.exports = nock;
25 changes: 21 additions & 4 deletions src/lib/specs-crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ const {
createFolderIfNeeded
} = require('./util');

const {version: reffyVersion} = require('../../package.json');

/**
* Return the spec if crawl succeeded or crawl result from given fallback list
Expand Down Expand Up @@ -78,10 +79,15 @@ async function crawlSpec(spec, crawlOptions) {
path.dirname(crawlOptions.fallback) : '';

if (spec.error) {
return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData);
return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData?.results);
}

try {
const fallback = crawlOptions.fallbackData?.results?.find(s => s.url === spec.url);
let cacheInfo = {};
if (crawlOptions.fallbackData?.crawler === `reffy-${reffyVersion}`) {
cacheInfo = Object.assign({}, fallback?.crawlCacheInfo);
dontcallmedom marked this conversation as resolved.
Show resolved Hide resolved
}
const result = await processSpecification(
spec.crawled,
(spec, modules) => {
Expand All @@ -97,8 +103,14 @@ async function crawlSpec(spec, crawlOptions) {
},
[spec, crawlOptions.modules],
{ quiet: crawlOptions.quiet,
forceLocalFetch: crawlOptions.forceLocalFetch }
forceLocalFetch: crawlOptions.forceLocalFetch,
...cacheInfo}
);
if (result.status === "notmodified" && fallback) {
crawlOptions.quiet ?? console.warn(`skipping ${spec.url}, no change`);
const copy = Object.assign({}, fallback);
return expandSpecResult(copy, fallbackFolder);
}

// Specific rule for IDL extracts:
// parse the extracted WebIdl content
Expand Down Expand Up @@ -169,6 +181,9 @@ async function crawlSpec(spec, crawlOptions) {

// Copy results back into initial spec object
spec.crawled = result.crawled;
if (result.crawlCacheInfo) {
spec.crawlCacheInfo = result.crawlCacheInfo;
}
crawlOptions.modules.forEach(mod => {
if (result[mod.property]) {
spec[mod.property] = result[mod.property];
Expand All @@ -183,7 +198,7 @@ async function crawlSpec(spec, crawlOptions) {
spec.error = err.toString() + (err.stack ? ' ' + err.stack : '');
}

return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData);
return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData?.results);
}


Expand Down Expand Up @@ -351,7 +366,7 @@ async function crawlList(speclist, crawlOptions) {
// Load fallback data if necessary
if (crawlOptions.fallback) {
try {
crawlOptions.fallbackData = JSON.parse(await fs.promises.readFile(crawlOptions.fallback)).results;
crawlOptions.fallbackData = JSON.parse(await fs.promises.readFile(crawlOptions.fallback));
} catch (e) {
throw new Error(`Could not parse fallback data file ${crawlOptions.fallback}`);
}
Expand Down Expand Up @@ -469,12 +484,14 @@ async function saveResults(data, settings) {

// Save all results to an index.json file
const indexFilename = path.join(settings.output, 'index.json');

const contents = {
type: 'crawl',
title: 'Reffy crawl',
date: (new Date()).toJSON(),
options: settings,
stats: {},
crawler: `reffy-${reffyVersion}`,
results: data
};
contents.options.modules = contents.options.modules.map(mod => mod.property);
Expand Down
75 changes: 63 additions & 12 deletions src/lib/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ const specEquivalents = require('../specs/spec-equivalents.json');

const reffyModules = require('../browserlib/reffy.json');


/**
* Maximum depth difference supported between Reffy's install path and custom
* modules that may be provided on the command-line
Expand All @@ -22,7 +21,6 @@ const reffyModules = require('../browserlib/reffy.json');
*/
const maxPathDepth = 20;


/**
* Returns a range array from 0 to the number provided (not included)
*/
Expand Down Expand Up @@ -325,14 +323,16 @@ async function teardownBrowser() {
* flag tells the function that all network requests need to be only handled
* by Node.js's "fetch" function (as opposed to falling back to Puppeteer's
* network and caching logic), which is useful to keep full control of network
* requests in tests.
* requests in tests. The "etag" and "lastModified" options give input
* to the conditional fetch request sent for the primary crawled URL
* @return {Promise} The promise to get the results of the processing function
*/
async function processSpecification(spec, processFunction, args, options) {
spec = (typeof spec === 'string') ? { url: spec } : spec;
processFunction = processFunction || function () {};
args = args || [];
options = options || {};
let reuseExistingData = false;

if (!browser) {
throw new Error('Browser instance not initialized, setupBrowser() must be called before processSpecification().');
Expand Down Expand Up @@ -410,16 +410,47 @@ async function processSpecification(spec, processFunction, args, options) {
return;
}

const response = await fetch(request.url, { signal: controller.signal });
// If we have a fallback data source
// with a defined cache target for the primary url of the spec
// we set a conditional request header
if (request.url === spec.url) {
// Use If-Modified-Since in preference as it is in practice
// more reliable for conditional requests
if (options.lastModified) {
request.headers["If-Modified-Since"] = options.lastModified;
} else if (options.etag) {
request.headers["If-None-Match"] = options.etag;
}
}

const response = await fetch(request.url, { signal: controller.signal, headers: request.headers });

// If we hit a 304, or
// If the response Last-Modified / ETag header match
// what is already known (the server is drunk)
// failing the request triggers the error path in
// page.goto()
if (response.status === 304 ||
(options.lastModified && response.headers['last-modified'] === options.lastModified) ||
(options.etag && response.headers.etag === options.etag)
) {
await cdp.send('Fetch.failRequest', {
requestId,
errorReason: "Failed"
});
reuseExistingData = true;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a bit sad that there's no easy way to pass that reason within the Fetch.failRequest message. errorReason: "BlockedByClient" could perhaps be the closest reason to what we're doing here.

In any case, #858 removes that part, so that comment will quickly become moot.

return;
}
const body = await response.buffer();

await cdp.send('Fetch.fulfillRequest', {
requestId,
responseCode: response.status,
responseHeaders: Object.keys(response.headers.raw()).map(header => {
return {
name: header,
value: response.headers.raw()[header].join(',')
};
return {
name: header,
value: response.headers.raw()[header].join(',')
};
}),
body: body.toString('base64')
});
Expand All @@ -442,8 +473,11 @@ async function processSpecification(spec, processFunction, args, options) {
await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
}
else {
options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
try {
if (reuseExistingData) {
await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
}
options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
await cdp.send('Fetch.continueRequest', { requestId });
}
catch (err) {
Expand Down Expand Up @@ -497,13 +531,30 @@ async function processSpecification(spec, processFunction, args, options) {

// Load the page
// (note HTTP status is 0 when `file://` URLs are loaded)
let cacheInfo;
if (spec.html) {
await page.setContent(spec.html, loadOptions);
}
else {
const result = await page.goto(spec.url, loadOptions);
let result;
try {
result = await page.goto(spec.url, loadOptions);
} catch (err) {
if (reuseExistingData) {
return {status: "notmodified"};
}
throw new Error(`Loading ${spec.url} triggered network error ${err}`);
}
if ((result.status() !== 200) && (!spec.url.startsWith('file://') || (result.status() !== 0))) {
throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
}
const responseHeaders = result.headers();
// Use Last-Modified in preference as it is in practice
// more reliable for conditional requests
if (responseHeaders['last-modified']) {
cacheInfo = {lastModified: responseHeaders['last-modified']};
} else if (responseHeaders.etag) {
cacheInfo = {etag: responseHeaders.etag};
}
}

Expand Down Expand Up @@ -613,7 +664,7 @@ async function processSpecification(spec, processFunction, args, options) {

// Run the processFunction method in the browser context
const results = await page.evaluate(processFunction, ...args);

results.crawlCacheInfo = cacheInfo;
// Pending network requests may still be in the queue, flag the page
// as closed not to send commands on a CDP session that's no longer
// attached to anything
Expand Down
51 changes: 51 additions & 0 deletions tests/crawl-cache.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"type": "crawl",
"title": "Reffy crawl",
"date": "2022-02-02T10:45:00.000Z",
"options": {
"output": "tests",
"specs": [
"httpswwww3orgTRididnotchange"
],
"modules": [
"title",
"generator",
"date",
"links",
"refs",
"idl",
"css",
"dfns",
"elements",
"headings",
"ids"
]
},
"stats": {
"crawled": 1,
"errors": 0
},
"results": [
{
"url": "https://www.w3.org/TR/ididnotchange/",
"nightly": {
"url": "https://www.w3.org/TR/ididnotchange/"
},
"shortname": "ididnotchange",
"series": {
"shortname": "ididnotchange"
},
"versions": [
"https://www.w3.org/TR/ididnotchange/"
],
"crawled": {
"url": "https://www.w3.org/TR/ididnotchange/"
},
"crawlCacheInfo": {
"lastModified": "Fri, 11 Feb 2022 00:00:42 GMT"
},
"title": "Change is the only constant",
"refs": "extracts/crawl-refs.json"
}
]
}
30 changes: 29 additions & 1 deletion tests/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ const fs = require("fs");
const path = require("path");
const os = require("os");

const {version: reffyVersion} = require('../package.json');

const specs = [
{url: "https://www.w3.org/TR/WOFF2/", nightly: {url: "https://w3c.github.io/woff/woff2/", pages:["https://w3c.github.io/woff/woff2/page.html"]}},
{url: "https://www.w3.org/TR/audio-output/", nightly: {url: "https://w3c.github.io/mediacapture-output/"}},
Expand All @@ -17,6 +19,16 @@ async function crawl() {
return results;
}

async function runWithAnnotatedCrawlData(path, fn) {
const rawCrawlData = fs.readFileSync(path);
let crawlData = JSON.parse(rawCrawlData);
crawlData.crawler = `reffy-${reffyVersion}`;
fs.writeFileSync(path, JSON.stringify(crawlData));
const res = await fn();
fs.writeFileSync(path, rawCrawlData);
return res;
}

if (global.describe && describe instanceof Function) {
const { assert } = require('chai');

Expand Down Expand Up @@ -86,6 +98,21 @@ if (global.describe && describe instanceof Function) {
assert.equal(results.results[0].title, 'A test spec');
});


it("skips processing and reuse fallback data when spec cache info indicates it has not changed", async () => {
const url = "https://www.w3.org/TR/ididnotchange/";
const fallback = path.resolve(__dirname, 'crawl-cache.json');
const results = await runWithAnnotatedCrawlData(fallback, async () => crawlList(
[{ url, nightly: { url } }],
{
forceLocalFetch: true,
fallback
}));
assert.equal(results[0].title, "Change is the only constant");
assert.isUndefined(results[0].error);
assert.equal(results[0].refs, "A useful list of refs");
})

it("reports HTTP error statuses", async () => {
const url = "https://www.w3.org/TR/idontexist/";
const results = await crawlList(
Expand All @@ -97,11 +124,12 @@ if (global.describe && describe instanceof Function) {

it("reports errors and returns fallback data when possible", async () => {
const url = "https://www.w3.org/TR/idontexist/";
const fallback = path.resolve(__dirname, 'crawl-fallback.json');
const results = await crawlList(
[{ url, nightly: { url } }],
{
forceLocalFetch: true,
fallback: path.resolve(__dirname, 'crawl-fallback.json')
fallback
});
assert.equal(results[0].title, "On the Internet, nobody knows you don't exist");
assert.include(results[0].error, "Loading https://www.w3.org/TR/idontexist/ triggered HTTP status 404");
Expand Down