Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Record and use HTTP cache information to skip unnecessary crawling and processing #856

Merged
merged 19 commits into from
Feb 7, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/lib/fetch.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
const os = require('os');
const path = require('path');
const baseFetch = require('fetch-filecache-for-crawling');
const baseBaseFetch = require('node-fetch');

// Read configuration parameters from `config.json` file
let config = null;
Expand All @@ -33,7 +34,7 @@ catch (err) {
* @return {Promise(Response)} Promise to get an HTTP response
*/
async function fetch(url, options) {
options = Object.assign({}, options);
options = Object.assign({headers: {}}, options);
['cacheFolder', 'resetCache', 'cacheRefresh', 'logToConsole'].forEach(param => {
let fetchParam = (param === 'cacheRefresh') ? 'refresh' : param;
if (config[param] && !options.hasOwnProperty(fetchParam)) {
Expand All @@ -43,6 +44,10 @@ async function fetch(url, options) {
if (!options.refresh) {
options.refresh = 'once';
}
// If conditional headers are sent in, ignore any local caching
if (options.headers['If-Modified-Since'] || options.headers['If-None-Match']) {
return baseBaseFetch(url, options);
}

// Use cache folder in tmp folder by default
if (!options.cacheFolder) {
Expand All @@ -53,4 +58,4 @@ async function fetch(url, options) {
}


module.exports = fetch;
module.exports = fetch;
16 changes: 14 additions & 2 deletions src/lib/specs-crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,9 @@ async function specOrFallback(spec, fallbackFolder, fallbackData) {
if (fallback) {
const copy = Object.assign({}, fallback);
const result = await expandSpecResult(copy, fallbackFolder);
result.error = spec.error;
if (!spec.ignoreError) {
result.error = spec.error;
}
return result;
}
}
Expand Down Expand Up @@ -82,6 +84,9 @@ async function crawlSpec(spec, crawlOptions) {
}

try {
const fallback = (crawlOptions.fallbackData || []).find(s => s.url === spec.url);
dontcallmedom marked this conversation as resolved.
Show resolved Hide resolved
const etag = fallback?.crawlCacheInfo?.etag;
const lastModified = fallback?.crawlCacheInfo?.lastModified;
const result = await processSpecification(
spec.crawled,
(spec, modules) => {
Expand All @@ -97,7 +102,8 @@ async function crawlSpec(spec, crawlOptions) {
},
[spec, crawlOptions.modules],
{ quiet: crawlOptions.quiet,
forceLocalFetch: crawlOptions.forceLocalFetch }
forceLocalFetch: crawlOptions.forceLocalFetch,
etag, lastModified}
);

// Specific rule for IDL extracts:
Expand Down Expand Up @@ -169,6 +175,7 @@ async function crawlSpec(spec, crawlOptions) {

// Copy results back into initial spec object
spec.crawled = result.crawled;
spec.crawlCacheInfo = result.crawlCacheInfo;
crawlOptions.modules.forEach(mod => {
if (result[mod.property]) {
spec[mod.property] = result[mod.property];
Expand All @@ -180,6 +187,10 @@ async function crawlSpec(spec, crawlOptions) {
}
catch (err) {
spec.title = spec.title || '[Could not be determined, see error]';
if (err.name === "ReuseExistingData") {
crawlOptions.quiet ?? console.warn(`${crawlOptions.logCounter} - ${spec.url} - skipping, no change`);
spec.ignoreError = true;
}
spec.error = err.toString() + (err.stack ? ' ' + err.stack : '');
}

Expand Down Expand Up @@ -392,6 +403,7 @@ async function crawlList(speclist, crawlOptions) {
const spec = specAndPromise.spec;
const logCounter = ('' + (idx + 1)).padStart(nbStr.length, ' ') + '/' + nbStr;
crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - crawling`);
crawlOptions.logCounter = logCounter;
dontcallmedom marked this conversation as resolved.
Show resolved Hide resolved
let result = await crawlSpec(spec, crawlOptions);
result = await saveSpecResults(result, crawlOptions);
crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - done`);
Expand Down
65 changes: 55 additions & 10 deletions src/lib/util.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ const specEquivalents = require('../specs/spec-equivalents.json');

const reffyModules = require('../browserlib/reffy.json');


/**
* Maximum depth difference supported between Reffy's install path and custom
* modules that may be provided on the command-line
Expand All @@ -22,6 +21,13 @@ const reffyModules = require('../browserlib/reffy.json');
*/
const maxPathDepth = 20;

class ReuseExistingData extends Error {
constructor(message) {
super(message);
this.name = "ReuseExistingData";
}
}


/**
* Returns a range array from 0 to the number provided (not included)
Expand Down Expand Up @@ -325,14 +331,16 @@ async function teardownBrowser() {
* flag tells the function that all network requests need to be only handled
* by Node.js's "fetch" function (as opposed to falling back to Puppeteer's
* network and caching logic), which is useful to keep full control of network
* requests in tests.
* requests in tests. The "etag" and "lastModified" options give input
* to the conditional fetch request sent for the primary crawled URL
* @return {Promise} The promise to get the results of the processing function
*/
async function processSpecification(spec, processFunction, args, options) {
spec = (typeof spec === 'string') ? { url: spec } : spec;
processFunction = processFunction || function () {};
args = args || [];
options = options || {};
let reuseExistingData = false;

if (!browser) {
throw new Error('Browser instance not initialized, setupBrowser() must be called before processSpecification().');
Expand Down Expand Up @@ -410,16 +418,37 @@ async function processSpecification(spec, processFunction, args, options) {
return;
}

const response = await fetch(request.url, { signal: controller.signal });
// If we have a fallback data source
// with a defined cache target for the said url,
// we set a conditional request header
if (options.etag) {
request.headers["If-None-Match"] = options.etag;
} else if (options.lastModified) {
request.headers["If-Modified-Since"] = options.lastModified;
}

const response = await fetch(request.url, { signal: controller.signal, headers: request.headers });

// If we hit a 304, skip any fetching and processing
// failing the request triggers the error path in
// page.goto()
if (response.status === 304) {
await cdp.send('Fetch.failRequest', {
requestId,
errorReason: "Failed"
});
reuseExistingData = true;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's a bit sad that there's no easy way to pass that reason within the Fetch.failRequest message. errorReason: "BlockedByClient" could perhaps be the closest reason to what we're doing here.

In any case, #858 removes that part, so that comment will quickly become moot.

return;
}
const body = await response.buffer();
await cdp.send('Fetch.fulfillRequest', {
requestId,
responseCode: response.status,
responseHeaders: Object.keys(response.headers.raw()).map(header => {
return {
name: header,
value: response.headers.raw()[header].join(',')
};
return {
name: header,
value: response.headers.raw()[header].join(',')
};
}),
body: body.toString('base64')
});
Expand All @@ -442,8 +471,11 @@ async function processSpecification(spec, processFunction, args, options) {
await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
}
else {
options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
try {
if (reuseExistingData) {
await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
}
options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
await cdp.send('Fetch.continueRequest', { requestId });
}
catch (err) {
Expand Down Expand Up @@ -497,14 +529,27 @@ async function processSpecification(spec, processFunction, args, options) {

// Load the page
// (note HTTP status is 0 when `file://` URLs are loaded)
let cacheInfo;
if (spec.html) {
await page.setContent(spec.html, loadOptions);
}
else {
try {
const result = await page.goto(spec.url, loadOptions);
if ((result.status() !== 200) && (!spec.url.startsWith('file://') || (result.status() !== 0))) {
throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
}
const responseHeaders = result.headers();
if (responseHeaders.etag) {
cacheInfo = {etag: responseHeaders.etag};
} else if (responseHeaders['last-modified']) {
cacheInfo = {lastModified: responseHeaders['last-modified']};
}
} catch (err) {
if (reuseExistingData) {
throw new ReuseExistingData(`${spec.url} hasn't changed`);
dontcallmedom marked this conversation as resolved.
Show resolved Hide resolved
}
}
}

// Handle multi-page specs
Expand Down Expand Up @@ -613,7 +658,7 @@ async function processSpecification(spec, processFunction, args, options) {

// Run the processFunction method in the browser context
const results = await page.evaluate(processFunction, ...args);

results.crawlCacheInfo = cacheInfo;
// Pending network requests may still be in the queue, flag the page
// as closed not to send commands on a CDP session that's no longer
// attached to anything
Expand Down