w3c · dontcallmedom · Feb 7, 2022 · Feb 3, 2022 · Feb 4, 2022 · Feb 4, 2022
diff --git a/src/lib/fetch.js b/src/lib/fetch.js
@@ -8,6 +8,7 @@
 const os = require('os');
 const path = require('path');
 const baseFetch = require('fetch-filecache-for-crawling');
+const baseBaseFetch = require('node-fetch');
 
 // Read configuration parameters from `config.json` file
 let config = null;
@@ -33,7 +34,7 @@ catch (err) {
  * @return {Promise(Response)} Promise to get an HTTP response
  */
 async function fetch(url, options) {
-    options = Object.assign({}, options);
+    options = Object.assign({headers: {}}, options);
     ['cacheFolder', 'resetCache', 'cacheRefresh', 'logToConsole'].forEach(param => {
         let fetchParam = (param === 'cacheRefresh') ? 'refresh' : param;
         if (config[param] && !options.hasOwnProperty(fetchParam)) {
@@ -43,6 +44,10 @@ async function fetch(url, options) {
     if (!options.refresh) {
         options.refresh = 'once';
     }
+    // If conditional headers are sent in, ignore any local caching
+    if (options.headers['If-Modified-Since'] || options.headers['If-None-Match']) {
+        return baseBaseFetch(url, options);
+    }
 
     // Use cache folder in tmp folder by default
     if (!options.cacheFolder) {
@@ -53,4 +58,4 @@ async function fetch(url, options) {
 }
 
 
-module.exports = fetch;
+module.exports = fetch;
diff --git a/src/lib/specs-crawler.js b/src/lib/specs-crawler.js
@@ -52,7 +52,9 @@ async function specOrFallback(spec, fallbackFolder, fallbackData) {
         if (fallback) {
             const copy = Object.assign({}, fallback);
             const result = await expandSpecResult(copy, fallbackFolder);
-            result.error = spec.error;
+            if (!spec.ignoreError) {
+              result.error = spec.error;
+            }
             return result;
         }
     }
@@ -82,6 +84,9 @@ async function crawlSpec(spec, crawlOptions) {
     }
 
     try {
+        const fallback = (crawlOptions.fallbackData || []).find(s => s.url === spec.url);
+        const etag = fallback?.crawlCacheInfo?.etag;
+        const lastModified = fallback?.crawlCacheInfo?.lastModified;
         const result = await processSpecification(
             spec.crawled,
             (spec, modules) => {
@@ -97,7 +102,8 @@ async function crawlSpec(spec, crawlOptions) {
             },
             [spec, crawlOptions.modules],
             { quiet: crawlOptions.quiet,
-              forceLocalFetch: crawlOptions.forceLocalFetch }
+              forceLocalFetch: crawlOptions.forceLocalFetch,
+              etag, lastModified}
         );
 
         // Specific rule for IDL extracts:
@@ -169,6 +175,7 @@ async function crawlSpec(spec, crawlOptions) {
 
         // Copy results back into initial spec object
         spec.crawled = result.crawled;
+        spec.crawlCacheInfo = result.crawlCacheInfo;
         crawlOptions.modules.forEach(mod => {
             if (result[mod.property]) {
                 spec[mod.property] = result[mod.property];
@@ -180,6 +187,10 @@ async function crawlSpec(spec, crawlOptions) {
     }
     catch (err) {
         spec.title = spec.title || '[Could not be determined, see error]';
+        if (err.name === "ReuseExistingData") {
+          crawlOptions.quiet ?? console.warn(`${crawlOptions.logCounter} - ${spec.url} - skipping, no change`);
+          spec.ignoreError = true;
+        }
         spec.error = err.toString() + (err.stack ? ' ' + err.stack : '');
     }
 
@@ -392,6 +403,7 @@ async function crawlList(speclist, crawlOptions) {
         const spec = specAndPromise.spec;
         const logCounter = ('' + (idx + 1)).padStart(nbStr.length, ' ') + '/' + nbStr;
         crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - crawling`);
+        crawlOptions.logCounter = logCounter;
         let result = await crawlSpec(spec, crawlOptions);
         result = await saveSpecResults(result, crawlOptions);
         crawlOptions.quiet ?? console.warn(`${logCounter} - ${spec.url} - done`);

diff --git a/src/lib/util.js b/src/lib/util.js
@@ -13,7 +13,6 @@ const specEquivalents = require('../specs/spec-equivalents.json');
 
 const reffyModules = require('../browserlib/reffy.json');
 
-
 /**
  * Maximum depth difference supported between Reffy's install path and custom
  * modules that may be provided on the command-line
@@ -22,6 +21,13 @@ const reffyModules = require('../browserlib/reffy.json');
  */
 const maxPathDepth = 20;
 
+class ReuseExistingData extends Error {
+  constructor(message) {
+    super(message);
+    this.name = "ReuseExistingData";
+  }
+}
+
 
 /**
  * Returns a range array from 0 to the number provided (not included)
@@ -325,14 +331,16 @@ async function teardownBrowser() {
  *   flag tells the function that all network requests need to be only handled
  *   by Node.js's "fetch" function (as opposed to falling back to Puppeteer's
  *   network and caching logic), which is useful to keep full control of network
- *   requests in tests.
+ *   requests in tests. The "etag" and "lastModified" options give input
+ *   to the conditional fetch request sent for the primary crawled URL
  * @return {Promise} The promise to get the results of the processing function
  */
 async function processSpecification(spec, processFunction, args, options) {
     spec = (typeof spec === 'string') ? { url: spec } : spec;
     processFunction = processFunction || function () {};
     args = args || [];
     options = options || {};
+    let reuseExistingData = false;
 
     if (!browser) {
         throw new Error('Browser instance not initialized, setupBrowser() must be called before processSpecification().');
@@ -410,16 +418,37 @@ async function processSpecification(spec, processFunction, args, options) {
                         return;
                     }
 
-                    const response = await fetch(request.url, { signal: controller.signal });
+                    // If we have a fallback data source
+                    // with a defined cache target for the said url,
+                    // we set a conditional request header
+                    if (options.etag) {
+                      request.headers["If-None-Match"] = options.etag;
+                    } else if (options.lastModified) {
+                      request.headers["If-Modified-Since"] = options.lastModified;
+                    }
+
+                    const response = await fetch(request.url, { signal: controller.signal, headers: request.headers });
+
+                    // If we hit a 304, skip any fetching and processing
+                    // failing the request triggers the error path in
+                    // page.goto()
+                    if (response.status === 304) {
+                       await cdp.send('Fetch.failRequest', {
+                         requestId,
+                         errorReason: "Failed"
+                       });
+                       reuseExistingData = true;
+                       return;
+                    }
                     const body = await response.buffer();
                     await cdp.send('Fetch.fulfillRequest', {
                         requestId,
                         responseCode: response.status,
                         responseHeaders: Object.keys(response.headers.raw()).map(header => {
-                            return {
-                                name: header,
-                                value: response.headers.raw()[header].join(',')
-                            };
+                          return {
+                            name: header,
+                            value: response.headers.raw()[header].join(',')
+                          };
                         }),
                         body: body.toString('base64')
                     });
@@ -442,8 +471,11 @@ async function processSpecification(spec, processFunction, args, options) {
                     await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
                 }
                 else {
-                    options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
                     try {
+                        if (reuseExistingData) {
+                            await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
+                        }
+                        options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
                         await cdp.send('Fetch.continueRequest', { requestId });
                     }
                     catch (err) {
@@ -497,14 +529,27 @@ async function processSpecification(spec, processFunction, args, options) {
 
         // Load the page
         // (note HTTP status is 0 when `file://` URLs are loaded)
+        let cacheInfo;
         if (spec.html) {
             await page.setContent(spec.html, loadOptions);
         }
         else {
+          try {
             const result = await page.goto(spec.url, loadOptions);
             if ((result.status() !== 200) && (!spec.url.startsWith('file://') || (result.status() !== 0))) {
-                throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
+              throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
+            }
+            const responseHeaders = result.headers();
+            if (responseHeaders.etag) {
+              cacheInfo = {etag: responseHeaders.etag};
+            } else if (responseHeaders['last-modified']) {
+              cacheInfo = {lastModified: responseHeaders['last-modified']};
             }
+          } catch (err) {
+            if (reuseExistingData) {
+              throw new ReuseExistingData(`${spec.url} hasn't changed`);
+            }
+          }
         }
 
         // Handle multi-page specs
@@ -613,7 +658,7 @@ async function processSpecification(spec, processFunction, args, options) {
 
         // Run the processFunction method in the browser context
         const results = await page.evaluate(processFunction, ...args);
-
+        results.crawlCacheInfo = cacheInfo;
         // Pending network requests may still be in the queue, flag the page
         // as closed not to send commands on a CDP session that's no longer
         // attached to anything