w3c · dontcallmedom · Feb 7, 2022 · Feb 3, 2022 · Feb 4, 2022 · Feb 4, 2022
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -34,7 +34,7 @@
     "abortcontroller-polyfill": "1.7.3",
     "browser-specs": "2.27.0",
     "commander": "9.0.0",
-    "fetch-filecache-for-crawling": "4.0.2",
+    "fetch-filecache-for-crawling": "4.1.0",
     "puppeteer": "13.1.3",
     "semver": "^7.3.5",
     "webidl2": "24.2.0"

diff --git a/src/lib/fetch.js b/src/lib/fetch.js
@@ -33,7 +33,7 @@ catch (err) {
  * @return {Promise(Response)} Promise to get an HTTP response
  */
 async function fetch(url, options) {
-    options = Object.assign({}, options);
+    options = Object.assign({headers: {}}, options);
     ['cacheFolder', 'resetCache', 'cacheRefresh', 'logToConsole'].forEach(param => {
         let fetchParam = (param === 'cacheRefresh') ? 'refresh' : param;
         if (config[param] && !options.hasOwnProperty(fetchParam)) {
@@ -53,4 +53,4 @@ async function fetch(url, options) {
 }
 
 
-module.exports = fetch;
+module.exports = fetch;
diff --git a/src/lib/nock-server.js b/src/lib/nock-server.js
@@ -98,7 +98,14 @@ nock("https://www.w3.org")
   .get("/Tools/respec/respec-w3c").replyWithFile(200,
     path.join(modulesFolder, "respec", "builds", "respec-w3c.js"),
     { "Content-Type": "application/js" })
-  .get("/TR/idontexist/").reply(404, '');
+  .get("/TR/idontexist/").reply(404, '')
+  .get("/TR/ididnotchange/").reply(function() {
+    if (this.req.headers['if-modified-since'][0] === "Fri, 11 Feb 2022 00:00:42 GMT") {
+      return [304, ''];
+    } else {
+      return [200, 'Unexpected path'];
+    }
+  });
 
 nock("https://drafts.csswg.org")
   .persist()
@@ -117,4 +124,4 @@ nock.emitter.on('no match', function(req, options, requestBody) {
   }
 });
 
-module.exports = nock;
+module.exports = nock;
diff --git a/src/lib/specs-crawler.js b/src/lib/specs-crawler.js
@@ -29,6 +29,7 @@ const {
     createFolderIfNeeded
 } = require('./util');
 
+const {version: reffyVersion} = require('../../package.json');
 
 /**
  * Return the spec if crawl succeeded or crawl result from given fallback list
@@ -78,10 +79,15 @@ async function crawlSpec(spec, crawlOptions) {
         path.dirname(crawlOptions.fallback) : '';
 
     if (spec.error) {
-        return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData);
+        return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData?.results);
     }
 
     try {
+        const fallback = crawlOptions.fallbackData?.results?.find(s => s.url === spec.url);
+        let cacheInfo = {};
+        if (crawlOptions.fallbackData?.crawler === `reffy-${reffyVersion}`) {
+          cacheInfo = Object.assign({}, fallback?.crawlCacheInfo);
+        }
         const result = await processSpecification(
             spec.crawled,
             (spec, modules) => {
@@ -97,8 +103,14 @@ async function crawlSpec(spec, crawlOptions) {
             },
             [spec, crawlOptions.modules],
             { quiet: crawlOptions.quiet,
-              forceLocalFetch: crawlOptions.forceLocalFetch }
+              forceLocalFetch: crawlOptions.forceLocalFetch,
+              ...cacheInfo}
         );
+        if (result.status === "notmodified" && fallback) {
+          crawlOptions.quiet ?? console.warn(`skipping ${spec.url}, no change`);
+          const copy = Object.assign({}, fallback);
+          return expandSpecResult(copy, fallbackFolder);
+        }
 
         // Specific rule for IDL extracts:
         // parse the extracted WebIdl content
@@ -169,6 +181,9 @@ async function crawlSpec(spec, crawlOptions) {
 
         // Copy results back into initial spec object
         spec.crawled = result.crawled;
+        if (result.crawlCacheInfo) {
+          spec.crawlCacheInfo = result.crawlCacheInfo;
+        }
         crawlOptions.modules.forEach(mod => {
             if (result[mod.property]) {
                 spec[mod.property] = result[mod.property];
@@ -183,7 +198,7 @@ async function crawlSpec(spec, crawlOptions) {
         spec.error = err.toString() + (err.stack ? ' ' + err.stack : '');
     }
 
-    return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData);
+    return specOrFallback(spec, fallbackFolder, crawlOptions.fallbackData?.results);
 }
 
 
@@ -351,7 +366,7 @@ async function crawlList(speclist, crawlOptions) {
     // Load fallback data if necessary
     if (crawlOptions.fallback) {
         try {
-            crawlOptions.fallbackData = JSON.parse(await fs.promises.readFile(crawlOptions.fallback)).results;
+            crawlOptions.fallbackData = JSON.parse(await fs.promises.readFile(crawlOptions.fallback));
         } catch (e) {
             throw new Error(`Could not parse fallback data file ${crawlOptions.fallback}`);
         }
@@ -469,12 +484,14 @@ async function saveResults(data, settings) {
 
     // Save all results to an index.json file
     const indexFilename = path.join(settings.output, 'index.json');
+
     const contents = {
         type: 'crawl',
         title: 'Reffy crawl',
         date: (new Date()).toJSON(),
         options: settings,
         stats: {},
+        crawler: `reffy-${reffyVersion}`,
         results: data
     };
     contents.options.modules = contents.options.modules.map(mod => mod.property);

diff --git a/src/lib/util.js b/src/lib/util.js
@@ -13,7 +13,6 @@ const specEquivalents = require('../specs/spec-equivalents.json');
 
 const reffyModules = require('../browserlib/reffy.json');
 
-
 /**
  * Maximum depth difference supported between Reffy's install path and custom
  * modules that may be provided on the command-line
@@ -22,7 +21,6 @@ const reffyModules = require('../browserlib/reffy.json');
  */
 const maxPathDepth = 20;
 
-
 /**
  * Returns a range array from 0 to the number provided (not included)
  */
@@ -325,14 +323,16 @@ async function teardownBrowser() {
  *   flag tells the function that all network requests need to be only handled
  *   by Node.js's "fetch" function (as opposed to falling back to Puppeteer's
  *   network and caching logic), which is useful to keep full control of network
- *   requests in tests.
+ *   requests in tests. The "etag" and "lastModified" options give input
+ *   to the conditional fetch request sent for the primary crawled URL
  * @return {Promise} The promise to get the results of the processing function
  */
 async function processSpecification(spec, processFunction, args, options) {
     spec = (typeof spec === 'string') ? { url: spec } : spec;
     processFunction = processFunction || function () {};
     args = args || [];
     options = options || {};
+    let reuseExistingData = false;
 
     if (!browser) {
         throw new Error('Browser instance not initialized, setupBrowser() must be called before processSpecification().');
@@ -410,16 +410,47 @@ async function processSpecification(spec, processFunction, args, options) {
                         return;
                     }
 
-                    const response = await fetch(request.url, { signal: controller.signal });
+                    // If we have a fallback data source
+                    // with a defined cache target for the primary url of the spec
+                    // we set a conditional request header
+                    if (request.url === spec.url) {
+                      // Use If-Modified-Since in preference as it is in practice
+                      // more reliable for conditional requests
+                      if (options.lastModified) {
+                        request.headers["If-Modified-Since"] = options.lastModified;
+                      } else if (options.etag) {
+                        request.headers["If-None-Match"] = options.etag;
+                      }
+                    }
+
+                    const response = await fetch(request.url, { signal: controller.signal, headers: request.headers });
+
+                    // If we hit a 304, or
+                    // If the response Last-Modified / ETag header match
+                    // what is already known (the server is drunk)
+                    // failing the request triggers the error path in
+                    // page.goto()
+                    if (response.status === 304 ||
+                        (options.lastModified && response.headers['last-modified'] === options.lastModified) ||
+                        (options.etag && response.headers.etag === options.etag)
+                       ) {
+                       await cdp.send('Fetch.failRequest', {
+                         requestId,
+                         errorReason: "Failed"
+                       });
+                       reuseExistingData = true;
+                       return;
+                    }
                     const body = await response.buffer();
+
                     await cdp.send('Fetch.fulfillRequest', {
                         requestId,
                         responseCode: response.status,
                         responseHeaders: Object.keys(response.headers.raw()).map(header => {
-                            return {
-                                name: header,
-                                value: response.headers.raw()[header].join(',')
-                            };
+                          return {
+                            name: header,
+                            value: response.headers.raw()[header].join(',')
+                          };
                         }),
                         body: body.toString('base64')
                     });
@@ -442,8 +473,11 @@ async function processSpecification(spec, processFunction, args, options) {
                     await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
                 }
                 else {
-                    options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
                     try {
+                        if (reuseExistingData) {
+                            await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
+                        }
+                        options.quiet ?? console.warn(`[warn] Fall back to regular network request for ${request.url}`, err);
                         await cdp.send('Fetch.continueRequest', { requestId });
                     }
                     catch (err) {
@@ -497,13 +531,30 @@ async function processSpecification(spec, processFunction, args, options) {
 
         // Load the page
         // (note HTTP status is 0 when `file://` URLs are loaded)
+        let cacheInfo;
         if (spec.html) {
             await page.setContent(spec.html, loadOptions);
         }
         else {
-            const result = await page.goto(spec.url, loadOptions);
+            let result;
+            try {
+              result = await page.goto(spec.url, loadOptions);
+            } catch (err) {
+              if (reuseExistingData) {
+                return {status: "notmodified"};
+              }
+              throw new Error(`Loading ${spec.url} triggered network error ${err}`);
+            }
             if ((result.status() !== 200) && (!spec.url.startsWith('file://') || (result.status() !== 0))) {
-                throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
+              throw new Error(`Loading ${spec.url} triggered HTTP status ${result.status()}`);
+            }
+            const responseHeaders = result.headers();
+            // Use Last-Modified in preference as it is in practice
+            // more reliable for conditional requests
+            if (responseHeaders['last-modified']) {
+              cacheInfo = {lastModified: responseHeaders['last-modified']};
+            } else if (responseHeaders.etag) {
+              cacheInfo = {etag: responseHeaders.etag};
             }
         }
 
@@ -613,7 +664,7 @@ async function processSpecification(spec, processFunction, args, options) {
 
         // Run the processFunction method in the browser context
         const results = await page.evaluate(processFunction, ...args);
-
+        results.crawlCacheInfo = cacheInfo;
         // Pending network requests may still be in the queue, flag the page
         // as closed not to send commands on a CDP session that's no longer
         // attached to anything

diff --git a/tests/crawl-cache.json b/tests/crawl-cache.json
@@ -0,0 +1,51 @@
+{
+  "type": "crawl",
+  "title": "Reffy crawl",
+  "date": "2022-02-02T10:45:00.000Z",
+  "options": {
+    "output": "tests",
+    "specs": [
+      "httpswwww3orgTRididnotchange"
+    ],
+    "modules": [
+      "title",
+      "generator",
+      "date",
+      "links",
+      "refs",
+      "idl",
+      "css",
+      "dfns",
+      "elements",
+      "headings",
+      "ids"
+    ]
+  },
+  "stats": {
+    "crawled": 1,
+    "errors": 0
+  },
+  "results": [
+    {
+      "url": "https://www.w3.org/TR/ididnotchange/",
+      "nightly": {
+        "url": "https://www.w3.org/TR/ididnotchange/"
+      },
+      "shortname": "ididnotchange",
+      "series": {
+        "shortname": "ididnotchange"
+      },
+      "versions": [
+        "https://www.w3.org/TR/ididnotchange/"
+      ],
+      "crawled": {
+        "url": "https://www.w3.org/TR/ididnotchange/"
+      },
+      "crawlCacheInfo": {
+        "lastModified": "Fri, 11 Feb 2022 00:00:42 GMT"
+      },
+      "title": "Change is the only constant",
+      "refs": "extracts/crawl-refs.json"
+    }
+  ]
+}
diff --git a/tests/crawl.js b/tests/crawl.js
@@ -4,6 +4,8 @@ const fs = require("fs");
 const path = require("path");
 const os = require("os");
 
+const {version: reffyVersion} = require('../package.json');
+
 const specs = [
   {url: "https://www.w3.org/TR/WOFF2/", nightly: {url: "https://w3c.github.io/woff/woff2/", pages:["https://w3c.github.io/woff/woff2/page.html"]}},
   {url: "https://www.w3.org/TR/audio-output/", nightly: {url: "https://w3c.github.io/mediacapture-output/"}},
@@ -17,6 +19,16 @@ async function crawl() {
   return results;
 }
 
+async function runWithAnnotatedCrawlData(path, fn) {
+  const rawCrawlData = fs.readFileSync(path);
+  let crawlData = JSON.parse(rawCrawlData);
+  crawlData.crawler = `reffy-${reffyVersion}`;
+  fs.writeFileSync(path, JSON.stringify(crawlData));
+  const res = await fn();
+  fs.writeFileSync(path, rawCrawlData);
+  return res;
+}
+
 if (global.describe && describe instanceof Function) {
   const { assert } = require('chai');
 
@@ -86,6 +98,21 @@ if (global.describe && describe instanceof Function) {
       assert.equal(results.results[0].title, 'A test spec');
     });
 
+
+    it("skips processing and reuse fallback data when spec cache info indicates it has not changed", async () => {
+      const url = "https://www.w3.org/TR/ididnotchange/";
+      const fallback = path.resolve(__dirname, 'crawl-cache.json');
+      const results = await runWithAnnotatedCrawlData(fallback, async () => crawlList(
+        [{ url, nightly: { url } }],
+        {
+          forceLocalFetch: true,
+          fallback
+        }));
+      assert.equal(results[0].title, "Change is the only constant");
+      assert.isUndefined(results[0].error);
+      assert.equal(results[0].refs, "A useful list of refs");
+    })
+
     it("reports HTTP error statuses", async () => {
       const url = "https://www.w3.org/TR/idontexist/";
       const results = await crawlList(
@@ -97,11 +124,12 @@ if (global.describe && describe instanceof Function) {
 
     it("reports errors and returns fallback data when possible", async () => {
       const url = "https://www.w3.org/TR/idontexist/";
+      const fallback = path.resolve(__dirname, 'crawl-fallback.json');
       const results = await crawlList(
         [{ url, nightly: { url } }],
         {
           forceLocalFetch: true,
-          fallback: path.resolve(__dirname, 'crawl-fallback.json')
+          fallback
         });
       assert.equal(results[0].title, "On the Internet, nobody knows you don't exist");
       assert.include(results[0].error, "Loading https://www.w3.org/TR/idontexist/ triggered HTTP status 404");