w3c · tidoust · Jun 6, 2024 · Jun 5, 2024 · tidoust · Jun 5, 2024
diff --git a/src/lib/mock-server.js b/src/lib/mock-server.js
@@ -123,6 +123,12 @@ mockAgent
   .reply(200, '')
   .persist();
 
+mockAgent
+  .get("https://www.w3.org")
+  .intercept({ method: "GET", path: "/StyleSheets/TR/2021/dark.css" })
+  .reply(200, '')
+  .persist();
+
 mockAgent
   .get("https://www.w3.org")
   .intercept({ method: "GET", path: "/Tools/respec/respec-highlight" })

diff --git a/src/lib/specs-crawler.js b/src/lib/specs-crawler.js
@@ -31,6 +31,88 @@ const {
 
 const {version: reffyVersion} = require('../../package.json');
 
+/**
+ * To be friendly with servers, requests get serialized by origin server,
+ * and the code sleeps a bit in between requests to a given origin server.
+ * To achieve, the code needs to take a lock on the origin it wants to send a
+ * request to.
+ */
+const originLocks = {};
+
+
+/**
+ * Helper function to sleep for a specified number of milliseconds
+ */
+function sleep(ms) {
+    return new Promise(resolve => setTimeout(resolve, ms, 'slept'));
+}
+
+
+/**
+ * Helper function to interleave values of a list of arrays.
+ *
+ * For example:
+ * interleave([0, 2, 4, 6, 8], [1, 3, 5]) returns [0, 1, 2, 3, 4, 5, 6, 8]
+ * interleave([0, 3], [1, 4], [2, 5]) returns [0, 1, 2, 3, 4, 5]
+ *
+ * The function is used to sort the list of specs to crawl so as to distribute
+ * origins throughout the list.
+ *
+ * Note the function happily modifies (and empties in practice) the arrays
+ * it receives as arguments.
+ */
+function interleave(firstArray, ...furtherArrays) {
+    if (firstArray?.length > 0) {
+        // Return the concactenation of the first item in the first array,
+        // and of the result of interleaving remaining arrays, putting the
+        // first array last in the list.
+        const firstItem = firstArray.shift();
+        return [firstItem, ...interleave(...furtherArrays, firstArray)];
+    }
+    else {
+        // First array is empty, let's proceed with remaining arrays
+        // until there's nothing else to proceed.
+        if (furtherArrays.length > 0) {
+            return interleave(...furtherArrays);
+        }
+        else {
+            return [];
+        }
+    }
+}
+
+
+/**
+ * Helper function that returns the "origin" of a URL, defined in a loose way
+ * as the part of the true origin that identifies the server that's going to
+ * serve the resource.
+ *
+ * For example "github.io" for all specs under github.io, "whatwg.org" for
+ * all WHATWG specs, "csswg.org" for CSS specs at large (including Houdini
+ * and FXTF specs since they are served by the same server).
+ */
+function getOrigin(url) {
+    if (!url) {
+        return '';
+    }
+    const origin = (new URL(url)).origin;
+    if (origin.endsWith('.whatwg.org')) {
+        return 'whatwg.org';
+    }
+    else if (origin.endsWith('.github.io')) {
+        return 'github.io';
+    }
+    else if (origin.endsWith('.csswg.org') ||
+             origin.endsWith('.css-houdini.org') ||
+             origin.endsWith('.fxtf.org')) {
+        return 'csswg.org';
+    }
+    else {
+        return origin;
+    }
+}
+
+
 /**
  * Return the spec if crawl succeeded or crawl result from given fallback list
  * if crawl yielded an error (and fallback does exist).
@@ -95,24 +177,51 @@ async function crawlSpec(spec, crawlOptions) {
             result = {};
         }
         else {
-            result = await processSpecification(
-                urlToCrawl,
-                (spec, modules) => {
-                    const idToHeading = modules.find(m => m.needsIdToHeadingMap) ?
-                        window.reffy.mapIdsToHeadings() : null;
-                    const res = {
-                        crawled: window.location.toString()
-                    };
-                    modules.forEach(mod => {
-                        res[mod.property] = window.reffy[mod.name](spec, idToHeading);
-                    });
-                    return res;
-                },
-              [spec, crawlOptions.modules],
-                { quiet: crawlOptions.quiet,
-                  forceLocalFetch: crawlOptions.forceLocalFetch,
-                  ...cacheInfo}
-            );
+            // To be friendly with servers, requests are serialized per origin
+            // and only sent after a couple of seconds.
+            const origin = getOrigin(urlToCrawl.url);
+            let originLock = originLocks[origin];
+            if (!originLock) {
+                originLock = {
+                    locked: false,
+                    last: 0
+                };
+                originLocks[origin] = originLock;
+            }
+            // Wait for the "lock" on the origin. Once we can take it, sleep as
+            // needed to only send a request after enough time has elapsed.
+            while (originLock.locked) {
+                await sleep(100);
+            }
+            originLock.locked = true;
+            const now = Date.now();
+            if (now - originLock.last < 2000) {
+                await sleep(2000 - (now - originLock.last));
+            }
+            try {
+                result = await processSpecification(
+                    urlToCrawl,
+                    (spec, modules) => {
+                        const idToHeading = modules.find(m => m.needsIdToHeadingMap) ?
+                            window.reffy.mapIdsToHeadings() : null;
+                        const res = {
+                            crawled: window.location.toString()
+                        };
+                        modules.forEach(mod => {
+                            res[mod.property] = window.reffy[mod.name](spec, idToHeading);
+                        });
+                        return res;
+                    },
+                  [spec, crawlOptions.modules],
+                    { quiet: crawlOptions.quiet,
+                      forceLocalFetch: crawlOptions.forceLocalFetch,
+                      ...cacheInfo}
+                );
+            }
+            finally {
+                originLock.last = Date.now();
+                originLock.locked = false;
+            }
             if (result.status === "notmodified" && fallback) {
               crawlOptions.quiet ?? console.warn(`skipping ${spec.url}, no change`);
               const copy = Object.assign({}, fallback);
@@ -343,14 +452,33 @@ async function crawlList(speclist, crawlOptions) {
         return { spec, readyToCrawl, resolve, reject };
     });
 
+    // While we want results to be returned following the initial order of the
+    // specs, to avoid sending too many requests at once to the same origin,
+    // we'll sort specs so that origins get interleaved.
+    // Note: there may be specs without URL (ISO specs)
+    const specsByOrigin = {};
+    for (const spec of list) {
+        const toCrawl = crawlOptions.publishedVersion ?
+            (spec.release ?? spec.nightly) :
+            spec.nightly;
+        const origin = getOrigin(toCrawl?.url);
+        if (!specsByOrigin[origin]) {
+            specsByOrigin[origin] = [];
+        }
+        specsByOrigin[origin].push(spec);
+    }
+    const spreadList = interleave(...Object.values(specsByOrigin));
+
     // In debug mode, specs are processed one by one. In normal mode,
     // specs are processing in chunks
     const chunkSize = Math.min((crawlOptions.debug ? 1 : 4), list.length);
 
     let pos = 0;
     function flagNextSpecAsReadyToCrawl() {
-        if (pos < listAndPromise.length) {
-            listAndPromise[pos].resolve();
+        if (pos < spreadList.length) {
+            const spec = spreadList[pos];
+            const specAndPromise = listAndPromise.find(sp => sp.spec === spec);
+            specAndPromise.resolve();
             pos += 1;
         }
     }