Skip to content

Commit

Permalink
Add throttling logic per origin, skip SVG resources (#1356)
Browse files Browse the repository at this point in the history
* Add throttling logic per origin

This adds a generic throttled queue per origin class that replaces the previous
`throttle` function to add serialization (and sleep interval) per origin.

The code is inspired by:
https://github.com/w3c/cg-monitor/blob/5ceea24a1eaa7c1736d38909bcbb47df4ae297a3/lib/caching-queued-fetch.js#L68

(Note plan is to reuse the same class in Reffy)

* Intercept and skip network requests on SVG files

SVG files were not intercepted in Reffy because a couple of specs were using a
clunky dynamic PNG fallback mechanism that created an infinite loop. These
specs were fixed some time ago, so I don't think that's still an issue and,
for some reason, the drafts CSS server takes a lot of time to serve SVG files
linked from specs.

---------

Co-authored-by: Dominique Hazael-Massieux <dom@w3.org>
  • Loading branch information
tidoust and dontcallmedom authored Jun 7, 2024
1 parent 1c6f61e commit 7e3dc35
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 48 deletions.
1 change: 0 additions & 1 deletion src/build-index.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ const determineTestPath = require("./determine-testpath.js");
const extractPages = require("./extract-pages.js");
const fetchInfo = require("./fetch-info.js");
const fetchGroups = require("./fetch-groups.js");
const throttle = require("./throttle");
const githubToken = (_ => {
try {
return require("../config.json").GITHUB_TOKEN;
Expand Down
27 changes: 16 additions & 11 deletions src/fetch-info.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,9 @@

const puppeteer = require("puppeteer");
const loadSpec = require("./load-spec");
const throttle = require("./throttle");
const throttledFetch = throttle(fetch, 2);
const computeShortname = require("./compute-shortname");
const Octokit = require("./octokit");
const ThrottledQueue = require("./throttled-queue");

// Map spec statuses returned by Specref to those used in specs
// Note we typically won't get /TR statuses from Specref, since all /TR URLs
Expand All @@ -56,6 +55,8 @@ const specrefStatusMapping = {
"cg-draft": "Draft Community Group Report"
};

const fetchQueue = new ThrottledQueue(2);

async function useLastInfoForDiscontinuedSpecs(specs) {
const results = {};
for (const spec of specs) {
Expand All @@ -80,7 +81,7 @@ async function fetchInfoFromW3CApi(specs, options) {
}

const url = `https://api.w3.org/specifications/${spec.shortname}/versions/latest`;
const res = await throttledFetch(url, options);
const res = await fetchQueue.runThrottled(fetch, url, options);
if (res.status === 404) {
return;
}
Expand Down Expand Up @@ -135,7 +136,7 @@ async function fetchInfoFromW3CApi(specs, options) {
// Fetch info on the series
const seriesInfo = await Promise.all([...seriesShortnames].map(async shortname => {
const url = `https://api.w3.org/specification-series/${shortname}`;
const res = await throttledFetch(url, options);
const res = await fetchQueue.runThrottled(fetch, url, options);
if (res.status === 404) {
return;
}
Expand Down Expand Up @@ -206,7 +207,7 @@ async function fetchInfoFromSpecref(specs, options) {
// API does not return the "source" field, so we need to retrieve the list
// ourselves from Specref's GitHub repository.
const specrefBrowserspecsUrl = "https://raw.githubusercontent.com/tobie/specref/main/refs/browser-specs.json";
const browserSpecsResponse = await throttledFetch(specrefBrowserspecsUrl, options);
const browserSpecsResponse = await fetch(specrefBrowserspecsUrl, options);
if (browserSpecsResponse.status !== 200) {
throw new Error(`Could not retrieve specs contributed by browser-specs to Speref, status code is ${browserSpecsResponse.status}`);
}
Expand All @@ -218,7 +219,7 @@ async function fetchInfoFromSpecref(specs, options) {
let specrefUrl = "https://api.specref.org/bibrefs?refs=" +
chunk.map(spec => spec.shortname).join(',');

const res = await throttledFetch(specrefUrl, options);
const res = await fetchQueue.runThrottled(fetch, specrefUrl, options);
if (res.status !== 200) {
throw new Error(`Could not query Specref, status code is ${res.status}`);
}
Expand Down Expand Up @@ -290,7 +291,7 @@ async function fetchInfoFromSpecref(specs, options) {
async function fetchInfoFromIETF(specs, options) {
async function fetchJSONDoc(draftName) {
const url = `https://datatracker.ietf.org/doc/${draftName}/doc.json`;
const res = await throttledFetch(url, options);
const res = await fetchQueue.runThrottled(fetch, url, options);
if (res.status !== 200) {
throw new Error(`IETF datatracker returned an error for ${url}, status code is ${res.status}`);
}
Expand All @@ -303,7 +304,7 @@ async function fetchInfoFromIETF(specs, options) {
}

async function fetchRFCName(docUrl) {
const res = await fetch(docUrl, options);
const res = await fetchQueue.runThrottled(fetch, docUrl, options);
if (res.status !== 200) {
throw new Error(`IETF datatracker returned an error for ${url}, status code is ${res.status}`);
}
Expand All @@ -324,7 +325,7 @@ async function fetchInfoFromIETF(specs, options) {
return [];
}
const url = `https://datatracker.ietf.org/api/v1/doc/relateddocument/?format=json&relationship__slug__in=obs&target__name__in=${draftName}`;
const res = await throttledFetch(url, options);
const res = await fetchQueue.runThrottled(fetch, url, options);
if (res.status !== 200) {
throw new Error(`IETF datatracker returned an error for ${url}, status code is ${res.status}`);
}
Expand Down Expand Up @@ -361,6 +362,7 @@ async function fetchInfoFromIETF(specs, options) {
return paths.filter(p => p.path.match(/^specs\/rfc\d+\.html$/))
.map(p => p.path.match(/(rfc\d+)\.html$/)[1]);
}
const httpwgRFCs = await getHttpwgRFCs();

const info = await Promise.all(specs.map(async spec => {
// IETF can only provide information about IETF specs
Expand All @@ -387,7 +389,6 @@ async function fetchInfoFromIETF(specs, options) {
// Note we prefer the httpwg.org version for HTTP WG RFCs and drafts.
let nightly;
if (lastRevision.name.startsWith('rfc')) {
const httpwgRFCs = await getHttpwgRFCs();
if (httpwgRFCs.includes(lastRevision.name)) {
nightly = `https://httpwg.org/specs/${lastRevision.name}.html`
}
Expand Down Expand Up @@ -451,6 +452,7 @@ async function fetchInfoFromSpecs(specs, options) {
const page = await browser.newPage();

try {
console.warn(`- fetch spec info from ${url}`);
await loadSpec(url, page);

if (spec.url.startsWith("https://tc39.es/")) {
Expand Down Expand Up @@ -567,7 +569,10 @@ async function fetchInfoFromSpecs(specs, options) {
}

try {
const info = await Promise.all(specs.map(throttle(fetchInfoFromSpec, 2)));
const queue = new ThrottledQueue(4);
const info = await Promise.all(specs.map(spec =>
queue.runThrottledPerOrigin(spec.nightly?.url || spec.url, fetchInfoFromSpec, spec)
));
const results = {};
specs.forEach((spec, idx) => results[spec.shortname] = info[idx]);
return results;
Expand Down
2 changes: 1 addition & 1 deletion src/load-spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ module.exports = async function (url, page) {
return async function ({ requestId, request }) {
try {
// Abort network requests to common image formats
if (/\.(gif|ico|jpg|jpeg|png|ttf|woff)$/i.test(request.url)) {
if (/\.(gif|ico|jpg|jpeg|png|ttf|woff|svg)$/i.test(request.url)) {
await cdp.send('Fetch.failRequest', { requestId, errorReason: 'Failed' });
return;
}
Expand Down
35 changes: 0 additions & 35 deletions src/throttle.js

This file was deleted.

120 changes: 120 additions & 0 deletions src/throttled-queue.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/**
* Helper function to sleep for a specified number of milliseconds
*/
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms, 'slept'));
}


/**
* Helper function that returns the "origin" of a URL, defined in a loose way
* as the part of the true origin that identifies the server that's going to
* serve the resource.
*
* For example "github.io" for all specs under github.io, "whatwg.org" for
* all WHATWG specs, "csswg.org" for CSS specs at large (including Houdini
* and FXTF specs since they are served by the same server).
*/
function getOrigin(url) {
if (!url) {
return '';
}
const origin = (new URL(url)).origin;
if (origin.endsWith('.whatwg.org')) {
return 'whatwg.org';
}
else if (origin.endsWith('.github.io')) {
return 'github.io';
}
else if (origin.endsWith('.csswg.org') ||
origin.endsWith('.css-houdini.org') ||
origin.endsWith('.fxtf.org')) {
return 'csswg.org';
}
else {
return origin;
}
}


/**
* The ThrottledQueue class can be used to run a series of tasks that send
* network requests to an origin server in parallel, up to a certain limit,
* while guaranteeing that only one request will be sent to a given origin
* server at a time.
*/
module.exports = class ThrottledQueue {
originQueue = {};
maxParallel = 4;
ongoing = 0;
pending = [];

constructor(maxParallel) {
if (maxParallel >= 0) {
this.maxParallel = maxParallel;
}
}

/**
* Run the given processing function with the given parameters, immediately
* if possible or as soon as possible when too many tasks are already running
* in parallel.
*
* Note this function has no notion of origin. Users may call the function
* directly if they don't need any throttling per origin.
*/
async runThrottled(processFunction, ...params) {
if (this.ongoing >= this.maxParallel) {
return new Promise((resolve, reject) => {
this.pending.push({ params, resolve, reject });
});
}
else {
this.ongoing += 1;
const result = await processFunction.call(null, ...params);
this.ongoing -= 1;

// Done with current task, trigger next pending task in the background
setTimeout(_ => {
if (this.pending.length && this.ongoing < this.maxParallel) {
const next = this.pending.shift();
this.runThrottled(processFunction, ...next.params)
.then(result => next.resolve(result))
.catch(err => next.reject(err));
}
}, 0);

return result;
}
}

/**
* Run the given processing function with the given parameters, immediately
* if possible or as soon as possible when too many tasks are already running
* in parallel, or when there's already a task being run against the same
* origin as that of the provided URL.
*
* Said differently, the function serializes tasks per origin, and calls
* "runThrottled" to restrict the number of tasks that run in parallel to the
* requested maximum.
*
* Additionally, the function forces a 2 second sleep after processing to
* keep a low network profile.
*/
async runThrottledPerOrigin(url, processFunction, ...params) {
const origin = getOrigin(url);
if (!this.originQueue[origin]) {
this.originQueue[origin] = Promise.resolve(true);
}
return new Promise((resolve, reject) => {
this.originQueue[origin] = this.originQueue[origin]
.then(async _ => this.runThrottled(processFunction, ...params))
.then(async result => {
await sleep(2000);
return result;
})
.then(resolve)
.catch(reject);
});
}
}

0 comments on commit 7e3dc35

Please sign in to comment.