URL: percent-encoding test framework basics

web-platform-tests · Nov 3, 2020 · 09d8830 · 09d8830
1 parent 7714033
commit 09d8830
Show file tree

Hide file tree

Showing 4 changed files with 108 additions and 0 deletions.
diff --git a/url/README.md b/url/README.md
@@ -44,6 +44,10 @@ expected to fail.
 Tests in `/encoding` and `/html/infrastructure/urls/resolving-urls/query-encoding/` cover the
 encoding argument to the URL parser.
 
+There's also limited coverage in `resources/percent-encoding.json` for percent-encode after encoding
+with _percentEncodeSet_ set to special-query percent-encode set and _spaceAsPlus_ set to false.
+(Improvements to expand coverage here are welcome.)
+
 ## Specification
 
 The tests in this directory assert conformance with [the URL Standard][URL].

diff --git a/url/percent-encoding.window.js b/url/percent-encoding.window.js
@@ -0,0 +1,33 @@
+promise_test(() => fetch("resources/percent-encoding.json").then(res => res.json()).then(runTests), "Loading data…");
+
+function runTests(testUnits) {
+  for (const testUnit of testUnits) {
+    // Ignore comments
+    if (typeof testUnit === "string") {
+      continue;
+    }
+    for (const encoding of Object.keys(testUnit.output)) {
+      async_test(t => {
+        const frame = document.body.appendChild(document.createElement("iframe"));
+        t.add_cleanup(() => frame.remove());
+        frame.onload = t.step_func_done(() => {
+          const output = frame.contentDocument.querySelector("a");
+          // Test that the fragment is always UTF-8 encoded
+          assert_equals(output.hash, `#${testUnit.output["utf-8"]}`, "fragment");
+          assert_equals(output.search, `?${testUnit.output[encoding]}`, "query");
+        });
+        frame.src = `resources/percent-encoding.py?encoding=${encoding}&value=${toBase64(testUnit.input)}`;
+      }, `Input ${testUnit.input} with encoding ${encoding}`);
+    }
+  }
+}
+
+// Use base64 to avoid relying on the URL parser to get UTF-8 percent-encoding correctly. This does
+// not use btoa directly as that only works with code points in the range U+0000 to U+00FF,
+// inclusive.
+function toBase64(input) {
+  const bytes = new TextEncoder().encode(input);
+  const byteString = Array.from(bytes, byte => String.fromCharCode(byte)).join("");
+  const encoded = self.btoa(byteString);
+  return encoded;
+}
diff --git a/url/resources/percent-encoding.json b/url/resources/percent-encoding.json
@@ -0,0 +1,48 @@
+[
+  "Tests for percent-encoding.",
+  {
+    "input": "\u2020",
+    "output": {
+      "big5": "%26%238224%3B",
+      "euc-kr": "%A2%D3",
+      "utf-8": "%E2%80%A0",
+      "windows-1252": "%86"
+    }
+  },
+  "This uses a trailing A to prevent the URL parser from trimming the C0 control.",
+  {
+    "input": "\u000EA",
+    "output": {
+      "big5": "%0EA",
+      "iso-2022-jp": "%26%2365533%3BA",
+      "utf-8": "%0EA"
+    }
+  },
+  {
+    "input": "\u203E\u005C",
+    "output": {
+      "iso-2022-jp": "%1B(J~%1B(B\\",
+      "utf-8": "%E2%80%BE\\"
+    }
+  },
+  {
+    "input": "\uE5E5",
+    "output": {
+      "gb18030": "%26%2358853%3B",
+      "utf-8": "%EE%97%A5"
+    }
+  },
+  {
+    "input": "\u2212",
+    "output": {
+      "shift_jis": "%81|",
+      "utf-8": "%E2%88%92"
+    }
+  },
+  {
+    "input": "á|",
+    "output": {
+      "utf-8": "%C3%A1|"
+    }
+  }
+]
diff --git a/url/resources/percent-encoding.py b/url/resources/percent-encoding.py
@@ -0,0 +1,23 @@
+import base64
+from wptserve.utils import isomorphic_decode
+
+# Use numeric references to let the HTML parser take care of inserting the correct code points
+# rather than trying to figure out the necessary bytes for each encoding. (The latter can be
+# especially tricky given that Python does not implement the Encoding Standard.)
+def numeric_references(input):
+    output = b""
+    for cp in input:
+        output += b"&#x" + format(ord(cp), b"X") + b";"
+    return output
+
+def main(request, response):
+    # Undo the "magic" space with + replacement as otherwise base64 decoding will fail.
+    value = request.GET.first(b"value").replace(" ", "+")
+    encoding = request.GET.first(b"encoding")
+
+    output_value = numeric_references(base64.b64decode(value).decode(b"utf-8"))
+    return (
+        [(b"Content-Type", b"text/html;charset=" + encoding)],
+        b"""<!doctype html>
+<a href="https://doesnotmatter.invalid/?%s#%s">test</a>
+""" % (output_value, output_value))