From 09d8830be15da7e3a44f32a934609c25357d6ef3 Mon Sep 17 00:00:00 2001
From: Anne van Kesteren <annevk@annevk.nl>
Date: Tue, 3 Nov 2020 17:00:10 +0100
Subject: [PATCH] URL: percent-encoding test framework basics

---
 url/README.md                       |  4 +++
 url/percent-encoding.window.js      | 33 ++++++++++++++++++++
 url/resources/percent-encoding.json | 48 +++++++++++++++++++++++++++++
 url/resources/percent-encoding.py   | 23 ++++++++++++++
 4 files changed, 108 insertions(+)
 create mode 100644 url/percent-encoding.window.js
 create mode 100644 url/resources/percent-encoding.json
 create mode 100644 url/resources/percent-encoding.py

diff --git a/url/README.md b/url/README.md
index 823a8eec022282..50a71bb482df9e 100644
--- a/url/README.md
+++ b/url/README.md
@@ -44,6 +44,10 @@ expected to fail.
 Tests in `/encoding` and `/html/infrastructure/urls/resolving-urls/query-encoding/` cover the
 encoding argument to the URL parser.
 
+There's also limited coverage in `resources/percent-encoding.json` for percent-encode after encoding
+with _percentEncodeSet_ set to special-query percent-encode set and _spaceAsPlus_ set to false.
+(Improvements to expand coverage here are welcome.)
+
 ## Specification
 
 The tests in this directory assert conformance with [the URL Standard][URL].
diff --git a/url/percent-encoding.window.js b/url/percent-encoding.window.js
new file mode 100644
index 00000000000000..dcb5c1e55b21b7
--- /dev/null
+++ b/url/percent-encoding.window.js
@@ -0,0 +1,33 @@
+promise_test(() => fetch("resources/percent-encoding.json").then(res => res.json()).then(runTests), "Loading data…");
+
+function runTests(testUnits) {
+  for (const testUnit of testUnits) {
+    // Ignore comments
+    if (typeof testUnit === "string") {
+      continue;
+    }
+    for (const encoding of Object.keys(testUnit.output)) {
+      async_test(t => {
+        const frame = document.body.appendChild(document.createElement("iframe"));
+        t.add_cleanup(() => frame.remove());
+        frame.onload = t.step_func_done(() => {
+          const output = frame.contentDocument.querySelector("a");
+          // Test that the fragment is always UTF-8 encoded
+          assert_equals(output.hash, `#${testUnit.output["utf-8"]}`, "fragment");
+          assert_equals(output.search, `?${testUnit.output[encoding]}`, "query");
+        });
+        frame.src = `resources/percent-encoding.py?encoding=${encoding}&value=${toBase64(testUnit.input)}`;
+      }, `Input ${testUnit.input} with encoding ${encoding}`);
+    }
+  }
+}
+
+// Use base64 to avoid relying on the URL parser to get UTF-8 percent-encoding correctly. This does
+// not use btoa directly as that only works with code points in the range U+0000 to U+00FF,
+// inclusive.
+function toBase64(input) {
+  const bytes = new TextEncoder().encode(input);
+  const byteString = Array.from(bytes, byte => String.fromCharCode(byte)).join("");
+  const encoded = self.btoa(byteString);
+  return encoded;
+}
diff --git a/url/resources/percent-encoding.json b/url/resources/percent-encoding.json
new file mode 100644
index 00000000000000..eccd1db62fe601
--- /dev/null
+++ b/url/resources/percent-encoding.json
@@ -0,0 +1,48 @@
+[
+  "Tests for percent-encoding.",
+  {
+    "input": "\u2020",
+    "output": {
+      "big5": "%26%238224%3B",
+      "euc-kr": "%A2%D3",
+      "utf-8": "%E2%80%A0",
+      "windows-1252": "%86"
+    }
+  },
+  "This uses a trailing A to prevent the URL parser from trimming the C0 control.",
+  {
+    "input": "\u000EA",
+    "output": {
+      "big5": "%0EA",
+      "iso-2022-jp": "%26%2365533%3BA",
+      "utf-8": "%0EA"
+    }
+  },
+  {
+    "input": "\u203E\u005C",
+    "output": {
+      "iso-2022-jp": "%1B(J~%1B(B\\",
+      "utf-8": "%E2%80%BE\\"
+    }
+  },
+  {
+    "input": "\uE5E5",
+    "output": {
+      "gb18030": "%26%2358853%3B",
+      "utf-8": "%EE%97%A5"
+    }
+  },
+  {
+    "input": "\u2212",
+    "output": {
+      "shift_jis": "%81|",
+      "utf-8": "%E2%88%92"
+    }
+  },
+  {
+    "input": "á|",
+    "output": {
+      "utf-8": "%C3%A1|"
+    }
+  }
+]
diff --git a/url/resources/percent-encoding.py b/url/resources/percent-encoding.py
new file mode 100644
index 00000000000000..f7228871205a20
--- /dev/null
+++ b/url/resources/percent-encoding.py
@@ -0,0 +1,23 @@
+import base64
+from wptserve.utils import isomorphic_decode
+
+# Use numeric references to let the HTML parser take care of inserting the correct code points
+# rather than trying to figure out the necessary bytes for each encoding. (The latter can be
+# especially tricky given that Python does not implement the Encoding Standard.)
+def numeric_references(input):
+    output = b""
+    for cp in input:
+        output += b"&#x" + format(ord(cp), b"X") + b";"
+    return output
+
+def main(request, response):
+    # Undo the "magic" space with + replacement as otherwise base64 decoding will fail.
+    value = request.GET.first(b"value").replace(" ", "+")
+    encoding = request.GET.first(b"encoding")
+
+    output_value = numeric_references(base64.b64decode(value).decode(b"utf-8"))
+    return (
+        [(b"Content-Type", b"text/html;charset=" + encoding)],
+        b"""<!doctype html>
+<a href="https://doesnotmatter.invalid/?%s#%s">test</a>
+""" % (output_value, output_value))