From 09d8830be15da7e3a44f32a934609c25357d6ef3 Mon Sep 17 00:00:00 2001 From: Anne van Kesteren Date: Tue, 3 Nov 2020 17:00:10 +0100 Subject: [PATCH] URL: percent-encoding test framework basics --- url/README.md | 4 +++ url/percent-encoding.window.js | 33 ++++++++++++++++++++ url/resources/percent-encoding.json | 48 +++++++++++++++++++++++++++++ url/resources/percent-encoding.py | 23 ++++++++++++++ 4 files changed, 108 insertions(+) create mode 100644 url/percent-encoding.window.js create mode 100644 url/resources/percent-encoding.json create mode 100644 url/resources/percent-encoding.py diff --git a/url/README.md b/url/README.md index 823a8eec022282..50a71bb482df9e 100644 --- a/url/README.md +++ b/url/README.md @@ -44,6 +44,10 @@ expected to fail. Tests in `/encoding` and `/html/infrastructure/urls/resolving-urls/query-encoding/` cover the encoding argument to the URL parser. +There's also limited coverage in `resources/percent-encoding.json` for percent-encode after encoding +with _percentEncodeSet_ set to special-query percent-encode set and _spaceAsPlus_ set to false. +(Improvements to expand coverage here are welcome.) + ## Specification The tests in this directory assert conformance with [the URL Standard][URL]. diff --git a/url/percent-encoding.window.js b/url/percent-encoding.window.js new file mode 100644 index 00000000000000..dcb5c1e55b21b7 --- /dev/null +++ b/url/percent-encoding.window.js @@ -0,0 +1,33 @@ +promise_test(() => fetch("resources/percent-encoding.json").then(res => res.json()).then(runTests), "Loading data…"); + +function runTests(testUnits) { + for (const testUnit of testUnits) { + // Ignore comments + if (typeof testUnit === "string") { + continue; + } + for (const encoding of Object.keys(testUnit.output)) { + async_test(t => { + const frame = document.body.appendChild(document.createElement("iframe")); + t.add_cleanup(() => frame.remove()); + frame.onload = t.step_func_done(() => { + const output = frame.contentDocument.querySelector("a"); + // Test that the fragment is always UTF-8 encoded + assert_equals(output.hash, `#${testUnit.output["utf-8"]}`, "fragment"); + assert_equals(output.search, `?${testUnit.output[encoding]}`, "query"); + }); + frame.src = `resources/percent-encoding.py?encoding=${encoding}&value=${toBase64(testUnit.input)}`; + }, `Input ${testUnit.input} with encoding ${encoding}`); + } + } +} + +// Use base64 to avoid relying on the URL parser to get UTF-8 percent-encoding correctly. This does +// not use btoa directly as that only works with code points in the range U+0000 to U+00FF, +// inclusive. +function toBase64(input) { + const bytes = new TextEncoder().encode(input); + const byteString = Array.from(bytes, byte => String.fromCharCode(byte)).join(""); + const encoded = self.btoa(byteString); + return encoded; +} diff --git a/url/resources/percent-encoding.json b/url/resources/percent-encoding.json new file mode 100644 index 00000000000000..eccd1db62fe601 --- /dev/null +++ b/url/resources/percent-encoding.json @@ -0,0 +1,48 @@ +[ + "Tests for percent-encoding.", + { + "input": "\u2020", + "output": { + "big5": "%26%238224%3B", + "euc-kr": "%A2%D3", + "utf-8": "%E2%80%A0", + "windows-1252": "%86" + } + }, + "This uses a trailing A to prevent the URL parser from trimming the C0 control.", + { + "input": "\u000EA", + "output": { + "big5": "%0EA", + "iso-2022-jp": "%26%2365533%3BA", + "utf-8": "%0EA" + } + }, + { + "input": "\u203E\u005C", + "output": { + "iso-2022-jp": "%1B(J~%1B(B\\", + "utf-8": "%E2%80%BE\\" + } + }, + { + "input": "\uE5E5", + "output": { + "gb18030": "%26%2358853%3B", + "utf-8": "%EE%97%A5" + } + }, + { + "input": "\u2212", + "output": { + "shift_jis": "%81|", + "utf-8": "%E2%88%92" + } + }, + { + "input": "á|", + "output": { + "utf-8": "%C3%A1|" + } + } +] diff --git a/url/resources/percent-encoding.py b/url/resources/percent-encoding.py new file mode 100644 index 00000000000000..f7228871205a20 --- /dev/null +++ b/url/resources/percent-encoding.py @@ -0,0 +1,23 @@ +import base64 +from wptserve.utils import isomorphic_decode + +# Use numeric references to let the HTML parser take care of inserting the correct code points +# rather than trying to figure out the necessary bytes for each encoding. (The latter can be +# especially tricky given that Python does not implement the Encoding Standard.) +def numeric_references(input): + output = b"" + for cp in input: + output += b"&#x" + format(ord(cp), b"X") + b";" + return output + +def main(request, response): + # Undo the "magic" space with + replacement as otherwise base64 decoding will fail. + value = request.GET.first(b"value").replace(" ", "+") + encoding = request.GET.first(b"encoding") + + output_value = numeric_references(base64.b64decode(value).decode(b"utf-8")) + return ( + [(b"Content-Type", b"text/html;charset=" + encoding)], + b""" +test +""" % (output_value, output_value))