From 03236685b6bf3b1422a3c2b90a51c8c94bd9a0ea Mon Sep 17 00:00:00 2001 From: Romain Deltour Date: Sat, 22 Apr 2023 04:02:27 +0200 Subject: [PATCH] fix: normalize URLs before checking if the resources exist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit URLs were not normalized before performing existence checks. So percent-encoded URLs sometimes triggered `RSC-001` or `RSC-007` errors. This commit introduces a new `normalize(URL)` method in the `URLUtils` class. Normalization is now used when checking a URL. This notably applies to resource and ID existence checks. Important Note: URL normalization is not well-defined. Some percent-encoding normalization is described in RFC3986, but is not defined in the URL standard. Also, normalization (as useful for EPUBCheck) is also dependent on the URL scheme. The normalization we apply is quite naïve and might need to be improved in the future. It should however cover the majority of HTTP URL real-world scenarios. Fix #1479 --- .../com/adobe/epubcheck/ocf/OCFContainer.java | 10 +++++-- .../epubcheck/core/references/URLChecker.java | 4 ++- .../org/w3c/epubcheck/util/url/URLUtils.java | 29 ++++++++++++++++--- .../EPUB/content&001.xhtml | 11 +++++++ .../url-percent-encoded-valid/EPUB/nav.xhtml | 14 +++++++++ .../EPUB/package.opf | 16 ++++++++++ .../META-INF/container.xml | 6 ++++ .../files/url-percent-encoded-valid/mimetype | 1 + src/test/resources/epub3/04-ocf/ocf.feature | 6 ++++ 9 files changed, 90 insertions(+), 7 deletions(-) create mode 100644 src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/EPUB/content&001.xhtml create mode 100644 src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/EPUB/nav.xhtml create mode 100644 src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/EPUB/package.opf create mode 100644 src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/META-INF/container.xml create mode 100644 src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/mimetype diff --git a/src/main/java/com/adobe/epubcheck/ocf/OCFContainer.java b/src/main/java/com/adobe/epubcheck/ocf/OCFContainer.java index 6ebf1710c..ebaf4d087 100644 --- a/src/main/java/com/adobe/epubcheck/ocf/OCFContainer.java +++ b/src/main/java/com/adobe/epubcheck/ocf/OCFContainer.java @@ -79,7 +79,14 @@ public OCFContainer(Builder builder) public boolean contains(URL resource) { - return resources.containsKey(resource); + if (resources.containsKey(resource)) + { + return true; + } + else + { + return resources.containsKey(URLUtils.normalize(resource)); + } } @Override @@ -134,5 +141,4 @@ public boolean isRemote(URL url) } } - } diff --git a/src/main/java/org/w3c/epubcheck/core/references/URLChecker.java b/src/main/java/org/w3c/epubcheck/core/references/URLChecker.java index 080e6f03d..f7ba681a7 100644 --- a/src/main/java/org/w3c/epubcheck/core/references/URLChecker.java +++ b/src/main/java/org/w3c/epubcheck/core/references/URLChecker.java @@ -2,6 +2,8 @@ import java.net.URI; +import org.w3c.epubcheck.util.url.URLUtils; + import com.adobe.epubcheck.api.EPUBLocation; import com.adobe.epubcheck.api.Report; import com.adobe.epubcheck.messages.MessageId; @@ -64,7 +66,7 @@ public URL setBase(String newBase, EPUBLocation location) public URL checkURL(String string, EPUBLocation location) { - URL url = resolveURL(string, false, location); + URL url = URLUtils.normalize(resolveURL(string, false, location)); return url; } diff --git a/src/main/java/org/w3c/epubcheck/util/url/URLUtils.java b/src/main/java/org/w3c/epubcheck/util/url/URLUtils.java index 526e869d5..e58146ed4 100644 --- a/src/main/java/org/w3c/epubcheck/util/url/URLUtils.java +++ b/src/main/java/org/w3c/epubcheck/util/url/URLUtils.java @@ -14,6 +14,7 @@ import io.mola.galimatias.GalimatiasParseException; import io.mola.galimatias.ParseIssue; import io.mola.galimatias.URL; +import io.mola.galimatias.canonicalize.DecodeUnreservedCanonicalizer; //FIXME 2022 add unit tests public final class URLUtils @@ -79,9 +80,9 @@ else if (urlA.equals(urlB)) * in EPUB (to test for remote resources compared to container URLs). * * @param test - * the URL to test + * the URL to test * @param local - * the URL it is tested against + * the URL it is tested against * @return `true` if and only if `test` is remote compared to `local`. */ public static boolean isRemote(URL test, URL local) @@ -151,13 +152,33 @@ public static String decode(String string) return percentDecode(string); } + public static URL normalize(URL url) + { + URL normalized = url; + if (url != null) + { + try + { + if (url.isHierarchical() && url.path() != null) + { + normalized = url.withPath(URLUtils.encodePath(URLUtils.decode(url.path()))); + } + normalized = new DecodeUnreservedCanonicalizer().canonicalize(normalized); + } catch (GalimatiasParseException unexpected) + { + throw new AssertionError(unexpected); + } + } + return normalized; + } + /** * Returns the MIME type of a `data:` URL. * * @param url - * a URL, can be `null`. + * a URL, can be `null`. * @return the MIME type declared in the data URL (can be an empty string), or - * `null` if `url` is not a data URL. + * `null` if `url` is not a data URL. */ public static String getDataURLType(URL url) { diff --git a/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/EPUB/content&001.xhtml b/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/EPUB/content&001.xhtml new file mode 100644 index 000000000..43a520ea2 --- /dev/null +++ b/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/EPUB/content&001.xhtml @@ -0,0 +1,11 @@ + + + + + Minimal EPUB + + +

Loomings

+

Call me Ishmael.

+ + diff --git a/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/EPUB/nav.xhtml b/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/EPUB/nav.xhtml new file mode 100644 index 000000000..7f34f3024 --- /dev/null +++ b/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/EPUB/nav.xhtml @@ -0,0 +1,14 @@ + + + + + Minimal Nav + + + + + diff --git a/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/EPUB/package.opf b/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/EPUB/package.opf new file mode 100644 index 000000000..552655bbd --- /dev/null +++ b/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/EPUB/package.opf @@ -0,0 +1,16 @@ + + + + Minimal EPUB 3.0 + en + NOID + 2017-06-14T00:00:01Z + + + + + + + + + \ No newline at end of file diff --git a/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/META-INF/container.xml b/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/META-INF/container.xml new file mode 100644 index 000000000..318782179 --- /dev/null +++ b/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/META-INF/container.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/mimetype b/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/mimetype new file mode 100644 index 000000000..57ef03f24 --- /dev/null +++ b/src/test/resources/epub3/04-ocf/files/url-percent-encoded-valid/mimetype @@ -0,0 +1 @@ +application/epub+zip \ No newline at end of file diff --git a/src/test/resources/epub3/04-ocf/ocf.feature b/src/test/resources/epub3/04-ocf/ocf.feature index 7a0dba39f..b5c11175c 100644 --- a/src/test/resources/epub3/04-ocf/ocf.feature +++ b/src/test/resources/epub3/04-ocf/ocf.feature @@ -114,6 +114,12 @@ Feature: EPUB 3 — Open Container Format When checking EPUB 'url-in-xhtml-valid.xhtml' And no errors or warnings are reported + @spec @xref:sec-container-iri + Scenario: Allow percent-encoded URLs + When checking EPUB 'url-percent-encoded-valid' + And no errors or warnings are reported + + #### Invalid container URLs @spec @xref:sec-container-iri