Skip to content

Commit

Permalink
fix: normalize URLs before checking if the resources exist
Browse files Browse the repository at this point in the history
URLs were not normalized before performing existence checks. So percent-encoded URLs sometimes triggered `RSC-001` or `RSC-007` errors.

This commit introduces a new `normalize(URL)` method in the `URLUtils` class. Normalization is now used when checking a URL. This notably applies to resource and ID existence checks.

Important Note:
  URL normalization is not well-defined. Some percent-encoding normalization is described in RFC3986, but is not defined in the URL standard. Also, normalization (as useful for EPUBCheck) is also dependent on the URL scheme.
  The normalization we apply is quite naïve and might need to be improved in the future. It should however cover the majority of HTTP URL real-world scenarios.

Fix #1479
  • Loading branch information
rdeltour committed Apr 22, 2023
1 parent 0f0cece commit 0323668
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 7 deletions.
10 changes: 8 additions & 2 deletions src/main/java/com/adobe/epubcheck/ocf/OCFContainer.java
Expand Up @@ -79,7 +79,14 @@ public OCFContainer(Builder builder)

public boolean contains(URL resource)
{
return resources.containsKey(resource);
if (resources.containsKey(resource))
{
return true;
}
else
{
return resources.containsKey(URLUtils.normalize(resource));
}
}

@Override
Expand Down Expand Up @@ -134,5 +141,4 @@ public boolean isRemote(URL url)
}
}


}
Expand Up @@ -2,6 +2,8 @@

import java.net.URI;

import org.w3c.epubcheck.util.url.URLUtils;

import com.adobe.epubcheck.api.EPUBLocation;
import com.adobe.epubcheck.api.Report;
import com.adobe.epubcheck.messages.MessageId;
Expand Down Expand Up @@ -64,7 +66,7 @@ public URL setBase(String newBase, EPUBLocation location)

public URL checkURL(String string, EPUBLocation location)
{
URL url = resolveURL(string, false, location);
URL url = URLUtils.normalize(resolveURL(string, false, location));
return url;
}

Expand Down
29 changes: 25 additions & 4 deletions src/main/java/org/w3c/epubcheck/util/url/URLUtils.java
Expand Up @@ -14,6 +14,7 @@
import io.mola.galimatias.GalimatiasParseException;
import io.mola.galimatias.ParseIssue;
import io.mola.galimatias.URL;
import io.mola.galimatias.canonicalize.DecodeUnreservedCanonicalizer;

//FIXME 2022 add unit tests
public final class URLUtils
Expand Down Expand Up @@ -79,9 +80,9 @@ else if (urlA.equals(urlB))
* in EPUB (to test for remote resources compared to container URLs).
*
* @param test
* the URL to test
* the URL to test
* @param local
* the URL it is tested against
* the URL it is tested against
* @return `true` if and only if `test` is remote compared to `local`.
*/
public static boolean isRemote(URL test, URL local)
Expand Down Expand Up @@ -151,13 +152,33 @@ public static String decode(String string)
return percentDecode(string);
}

public static URL normalize(URL url)
{
URL normalized = url;
if (url != null)
{
try
{
if (url.isHierarchical() && url.path() != null)
{
normalized = url.withPath(URLUtils.encodePath(URLUtils.decode(url.path())));
}
normalized = new DecodeUnreservedCanonicalizer().canonicalize(normalized);
} catch (GalimatiasParseException unexpected)
{
throw new AssertionError(unexpected);
}
}
return normalized;
}

/**
* Returns the MIME type of a `data:` URL.
*
* @param url
* a URL, can be `null`.
* a URL, can be `null`.
* @return the MIME type declared in the data URL (can be an empty string), or
* `null` if `url` is not a data URL.
* `null` if `url` is not a data URL.
*/
public static String getDataURLType(URL url)
{
Expand Down
@@ -0,0 +1,11 @@
<!DOCTYPE html>
<html xmlns:epub="http://www.idpf.org/2007/ops" xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta charset="utf-8"/>
<title>Minimal EPUB</title>
</head>
<body>
<h1>Loomings</h1>
<p>Call me Ishmael.</p>
</body>
</html>
@@ -0,0 +1,14 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en" lang="en">
<head>
<meta charset="utf-8"/>
<title>Minimal Nav</title>
</head>
<body>
<nav epub:type="toc">
<ol>
<li><a href="content%26001.xhtml">content 001</a></li>
</ol>
</nav>
</body>
</html>
@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" xml:lang="en" unique-identifier="q">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title id="title">Minimal EPUB 3.0</dc:title>
<dc:language>en</dc:language>
<dc:identifier id="q">NOID</dc:identifier>
<meta property="dcterms:modified">2017-06-14T00:00:01Z</meta>
</metadata>
<manifest>
<item id="content_001" href="content%26001.xhtml" media-type="application/xhtml+xml"/>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
</manifest>
<spine>
<itemref idref="content_001" />
</spine>
</package>
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8" ?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="EPUB/package.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
@@ -0,0 +1 @@
application/epub+zip
6 changes: 6 additions & 0 deletions src/test/resources/epub3/04-ocf/ocf.feature
Expand Up @@ -114,6 +114,12 @@ Feature: EPUB 3 — Open Container Format
When checking EPUB 'url-in-xhtml-valid.xhtml'
And no errors or warnings are reported

@spec @xref:sec-container-iri
Scenario: Allow percent-encoded URLs
When checking EPUB 'url-percent-encoded-valid'
And no errors or warnings are reported


#### Invalid container URLs

@spec @xref:sec-container-iri
Expand Down

0 comments on commit 0323668

Please sign in to comment.