Skip to content

Commit

Permalink
feat: improve checking of OCF file name characters
Browse files Browse the repository at this point in the history
- significantly refactored `OCFFilenameChecker`:
  - now based on ICU4J for more up-to-date Unicode support
  - iterate over the input string codepoints (`int`) instead of code
    units (`char`) for better support of charactres outside the BMP.
  - rely on ICU UnicodeSet to define the set of forbidden characters
  - improve the message reported for forbidden characters (notably for
    non-printable characters)
  - the class now implements the `Checker` interface
- add unit tests for the `OCFFilenameChecker` only in a new feature
  file `filename-checker.feature`, located along with the EPUB 3 OCF
  functional tests, for easier lookup.
- add a few functional tests, to make sure `OCFFilenameChecker` is
  correctly called when checking full publications or single package
  documents.
- update ICU4J to v72.1

Fixes #1240, fixes #1292, fixes #1302
  • Loading branch information
rdeltour committed Nov 17, 2022
1 parent 5c35fa0 commit b6ac8ea
Show file tree
Hide file tree
Showing 34 changed files with 515 additions and 248 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Expand Up @@ -173,7 +173,7 @@
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>70.1</version>
<version>72.1</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
Expand Down
4 changes: 1 addition & 3 deletions src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java
Expand Up @@ -287,8 +287,6 @@ private boolean checkContainerStructure(OCFCheckerState state)
List<String> directories = new LinkedList<>();

// Loop through the entries
OCFFilenameChecker filenameChecker = new OCFFilenameChecker(state.context().build());
// FIXME catch IAE MALFORMED entries
for (OCFResource resource : resourcesProvider)
{
Preconditions.checkNotNull(resource.getPath());
Expand Down Expand Up @@ -318,7 +316,7 @@ else if (resources.containsKey(
else
{
// Check file name requirements
filenameChecker.checkCompatiblyEscaped(resource.getPath());
new OCFFilenameChecker(resource.getPath(), state.context().build()).check();;

// report entry metadata
reportFeatures(resource.getProperties());
Expand Down
211 changes: 123 additions & 88 deletions src/main/java/com/adobe/epubcheck/ocf/OCFFilenameChecker.java
@@ -1,127 +1,162 @@
package com.adobe.epubcheck.ocf;

import java.util.LinkedHashSet;
import java.util.Set;
import java.util.stream.Collectors;

import org.w3c.epubcheck.core.Checker;

import com.adobe.epubcheck.api.EPUBLocation;
import com.adobe.epubcheck.api.Report;
import com.adobe.epubcheck.messages.MessageId;
import com.adobe.epubcheck.opf.ValidationContext;
import com.adobe.epubcheck.util.EPUBVersion;
import com.google.common.collect.ImmutableSet;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UCharacterIterator;
import com.ibm.icu.text.UForwardCharacterIterator;
import com.ibm.icu.text.UnicodeSet;

//FIXME 2022 update related PKG-* messages to contain the file name string
public final class OCFFilenameChecker
public final class OCFFilenameChecker implements Checker
{
private static final Set<String> RESTRICTED_30_CHARACTER_SET = ImmutableSet.of("PRIVATE_USE_AREA",
"ARABIC_PRESENTATION_FORMS_A", "SPECIALS", "SUPPLEMENTARY_PRIVATE_USE_AREA_A",
"SUPPLEMENTARY_PRIVATE_USE_AREA_B", "VARIATION_SELECTORS_SUPPLEMENT", "TAGS");

private static final UnicodeSet ASCII = new UnicodeSet("[:ascii:]").freeze();

private static final UnicodeSet DISALLOWED_EPUB2 = new UnicodeSet()
// .add(0x002F)// SOLIDUS '/' -- allowed as path separator
.add(0x0022)// QUOTATION MARK '"'
.add(0x002A)// ASTERISK '*'
// .add(0x002E)// FULL STOP '.' -- only disallowed as the last character
.add(0x003A)// COLON ':'
.add(0x003C)// LESS-THAN SIGN '<'
.add(0x003E)// GREATER-THAN SIGN '>'
.add(0x003F)// QUESTION MARK '?'
.add(0x005C)// REVERSE SOLIDUS '\'
.freeze();

private static final ImmutableMap<String, UnicodeSet> DISALLOWED_EPUB3 = new ImmutableMap.Builder<String, UnicodeSet>()
.put("ASCII", new UnicodeSet() //
.addAll(DISALLOWED_EPUB2)// all disallowed in EPUB 2.0.1
.add(0x007C) // VERTICAL LINE '|'
.freeze())
.put("NON CHARACTER", new UnicodeSet("[:Noncharacter_Code_Point=Yes:]")//
.freeze())
.put("CONTROL", new UnicodeSet().add(0x007F) // DEL
.addAll(0x0000, 0x001F) // C0 range
.addAll(0x0080, 0x009F) // C1 range
.freeze())
.put("PRIVATE USE", new UnicodeSet() //
.addAll(0xE000, 0xF8FF) // Private Use Area
.addAll(0xF0000, 0xFFFFF) // Supplementary Private Use Area-A
.addAll(0x100000, 0x10FFFF) // Supplementary Private Use Area-B
.freeze())
.put("SPECIALS", new UnicodeSet() //
.addAll(0xFFF0, 0xFFFF) // Specials Blocks
.freeze())
.put("DEPRECATED", new UnicodeSet() //
.add(0xE0001)// LANGUAGE TAG
// .add(0xE007F)// CANCEL TAG -- reinstated in Emoji tag sequences
.freeze())
.build();

private static String toString(int codepoint, String setName)
{
assert setName != null;
StringBuilder result = new StringBuilder().append(String.format("U+%04X ", codepoint));
if ("ASCII".equals(setName))
{
result.append('(').append(UCharacter.toString(codepoint)).append(')');
}
else
{
String characterName = UCharacter.getName(codepoint);
if (characterName != null)
{
result.append(characterName).append(' ');
}
result.append('(').append(setName).append(')');
}
return result.toString();
}

private final Report report;
private final EPUBVersion version;
private final EPUBLocation location;
private final String filename;

public OCFFilenameChecker(String filename, ValidationContext context)
{
this(filename, context, null);
}

public OCFFilenameChecker(ValidationContext context)
public OCFFilenameChecker(String filename, ValidationContext context, EPUBLocation location)
{
Preconditions.checkArgument(filename != null);
Preconditions.checkArgument(context != null);
this.filename = filename;
this.report = context.report;
this.version = context.version;
this.location = EPUBLocation.of(context);
this.location = (location != null) ? location : EPUBLocation.of(context);
}

public String checkCompatiblyEscaped(final String str)
@Override
public void check()
{
// don't check remote resources
if (str.matches("^[^:/?#]+://.*"))
{
return "";
}

// the test string will be used to compare test result
String test = checkNonAsciiFilename(str);

if (str.endsWith("."))
{
report.message(MessageId.PKG_011, location, str);
test += ".";
}

boolean spaces = false;
final char[] ascciGraphic = new char[] { '<', '>', '"', '{', '}', '|', '^', '`', '*',
'?' /* , ':','/', '\\' */ };
String result = "";
char[] chars = str.toCharArray();
for (char c : chars)
// Iterate through the code points to search disallowed characters
UCharacterIterator chars = UCharacterIterator.getInstance(filename);
final Set<String> disallowed = new LinkedHashSet<>();
boolean hasSpaces = false;
boolean isASCIIOnly = true;
int codepoint;
while ((codepoint = chars.nextCodePoint()) != UForwardCharacterIterator.DONE)
{
for (char a : ascciGraphic)
// Check if the string has non-ASCII characters
isASCIIOnly = isASCIIOnly && ASCII.contains(codepoint);
// Check if the string has space characters
hasSpaces = hasSpaces || UCharacter.isUWhiteSpace(codepoint);
// Check for disallowed characters
switch (version)
{
if (c == a)
case VERSION_2:
if (DISALLOWED_EPUB2.contains(codepoint))
{
result += "\"" + Character.toString(c) + "\",";
test += Character.toString(c);
disallowed.add(toString(codepoint, "ASCII"));
}
}
if (Character.isSpaceChar(c))
{
spaces = true;
test += Character.toString(c);
break;
default:
for (String name : DISALLOWED_EPUB3.keySet())
{
if (DISALLOWED_EPUB3.get(name).contains(codepoint))
{
disallowed.add(toString(codepoint, name));
break;
}
}
break;
}
}
if (result.length() > 1)
// Check that FULL STOP is not used as the last character
if (chars.previousCodePoint() == 0x002E)
{
result = result.substring(0, result.length() - 1);
report.message(MessageId.PKG_009, location, str, result);
report.message(MessageId.PKG_011, location, filename);
}
if (spaces)
// Report if disallowed characters were found
if (!disallowed.isEmpty())
{
report.message(MessageId.PKG_010, location, str);
report.message(MessageId.PKG_009, location, filename,
disallowed.stream().collect(Collectors.joining(", ")));
}

if (version == EPUBVersion.VERSION_3)
// Report whitespace characters
if (hasSpaces)
{
checkCompatiblyEscaped30(str, test);
report.message(MessageId.PKG_010, location, filename);
}
return test;
}

private String checkNonAsciiFilename(final String str)
{
String nonAscii = str.replaceAll("[\\p{ASCII}]", "");
if (nonAscii.length() > 0)
// Report non-ASCII characters as usage
if (!isASCIIOnly)
{
report.message(MessageId.PKG_012, location, str, nonAscii);
report.message(MessageId.PKG_012, location, filename);
}
return nonAscii;
}

private String checkCompatiblyEscaped30(String str, String test)
{
String result = "";

char[] chars = str.toCharArray();
for (char c : chars)
{
if (Character.isISOControl(c))
{
result += "\"" + Character.toString(c) + "\",";
test += Character.toString(c);
}

// DEL (U+007F)
if (c == '\u007F')
{
result += "\"" + Character.toString(c) + "\",";
test += Character.toString(c);
}
String unicodeType = Character.UnicodeBlock.of(c).toString();
if (RESTRICTED_30_CHARACTER_SET.contains(unicodeType))
{
result += "\"" + Character.toString(c) + "\",";
}
}
if (result.length() > 1)
{
result = result.substring(0, result.length() - 1);
report.message(MessageId.PKG_009, location, str, result);
}
return test;
}
}
14 changes: 5 additions & 9 deletions src/main/java/com/adobe/epubcheck/opf/OPFChecker.java
Expand Up @@ -207,16 +207,12 @@ protected boolean checkContent()

for (OPFItem item : opfHandler.getItems())
{
// only check Filename CompatiblyEscaped when in "-mode opf"
// this is when 'xrefChecker' Object is null which is an indicator for
// single file validation
// (Had no better possibility in mind since "mode" isn't available in
// OPFChecker.java)
//
// bugfix for issue 239
if (!context.xrefChecker.isPresent())
// only check the filename in single-file mode
// (it is checked by the container checker in full-publication mode)
// and for local resources (i.e. computed to a file URL)
if (!context.container.isPresent() && !item.isRemote())
{
new OCFFilenameChecker(context).checkCompatiblyEscaped(item.getPath());
new OCFFilenameChecker(item.getPath(), context, item.getLocation()).check();
}
if (!item.equals(opfHandler.getItemByURL(item.getURL()).orNull()))
{
Expand Down
Expand Up @@ -297,10 +297,10 @@ PKG_005=The mimetype file has an extra field of length %1$s. The use of the extr
PKG_006=Mimetype file entry is missing or is not the first file in the archive.
PKG_007=Mimetype file should only contain the string "application/epub+zip" and should not be compressed.
PKG_008=Unable to read file "%1$s".
PKG_009=File name contains characters that are not allowed in OCF file names: "%1$s".
PKG_010=Filename contains spaces, therefore URI escaping is necessary. Consider removing spaces from filename.
PKG_011=Filename is not allowed to end with ".".
PKG_012=File name contains the following non-ascii characters: %1$s. Consider changing the filename.
PKG_009=The file name "%1$s" contains characters that are not allowed in OCF file names: %2$s.
PKG_010=The file name "%1$s" contains spaces, which may create interoperability issues with older reading systems.
PKG_011=The file name "%1$s" is not allowed to end with ".".
PKG_012=The file name "%1$s" contains non-ASCII characters, which might create interoperability issues with older reading systems.
PKG_013=The EPUB file includes multiple OPS renditions.
PKG_014=The EPUB contains empty directory "%1$s".
PKG_015=Unable to read EPUB contents: %1$s
Expand Down

0 comments on commit b6ac8ea

Please sign in to comment.