Skip to content

Commit

Permalink
feat: check file name uniqueness wiht Unicode canonical case fold nor…
Browse files Browse the repository at this point in the history
…malization

This commit changes the OCF container file name uniqueness check to
perform the Unicode canonical case fold normalization step defined in
https://www.w3.org/TR/charmod-norm/#CanonicalFoldNormalizationStep

That is, we normalize the file name to NFD then apply full case folding
before checking for uniqueness.

Previously the behaviors was:
- we checked for uniqueness of the lower case form (String.toLowerCase)
- we checked for uniqueness of the NFC-normalized lower case

This was flawed, since String.toLowerCase is not equivalent to Unicode
full case folding.

Also, previously, only a warning (OPF-061) was reported when names
were not unique after NFC normalization. This is now an error, using
the same code as the other uniqueness failures (OPF-060).

This commit removes OPF-061, which is no longer used.

Fixes #1246
  • Loading branch information
rdeltour committed Nov 17, 2022
1 parent bfd30b5 commit 111e772
Show file tree
Hide file tree
Showing 10 changed files with 69 additions and 25 deletions.
Expand Up @@ -248,7 +248,6 @@ private void initialize()
severities.put(MessageId.OPF_058, Severity.SUPPRESSED);
severities.put(MessageId.OPF_059, Severity.SUPPRESSED);
severities.put(MessageId.OPF_060, Severity.ERROR);
severities.put(MessageId.OPF_061, Severity.WARNING);
severities.put(MessageId.OPF_062, Severity.USAGE);
severities.put(MessageId.OPF_063, Severity.WARNING);
severities.put(MessageId.OPF_064, Severity.INFO);
Expand Down
1 change: 0 additions & 1 deletion src/main/java/com/adobe/epubcheck/messages/MessageId.java
Expand Up @@ -242,7 +242,6 @@ public enum MessageId implements Comparable<MessageId>
OPF_058("OPF-058"),
OPF_059("OPF-059"),
OPF_060("OPF-060"),
OPF_061("OPF-061"),
OPF_062("OPF-062"),
OPF_063("OPF-063"),
OPF_064("OPF-064"),
Expand Down
29 changes: 13 additions & 16 deletions src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java
Expand Up @@ -25,18 +25,17 @@
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.text.Normalizer;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

import org.w3c.epubcheck.constants.MIMEType;
import org.w3c.epubcheck.core.AbstractChecker;
import org.w3c.epubcheck.core.Checker;
import org.w3c.epubcheck.core.CheckerFactory;
import org.w3c.epubcheck.util.text.UnicodeUtils;

import com.adobe.epubcheck.api.EPUBLocation;
import com.adobe.epubcheck.api.EPUBProfile;
Expand Down Expand Up @@ -296,18 +295,6 @@ private boolean checkContainerStructure(OCFCheckerState state)

// FIXME 2022 report symbolic links and continue

// Check duplicate entries
if (normalizedPaths.contains(resource.getPath().toLowerCase(Locale.ROOT)))
{
context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath());
}
// Check duplicate entries after NFC normalization
else if (normalizedPaths.contains(
Normalizer.normalize(resource.getPath().toLowerCase(Locale.ROOT), Normalizer.Form.NFC)))
{
context.report.message(MessageId.OPF_061, EPUBLocation.of(context), resource.getPath());
}

// Store the resource in the data structure
if (resource.isDirectory())
{
Expand All @@ -318,9 +305,19 @@ else if (normalizedPaths.contains(
else
{
// The container resource is a file,
// sStore its path for later checking of empty directories
// store its path for later checking of empty directories
filePaths.add(resource.getPath());
normalizedPaths.add(resource.getPath().toLowerCase(Locale.ROOT));

// Check duplicate entries
String normalizedPath = UnicodeUtils.canonicalCaseFold(resource.getPath());
if (normalizedPaths.contains(normalizedPath))
{
context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath());
}
else
{
normalizedPaths.add(normalizedPath);
}

// Check file name requirements
new OCFFilenameChecker(resource.getPath(), state.context().build()).check();
Expand Down
39 changes: 39 additions & 0 deletions src/main/java/org/w3c/epubcheck/util/text/UnicodeUtils.java
@@ -0,0 +1,39 @@
package org.w3c.epubcheck.util.text;

import com.google.common.base.Preconditions;
import com.ibm.icu.text.CaseMap;
import com.ibm.icu.text.Normalizer2;

public final class UnicodeUtils
{

private static final Normalizer2 NFD_NORMALIZER = Normalizer2.getNFCInstance();
private static final CaseMap.Fold CASE_FOLDER = CaseMap.fold();

private UnicodeUtils()
{
// static utility class
}

/**
* Applies Unicode Canonical Case Fold Normalization as defined in
* https://www.w3.org/TR/charmod-norm/#CanonicalFoldNormalizationStep
*
* This applies, in sequence: - canonical decomposition (NFD) - case folding
*
* Note that the result is **not** recomposed (NFC), i.e. the optional
* post-folding NFC normalization is not applied.
*
* In other words, the result is suitable for string comparison for
* case-insensitive string comparison, but not for display.
*
* @param string
* the string to normalize
* @return the string normalized by applying NFD then case folding
*/
public static String canonicalCaseFold(String string)
{
Preconditions.checkArgument(string != null);
return CASE_FOLDER.apply(NFD_NORMALIZER.normalize(string));
}
}
Expand Up @@ -238,8 +238,7 @@ OPF_058=Spine item "%1$s" is not referenced from the TOC in the Nav Doc.
OPF_058_SUG=Every spine item in the manifest should be referenced by at least one TOC entry in the Nav Doc.
OPF_059=Spine item "%1$s" is not referenced from the TOC in the NCX.
OPF_059_SUG=Every spine item in the manifest should be referenced by at least one TOC entry in the NCX file.
OPF_060=Duplicate entry in the ZIP file: "%1$s".
OPF_061=Duplicate entry in the ZIP file (after Unicode NFC normalization) "%1$s".
OPF_060=Duplicate entry in the ZIP file: "%1$s" (file names must be unique after Unicode canonical normalization and full case folding).
OPF_062=Found Adobe page-map attribute on spine element in opf file.
OPF_063=Referenced Adobe page-map item "%1$s" was not found in the manifest.
OPF_064=OPF declares type "%1$s", validating using profile "%2$s".
Expand Down
Binary file not shown.
Binary file not shown.
21 changes: 16 additions & 5 deletions src/test/resources/epub3/04-ocf/ocf.feature
Expand Up @@ -29,17 +29,28 @@ Feature: EPUB 3 — Open Container Format
Then no errors or warnings are reported

@spec @xref:sec-container-filenames
Scenario: Report a duplicate filename if two files only differ by case
When checking EPUB 'ocf-filename-duplicate-after-case-normalization-error.epub'
Scenario: Report a duplicate filename after common case folding
When checking EPUB 'ocf-filename-duplicate-after-common-case-folding-error.epub'
Then error OPF-060 is reported
And no other errors or warnings are reported

@spec @xref:sec-container-filenames
Scenario: Report a duplicate filename if two files have the same name after Unicode normalization
When checking EPUB 'ocf-filename-duplicate-after-unicode-normalization-warning.epub'
Then warning OPF-061 is reported
Scenario: Report a duplicate filename after full case folding
When checking EPUB 'ocf-filename-duplicate-after-full-case-folding-error.epub'
Then error OPF-060 is reported
And no other errors or warnings are reported

@spec @xref:sec-container-filenames
Scenario: Report a duplicate filename after Unicode canonical normalization (NFC)
When checking EPUB 'ocf-filename-duplicate-after-canonical-normalization-error.epub'
Then error OPF-060 is reported
And no other errors or warnings are reported

@spec @xref:sec-container-filenames
Scenario: Allow a duplicate filename after Unicode compatibility normalization (NFKC)
When checking EPUB 'ocf-filename-duplicate-after-compatibility-normalization-valid.epub'
Then no other errors or warnings are reported

@spec @xref:sec-container-filenames
Scenario: Allow Unicode emoji tag set in file name
When checking EPUB 'ocf-filename-character-emoji-tag-sequence-valid'
Expand Down

0 comments on commit 111e772

Please sign in to comment.