Skip to content

Commit 111e772

Browse files
committed
feat: check file name uniqueness wiht Unicode canonical case fold normalization
This commit changes the OCF container file name uniqueness check to perform the Unicode canonical case fold normalization step defined in https://www.w3.org/TR/charmod-norm/#CanonicalFoldNormalizationStep That is, we normalize the file name to NFD then apply full case folding before checking for uniqueness. Previously the behaviors was: - we checked for uniqueness of the lower case form (String.toLowerCase) - we checked for uniqueness of the NFC-normalized lower case This was flawed, since String.toLowerCase is not equivalent to Unicode full case folding. Also, previously, only a warning (OPF-061) was reported when names were not unique after NFC normalization. This is now an error, using the same code as the other uniqueness failures (OPF-060). This commit removes OPF-061, which is no longer used. Fixes #1246
1 parent bfd30b5 commit 111e772

10 files changed

+69
-25
lines changed

src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,6 @@ private void initialize()
248248
severities.put(MessageId.OPF_058, Severity.SUPPRESSED);
249249
severities.put(MessageId.OPF_059, Severity.SUPPRESSED);
250250
severities.put(MessageId.OPF_060, Severity.ERROR);
251-
severities.put(MessageId.OPF_061, Severity.WARNING);
252251
severities.put(MessageId.OPF_062, Severity.USAGE);
253252
severities.put(MessageId.OPF_063, Severity.WARNING);
254253
severities.put(MessageId.OPF_064, Severity.INFO);

src/main/java/com/adobe/epubcheck/messages/MessageId.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,6 @@ public enum MessageId implements Comparable<MessageId>
242242
OPF_058("OPF-058"),
243243
OPF_059("OPF-059"),
244244
OPF_060("OPF-060"),
245-
OPF_061("OPF-061"),
246245
OPF_062("OPF-062"),
247246
OPF_063("OPF-063"),
248247
OPF_064("OPF-064"),

src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,18 +25,17 @@
2525
import java.io.File;
2626
import java.io.IOException;
2727
import java.io.InputStream;
28-
import java.text.Normalizer;
2928
import java.util.HashSet;
3029
import java.util.LinkedList;
3130
import java.util.List;
32-
import java.util.Locale;
3331
import java.util.Map;
3432
import java.util.Set;
3533

3634
import org.w3c.epubcheck.constants.MIMEType;
3735
import org.w3c.epubcheck.core.AbstractChecker;
3836
import org.w3c.epubcheck.core.Checker;
3937
import org.w3c.epubcheck.core.CheckerFactory;
38+
import org.w3c.epubcheck.util.text.UnicodeUtils;
4039

4140
import com.adobe.epubcheck.api.EPUBLocation;
4241
import com.adobe.epubcheck.api.EPUBProfile;
@@ -296,18 +295,6 @@ private boolean checkContainerStructure(OCFCheckerState state)
296295

297296
// FIXME 2022 report symbolic links and continue
298297

299-
// Check duplicate entries
300-
if (normalizedPaths.contains(resource.getPath().toLowerCase(Locale.ROOT)))
301-
{
302-
context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath());
303-
}
304-
// Check duplicate entries after NFC normalization
305-
else if (normalizedPaths.contains(
306-
Normalizer.normalize(resource.getPath().toLowerCase(Locale.ROOT), Normalizer.Form.NFC)))
307-
{
308-
context.report.message(MessageId.OPF_061, EPUBLocation.of(context), resource.getPath());
309-
}
310-
311298
// Store the resource in the data structure
312299
if (resource.isDirectory())
313300
{
@@ -318,9 +305,19 @@ else if (normalizedPaths.contains(
318305
else
319306
{
320307
// The container resource is a file,
321-
// sStore its path for later checking of empty directories
308+
// store its path for later checking of empty directories
322309
filePaths.add(resource.getPath());
323-
normalizedPaths.add(resource.getPath().toLowerCase(Locale.ROOT));
310+
311+
// Check duplicate entries
312+
String normalizedPath = UnicodeUtils.canonicalCaseFold(resource.getPath());
313+
if (normalizedPaths.contains(normalizedPath))
314+
{
315+
context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath());
316+
}
317+
else
318+
{
319+
normalizedPaths.add(normalizedPath);
320+
}
324321

325322
// Check file name requirements
326323
new OCFFilenameChecker(resource.getPath(), state.context().build()).check();
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
package org.w3c.epubcheck.util.text;
2+
3+
import com.google.common.base.Preconditions;
4+
import com.ibm.icu.text.CaseMap;
5+
import com.ibm.icu.text.Normalizer2;
6+
7+
public final class UnicodeUtils
8+
{
9+
10+
private static final Normalizer2 NFD_NORMALIZER = Normalizer2.getNFCInstance();
11+
private static final CaseMap.Fold CASE_FOLDER = CaseMap.fold();
12+
13+
private UnicodeUtils()
14+
{
15+
// static utility class
16+
}
17+
18+
/**
19+
* Applies Unicode Canonical Case Fold Normalization as defined in
20+
* https://www.w3.org/TR/charmod-norm/#CanonicalFoldNormalizationStep
21+
*
22+
* This applies, in sequence: - canonical decomposition (NFD) - case folding
23+
*
24+
* Note that the result is **not** recomposed (NFC), i.e. the optional
25+
* post-folding NFC normalization is not applied.
26+
*
27+
* In other words, the result is suitable for string comparison for
28+
* case-insensitive string comparison, but not for display.
29+
*
30+
* @param string
31+
* the string to normalize
32+
* @return the string normalized by applying NFD then case folding
33+
*/
34+
public static String canonicalCaseFold(String string)
35+
{
36+
Preconditions.checkArgument(string != null);
37+
return CASE_FOLDER.apply(NFD_NORMALIZER.normalize(string));
38+
}
39+
}

src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -238,8 +238,7 @@ OPF_058=Spine item "%1$s" is not referenced from the TOC in the Nav Doc.
238238
OPF_058_SUG=Every spine item in the manifest should be referenced by at least one TOC entry in the Nav Doc.
239239
OPF_059=Spine item "%1$s" is not referenced from the TOC in the NCX.
240240
OPF_059_SUG=Every spine item in the manifest should be referenced by at least one TOC entry in the NCX file.
241-
OPF_060=Duplicate entry in the ZIP file: "%1$s".
242-
OPF_061=Duplicate entry in the ZIP file (after Unicode NFC normalization) "%1$s".
241+
OPF_060=Duplicate entry in the ZIP file: "%1$s" (file names must be unique after Unicode canonical normalization and full case folding).
243242
OPF_062=Found Adobe page-map attribute on spine element in opf file.
244243
OPF_063=Referenced Adobe page-map item "%1$s" was not found in the manifest.
245244
OPF_064=OPF declares type "%1$s", validating using profile "%2$s".

src/test/resources/epub3/04-ocf/ocf.feature

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,17 +29,28 @@ Feature: EPUB 3 — Open Container Format
2929
Then no errors or warnings are reported
3030

3131
@spec @xref:sec-container-filenames
32-
Scenario: Report a duplicate filename if two files only differ by case
33-
When checking EPUB 'ocf-filename-duplicate-after-case-normalization-error.epub'
32+
Scenario: Report a duplicate filename after common case folding
33+
When checking EPUB 'ocf-filename-duplicate-after-common-case-folding-error.epub'
3434
Then error OPF-060 is reported
3535
And no other errors or warnings are reported
3636

3737
@spec @xref:sec-container-filenames
38-
Scenario: Report a duplicate filename if two files have the same name after Unicode normalization
39-
When checking EPUB 'ocf-filename-duplicate-after-unicode-normalization-warning.epub'
40-
Then warning OPF-061 is reported
38+
Scenario: Report a duplicate filename after full case folding
39+
When checking EPUB 'ocf-filename-duplicate-after-full-case-folding-error.epub'
40+
Then error OPF-060 is reported
41+
And no other errors or warnings are reported
42+
43+
@spec @xref:sec-container-filenames
44+
Scenario: Report a duplicate filename after Unicode canonical normalization (NFC)
45+
When checking EPUB 'ocf-filename-duplicate-after-canonical-normalization-error.epub'
46+
Then error OPF-060 is reported
4147
And no other errors or warnings are reported
4248

49+
@spec @xref:sec-container-filenames
50+
Scenario: Allow a duplicate filename after Unicode compatibility normalization (NFKC)
51+
When checking EPUB 'ocf-filename-duplicate-after-compatibility-normalization-valid.epub'
52+
Then no other errors or warnings are reported
53+
4354
@spec @xref:sec-container-filenames
4455
Scenario: Allow Unicode emoji tag set in file name
4556
When checking EPUB 'ocf-filename-character-emoji-tag-sequence-valid'

0 commit comments

Comments
 (0)