From 111e7725e3b21668b5454224d3c0bd75f70f7ef9 Mon Sep 17 00:00:00 2001 From: Romain Deltour Date: Sat, 29 Oct 2022 00:34:22 +0200 Subject: [PATCH] feat: check file name uniqueness wiht Unicode canonical case fold normalization This commit changes the OCF container file name uniqueness check to perform the Unicode canonical case fold normalization step defined in https://www.w3.org/TR/charmod-norm/#CanonicalFoldNormalizationStep That is, we normalize the file name to NFD then apply full case folding before checking for uniqueness. Previously the behaviors was: - we checked for uniqueness of the lower case form (String.toLowerCase) - we checked for uniqueness of the NFC-normalized lower case This was flawed, since String.toLowerCase is not equivalent to Unicode full case folding. Also, previously, only a warning (OPF-061) was reported when names were not unique after NFC normalization. This is now an error, using the same code as the other uniqueness failures (OPF-060). This commit removes OPF-061, which is no longer used. Fixes #1246 --- .../epubcheck/messages/DefaultSeverities.java | 1 - .../adobe/epubcheck/messages/MessageId.java | 1 - .../com/adobe/epubcheck/ocf/OCFChecker.java | 29 ++++++------- .../w3c/epubcheck/util/text/UnicodeUtils.java | 39 ++++++++++++++++++ .../messages/MessageBundle.properties | 3 +- ...-after-canonical-normalization-error.epub} | Bin ...cate-after-common-case-folding-error.epub} | Bin ...ter-compatibility-normalization-valid.epub | Bin 0 -> 1827 bytes ...plicate-after-full-case-folding-error.epub | Bin 0 -> 1850 bytes src/test/resources/epub3/04-ocf/ocf.feature | 21 +++++++--- 10 files changed, 69 insertions(+), 25 deletions(-) create mode 100644 src/main/java/org/w3c/epubcheck/util/text/UnicodeUtils.java rename src/test/resources/epub3/04-ocf/files/{ocf-filename-duplicate-after-unicode-normalization-warning.epub => ocf-filename-duplicate-after-canonical-normalization-error.epub} (100%) rename src/test/resources/epub3/04-ocf/files/{ocf-filename-duplicate-after-case-normalization-error.epub => ocf-filename-duplicate-after-common-case-folding-error.epub} (100%) create mode 100644 src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-compatibility-normalization-valid.epub create mode 100644 src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-full-case-folding-error.epub diff --git a/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java b/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java index cd25d4799..b6ad8ffd1 100644 --- a/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java +++ b/src/main/java/com/adobe/epubcheck/messages/DefaultSeverities.java @@ -248,7 +248,6 @@ private void initialize() severities.put(MessageId.OPF_058, Severity.SUPPRESSED); severities.put(MessageId.OPF_059, Severity.SUPPRESSED); severities.put(MessageId.OPF_060, Severity.ERROR); - severities.put(MessageId.OPF_061, Severity.WARNING); severities.put(MessageId.OPF_062, Severity.USAGE); severities.put(MessageId.OPF_063, Severity.WARNING); severities.put(MessageId.OPF_064, Severity.INFO); diff --git a/src/main/java/com/adobe/epubcheck/messages/MessageId.java b/src/main/java/com/adobe/epubcheck/messages/MessageId.java index dd8486823..7724bb5d4 100644 --- a/src/main/java/com/adobe/epubcheck/messages/MessageId.java +++ b/src/main/java/com/adobe/epubcheck/messages/MessageId.java @@ -242,7 +242,6 @@ public enum MessageId implements Comparable OPF_058("OPF-058"), OPF_059("OPF-059"), OPF_060("OPF-060"), - OPF_061("OPF-061"), OPF_062("OPF-062"), OPF_063("OPF-063"), OPF_064("OPF-064"), diff --git a/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java b/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java index 181a9a949..5839c3fdc 100755 --- a/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java +++ b/src/main/java/com/adobe/epubcheck/ocf/OCFChecker.java @@ -25,11 +25,9 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.text.Normalizer; import java.util.HashSet; import java.util.LinkedList; import java.util.List; -import java.util.Locale; import java.util.Map; import java.util.Set; @@ -37,6 +35,7 @@ import org.w3c.epubcheck.core.AbstractChecker; import org.w3c.epubcheck.core.Checker; import org.w3c.epubcheck.core.CheckerFactory; +import org.w3c.epubcheck.util.text.UnicodeUtils; import com.adobe.epubcheck.api.EPUBLocation; import com.adobe.epubcheck.api.EPUBProfile; @@ -296,18 +295,6 @@ private boolean checkContainerStructure(OCFCheckerState state) // FIXME 2022 report symbolic links and continue - // Check duplicate entries - if (normalizedPaths.contains(resource.getPath().toLowerCase(Locale.ROOT))) - { - context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath()); - } - // Check duplicate entries after NFC normalization - else if (normalizedPaths.contains( - Normalizer.normalize(resource.getPath().toLowerCase(Locale.ROOT), Normalizer.Form.NFC))) - { - context.report.message(MessageId.OPF_061, EPUBLocation.of(context), resource.getPath()); - } - // Store the resource in the data structure if (resource.isDirectory()) { @@ -318,9 +305,19 @@ else if (normalizedPaths.contains( else { // The container resource is a file, - // sStore its path for later checking of empty directories + // store its path for later checking of empty directories filePaths.add(resource.getPath()); - normalizedPaths.add(resource.getPath().toLowerCase(Locale.ROOT)); + + // Check duplicate entries + String normalizedPath = UnicodeUtils.canonicalCaseFold(resource.getPath()); + if (normalizedPaths.contains(normalizedPath)) + { + context.report.message(MessageId.OPF_060, EPUBLocation.of(context), resource.getPath()); + } + else + { + normalizedPaths.add(normalizedPath); + } // Check file name requirements new OCFFilenameChecker(resource.getPath(), state.context().build()).check(); diff --git a/src/main/java/org/w3c/epubcheck/util/text/UnicodeUtils.java b/src/main/java/org/w3c/epubcheck/util/text/UnicodeUtils.java new file mode 100644 index 000000000..4f280b33d --- /dev/null +++ b/src/main/java/org/w3c/epubcheck/util/text/UnicodeUtils.java @@ -0,0 +1,39 @@ +package org.w3c.epubcheck.util.text; + +import com.google.common.base.Preconditions; +import com.ibm.icu.text.CaseMap; +import com.ibm.icu.text.Normalizer2; + +public final class UnicodeUtils +{ + + private static final Normalizer2 NFD_NORMALIZER = Normalizer2.getNFCInstance(); + private static final CaseMap.Fold CASE_FOLDER = CaseMap.fold(); + + private UnicodeUtils() + { + // static utility class + } + + /** + * Applies Unicode Canonical Case Fold Normalization as defined in + * https://www.w3.org/TR/charmod-norm/#CanonicalFoldNormalizationStep + * + * This applies, in sequence: - canonical decomposition (NFD) - case folding + * + * Note that the result is **not** recomposed (NFC), i.e. the optional + * post-folding NFC normalization is not applied. + * + * In other words, the result is suitable for string comparison for + * case-insensitive string comparison, but not for display. + * + * @param string + * the string to normalize + * @return the string normalized by applying NFD then case folding + */ + public static String canonicalCaseFold(String string) + { + Preconditions.checkArgument(string != null); + return CASE_FOLDER.apply(NFD_NORMALIZER.normalize(string)); + } +} diff --git a/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties b/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties index bed87d83a..0a78f94c6 100644 --- a/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties +++ b/src/main/resources/com/adobe/epubcheck/messages/MessageBundle.properties @@ -238,8 +238,7 @@ OPF_058=Spine item "%1$s" is not referenced from the TOC in the Nav Doc. OPF_058_SUG=Every spine item in the manifest should be referenced by at least one TOC entry in the Nav Doc. OPF_059=Spine item "%1$s" is not referenced from the TOC in the NCX. OPF_059_SUG=Every spine item in the manifest should be referenced by at least one TOC entry in the NCX file. -OPF_060=Duplicate entry in the ZIP file: "%1$s". -OPF_061=Duplicate entry in the ZIP file (after Unicode NFC normalization) "%1$s". +OPF_060=Duplicate entry in the ZIP file: "%1$s" (file names must be unique after Unicode canonical normalization and full case folding). OPF_062=Found Adobe page-map attribute on spine element in opf file. OPF_063=Referenced Adobe page-map item "%1$s" was not found in the manifest. OPF_064=OPF declares type "%1$s", validating using profile "%2$s". diff --git a/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-unicode-normalization-warning.epub b/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-canonical-normalization-error.epub similarity index 100% rename from src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-unicode-normalization-warning.epub rename to src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-canonical-normalization-error.epub diff --git a/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-case-normalization-error.epub b/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-common-case-folding-error.epub similarity index 100% rename from src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-case-normalization-error.epub rename to src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-common-case-folding-error.epub diff --git a/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-compatibility-normalization-valid.epub b/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-compatibility-normalization-valid.epub new file mode 100644 index 0000000000000000000000000000000000000000..22a38b2dc281d96d657247e96ea165d6a6c903a1 GIT binary patch literal 1827 zcmWIWW@h1HVBlb2(5|iZ%THXbBf`MI0Kz~ahTP2D)RM}A)Wm{uLo`b>i7V}VnFN~5bC6#oS#>cnpYD4sHIo0BBLZX$F|>) z|B!-6%kx=36ScUOmq_7Lq zEbF#+O}x`~Guhuw-g)eau?_ z9n{eIA#k?bdcKNWUdX|)h^ry5Se)z`+N`uyUq&)rKDpSuA@Npc?}>d!cxF1kwrbqD zRqavD-nic?zVD`A?fKsHVBU1K_Q`j0m7i+_DoR;iIWTROnD-ZtMcbwbhjHpCyfxUK z)EefIc=Vj=q?;E{=&W93I#KdC(^Tc_Hea0gE#Fo$({)--V^0dxTfR%;mzKC3Jof%F z`?E(!{xs}TF=L#bcR-|++j3w00>72f{NjJUzOB3>O?oIIFjJwJej#u1<3j3zb&a-dd_?H@{(H zW$VwI;`PScS1~LLnDpVzwFIU^Gn$MN@6Lbp_G9mA*&p*y?O(e4aa{R@_k3>uw_Hdn zy;Ao3^;N#VLf5QC>(;zmw0eixl8^0MHy6K~9?SSRe1YGF8Skei@7u5X(i{;}DZrSj zWISGa4;Vupz}Vu0#Z+Em86*{kPS~4w*np=c{kdkyThqB^8My&bvlwE38JD~a(by)# zIYISs+M_3LwY?o1{oE(qf9Ub(?z_iNU;i#X&%^7NS0Gv@IcGkrxl1zh1|9F6Hq#QV z4sBbPy<-i}`GCd8FF%ye+`ZBNjdQ52qr`I;GrymUAKWVb7JkyV(QUO}R%5f;(Wu4C zA}2pkfAxO8U#RQflHsn%)Hbhy^7qN*uH~YhYSQ< z&VSQB6t~nZUuYIr(>&W3>}y_zHNVWu2)ej`d)JBznF{0lRae(?zw)a6bVy8Wv!zCo zdhheet%vKP^&Ho}T2ypM(l+=%dw;2)Slhhbgij|8%BJ{~xhWMY=e_9IT$%XHn5ooo zW!1zxzZ^1o?k~Be*kY7q7k%RmU$*SU*bgqoow_Ufr213$xo>JPWP5MWpT6aS;j&M? ze(usQmOK5o(E5Av`cdWpZ$>722Hb@*(2qy}RzxFoVK0yoTC9Lflw<%eo{`PKR^TDb zkOO8cuo+0jAF^(2nGm7-1QS#@JWC?$#g=RldM^PBUli9v6EU(0*pdapge@$%5=nqJ SD;r218xU3jeK?sF!~+2J*vkw6 literal 0 HcmV?d00001 diff --git a/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-full-case-folding-error.epub b/src/test/resources/epub3/04-ocf/files/ocf-filename-duplicate-after-full-case-folding-error.epub new file mode 100644 index 0000000000000000000000000000000000000000..eb531cc02cd50fa69f134b75025421372c134c25 GIT binary patch literal 1850 zcmWIWW@h1HVBlb2(5|iZ%THXbBf`MI0Kz~ahTP2D)RM}A)Wm{XD6no>g5-tEe-bcJ8U3wx0vg7 ze&C}e3J!sjynKYXxwjpnFY5U>-=X;S#jg+ky}Q>5_P`oBmvqM( ztG-1`Y9BqHvVDuMYR&DF+msfrbb0;F=fuKSk3_bX*&XTLdf>*EgGK*?cg(R0WenEv zWXVf4U{3ULZ7uj0|MA_A8{+(b)ZeadzxX4~q^>f){&HRX|KRBEO8r;g>%(t{d1u&% zR@}ek=)2N%$;a-k+dsdHk77Dj-~P7zf_3ha;C(w4f3gR7vvbT>F<8~Y$iNW9%)kJO zK~UUtFw6kPV6}Yw)4RZ!^90KB!D28ku}rTbqa-&cbb?|2VFiKK^ylFM-+s+0$;h1> zd8^OYy5wbupftM*&xx2vPwtC9UJ!j1B zff)?XD)pWpzvQoe<7z4I_11^lS5_3*{ZsapZ#(mD^()Srj+r@YN|sO4K7KP^)r>Vu z_4g9izAly5TjnfUG;MxnMa8oT>M19*KIYAlT$;*W=@=1xlCRueuD1U4mdWDV?p(Q` zyJFhp_tWN>d@##yyv8xb$3H8h^0}<0yS?GG*Q<~0J-jNbU4hl<&}R8-tQ?w(?&~MM zZ242T9}y6RQ7;1iXI>B701SW+Kr9Lih~)gdlGMDC_~K$nc-ZzE@*h$VX?Z^Dhx~-Q zla3vo70#B;F5D|yQsC0v%90UmaHT$9u(WFW?&|oW4L9bV`n+}Bgo3DxKY0$hE)F{; z7GZHcpX1}b<#kiboV_>MoIfdHWiiF9Yv#5CcM`sY3#%|5j1+cZnq}Sgu8DWrZYKM? z$vcld5mtS$BzTV8(uuP)L#|HOo$!K{?Wf~|Yo_Zxge7fD=1yC-HF%!u=H;rvT$z2} z!ZMD9axLlac>nxL@8^36FP0GU;^Fy2-HTyBFX~@j$Fdq2dq03!42XSQLmYKI{oM3H zF`1Z|ms+G(k((3ScaZClfq={TZ`z0Amb&E&&EjgBXZwPE&C9Unmw6dM7x!=PT2UcW zVVu9}>RRqsUbUYNiHU8t)JRhAeLlJMa9y;X7+r~ z6rVCTr9$Pr7af}`6Q3C~l^U+Bnt11zLnhDtCASn?jFRl4Z@l5lmc1DJ!Ns^!cSWC6 zf9gK>O$~-@?+yCXw_Gq>_Nmv;UHZjxr~ejOe=lA?${gU$$YjreyQ~KK5edLbZip^K zIgOnELFF_8SOJ;nr8dG&>}3Q(_a9)X0oDyKF_86Q%T@@ztiZAetQS3#Asc}$*&>X1 lfo=pW5hELcEm