unicode-org · pedberg-icu · May 12, 2022 · May 12, 2022 · btangmu · May 12, 2022
diff --git a/common/main/en.xml b/common/main/en.xml
diff --git a/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java b/tools/cldr-code/src/main/java/org/unicode/cldr/test/DisplayAndInputProcessor.java
@@ -15,6 +15,7 @@
 
 import org.unicode.cldr.test.CheckExemplars.ExemplarType;
 import org.unicode.cldr.util.Builder;
+import org.unicode.cldr.util.CLDRConfig;
 import org.unicode.cldr.util.CLDRFile;
 import org.unicode.cldr.util.CLDRLocale;
 import org.unicode.cldr.util.CldrUtility;
@@ -23,6 +24,7 @@
 import org.unicode.cldr.util.Emoji;
 import org.unicode.cldr.util.ICUServiceBuilder;
 import org.unicode.cldr.util.PatternCache;
+import org.unicode.cldr.util.SupplementalDataInfo;
 import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
 import org.unicode.cldr.util.With;
 import org.unicode.cldr.util.XPathParts;
@@ -82,6 +84,35 @@ public class DisplayAndInputProcessor {
     private static final Pattern INTERVAL_FORMAT_PATHS = PatternCache.get("//ldml/dates/.+/intervalFormatItem.*");
     private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])");
 
+    // Pattern to match against paths that might have time formats with h or K (12-hour cycles)
+    private static final Pattern HOUR_FORMAT_XPATHS = PatternCache
+        .get("//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/("
+            + "timeFormats/timeFormatLength\\[@type=\"[^\"]*\"]/timeFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|"
+            + "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*|"
+            + "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*)");
+
+    private static final Pattern AMPM_SPACE_BEFORE = PatternCache.get("([Khms])[ \\u00A0]+a"); // time, space, a
+    private static final Pattern AMPM_SPACE_AFTER = PatternCache.get("a[ \\u00A0]+([Kh])"); // a space, hour
+
+    // Pattern to match against paths that might have date formats with y
+    private static final Pattern YEAR_FORMAT_XPATHS = PatternCache
+        .get("//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/("
+            + "dateFormats/dateFormatLength\\[@type=\"[^\"]*\"]/dateFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|"
+            + "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*|"
+            + "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*)");
+
+    // Cyrillic year markers are or begin with (in various languages) \u0430 \u0433 \u0435 \u0436 \u043E \u0440 \u0441
+    private static final Pattern YEAR_SPACE_YEARMARKER = PatternCache.get("y[ \\u00A0]+('?[агежорс])"); // y, space, Cyrillic year marker start
+
+    public static final Pattern UNIT_NARROW_XPATHS = PatternCache
+        .get("//ldml/units/unitLength\\[@type=\"narrow\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*");
+
+    public static final Pattern UNIT_SHORT_XPATHS = PatternCache
+        .get("//ldml/units/unitLength\\[@type=\"short\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*");
+
+    private static final Pattern PLACEHOLDER_SPACE_AFTER = PatternCache.get("\\}[ \\u00A0\\u202F]+");
+    private static final Pattern PLACEHOLDER_SPACE_BEFORE = PatternCache.get("[ \\u00A0\\u202F]+\\{");
+
     /**
      * string of whitespace not including NBSP, i.e. [\t\n\r]+
      */
@@ -106,6 +137,7 @@ public class DisplayAndInputProcessor {
     private static final Pattern FINAL_NBSP = PatternCache.get("\\u00A0+$");
     private static final Pattern MULTIPLE_NBSP = PatternCache.get("\\u00A0\\u00A0+");
 
+    // The following includes (among others) \u0009, \u0020, \u00A0, \u2007, \u2009, \u202F, \u3000
     private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();
 
     private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml");
@@ -167,6 +199,7 @@ public class DisplayAndInputProcessor {
     private UnicodeSetPrettyPrinter pp = null;
 
     final private CLDRLocale locale;
+    private String scriptCode; // actual or default script code (not null after init)
     private boolean isPosix;
 
     /**
@@ -212,6 +245,18 @@ void init(CLDRLocale locale, boolean needsCollator) {
                 .setOrdering(col)
                 .setSpaceComparator(spaceCol);
         }
+        String script = locale.getScript();
+        if (script == null || script.length() < 4) {
+            SupplementalDataInfo sdi = CLDRConfig.getInstance().getSupplementalDataInfo();
+            script = sdi.getDefaultScript(locale.getBaseName());
+            if (script == null || script.length() < 4 || script.equals("Zzzz")) {
+                script = sdi.getDefaultScript(locale.getLanguage());
+            }
+            if (script == null || script.length() < 4) {
+                script = "Zzzz";
+            }
+        }
+        scriptCode = script;
     }
 
     public UnicodeSetPrettyPrinter getPrettyPrinter() {
@@ -298,7 +343,7 @@ public synchronized String processForDisplay(String path, String value) {
         }
         // Fix up hyphens, replacing with N-dash as appropriate
         if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
-            value = normalizeIntervalHyphens(value);
+            value = normalizeIntervalHyphensAndSpaces(value); // This may also adjust spaces around en dash
         } else {
             value = normalizeHyphens(value);
         }
@@ -470,7 +515,7 @@ public synchronized String processInput(String path, String value, Exception[] i
             }
             // Fix up hyphens, replacing with N-dash as appropriate
             if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
-                value = normalizeIntervalHyphens(value);
+                value = normalizeIntervalHyphensAndSpaces(value); // This may also adjust spaces around en dash
             } else if (!isUnicodeSet) {
                 value = normalizeHyphens(value);
             }
@@ -646,20 +691,26 @@ private String normalizeApostrophes(String value) {
         }
     }
 
-    private String normalizeIntervalHyphens(String value) {
+    private String normalizeIntervalHyphensAndSpaces(String value) {
         DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser();
-        fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
+        fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart()); // first format & separator including spaces
         List<Object> items = fp.getItems();
         Object last = items.get(items.size() - 1);
         if (last instanceof String) {
-            String separator = last.toString();
-            if (separator.contains("-")) {
+            String separator = last.toString(); // separator including spaces
+            String replacement = separator;
+            if (scriptCode.equals("Latn") && (separator.equals(" - ") || separator.equals(" \u2013 "))) {
+                replacement = "\u2009\u2013\u2009"; // Per CLDR-14032
+            } else if (separator.contains("-")) {
+                replacement = separator.replace("-", "\u2013");
+            }
+            if (!replacement.equals(separator)) {
                 StringBuilder sb = new StringBuilder();
                 sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
                 if (sb.lastIndexOf(separator) >= 0) {
                     sb.delete(sb.lastIndexOf(separator), sb.length());
-                    sb.append(separator.replace("-", "\u2013"));
-                    sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart());
+                    sb.append(replacement);
+                    sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart()); // second format only
                     return sb.toString();
                 }
             }
@@ -1037,6 +1088,24 @@ private String normalizeWhitespace(String path, String value) {
         } else {
             throw new IllegalArgumentException("Unknown PathSpaceType " + pst);
         }
+
+        // Further whitespace adjustments per CLDR-14032
+        if (HOUR_FORMAT_XPATHS.matcher(path).matches()) {
+            value = AMPM_SPACE_BEFORE.matcher(value).replaceAll("$1\u202Fa");
+            value = AMPM_SPACE_AFTER.matcher(value).replaceAll("a\u202F$1");
+        }
+        if (scriptCode.equals("Cyrl") && YEAR_FORMAT_XPATHS.matcher(path).matches()) {
+            value = YEAR_SPACE_YEARMARKER.matcher(value).replaceAll("y\u202F$1");
+        }
+        if (UNIT_NARROW_XPATHS.matcher(path).matches()) {
+            value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u202F"); // Narrow NBSP
+            value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u202F{");
+        }
+        if (UNIT_SHORT_XPATHS.matcher(path).matches()) {
+            value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u00A0"); // Regular NBSP
+            value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u00A0{");
+        }
+
         return value;
     }
 

diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestDisplayAndInputProcessor.java
@@ -364,6 +364,10 @@ private String diff(String value, String input, String path) {
         if (path.contains("/foreignSpaceReplacement")) {
             return null; // CLDR-15384 typically inherited; no DAIP processing desired
         }
+        if (logKnownIssue("CLDR-15635", "Skip TestAll() for /intervalFormatItem until we update xml data") &&
+            (path.contains("/timeFormat") || path.contains("/dateFormatItem") || path.contains("/intervalFormatItem"))) {
+            return null; // CLDR-14032 changing normalization for intervalFormats but xml data not yet updated
+        }
         if (path.contains("/exemplarCharacters") || path.contains("/parseLenient")) {
             try {
                 UnicodeSet s1 = new UnicodeSet(value);
@@ -477,6 +481,64 @@ public static PathSpaceData[] getArray() {
         }
     }
 
+    /**
+     * Test whether DisplayAndInputProcessor.processInput correctly makes whitespace adjustments
+     */
+    public void TestWhitespaceAdjustments() {
+        class PathSpaceAdjustData {
+            String locale;
+            String xpath;
+            String rawValue;
+            String normValue;
+
+            PathSpaceAdjustData(String loc, String path, String raw, String norm) {
+                this.locale = loc;
+                this.xpath = path;
+                this.rawValue = raw;
+                this.normValue = norm;
+            }
+        }
+
+        PathSpaceAdjustData[] testItems = {
+            new PathSpaceAdjustData("en",
+                    "//ldml/dates/calendars/calendar[@type=\"gregorian\"]/timeFormats/timeFormatLength[@type=\"short\"]/timeFormat[@type=\"standard\"]/pattern[@type=\"standard\"]",
+                    "h:mm a", "h:mm a"), // \u202F
+            new PathSpaceAdjustData("ja",
+                    "//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/availableFormats/dateFormatItem[@id=\"hm\"]",
+                    "a K:mm", "a K:mm"), // \u202F
+            new PathSpaceAdjustData("en",
+                    "//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/intervalFormats/intervalFormatItem[@id=\"hm\"]/greatestDifference[@id=\"a\"]",
+                    "h:mm - h:m a", "h:mm – h:m a"), // \u2009\u2013\u2009, \u202F
+            new PathSpaceAdjustData("uk",
+                    "//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateFormats/dateFormatLength[@type=\"medium\"]/dateFormat[@type=\"standard\"]/pattern[@type=\"standard\"]",
+                    "d MMM y 'р'.", "d MMM y 'р'."), // \u202F after y
+            new PathSpaceAdjustData("uk",
+                    "//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/availableFormats/dateFormatItem[@id=\"yMMMd\"]",
+                    "d MMM y 'р'.", "d MMM y 'р'."), // \u202F after y
+            new PathSpaceAdjustData("uk",
+                    "//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/intervalFormats/intervalFormatItem[@id=\"yMMMd\"]/greatestDifference[@id=\"M\"]",
+                    "d MMM - d MMM y 'р'.", "d MMM – d MMM y 'р'."), // \u2013, \u202F after y
+            new PathSpaceAdjustData("en",
+                    "//ldml/units/unitLength[@type=\"narrow\"]unit[@type=\"mass-gram\"]/unitPattern[@count=\"other\"]",
+                    "{0} g", "{0} g"), //  \u202F
+            new PathSpaceAdjustData("en",
+                    "//ldml/units/unitLength[@type=\"narrow\"]unit[@type=\"mass-gram\"]/unitPattern[@count=\"other\"]",
+                    "g {0}", "g {0}"), //  \u202F
+            new PathSpaceAdjustData("en",
+                    "//ldml/units/unitLength[@type=\"short\"]unit[@type=\"mass-gram\"]/unitPattern[@count=\"other\"]",
+                    "{0} g", "{0} g"), // \u00A0
+            new PathSpaceAdjustData("en",
+                    "//ldml/units/unitLength[@type=\"short\"]unit[@type=\"mass-gram\"]/unitPattern[@count=\"other\"]",
+                    "g {0}", "g {0}"), // \u00A0
+        };
+
+       for (PathSpaceAdjustData testItem: testItems) {
+           DisplayAndInputProcessor daip = new DisplayAndInputProcessor(info.getCLDRFile(testItem.locale, true), false);
+           String normValue = daip.processInput(testItem.xpath, testItem.rawValue, null);
+           assertEquals("Whitespace adjustment for " + testItem.xpath, testItem.normValue, normValue);
+       }
+    }
+
     /**
      * Test whether DisplayAndInputProcessor.processInput correctly normalizes annotations
      * containing “|” = U+007C VERTICAL LINE or its variations

diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestExampleGenerator.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestExampleGenerator.java
@@ -834,13 +834,13 @@ public void TestDayPeriods() {
         checkDayPeriod("pl", "format", "morning1", "〖06:00 – 10:00⁻〗〖❬8:00 ❭rano〗");
         checkDayPeriod("pl", "stand-alone", "morning1", "〖06:00 – 10:00⁻〗");
 
-        checkDayPeriod("en", "format", "night1", "〖00:00 – 06:00⁻; 21:00 – 24:00⁻〗〖❬3:00 ❭at night〗");
+        checkDayPeriod("en", "format", "night1", "〖00:00 – 06:00⁻; 21:00 – 24:00⁻〗〖❬3:00 ❭at night〗");
         checkDayPeriod("en", "stand-alone", "night1", "〖00:00 – 06:00⁻; 21:00 – 24:00⁻〗");
 
-        checkDayPeriod("en", "format", "noon", "〖12:00〗〖❬12:00 ❭noon〗");
-        checkDayPeriod("en", "format", "midnight", "〖00:00〗〖❬12:00 ❭midnight〗");
-        checkDayPeriod("en", "format", "am", "〖00:00 – 12:00⁻〗〖❬6:00 ❭AM〗");
-        checkDayPeriod("en", "format", "pm", "〖12:00 – 24:00⁻〗〖❬6:00 ❭PM〗");
+        checkDayPeriod("en", "format", "noon", "〖12:00〗〖❬12:00 ❭noon〗");
+        checkDayPeriod("en", "format", "midnight", "〖00:00〗〖❬12:00 ❭midnight〗");
+        checkDayPeriod("en", "format", "am", "〖00:00 – 12:00⁻〗〖❬6:00 ❭AM〗");
+        checkDayPeriod("en", "format", "pm", "〖12:00 – 24:00⁻〗〖❬6:00 ❭PM〗");
     }
 
     private void checkDayPeriod(String localeId, String type, String dayPeriodCode, String expected) {

diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestPathHeader.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestPathHeader.java
@@ -346,7 +346,7 @@ public void TestAppendTimezone() {
         ExampleGenerator eg = new ExampleGenerator(cldrFile, cldrFile, CLDRPaths.SUPPLEMENTAL_DIRECTORY);
         String example = eg.getExampleHtml(APPEND_TIMEZONE, cldrFile.getStringValue(APPEND_TIMEZONE));
         String result = ExampleGenerator.simplify(example, false);
-        assertEquals("", "〖❬6:25:59 PM❭ ❬GMT❭〗", result);
+        assertEquals("", "〖❬6:25:59 PM❭ ❬GMT❭〗", result);
     }
 
     public void TestOptional() {

diff --git a/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestPseudolocalization.java b/tools/cldr-code/src/test/java/org/unicode/cldr/unittest/TestPseudolocalization.java
@@ -36,7 +36,7 @@ public void testConverter() {
             result.getStringValue("//ldml/characters/exemplarCharacters[@type=\"auxiliary\"]"));
 
         assertEquals("Date and time placeholders should only be bracketed",
-            "[h:mm:ss a]",
+            "[h:mm:ss a]",
             result.getStringValue(
                 "//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/"
                     + "availableFormats/dateFormatItem[@id=\"hms\"]"));