Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CLDR-14032 More space adjustments in DAIP; add test. Run it on en.xml, adjust tests #2001

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
584 changes: 292 additions & 292 deletions common/main/en.xml

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import org.unicode.cldr.test.CheckExemplars.ExemplarType;
import org.unicode.cldr.util.Builder;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRLocale;
import org.unicode.cldr.util.CldrUtility;
Expand All @@ -23,6 +24,7 @@
import org.unicode.cldr.util.Emoji;
import org.unicode.cldr.util.ICUServiceBuilder;
import org.unicode.cldr.util.PatternCache;
import org.unicode.cldr.util.SupplementalDataInfo;
import org.unicode.cldr.util.UnicodeSetPrettyPrinter;
import org.unicode.cldr.util.With;
import org.unicode.cldr.util.XPathParts;
Expand Down Expand Up @@ -82,6 +84,35 @@ public class DisplayAndInputProcessor {
private static final Pattern INTERVAL_FORMAT_PATHS = PatternCache.get("//ldml/dates/.+/intervalFormatItem.*");
private static final Pattern NON_DECIMAL_PERIOD = PatternCache.get("(?<![0#'])\\.(?![0#'])");

// Pattern to match against paths that might have time formats with h or K (12-hour cycles)
private static final Pattern HOUR_FORMAT_XPATHS = PatternCache
.get("//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/("
+ "timeFormats/timeFormatLength\\[@type=\"[^\"]*\"]/timeFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|"
+ "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*|"
+ "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-GL-Ma-gl-m]*[hK][A-Za-z]*\"].*)");

private static final Pattern AMPM_SPACE_BEFORE = PatternCache.get("([Khms])[ \\u00A0]+a"); // time, space, a
private static final Pattern AMPM_SPACE_AFTER = PatternCache.get("a[ \\u00A0]+([Kh])"); // a space, hour

// Pattern to match against paths that might have date formats with y
private static final Pattern YEAR_FORMAT_XPATHS = PatternCache
.get("//ldml/dates/calendars/calendar\\[@type=\"[^\"]*\"]/("
+ "dateFormats/dateFormatLength\\[@type=\"[^\"]*\"]/dateFormat\\[@type=\"standard\"]/pattern\\[@type=\"standard\"].*|"
+ "dateTimeFormats/availableFormats/dateFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*|"
+ "dateTimeFormats/intervalFormats/intervalFormatItem\\[@id=\"[A-XZa-xz]*y[A-Za-z]*\"].*)");

// Cyrillic year markers are or begin with (in various languages) \u0430 \u0433 \u0435 \u0436 \u043E \u0440 \u0441
private static final Pattern YEAR_SPACE_YEARMARKER = PatternCache.get("y[ \\u00A0]+('?[агежорс])"); // y, space, Cyrillic year marker start

public static final Pattern UNIT_NARROW_XPATHS = PatternCache
.get("//ldml/units/unitLength\\[@type=\"narrow\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*");

public static final Pattern UNIT_SHORT_XPATHS = PatternCache
.get("//ldml/units/unitLength\\[@type=\"short\"]unit\\[@type=\"[^\"]*\"]/unitPattern.*");

private static final Pattern PLACEHOLDER_SPACE_AFTER = PatternCache.get("\\}[ \\u00A0\\u202F]+");
private static final Pattern PLACEHOLDER_SPACE_BEFORE = PatternCache.get("[ \\u00A0\\u202F]+\\{");

/**
* string of whitespace not including NBSP, i.e. [\t\n\r]+
*/
Expand All @@ -106,6 +137,7 @@ public class DisplayAndInputProcessor {
private static final Pattern FINAL_NBSP = PatternCache.get("\\u00A0+$");
private static final Pattern MULTIPLE_NBSP = PatternCache.get("\\u00A0\\u00A0+");

// The following includes (among others) \u0009, \u0020, \u00A0, \u2007, \u2009, \u202F, \u3000
private static final UnicodeSet UNICODE_WHITESPACE = new UnicodeSet("[:whitespace:]").freeze();

private static final CLDRLocale MALAYALAM = CLDRLocale.getInstance("ml");
Expand Down Expand Up @@ -167,6 +199,7 @@ public class DisplayAndInputProcessor {
private UnicodeSetPrettyPrinter pp = null;

final private CLDRLocale locale;
private String scriptCode; // actual or default script code (not null after init)
private boolean isPosix;

/**
Expand Down Expand Up @@ -212,6 +245,18 @@ void init(CLDRLocale locale, boolean needsCollator) {
.setOrdering(col)
.setSpaceComparator(spaceCol);
}
String script = locale.getScript();
if (script == null || script.length() < 4) {
SupplementalDataInfo sdi = CLDRConfig.getInstance().getSupplementalDataInfo();
script = sdi.getDefaultScript(locale.getBaseName());
if (script == null || script.length() < 4 || script.equals("Zzzz")) {
script = sdi.getDefaultScript(locale.getLanguage());
}
if (script == null || script.length() < 4) {
script = "Zzzz";
}
}
scriptCode = script;
}

public UnicodeSetPrettyPrinter getPrettyPrinter() {
Expand Down Expand Up @@ -298,7 +343,7 @@ public synchronized String processForDisplay(String path, String value) {
}
// Fix up hyphens, replacing with N-dash as appropriate
if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
value = normalizeIntervalHyphens(value);
value = normalizeIntervalHyphensAndSpaces(value); // This may also adjust spaces around en dash
} else {
value = normalizeHyphens(value);
}
Expand Down Expand Up @@ -470,7 +515,7 @@ public synchronized String processInput(String path, String value, Exception[] i
}
// Fix up hyphens, replacing with N-dash as appropriate
if (INTERVAL_FORMAT_PATHS.matcher(path).matches()) {
value = normalizeIntervalHyphens(value);
value = normalizeIntervalHyphensAndSpaces(value); // This may also adjust spaces around en dash
} else if (!isUnicodeSet) {
value = normalizeHyphens(value);
}
Expand Down Expand Up @@ -646,20 +691,26 @@ private String normalizeApostrophes(String value) {
}
}

private String normalizeIntervalHyphens(String value) {
private String normalizeIntervalHyphensAndSpaces(String value) {
DateTimePatternGenerator.FormatParser fp = new DateTimePatternGenerator.FormatParser();
fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
fp.set(DateIntervalInfo.genPatternInfo(value, false).getFirstPart()); // first format & separator including spaces
List<Object> items = fp.getItems();
Object last = items.get(items.size() - 1);
if (last instanceof String) {
String separator = last.toString();
if (separator.contains("-")) {
String separator = last.toString(); // separator including spaces
String replacement = separator;
if (scriptCode.equals("Latn") && (separator.equals(" - ") || separator.equals(" \u2013 "))) {
replacement = "\u2009\u2013\u2009"; // Per CLDR-14032
} else if (separator.contains("-")) {
replacement = separator.replace("-", "\u2013");
}
if (!replacement.equals(separator)) {
StringBuilder sb = new StringBuilder();
sb.append(DateIntervalInfo.genPatternInfo(value, false).getFirstPart());
if (sb.lastIndexOf(separator) >= 0) {
sb.delete(sb.lastIndexOf(separator), sb.length());
sb.append(separator.replace("-", "\u2013"));
sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart());
sb.append(replacement);
sb.append(DateIntervalInfo.genPatternInfo(value, false).getSecondPart()); // second format only
return sb.toString();
}
}
Expand Down Expand Up @@ -1037,6 +1088,24 @@ private String normalizeWhitespace(String path, String value) {
} else {
throw new IllegalArgumentException("Unknown PathSpaceType " + pst);
}

// Further whitespace adjustments per CLDR-14032
if (HOUR_FORMAT_XPATHS.matcher(path).matches()) {
value = AMPM_SPACE_BEFORE.matcher(value).replaceAll("$1\u202Fa");
value = AMPM_SPACE_AFTER.matcher(value).replaceAll("a\u202F$1");
}
if (scriptCode.equals("Cyrl") && YEAR_FORMAT_XPATHS.matcher(path).matches()) {
value = YEAR_SPACE_YEARMARKER.matcher(value).replaceAll("y\u202F$1");
}
if (UNIT_NARROW_XPATHS.matcher(path).matches()) {
value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u202F"); // Narrow NBSP
value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u202F{");
}
if (UNIT_SHORT_XPATHS.matcher(path).matches()) {
value = PLACEHOLDER_SPACE_AFTER.matcher(value).replaceAll("}\u00A0"); // Regular NBSP
value = PLACEHOLDER_SPACE_BEFORE.matcher(value).replaceAll("\u00A0{");
}

return value;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,10 @@ private String diff(String value, String input, String path) {
if (path.contains("/foreignSpaceReplacement")) {
return null; // CLDR-15384 typically inherited; no DAIP processing desired
}
if (logKnownIssue("CLDR-15635", "Skip TestAll() for /intervalFormatItem until we update xml data") &&
(path.contains("/timeFormat") || path.contains("/dateFormatItem") || path.contains("/intervalFormatItem"))) {
return null; // CLDR-14032 changing normalization for intervalFormats but xml data not yet updated
}
if (path.contains("/exemplarCharacters") || path.contains("/parseLenient")) {
try {
UnicodeSet s1 = new UnicodeSet(value);
Expand Down Expand Up @@ -477,6 +481,64 @@ public static PathSpaceData[] getArray() {
}
}

/**
* Test whether DisplayAndInputProcessor.processInput correctly makes whitespace adjustments
*/
public void TestWhitespaceAdjustments() {
class PathSpaceAdjustData {
String locale;
String xpath;
String rawValue;
String normValue;

PathSpaceAdjustData(String loc, String path, String raw, String norm) {
this.locale = loc;
this.xpath = path;
this.rawValue = raw;
this.normValue = norm;
}
}

PathSpaceAdjustData[] testItems = {
new PathSpaceAdjustData("en",
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/timeFormats/timeFormatLength[@type=\"short\"]/timeFormat[@type=\"standard\"]/pattern[@type=\"standard\"]",
"h:mm a", "h:mm a"), // \u202F
new PathSpaceAdjustData("ja",
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/availableFormats/dateFormatItem[@id=\"hm\"]",
"a K:mm", "a K:mm"), // \u202F
new PathSpaceAdjustData("en",
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/intervalFormats/intervalFormatItem[@id=\"hm\"]/greatestDifference[@id=\"a\"]",
"h:mm - h:m a", "h:mm – h:m a"), // \u2009\u2013\u2009, \u202F
new PathSpaceAdjustData("uk",
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateFormats/dateFormatLength[@type=\"medium\"]/dateFormat[@type=\"standard\"]/pattern[@type=\"standard\"]",
"d MMM y 'р'.", "d MMM y 'р'."), // \u202F after y
new PathSpaceAdjustData("uk",
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/availableFormats/dateFormatItem[@id=\"yMMMd\"]",
"d MMM y 'р'.", "d MMM y 'р'."), // \u202F after y
new PathSpaceAdjustData("uk",
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/intervalFormats/intervalFormatItem[@id=\"yMMMd\"]/greatestDifference[@id=\"M\"]",
"d MMM - d MMM y 'р'.", "d MMM – d MMM y 'р'."), // \u2013, \u202F after y
new PathSpaceAdjustData("en",
"//ldml/units/unitLength[@type=\"narrow\"]unit[@type=\"mass-gram\"]/unitPattern[@count=\"other\"]",
"{0} g", "{0} g"), // \u202F
new PathSpaceAdjustData("en",
"//ldml/units/unitLength[@type=\"narrow\"]unit[@type=\"mass-gram\"]/unitPattern[@count=\"other\"]",
"g {0}", "g {0}"), // \u202F
new PathSpaceAdjustData("en",
"//ldml/units/unitLength[@type=\"short\"]unit[@type=\"mass-gram\"]/unitPattern[@count=\"other\"]",
"{0} g", "{0} g"), // \u00A0
new PathSpaceAdjustData("en",
"//ldml/units/unitLength[@type=\"short\"]unit[@type=\"mass-gram\"]/unitPattern[@count=\"other\"]",
"g {0}", "g {0}"), // \u00A0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the comments on lines 529 and 532 refer to \u00A0, which, however, doesn't seem to occur in the data on those lines

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, never mind, they do have \u00A0 but somehow it was changed to \u0020 when copy-pasting from browser

};

for (PathSpaceAdjustData testItem: testItems) {
DisplayAndInputProcessor daip = new DisplayAndInputProcessor(info.getCLDRFile(testItem.locale, true), false);
String normValue = daip.processInput(testItem.xpath, testItem.rawValue, null);
assertEquals("Whitespace adjustment for " + testItem.xpath, testItem.normValue, normValue);
}
}

/**
* Test whether DisplayAndInputProcessor.processInput correctly normalizes annotations
* containing “|” = U+007C VERTICAL LINE or its variations
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -834,13 +834,13 @@ public void TestDayPeriods() {
checkDayPeriod("pl", "format", "morning1", "〖06:00 – 10:00⁻〗〖❬8:00 ❭rano〗");
checkDayPeriod("pl", "stand-alone", "morning1", "〖06:00 – 10:00⁻〗");

checkDayPeriod("en", "format", "night1", "〖00:00 – 06:00⁻; 21:00 – 24:00⁻〗〖❬3:00 ❭at night〗");
checkDayPeriod("en", "format", "night1", "〖00:00 – 06:00⁻; 21:00 – 24:00⁻〗〖❬3:00❭at night〗");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if these are nbsp may be better to escape it in the java source for clarity. That might be a good sweeping change though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, filed a separate ticket for that: CLDR-15636

checkDayPeriod("en", "stand-alone", "night1", "〖00:00 – 06:00⁻; 21:00 – 24:00⁻〗");

checkDayPeriod("en", "format", "noon", "〖12:00〗〖❬12:00 ❭noon〗");
checkDayPeriod("en", "format", "midnight", "〖00:00〗〖❬12:00 ❭midnight〗");
checkDayPeriod("en", "format", "am", "〖00:00 – 12:00⁻〗〖❬6:00 ❭AM〗");
checkDayPeriod("en", "format", "pm", "〖12:00 – 24:00⁻〗〖❬6:00 ❭PM〗");
checkDayPeriod("en", "format", "noon", "〖12:00〗〖❬12:00❭noon〗");
checkDayPeriod("en", "format", "midnight", "〖00:00〗〖❬12:00❭midnight〗");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is probably a mistake to use a NBSP (or NNBSP) between the time (13:00) and an unabbreviated day period. The am/pm are fine, because they are short. But forcing a string like "12:00 midnight" to break as a unit could leave lines unnecessarily ragged.

However, I think that can be discussed and corrected afterwards if necessary.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@macchiati I did not do anything to the spacing in the patterns that have B (for day periods). I think the issue here is just that the ExampleGenerator code is using time patterns with 'a' to generate examples for dayPeriods. That seems like an ExampleGenerator problem.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, actually I think it might be in the DateTimePatternGenerator code which, if given a skeleton with 'B', will adjust a time pattern that has 'a' by substituting in the 'B'. Hmm, need to think about how to address that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could fix that in DTPG code, it can remove all NBSP in patterns that end up using B.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I do not think this is a necessarily problem in DTPG. We have separate availableFormats and intervalFormats for Bh, Bhm and Bhms - and this PR did not change those - so if Example Generator is correctly using DTPG and DIF it should not have gotten the NNBSP above. It may be an ExampleGenerator problem after all. I will investigate.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Problem is in ICUServiceBuilder.formatDayPeriod, filed https://unicode-org.atlassian.net/browse/CLDR-15645 to fix.

checkDayPeriod("en", "format", "am", "〖00:00 – 12:00⁻〗〖❬6:00❭AM〗");
checkDayPeriod("en", "format", "pm", "〖12:00 – 24:00⁻〗〖❬6:00❭PM〗");
}

private void checkDayPeriod(String localeId, String type, String dayPeriodCode, String expected) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ public void TestAppendTimezone() {
ExampleGenerator eg = new ExampleGenerator(cldrFile, cldrFile, CLDRPaths.SUPPLEMENTAL_DIRECTORY);
String example = eg.getExampleHtml(APPEND_TIMEZONE, cldrFile.getStringValue(APPEND_TIMEZONE));
String result = ExampleGenerator.simplify(example, false);
assertEquals("", "〖❬6:25:59 PM❭ ❬GMT❭〗", result);
assertEquals("", "〖❬6:25:59PM❭ ❬GMT❭〗", result);
}

public void TestOptional() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public void testConverter() {
result.getStringValue("//ldml/characters/exemplarCharacters[@type=\"auxiliary\"]"));

assertEquals("Date and time placeholders should only be bracketed",
"[h:mm:ss a]",
"[h:mm:ssa]",
result.getStringValue(
"//ldml/dates/calendars/calendar[@type=\"gregorian\"]/dateTimeFormats/"
+ "availableFormats/dateFormatItem[@id=\"hms\"]"));
Expand Down