ICU-21406 canonicalize -T- extension

See #1491
unicode-org · Dec 8, 2020 · caa52e3 · caa52e3
1 parent 917188d
commit caa52e3
Show file tree

Hide file tree

Showing 7 changed files with 251 additions and 22 deletions.
diff --git a/icu4c/source/common/locid.cpp b/icu4c/source/common/locid.cpp
@@ -1171,7 +1171,12 @@ class AliasReplacer {
     bool replaceVariant(UErrorCode& status);
 
     // Replace by using subdivisionAlias.
-    bool replaceSubdivision(CharString& subdivision, UErrorCode& status);
+    bool replaceSubdivision(StringPiece subdivision,
+                            CharString& output, UErrorCode& status);
+
+    // Replace transformed extensions.
+    bool replaceTransformedExtensions(
+        CharString& transformedExtensions, CharString& output, UErrorCode& status);
 };
 
 CharString&
@@ -1501,7 +1506,8 @@ AliasReplacer::replaceVariant(UErrorCode& status)
 }
 
 bool
-AliasReplacer::replaceSubdivision(CharString& subdivision, UErrorCode& status)
+AliasReplacer::replaceSubdivision(
+    StringPiece subdivision, CharString& output, UErrorCode& status)
 {
     if (U_FAILURE(status)) {
         return false;
@@ -1514,13 +1520,84 @@ AliasReplacer::replaceSubdivision(CharString& subdivision, UErrorCode& status)
             (firstSpace - replacement) : uprv_strlen(replacement);
         // Ignore len == 2, see CLDR-14312
         if (3 <= len && len <= 8) {
-            subdivision.clear().append(replacement, (int32_t)len, status);
+            output.append(replacement, (int32_t)len, status);
         }
         return true;
     }
     return false;
 }
 
+bool
+AliasReplacer::replaceTransformedExtensions(
+    CharString& transformedExtensions, CharString& output, UErrorCode& status)
+{
+    // The content of the transformedExtensions will be modified in this
+    // function to NULL-terminating (tkey-tvalue) pairs.
+    if (U_FAILURE(status)) {
+        return false;
+    }
+    int32_t len = transformedExtensions.length();
+    const char* str = transformedExtensions.data();
+    const char* tkey = ultag_getTKeyStart(str);
+    int32_t tlangLen = (tkey == str) ? 0 :
+        ((tkey == nullptr) ? len : (tkey - str - 1));
+    CharStringByteSink sink(&output);
+    if (tlangLen > 0) {
+        Locale tlang = LocaleBuilder()
+            .setLanguageTag(StringPiece(str, tlangLen))
+            .build(status);
+        tlang.canonicalize(status);
+        tlang.toLanguageTag(sink, status);
+        if (U_FAILURE(status)) {
+            return false;
+        }
+        T_CString_toLowerCase(output.data());
+    }
+    if (tkey != nullptr) {
+        // We need to sort the tfields by tkey
+        UVector tfields(status);
+        if (U_FAILURE(status)) {
+            return false;
+        }
+        do {
+            const char* tvalue = uprv_strchr(tkey, '-');
+            if (tvalue == nullptr) {
+                status = U_ILLEGAL_ARGUMENT_ERROR;
+            }
+            const char* nextTKey = ultag_getTKeyStart(tvalue);
+            if (nextTKey != nullptr) {
+                *((char*)(nextTKey-1)) = '\0';  // NULL terminate tvalue
+            }
+            tfields.insertElementAt((void*)tkey, tfields.size(), status);
+            if (U_FAILURE(status)) {
+                return false;
+            }
+            tkey = nextTKey;
+        } while (tkey != nullptr);
+        tfields.sort([](UElement e1, UElement e2) -> int8_t {
+            return uprv_strcmp(
+                (const char*)e1.pointer, (const char*)e2.pointer);
+        }, status);
+        for (int32_t i = 0; i < tfields.size(); i++) {
+             if (output.length() > 0) {
+                 output.append('-', status);
+             }
+             const char* tfield = (const char*) tfields.elementAt(i);
+             const char* tvalue = uprv_strchr(tfield, '-');
+             // Split the "tkey-tvalue" pair string so that we can canonicalize the tvalue.
+             U_ASSERT(tvalue != nullptr);
+             *((char*)tvalue++) = '\0'; // NULL terminate tkey
+             output.append(tfield, status).append('-', status);
+             const char* bcpTValue = ulocimp_toBcpType(tfield, tvalue, nullptr, nullptr);
+             output.append((bcpTValue == nullptr) ? tvalue : bcpTValue, status);
+        }
+    }
+    if (U_FAILURE(status)) {
+        return false;
+    }
+    return true;
+}
+
 CharString&
 AliasReplacer::outputToString(
     CharString& out, UErrorCode status)
@@ -1661,18 +1738,28 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode status)
         if (U_SUCCESS(status) && !iter.isNull()) {
             const char* key;
             while ((key = iter->next(nullptr, status)) != nullptr) {
-                if (uprv_strcmp("sd", key) == 0 || uprv_strcmp("rg", key) == 0) {
+                if (uprv_strcmp("sd", key) == 0 || uprv_strcmp("rg", key) == 0 ||
+                        uprv_strcmp("t", key) == 0) {
                     CharString value;
                     CharStringByteSink valueSink(&value);
                     locale.getKeywordValue(key, valueSink, status);
                     if (U_FAILURE(status)) {
                         status = U_ZERO_ERROR;
                         continue;
                     }
-                    if (replaceSubdivision(value, status)) {
-                        changed++;
+                    CharString replacement;
+                    if (uprv_strlen(key) == 2) {
+                        if (replaceSubdivision(value.toStringPiece(), replacement, status)) {
+                            changed++;
+                            temp.setKeywordValue(key, replacement.data(), status);
+                        }
+                    } else {
+                        U_ASSERT(uprv_strcmp(key, "t") == 0);
+                        if (replaceTransformedExtensions(value, replacement, status)) {
+                            changed++;
+                            temp.setKeywordValue(key, replacement.data(), status);
+                        }
                     }
-                    temp.setKeywordValue(key, value.data(), status);
                     if (U_FAILURE(status)) {
                         return false;
                     }
@@ -1689,7 +1776,6 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode status)
     }
     // If the tag is not changed, return.
     if (uprv_strcmp(out.data(), locale.getName()) == 0) {
-        U_ASSERT(changed == 0);
         out.clear();
         return false;
     }

diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp
@@ -646,6 +646,22 @@ _isTKey(const char* s, int32_t len)
     return FALSE;
 }
 
+U_CAPI const char * U_EXPORT2
+ultag_getTKeyStart(const char *localeID) {
+    const char *result = localeID;
+    const char *sep;
+    while((sep = uprv_strchr(result, SEP)) != nullptr) {
+        if (_isTKey(result, sep - result)) {
+            return result;
+        }
+        result = ++sep;
+    }
+    if (_isTKey(result, -1)) {
+        return result;
+    }
+    return nullptr;
+}
+
 static UBool
 _isTValue(const char* s, int32_t len)
 {

diff --git a/icu4c/source/common/ulocimp.h b/icu4c/source/common/ulocimp.h
@@ -286,6 +286,9 @@ ultag_isUnicodeLocaleType(const char* s, int32_t len);
 U_CFUNC UBool
 ultag_isVariantSubtags(const char* s, int32_t len);
 
+U_CAPI const char * U_EXPORT2
+ultag_getTKeyStart(const char *localeID);
+
 U_CFUNC const char*
 ulocimp_toBcpKey(const char* key);
 

diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp
@@ -4934,6 +4934,36 @@ void LocaleTest::TestCanonicalize(void)
         // ICU-21401
         { "cel-gaulish", "xtg"},
 
+        // ICU-21406
+        // Inside T extension
+        //  Case of Script and Region
+        { "ja-kana-jp-t-it-latn-it", "ja-Kana-JP-t-it-latn-it"},
+        { "und-t-zh-hani-tw", "und-t-zh-hani-tw"},
+        { "und-cyrl-t-und-Latn", "und-Cyrl-t-und-latn"},
+        //  Order of singleton
+        { "und-u-ca-roc-t-zh", "und-t-zh-u-ca-roc"},
+        //  Variant subtags are alphabetically ordered.
+        { "sl-t-sl-rozaj-biske-1994", "sl-t-sl-1994-biske-rozaj"},
+        // tfield subtags are alphabetically ordered.
+        // (Also tests subtag case normalisation.)
+        { "DE-T-lv-M0-DIN", "de-t-lv-m0-din"},
+        { "DE-T-M0-DIN-K0-QWERTZ", "de-t-k0-qwertz-m0-din"},
+        { "DE-T-lv-M0-DIN-K0-QWERTZ", "de-t-lv-k0-qwertz-m0-din"},
+        // "true" tvalue subtags aren't removed.
+        // (UTS 35 version 36, §3.2.1 claims otherwise, but tkey must be followed by
+        // tvalue, so that's likely a spec bug in UTS 35.)
+        { "en-t-m0-true", "en-t-m0-true"},
+        // tlang subtags are canonicalised.
+        { "en-t-iw", "en-t-he"},
+        { "en-t-hy-latn-SU", "en-t-hy-latn-am"},
+        { "ru-t-ru-cyrl-SU", "ru-t-ru-cyrl-ru"},
+        { "fr-t-fr-172", "fr-t-fr-ru"},
+        { "und-t-no-latn-BOKMAL", "und-t-nb-latn" },
+        { "und-t-sgn-qAAi-NL", "und-t-dse-zinh" },
+        // alias of tvalue should be replaced
+        { "en-t-m0-NaMeS", "en-t-m0-prprname" },
+        { "en-t-s0-ascii-d0-NaMe", "en-t-d0-charname-s0-ascii" },
+
     };
     int32_t i;
     for (i=0; i < UPRV_LENGTHOF(testCases); i++) {

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/locale/LanguageTag.java
@@ -595,6 +595,12 @@ public static boolean isVariant(String s) {
         return false;
     }
 
+    public static boolean isTKey(String s) {
+        // tkey        =  = alpha digit ;
+        return (s.length() == 2) && AsciiUtil.isAlpha(s.charAt(0))
+            && AsciiUtil.isNumeric(s.charAt(1));
+    }
+
     public static boolean isExtensionSingleton(String s) {
         // singleton     = DIGIT               ; 0 - 9
         //               / %x41-57             ; A - W
@@ -657,18 +663,20 @@ public static String canonicalizeVariant(String s) {
 
     public static String canonicalizeExtension(String s) {
         s = AsciiUtil.toLowerString(s);
-        int found;
-        while (s.endsWith("-true")) {
-            s = s.substring(0, s.length() - 5);  // length of "-true" is 5
-        }
-        while ((found = s.indexOf("-true-")) > 0) {
-            s = s.substring(0, found) + s.substring(found + 5);  // length of "-true" is 5
-        }
-        while (s.endsWith("-yes")) {
-            s = s.substring(0, s.length() - 4);  // length of "-yes" is 4
-        }
-        while ((found = s.indexOf("-yes-")) > 0) {
-            s = s.substring(0, found) + s.substring(found + 4);  // length of "-yes" is 5
+        if (s.startsWith("u-")) {
+            int found;
+            while (s.endsWith("-true")) {
+                s = s.substring(0, s.length() - 5);  // length of "-true" is 5
+            }
+            while ((found = s.indexOf("-true-")) > 0) {
+                s = s.substring(0, found) + s.substring(found + 5);  // length of "-true" is 5
+            }
+            while (s.endsWith("-yes")) {
+                s = s.substring(0, s.length() - 4);  // length of "-yes" is 4
+            }
+            while ((found = s.indexOf("-yes-")) > 0) {
+                s = s.substring(0, found) + s.substring(found + 4);  // length of "-yes" is 5
+            }
         }
         return s;
     }

diff --git a/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java b/icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java
@@ -1279,9 +1279,11 @@ public String replace() {
                 Iterator<String> keywords = temp.getKeywords();
                 while (keywords != null && keywords.hasNext()) {
                     String key = keywords.next();
-                    if (key.equals("rg") || key.equals("sd")) {
+                    if (key.equals("rg") || key.equals("sd") || key.equals("t")) {
                         String value = temp.getKeywordValue(key);
-                        String replacement = replaceSubdivision(value);
+                        String replacement = key.equals("t") ?
+                            replaceTransformedExtensions(value) :
+                            replaceSubdivision(value);
                         if (replacement != null) {
                             temp = temp.setKeywordValue(key, replacement);
                             keywordChanged = true;
@@ -1636,6 +1638,58 @@ private String replaceSubdivision(String subdivision) {
             return subdivisionAliasMap.get(subdivision);
         }
 
+        private String replaceTransformedExtensions(String extensions) {
+            StringBuilder builder = new StringBuilder();
+            List<String> subtags = new ArrayList<>(Arrays.asList(extensions.split(LanguageTag.SEP)));
+            List<String> tfields = new ArrayList<>();
+            int processedLength = 0;
+            int tlangLength = 0;
+            String tkey = "";
+            for (String subtag : subtags) {
+                if (LanguageTag.isTKey(subtag)) {
+                    if (tlangLength == 0) {
+                        // Found the first tkey. Record the total length of the preceding
+                        // tlang subtags. -1 if there is no tlang before the first tkey.
+                        tlangLength = processedLength-1;
+                    }
+                    if (builder.length() > 0) {
+                        // Finish & store the previous tkey with its tvalue subtags.
+                        tfields.add(builder.toString());
+                        builder.setLength(0);
+                    }
+                    // Start collecting subtags for this new tkey.
+                    tkey = subtag;
+                    builder.append(subtag);
+                } else {
+                    if (tlangLength != 0) {
+                        builder.append(LanguageTag.SEP).append(toUnicodeLocaleType(tkey, subtag));
+                    }
+                }
+                processedLength += subtag.length() + 1;
+            }
+            if (builder.length() > 0) {
+                // Finish & store the previous=last tkey with its tvalue subtags.
+                tfields.add(builder.toString());
+                builder.setLength(0);
+            }
+            String tlang = (tlangLength > 0) ? extensions.substring(0, tlangLength) :
+                ((tfields.size() == 0) ? extensions :  "");
+            if (tlang.length() > 0) {
+                String canonicalized = ULocale.createCanonical(
+                    ULocale.forLanguageTag(extensions)).toLanguageTag();
+                builder.append(AsciiUtil.toLowerString(canonicalized));
+            }
+
+            if (tfields.size() > 0) {
+                if (builder.length() > 0) {
+                    builder.append(LanguageTag.SEP);
+                }
+                // tfields are sorted by alphabetical order of their keys
+                Collections.sort(tfields);
+                builder.append(Utility.joinStrings(LanguageTag.SEP, tfields));
+            }
+            return builder.toString();
+        }
     };
 
     /**

diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/util/ULocaleTest.java
@@ -5233,6 +5233,38 @@ public void TestCanonical() {
 
         // ICU-21401
         Assert.assertEquals("xtg", canonicalTag("cel-gaulish"));
+
+        // ICU-21406
+        // Inside T extension
+        //  Case of Script and Region
+        Assert.assertEquals("ja-Kana-JP-t-it-latn-it", canonicalTag("ja-kana-jp-t-it-latn-it"));
+        Assert.assertEquals("und-t-zh-hani-tw", canonicalTag("und-t-zh-hani-tw"));
+        Assert.assertEquals("und-Cyrl-t-und-latn", canonicalTag("und-cyrl-t-und-Latn"));
+        //  Order of singleton
+        Assert.assertEquals("und-t-zh-u-ca-roc", canonicalTag("und-u-ca-roc-t-zh"));
+        //  Variant subtags are alphabetically ordered.
+        Assert.assertEquals("sl-1994-biske-rozaj", canonicalTag("sl-rozaj-biske-1994"));
+        Assert.assertEquals("sl-t-sl-1994-biske-rozaj", canonicalTag("sl-t-sl-rozaj-biske-1994"));
+        // tfield subtags are alphabetically ordered.
+        // (Also tests subtag case normalisation.)
+        Assert.assertEquals("de-t-lv-m0-din", canonicalTag("DE-T-lv-M0-DIN"));
+        Assert.assertEquals("de-t-k0-qwertz-m0-din", canonicalTag("DE-T-M0-DIN-K0-QWERTZ"));
+        Assert.assertEquals("de-t-lv-k0-qwertz-m0-din", canonicalTag("DE-T-lv-M0-DIN-K0-QWERTZ"));
+        // "true" tvalue subtags aren't removed.
+        // (UTS 35 version 36, §3.2.1 claims otherwise, but tkey must be followed by
+        // tvalue, so that's likely a spec bug in UTS 35.)
+        Assert.assertEquals("en-t-m0-true", canonicalTag("en-t-m0-true"));
+        // tlang subtags are canonicalised.
+        Assert.assertEquals("en-t-he", canonicalTag("en-t-iw"));
+        Assert.assertEquals("en-t-hy-latn-am", canonicalTag("en-t-hy-latn-SU"));
+        Assert.assertEquals("ru-t-ru-cyrl-ru", canonicalTag("ru-t-ru-cyrl-SU"));
+        Assert.assertEquals("fr-t-fr-ru", canonicalTag("fr-t-fr-172"));
+        Assert.assertEquals("und-t-nb-latn", canonicalTag("und-t-no-latn-BOKMAL"));
+        Assert.assertEquals("und-t-dse-zinh", canonicalTag("und-t-sgn-qAAi-NL"));
+        // alias of tvalue should be replaced
+        Assert.assertEquals("en-t-m0-prprname", canonicalTag("en-t-m0-NaMeS"));
+        Assert.assertEquals("en-t-d0-charname-s0-ascii", canonicalTag("en-t-s0-ascii-d0-nAmE"));
+
     }
 
     @Test