Skip to content

Commit

Permalink
ICU-21406 canonicalize -T- extension
Browse files Browse the repository at this point in the history
See #1491
  • Loading branch information
FrankYFTang authored and Squash Bot committed Dec 8, 2020
1 parent 917188d commit caa52e3
Show file tree
Hide file tree
Showing 7 changed files with 251 additions and 22 deletions.
102 changes: 94 additions & 8 deletions icu4c/source/common/locid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1171,7 +1171,12 @@ class AliasReplacer {
bool replaceVariant(UErrorCode& status);

// Replace by using subdivisionAlias.
bool replaceSubdivision(CharString& subdivision, UErrorCode& status);
bool replaceSubdivision(StringPiece subdivision,
CharString& output, UErrorCode& status);

// Replace transformed extensions.
bool replaceTransformedExtensions(
CharString& transformedExtensions, CharString& output, UErrorCode& status);
};

CharString&
Expand Down Expand Up @@ -1501,7 +1506,8 @@ AliasReplacer::replaceVariant(UErrorCode& status)
}

bool
AliasReplacer::replaceSubdivision(CharString& subdivision, UErrorCode& status)
AliasReplacer::replaceSubdivision(
StringPiece subdivision, CharString& output, UErrorCode& status)
{
if (U_FAILURE(status)) {
return false;
Expand All @@ -1514,13 +1520,84 @@ AliasReplacer::replaceSubdivision(CharString& subdivision, UErrorCode& status)
(firstSpace - replacement) : uprv_strlen(replacement);
// Ignore len == 2, see CLDR-14312
if (3 <= len && len <= 8) {
subdivision.clear().append(replacement, (int32_t)len, status);
output.append(replacement, (int32_t)len, status);
}
return true;
}
return false;
}

bool
AliasReplacer::replaceTransformedExtensions(
CharString& transformedExtensions, CharString& output, UErrorCode& status)
{
// The content of the transformedExtensions will be modified in this
// function to NULL-terminating (tkey-tvalue) pairs.
if (U_FAILURE(status)) {
return false;
}
int32_t len = transformedExtensions.length();
const char* str = transformedExtensions.data();
const char* tkey = ultag_getTKeyStart(str);
int32_t tlangLen = (tkey == str) ? 0 :
((tkey == nullptr) ? len : (tkey - str - 1));
CharStringByteSink sink(&output);
if (tlangLen > 0) {
Locale tlang = LocaleBuilder()
.setLanguageTag(StringPiece(str, tlangLen))
.build(status);
tlang.canonicalize(status);
tlang.toLanguageTag(sink, status);
if (U_FAILURE(status)) {
return false;
}
T_CString_toLowerCase(output.data());
}
if (tkey != nullptr) {
// We need to sort the tfields by tkey
UVector tfields(status);
if (U_FAILURE(status)) {
return false;
}
do {
const char* tvalue = uprv_strchr(tkey, '-');
if (tvalue == nullptr) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
const char* nextTKey = ultag_getTKeyStart(tvalue);
if (nextTKey != nullptr) {
*((char*)(nextTKey-1)) = '\0'; // NULL terminate tvalue
}
tfields.insertElementAt((void*)tkey, tfields.size(), status);
if (U_FAILURE(status)) {
return false;
}
tkey = nextTKey;
} while (tkey != nullptr);
tfields.sort([](UElement e1, UElement e2) -> int8_t {
return uprv_strcmp(
(const char*)e1.pointer, (const char*)e2.pointer);
}, status);
for (int32_t i = 0; i < tfields.size(); i++) {
if (output.length() > 0) {
output.append('-', status);
}
const char* tfield = (const char*) tfields.elementAt(i);
const char* tvalue = uprv_strchr(tfield, '-');
// Split the "tkey-tvalue" pair string so that we can canonicalize the tvalue.
U_ASSERT(tvalue != nullptr);
*((char*)tvalue++) = '\0'; // NULL terminate tkey
output.append(tfield, status).append('-', status);
const char* bcpTValue = ulocimp_toBcpType(tfield, tvalue, nullptr, nullptr);
output.append((bcpTValue == nullptr) ? tvalue : bcpTValue, status);
}
}
if (U_FAILURE(status)) {
return false;
}
return true;
}

CharString&
AliasReplacer::outputToString(
CharString& out, UErrorCode status)
Expand Down Expand Up @@ -1661,18 +1738,28 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode status)
if (U_SUCCESS(status) && !iter.isNull()) {
const char* key;
while ((key = iter->next(nullptr, status)) != nullptr) {
if (uprv_strcmp("sd", key) == 0 || uprv_strcmp("rg", key) == 0) {
if (uprv_strcmp("sd", key) == 0 || uprv_strcmp("rg", key) == 0 ||
uprv_strcmp("t", key) == 0) {
CharString value;
CharStringByteSink valueSink(&value);
locale.getKeywordValue(key, valueSink, status);
if (U_FAILURE(status)) {
status = U_ZERO_ERROR;
continue;
}
if (replaceSubdivision(value, status)) {
changed++;
CharString replacement;
if (uprv_strlen(key) == 2) {
if (replaceSubdivision(value.toStringPiece(), replacement, status)) {
changed++;
temp.setKeywordValue(key, replacement.data(), status);
}
} else {
U_ASSERT(uprv_strcmp(key, "t") == 0);
if (replaceTransformedExtensions(value, replacement, status)) {
changed++;
temp.setKeywordValue(key, replacement.data(), status);
}
}
temp.setKeywordValue(key, value.data(), status);
if (U_FAILURE(status)) {
return false;
}
Expand All @@ -1689,7 +1776,6 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode status)
}
// If the tag is not changed, return.
if (uprv_strcmp(out.data(), locale.getName()) == 0) {
U_ASSERT(changed == 0);
out.clear();
return false;
}
Expand Down
16 changes: 16 additions & 0 deletions icu4c/source/common/uloc_tag.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -646,6 +646,22 @@ _isTKey(const char* s, int32_t len)
return FALSE;
}

U_CAPI const char * U_EXPORT2
ultag_getTKeyStart(const char *localeID) {
const char *result = localeID;
const char *sep;
while((sep = uprv_strchr(result, SEP)) != nullptr) {
if (_isTKey(result, sep - result)) {
return result;
}
result = ++sep;
}
if (_isTKey(result, -1)) {
return result;
}
return nullptr;
}

static UBool
_isTValue(const char* s, int32_t len)
{
Expand Down
3 changes: 3 additions & 0 deletions icu4c/source/common/ulocimp.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,9 @@ ultag_isUnicodeLocaleType(const char* s, int32_t len);
U_CFUNC UBool
ultag_isVariantSubtags(const char* s, int32_t len);

U_CAPI const char * U_EXPORT2
ultag_getTKeyStart(const char *localeID);

U_CFUNC const char*
ulocimp_toBcpKey(const char* key);

Expand Down
30 changes: 30 additions & 0 deletions icu4c/source/test/intltest/loctest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4934,6 +4934,36 @@ void LocaleTest::TestCanonicalize(void)
// ICU-21401
{ "cel-gaulish", "xtg"},

// ICU-21406
// Inside T extension
// Case of Script and Region
{ "ja-kana-jp-t-it-latn-it", "ja-Kana-JP-t-it-latn-it"},
{ "und-t-zh-hani-tw", "und-t-zh-hani-tw"},
{ "und-cyrl-t-und-Latn", "und-Cyrl-t-und-latn"},
// Order of singleton
{ "und-u-ca-roc-t-zh", "und-t-zh-u-ca-roc"},
// Variant subtags are alphabetically ordered.
{ "sl-t-sl-rozaj-biske-1994", "sl-t-sl-1994-biske-rozaj"},
// tfield subtags are alphabetically ordered.
// (Also tests subtag case normalisation.)
{ "DE-T-lv-M0-DIN", "de-t-lv-m0-din"},
{ "DE-T-M0-DIN-K0-QWERTZ", "de-t-k0-qwertz-m0-din"},
{ "DE-T-lv-M0-DIN-K0-QWERTZ", "de-t-lv-k0-qwertz-m0-din"},
// "true" tvalue subtags aren't removed.
// (UTS 35 version 36, §3.2.1 claims otherwise, but tkey must be followed by
// tvalue, so that's likely a spec bug in UTS 35.)
{ "en-t-m0-true", "en-t-m0-true"},
// tlang subtags are canonicalised.
{ "en-t-iw", "en-t-he"},
{ "en-t-hy-latn-SU", "en-t-hy-latn-am"},
{ "ru-t-ru-cyrl-SU", "ru-t-ru-cyrl-ru"},
{ "fr-t-fr-172", "fr-t-fr-ru"},
{ "und-t-no-latn-BOKMAL", "und-t-nb-latn" },
{ "und-t-sgn-qAAi-NL", "und-t-dse-zinh" },
// alias of tvalue should be replaced
{ "en-t-m0-NaMeS", "en-t-m0-prprname" },
{ "en-t-s0-ascii-d0-NaMe", "en-t-d0-charname-s0-ascii" },

};
int32_t i;
for (i=0; i < UPRV_LENGTHOF(testCases); i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,12 @@ public static boolean isVariant(String s) {
return false;
}

public static boolean isTKey(String s) {
// tkey = = alpha digit ;
return (s.length() == 2) && AsciiUtil.isAlpha(s.charAt(0))
&& AsciiUtil.isNumeric(s.charAt(1));
}

public static boolean isExtensionSingleton(String s) {
// singleton = DIGIT ; 0 - 9
// / %x41-57 ; A - W
Expand Down Expand Up @@ -657,18 +663,20 @@ public static String canonicalizeVariant(String s) {

public static String canonicalizeExtension(String s) {
s = AsciiUtil.toLowerString(s);
int found;
while (s.endsWith("-true")) {
s = s.substring(0, s.length() - 5); // length of "-true" is 5
}
while ((found = s.indexOf("-true-")) > 0) {
s = s.substring(0, found) + s.substring(found + 5); // length of "-true" is 5
}
while (s.endsWith("-yes")) {
s = s.substring(0, s.length() - 4); // length of "-yes" is 4
}
while ((found = s.indexOf("-yes-")) > 0) {
s = s.substring(0, found) + s.substring(found + 4); // length of "-yes" is 5
if (s.startsWith("u-")) {
int found;
while (s.endsWith("-true")) {
s = s.substring(0, s.length() - 5); // length of "-true" is 5
}
while ((found = s.indexOf("-true-")) > 0) {
s = s.substring(0, found) + s.substring(found + 5); // length of "-true" is 5
}
while (s.endsWith("-yes")) {
s = s.substring(0, s.length() - 4); // length of "-yes" is 4
}
while ((found = s.indexOf("-yes-")) > 0) {
s = s.substring(0, found) + s.substring(found + 4); // length of "-yes" is 5
}
}
return s;
}
Expand Down
58 changes: 56 additions & 2 deletions icu4j/main/classes/core/src/com/ibm/icu/util/ULocale.java
Original file line number Diff line number Diff line change
Expand Up @@ -1279,9 +1279,11 @@ public String replace() {
Iterator<String> keywords = temp.getKeywords();
while (keywords != null && keywords.hasNext()) {
String key = keywords.next();
if (key.equals("rg") || key.equals("sd")) {
if (key.equals("rg") || key.equals("sd") || key.equals("t")) {
String value = temp.getKeywordValue(key);
String replacement = replaceSubdivision(value);
String replacement = key.equals("t") ?
replaceTransformedExtensions(value) :
replaceSubdivision(value);
if (replacement != null) {
temp = temp.setKeywordValue(key, replacement);
keywordChanged = true;
Expand Down Expand Up @@ -1636,6 +1638,58 @@ private String replaceSubdivision(String subdivision) {
return subdivisionAliasMap.get(subdivision);
}

private String replaceTransformedExtensions(String extensions) {
StringBuilder builder = new StringBuilder();
List<String> subtags = new ArrayList<>(Arrays.asList(extensions.split(LanguageTag.SEP)));
List<String> tfields = new ArrayList<>();
int processedLength = 0;
int tlangLength = 0;
String tkey = "";
for (String subtag : subtags) {
if (LanguageTag.isTKey(subtag)) {
if (tlangLength == 0) {
// Found the first tkey. Record the total length of the preceding
// tlang subtags. -1 if there is no tlang before the first tkey.
tlangLength = processedLength-1;
}
if (builder.length() > 0) {
// Finish & store the previous tkey with its tvalue subtags.
tfields.add(builder.toString());
builder.setLength(0);
}
// Start collecting subtags for this new tkey.
tkey = subtag;
builder.append(subtag);
} else {
if (tlangLength != 0) {
builder.append(LanguageTag.SEP).append(toUnicodeLocaleType(tkey, subtag));
}
}
processedLength += subtag.length() + 1;
}
if (builder.length() > 0) {
// Finish & store the previous=last tkey with its tvalue subtags.
tfields.add(builder.toString());
builder.setLength(0);
}
String tlang = (tlangLength > 0) ? extensions.substring(0, tlangLength) :
((tfields.size() == 0) ? extensions : "");
if (tlang.length() > 0) {
String canonicalized = ULocale.createCanonical(
ULocale.forLanguageTag(extensions)).toLanguageTag();
builder.append(AsciiUtil.toLowerString(canonicalized));
}

if (tfields.size() > 0) {
if (builder.length() > 0) {
builder.append(LanguageTag.SEP);
}
// tfields are sorted by alphabetical order of their keys
Collections.sort(tfields);
builder.append(Utility.joinStrings(LanguageTag.SEP, tfields));
}
return builder.toString();
}
};

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5233,6 +5233,38 @@ public void TestCanonical() {

// ICU-21401
Assert.assertEquals("xtg", canonicalTag("cel-gaulish"));

// ICU-21406
// Inside T extension
// Case of Script and Region
Assert.assertEquals("ja-Kana-JP-t-it-latn-it", canonicalTag("ja-kana-jp-t-it-latn-it"));
Assert.assertEquals("und-t-zh-hani-tw", canonicalTag("und-t-zh-hani-tw"));
Assert.assertEquals("und-Cyrl-t-und-latn", canonicalTag("und-cyrl-t-und-Latn"));
// Order of singleton
Assert.assertEquals("und-t-zh-u-ca-roc", canonicalTag("und-u-ca-roc-t-zh"));
// Variant subtags are alphabetically ordered.
Assert.assertEquals("sl-1994-biske-rozaj", canonicalTag("sl-rozaj-biske-1994"));
Assert.assertEquals("sl-t-sl-1994-biske-rozaj", canonicalTag("sl-t-sl-rozaj-biske-1994"));
// tfield subtags are alphabetically ordered.
// (Also tests subtag case normalisation.)
Assert.assertEquals("de-t-lv-m0-din", canonicalTag("DE-T-lv-M0-DIN"));
Assert.assertEquals("de-t-k0-qwertz-m0-din", canonicalTag("DE-T-M0-DIN-K0-QWERTZ"));
Assert.assertEquals("de-t-lv-k0-qwertz-m0-din", canonicalTag("DE-T-lv-M0-DIN-K0-QWERTZ"));
// "true" tvalue subtags aren't removed.
// (UTS 35 version 36, §3.2.1 claims otherwise, but tkey must be followed by
// tvalue, so that's likely a spec bug in UTS 35.)
Assert.assertEquals("en-t-m0-true", canonicalTag("en-t-m0-true"));
// tlang subtags are canonicalised.
Assert.assertEquals("en-t-he", canonicalTag("en-t-iw"));
Assert.assertEquals("en-t-hy-latn-am", canonicalTag("en-t-hy-latn-SU"));
Assert.assertEquals("ru-t-ru-cyrl-ru", canonicalTag("ru-t-ru-cyrl-SU"));
Assert.assertEquals("fr-t-fr-ru", canonicalTag("fr-t-fr-172"));
Assert.assertEquals("und-t-nb-latn", canonicalTag("und-t-no-latn-BOKMAL"));
Assert.assertEquals("und-t-dse-zinh", canonicalTag("und-t-sgn-qAAi-NL"));
// alias of tvalue should be replaced
Assert.assertEquals("en-t-m0-prprname", canonicalTag("en-t-m0-NaMeS"));
Assert.assertEquals("en-t-d0-charname-s0-ascii", canonicalTag("en-t-s0-ascii-d0-nAmE"));

}

@Test
Expand Down

0 comments on commit caa52e3

Please sign in to comment.