Skip to content

Commit

Permalink
ICU-9562 Update language tag mapping per the latest IANA registry
Browse files Browse the repository at this point in the history
uloc_forLanguageTag has a few mapping tables to map grandfathered
language tags and deprecated language subtags to their preferred or
modern values.

Update them based on the latest version of the IANA
language subtag registry. [1]

Five grandfathered tags without a preferred value are still mapped to
what ICU has mapped them to for backward compatibility until the
wisdom of continuing to do so is reviewed.

In addition, map redundant language tags to their preferred values
regardless of whether they're followed by other subtags or not. (e.g.
zh-yue vs zh-yue-u-co-pinyin) .

Similary, ja-latn-hepburn-heploc is mapped to ja-latn-alaic97 (the
variant subtag 'hepburn-helploc' with the prefix 'ja-latn' has the
preferred value, 'alaic97') .

Update the mapping for deprecated language subtags (e.g. 'jw' to
'jv' and a bunch of 3-letter language codes).

Add a new table for deprecated region subtags to map them to their
modern values. (e.g. 'DD' to 'DE').

Add a new test case for deprecated language and region mapping and
a few more cases for updated grandfathered and redundant tag mapping.

[1]
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
  • Loading branch information
jungshik authored and sffc committed Sep 27, 2018
1 parent ff98764 commit c71a1b4
Show file tree
Hide file tree
Showing 4 changed files with 281 additions and 14 deletions.
241 changes: 230 additions & 11 deletions icu4c/source/common/uloc_tag.cpp
Expand Up @@ -79,19 +79,34 @@ static const char LOCALE_TYPE_YES[] = "yes";

#define LANG_UND_LEN 3

/*
Updated on 2018-09-12 from
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
This table has 2 parts. The parts for Grandfathered tags is generated by the
following scripts from the IANA language tag registry.
curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
egrep -A 7 'Type: grandfathered' | \
egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \
awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\
tr 'A-Z' 'a-z'
The 2nd part is made of five ICU-specific entries. They're kept for
the backward compatibility for now, even though there are no preferred
values. They may have to be removed for the strict BCP 47 compliance.
*/
static const char* const GRANDFATHERED[] = {
/* grandfathered preferred */
"art-lojban", "jbo",
"cel-gaulish", "xtg-x-cel-gaulish",
"en-GB-oed", "en-GB-x-oed",
"en-gb-oed", "en-gb-oxendict",
"i-ami", "ami",
"i-bnn", "bnn",
"i-default", "en-x-i-default",
"i-enochian", "und-x-i-enochian",
"i-hak", "hak",
"i-klingon", "tlh",
"i-lux", "lb",
"i-mingo", "see-x-i-mingo",
"i-navajo", "nv",
"i-pwn", "pwn",
"i-tao", "tao",
Expand All @@ -104,17 +119,175 @@ static const char* const GRANDFATHERED[] = {
"sgn-ch-de", "sgg",
"zh-guoyu", "cmn",
"zh-hakka", "hak",
"zh-min", "nan-x-zh-min",
"zh-min-nan", "nan",
"zh-xiang", "hsn",
NULL, NULL

// Grandfathered tags with no preferred value in the IANA
// registry. Kept for now for the backward compatibility
// because ICU has mapped them this way.
"cel-gaulish", "xtg-x-cel-gaulish",
"i-default", "en-x-i-default",
"i-enochian", "und-x-i-enochian",
"i-mingo", "see-x-i-mingo",
"zh-min", "nan-x-zh-min",
};

/*
Updated on 2018-09-12 from
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
The table lists redundant tags with preferred value in the IANA languate tag registry.
It's generated with the following command:
curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \
awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \
tr 'A-Z' 'a-z'
In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because
a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'.
*/

static const char* const REDUNDANT[] = {
// redundant preferred
"sgn-br", "bzs",
"sgn-co", "csn",
"sgn-de", "gsg",
"sgn-dk", "dsl",
"sgn-es", "ssp",
"sgn-fr", "fsl",
"sgn-gb", "bfi",
"sgn-gr", "gss",
"sgn-ie", "isg",
"sgn-it", "ise",
"sgn-jp", "jsl",
"sgn-mx", "mfs",
"sgn-ni", "ncs",
"sgn-nl", "dse",
"sgn-no", "nsl",
"sgn-pt", "psr",
"sgn-se", "swl",
"sgn-us", "ase",
"sgn-za", "sfs",
"zh-cmn", "cmn",
"zh-cmn-hans", "cmn-hans",
"zh-cmn-hant", "cmn-hant",
"zh-gan", "gan",
"zh-wuu", "wuu",
"zh-yue", "yue",

// variant tag with preferred value
"ja-latn-hepburn-heploc", "ja-latn-alalc97",
};

/*
Updated on 2018-09-12 from
https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
grep 'Type: language' -A 7 language-subtag-registry | egrep 'Subtag|Prefe' | \
grep -B1 'Preferred' | grep -v '^--' | \
awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
Make sure that 2-letter language subtags come before 3-letter subtags.
*/
static const char DEPRECATEDLANGS[][4] = {
/* deprecated new */
"in", "id",
"iw", "he",
"ji", "yi",
"in", "id"
"jw", "jv",
"mo", "ro",
"aam", "aas",
"adp", "dz",
"aue", "ktz",
"ayx", "nun",
"bgm", "bcg",
"bjd", "drl",
"ccq", "rki",
"cjr", "mom",
"cka", "cmr",
"cmk", "xch",
"coy", "pij",
"cqu", "quh",
"drh", "khk",
"drw", "prs",
"gav", "dev",
"gfx", "vaj",
"ggn", "gvr",
"gti", "nyc",
"guv", "duz",
"hrr", "jal",
"ibi", "opa",
"ilw", "gal",
"jeg", "oyb",
"kgc", "tdf",
"kgh", "kml",
"koj", "kwv",
"krm", "bmf",
"ktr", "dtp",
"kvs", "gdj",
"kwq", "yam",
"kxe", "tvd",
"kzj", "dtp",
"kzt", "dtp",
"lii", "raq",
"lmm", "rmx",
"meg", "cir",
"mst", "mry",
"mwj", "vaj",
"myt", "mry",
"nad", "xny",
"ncp", "kdz",
"nnx", "ngv",
"nts", "pij",
"oun", "vaj",
"pcr", "adx",
"pmc", "huw",
"pmu", "phr",
"ppa", "bfy",
"ppr", "lcq",
"pry", "prt",
"puz", "pub",
"sca", "hle",
"skk", "oyb",
"tdu", "dtp",
"thc", "tpo",
"thx", "oyb",
"tie", "ras",
"tkk", "twm",
"tlw", "weo",
"tmp", "tyj",
"tne", "kak",
"tnf", "prs",
"tsf", "taj",
"uok", "ema",
"xba", "cax",
"xia", "acn",
"xkh", "waw",
"xsj", "suj",
"ybd", "rki",
"yma", "lrr",
"ymt", "mtm",
"yos", "zom",
"yuu", "yug",
};

/*
Updated on 2018-04-24 from
curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \
grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \
grep -B1 'Preferred' | \
awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
*/
static const char DEPRECATEDREGIONS[][3] = {
/* deprecated new */
"BU", "MM",
"DD", "DE",
"FX", "FR",
"TP", "TL",
"YD", "YE",
"ZR", "CD",
};

/*
Expand Down Expand Up @@ -717,6 +890,11 @@ _appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capac
} else {
/* resolve deprecated */
for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) {
// 2-letter deprecated subtags are listede before 3-letter
// ones in DEPRECATEDLANGS[]. Get out of loop on coming
// across the 1st 3-letter subtag, if the input is a 2-letter code.
// to avoid continuing to try when there's no match.
if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break;
if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) {
uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]);
len = (int32_t)uprv_strlen(buf);
Expand Down Expand Up @@ -763,7 +941,6 @@ _appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacit
*(appendAt + reslen) = SEP;
}
reslen++;

if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen));
}
Expand Down Expand Up @@ -805,6 +982,14 @@ _appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacit
*(appendAt + reslen) = SEP;
}
reslen++;
/* resolve deprecated */
for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) {
if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) {
uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]);
len = (int32_t)uprv_strlen(buf);
break;
}
}

if (reslen < capacity) {
uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen));
Expand Down Expand Up @@ -1916,7 +2101,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
}

/* check if the tag is grandfathered */
for (i = 0; GRANDFATHERED[i] != NULL; i += 2) {
for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) {
if (uprv_stricmp(GRANDFATHERED[i], tagBuf) == 0) {
int32_t newTagLength;

Expand All @@ -1938,6 +2123,37 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
}
}

size_t parsedLenDelta = 0;
if (grandfatheredLen == 0) {
for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
const char* redundantTag = REDUNDANT[i];
size_t redundantTagLen = uprv_strlen(redundantTag);
// The preferred tag for a redundant tag is always shorter than redundant
// tag. A redundant tag may or may not be followed by other subtags.
// (i.e. "zh-yue" or "zh-yue-u-co-pinyin").
if (uprv_strnicmp(redundantTag, tagBuf, redundantTagLen) == 0) {
const char* redundantTagEnd = tagBuf + redundantTagLen;
if (*redundantTagEnd == '\0' || *redundantTagEnd == SEP) {
const char* preferredTag = REDUNDANT[i + 1];
size_t preferredTagLen = uprv_strlen(preferredTag);
uprv_strncpy(t->buf, preferredTag, preferredTagLen);
if (*redundantTagEnd == SEP) {
uprv_memmove(tagBuf + preferredTagLen,
redundantTagEnd,
tagLen - redundantTagLen + 1);
} else {
tagBuf[preferredTagLen] = '\0';
}
// parsedLen should be the length of the input
// before redundantTag is replaced by preferredTag.
// Save the delta to add it back later.
parsedLenDelta = redundantTagLen - preferredTagLen;
break;
}
}
}
}

/*
* langtag = language
* ["-" script]
Expand Down Expand Up @@ -1978,6 +2194,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
if (next & LANG) {
if (_isLanguageSubtag(pSubtag, subtagLen)) {
*pSep = 0; /* terminate */
// TODO: move deprecated language code handling here.
t->language = T_CString_toLowerCase(pSubtag);

pLastGoodPosition = pSep;
Expand Down Expand Up @@ -2024,6 +2241,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
if (next & REGN) {
if (_isRegionSubtag(pSubtag, subtagLen)) {
*pSep = 0;
// TODO: move deprecated region code handling here.
t->region = T_CString_toUpperCase(pSubtag);

pLastGoodPosition = pSep;
Expand Down Expand Up @@ -2220,7 +2438,8 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
}

if (parsedLen != NULL) {
*parsedLen = (grandfatheredLen > 0) ? grandfatheredLen : (int32_t)(pLastGoodPosition - t->buf);
*parsedLen = (grandfatheredLen > 0) ? grandfatheredLen :
(int32_t)(pLastGoodPosition - t->buf + parsedLenDelta);
}

return t;
Expand Down

0 comments on commit c71a1b4

Please sign in to comment.