ICU-9562 Update language tag mapping per the latest IANA registry

uloc_forLanguageTag has a few mapping tables to map grandfathered language tags and deprecated language subtags to their preferred or modern values. Update them based on the latest version of the IANA language subtag registry. [1] Five grandfathered tags without a preferred value are still mapped to what ICU has mapped them to for backward compatibility until the wisdom of continuing to do so is reviewed. In addition, map redundant language tags to their preferred values regardless of whether they're followed by other subtags or not. (e.g. zh-yue vs zh-yue-u-co-pinyin) . Similary, ja-latn-hepburn-heploc is mapped to ja-latn-alaic97 (the variant subtag 'hepburn-helploc' with the prefix 'ja-latn' has the preferred value, 'alaic97') . Update the mapping for deprecated language subtags (e.g. 'jw' to 'jv' and a bunch of 3-letter language codes). Add a new table for deprecated region subtags to map them to their modern values. (e.g. 'DD' to 'DE'). Add a new test case for deprecated language and region mapping and a few more cases for updated grandfathered and redundant tag mapping. [1] https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry
unicode-org · Sep 27, 2018 · c71a1b4 · c71a1b4
1 parent ff98764
commit c71a1b4
Show file tree

Hide file tree

Showing 4 changed files with 281 additions and 14 deletions.
diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp
@@ -79,19 +79,34 @@ static const char LOCALE_TYPE_YES[] = "yes";
 
 #define LANG_UND_LEN 3
 
+/*
+ Updated on 2018-09-12 from
+ https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
+
+ This table has 2 parts. The parts for Grandfathered tags is generated by the
+ following scripts from the IANA language tag registry.
+
+ curl  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
+ egrep -A 7 'Type: grandfathered' | \
+ egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \
+ awk -n '/Tag/ {printf("    \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\
+ tr 'A-Z' 'a-z'
+
+
+ The 2nd part is made of five ICU-specific entries. They're kept for
+ the backward compatibility for now, even though there are no preferred
+ values. They may have to be removed for the strict BCP 47 compliance.
+
+*/
 static const char* const GRANDFATHERED[] = {
 /*  grandfathered   preferred */
     "art-lojban",   "jbo",
-    "cel-gaulish",  "xtg-x-cel-gaulish",
-    "en-GB-oed",    "en-GB-x-oed",
+    "en-gb-oed",    "en-gb-oxendict",
     "i-ami",        "ami",
     "i-bnn",        "bnn",
-    "i-default",    "en-x-i-default",
-    "i-enochian",   "und-x-i-enochian",
     "i-hak",        "hak",
     "i-klingon",    "tlh",
     "i-lux",        "lb",
-    "i-mingo",      "see-x-i-mingo",
     "i-navajo",     "nv",
     "i-pwn",        "pwn",
     "i-tao",        "tao",
@@ -104,17 +119,175 @@ static const char* const GRANDFATHERED[] = {
     "sgn-ch-de",    "sgg",
     "zh-guoyu",     "cmn",
     "zh-hakka",     "hak",
-    "zh-min",       "nan-x-zh-min",
     "zh-min-nan",   "nan",
     "zh-xiang",     "hsn",
-    NULL,           NULL
+
+    // Grandfathered tags with no preferred value in the IANA
+    // registry. Kept for now for the backward compatibility
+    // because ICU has mapped them this way.
+    "cel-gaulish",  "xtg-x-cel-gaulish",
+    "i-default",    "en-x-i-default",
+    "i-enochian",   "und-x-i-enochian",
+    "i-mingo",      "see-x-i-mingo",
+    "zh-min",       "nan-x-zh-min",
+};
+
+/*
+ Updated on 2018-09-12 from
+ https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
+
+ The table lists redundant tags with preferred value in the IANA languate tag registry.
+ It's generated with the following command:
+
+ curl  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\
+ grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \
+ awk -n '/Tag/ {printf("    \"%s\",       ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \
+ tr 'A-Z' 'a-z'
+
+ In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because
+ a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'.
+*/
+
+static const char* const REDUNDANT[] = {
+//  redundant       preferred
+    "sgn-br",       "bzs",
+    "sgn-co",       "csn",
+    "sgn-de",       "gsg",
+    "sgn-dk",       "dsl",
+    "sgn-es",       "ssp",
+    "sgn-fr",       "fsl",
+    "sgn-gb",       "bfi",
+    "sgn-gr",       "gss",
+    "sgn-ie",       "isg",
+    "sgn-it",       "ise",
+    "sgn-jp",       "jsl",
+    "sgn-mx",       "mfs",
+    "sgn-ni",       "ncs",
+    "sgn-nl",       "dse",
+    "sgn-no",       "nsl",
+    "sgn-pt",       "psr",
+    "sgn-se",       "swl",
+    "sgn-us",       "ase",
+    "sgn-za",       "sfs",
+    "zh-cmn",       "cmn",
+    "zh-cmn-hans",  "cmn-hans",
+    "zh-cmn-hant",  "cmn-hant",
+    "zh-gan",       "gan",
+    "zh-wuu",       "wuu",
+    "zh-yue",       "yue",
+
+    // variant tag with preferred value
+    "ja-latn-hepburn-heploc", "ja-latn-alalc97",
 };
 
+/*
+  Updated on 2018-09-12 from
+  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry .
+
+  grep 'Type: language' -A 7 language-subtag-registry  | egrep 'Subtag|Prefe' | \
+  grep -B1 'Preferred' | grep -v '^--' | \
+  awk -n '/Subtag/ {printf("    \"%s\",       ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
+
+  Make sure that 2-letter language subtags come before 3-letter subtags.
+*/
 static const char DEPRECATEDLANGS[][4] = {
 /*  deprecated  new */
+    "in",       "id",
     "iw",       "he",
     "ji",       "yi",
-    "in",       "id"
+    "jw",       "jv",
+    "mo",       "ro",
+    "aam",       "aas",
+    "adp",       "dz",
+    "aue",       "ktz",
+    "ayx",       "nun",
+    "bgm",       "bcg",
+    "bjd",       "drl",
+    "ccq",       "rki",
+    "cjr",       "mom",
+    "cka",       "cmr",
+    "cmk",       "xch",
+    "coy",       "pij",
+    "cqu",       "quh",
+    "drh",       "khk",
+    "drw",       "prs",
+    "gav",       "dev",
+    "gfx",       "vaj",
+    "ggn",       "gvr",
+    "gti",       "nyc",
+    "guv",       "duz",
+    "hrr",       "jal",
+    "ibi",       "opa",
+    "ilw",       "gal",
+    "jeg",       "oyb",
+    "kgc",       "tdf",
+    "kgh",       "kml",
+    "koj",       "kwv",
+    "krm",       "bmf",
+    "ktr",       "dtp",
+    "kvs",       "gdj",
+    "kwq",       "yam",
+    "kxe",       "tvd",
+    "kzj",       "dtp",
+    "kzt",       "dtp",
+    "lii",       "raq",
+    "lmm",       "rmx",
+    "meg",       "cir",
+    "mst",       "mry",
+    "mwj",       "vaj",
+    "myt",       "mry",
+    "nad",       "xny",
+    "ncp",       "kdz",
+    "nnx",       "ngv",
+    "nts",       "pij",
+    "oun",       "vaj",
+    "pcr",       "adx",
+    "pmc",       "huw",
+    "pmu",       "phr",
+    "ppa",       "bfy",
+    "ppr",       "lcq",
+    "pry",       "prt",
+    "puz",       "pub",
+    "sca",       "hle",
+    "skk",       "oyb",
+    "tdu",       "dtp",
+    "thc",       "tpo",
+    "thx",       "oyb",
+    "tie",       "ras",
+    "tkk",       "twm",
+    "tlw",       "weo",
+    "tmp",       "tyj",
+    "tne",       "kak",
+    "tnf",       "prs",
+    "tsf",       "taj",
+    "uok",       "ema",
+    "xba",       "cax",
+    "xia",       "acn",
+    "xkh",       "waw",
+    "xsj",       "suj",
+    "ybd",       "rki",
+    "yma",       "lrr",
+    "ymt",       "mtm",
+    "yos",       "zom",
+    "yuu",       "yug",
+};
+
+/*
+  Updated on 2018-04-24 from
+
+  curl  https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \
+  grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \
+  grep -B1 'Preferred' | \
+  awk -n '/Subtag/ {printf("    \"%s\",       ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}'
+*/
+static const char DEPRECATEDREGIONS[][3] = {
+/*  deprecated  new */
+    "BU",       "MM",
+    "DD",       "DE",
+    "FX",       "FR",
+    "TP",       "TL",
+    "YD",       "YE",
+    "ZR",       "CD",
 };
 
 /*
@@ -717,6 +890,11 @@ _appendLanguageToLanguageTag(const char* localeID, char* appendAt, int32_t capac
     } else {
         /* resolve deprecated */
         for (i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) {
+            // 2-letter deprecated subtags are listede before 3-letter
+            // ones in DEPRECATEDLANGS[]. Get out of loop on coming
+            // across the 1st 3-letter subtag, if the input is a 2-letter code.
+            // to avoid continuing to try when there's no match.
+            if (uprv_strlen(buf) < uprv_strlen(DEPRECATEDLANGS[i])) break;
             if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDLANGS[i]) == 0) {
                 uprv_strcpy(buf, DEPRECATEDLANGS[i + 1]);
                 len = (int32_t)uprv_strlen(buf);
@@ -763,7 +941,6 @@ _appendScriptToLanguageTag(const char* localeID, char* appendAt, int32_t capacit
                 *(appendAt + reslen) = SEP;
             }
             reslen++;
-
             if (reslen < capacity) {
                 uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen));
             }
@@ -805,6 +982,14 @@ _appendRegionToLanguageTag(const char* localeID, char* appendAt, int32_t capacit
                 *(appendAt + reslen) = SEP;
             }
             reslen++;
+           /* resolve deprecated */
+            for (int i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) {
+                if (uprv_compareInvCharsAsAscii(buf, DEPRECATEDREGIONS[i]) == 0) {
+                    uprv_strcpy(buf, DEPRECATEDREGIONS[i + 1]);
+                    len = (int32_t)uprv_strlen(buf);
+                    break;
+                }
+            }
 
             if (reslen < capacity) {
                 uprv_memcpy(appendAt + reslen, buf, uprv_min(len, capacity - reslen));
@@ -1916,7 +2101,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
     }
 
     /* check if the tag is grandfathered */
-    for (i = 0; GRANDFATHERED[i] != NULL; i += 2) {
+    for (i = 0; i < UPRV_LENGTHOF(GRANDFATHERED); i += 2) {
         if (uprv_stricmp(GRANDFATHERED[i], tagBuf) == 0) {
             int32_t newTagLength;
 
@@ -1938,6 +2123,37 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
         }
     }
 
+    size_t parsedLenDelta = 0;
+    if (grandfatheredLen == 0) {
+        for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) {
+            const char* redundantTag = REDUNDANT[i];
+            size_t redundantTagLen = uprv_strlen(redundantTag);
+            // The preferred tag for a redundant tag is always shorter than redundant
+            // tag. A redundant tag may or may not be followed by other subtags.
+            // (i.e. "zh-yue" or "zh-yue-u-co-pinyin").
+            if (uprv_strnicmp(redundantTag, tagBuf, redundantTagLen) == 0) {
+                const char* redundantTagEnd = tagBuf + redundantTagLen;
+                if (*redundantTagEnd  == '\0' || *redundantTagEnd == SEP) {
+                    const char* preferredTag = REDUNDANT[i + 1];
+                    size_t preferredTagLen = uprv_strlen(preferredTag);
+                    uprv_strncpy(t->buf, preferredTag, preferredTagLen);
+                    if (*redundantTagEnd == SEP) {
+                        uprv_memmove(tagBuf + preferredTagLen,
+                                     redundantTagEnd,
+                                     tagLen - redundantTagLen + 1);
+                    } else {
+                        tagBuf[preferredTagLen] = '\0';
+                    }
+                    // parsedLen should be the length of the input
+                    // before redundantTag is replaced by preferredTag.
+                    // Save the delta to add it back later.
+                    parsedLenDelta = redundantTagLen - preferredTagLen;
+                    break;
+                }
+            }
+        }
+    }
+
     /*
      * langtag      =   language
      *                  ["-" script]
@@ -1978,6 +2194,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
         if (next & LANG) {
             if (_isLanguageSubtag(pSubtag, subtagLen)) {
                 *pSep = 0;  /* terminate */
+                // TODO: move deprecated language code handling here.
                 t->language = T_CString_toLowerCase(pSubtag);
 
                 pLastGoodPosition = pSep;
@@ -2024,6 +2241,7 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
         if (next & REGN) {
             if (_isRegionSubtag(pSubtag, subtagLen)) {
                 *pSep = 0;
+                // TODO: move deprecated region code handling here.
                 t->region = T_CString_toUpperCase(pSubtag);
 
                 pLastGoodPosition = pSep;
@@ -2220,7 +2438,8 @@ ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode* sta
     }
 
     if (parsedLen != NULL) {
-        *parsedLen = (grandfatheredLen > 0) ? grandfatheredLen : (int32_t)(pLastGoodPosition - t->buf);
+        *parsedLen = (grandfatheredLen > 0) ? grandfatheredLen :
+            (int32_t)(pLastGoodPosition - t->buf + parsedLenDelta);
     }
 
     return t;