Skip to content

Commit

Permalink
ICU-22707 fix hst=V: hst=NA for Kirat Rai
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed Jun 5, 2024
1 parent 47e9389 commit 6543634
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 2 deletions.
11 changes: 10 additions & 1 deletion icu4c/source/common/uprops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -590,7 +590,11 @@ static int32_t scriptGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*

/*
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
* Hangul_Syllable_Type is redundant with a subset of Grapheme_Cluster_Break.
*
* Starting with Unicode 16, there is an exception:
* Some Kirat Rai vowels are given GCB=V for proper grapheme clustering, but
* they are of course not related to Hangul syllables.
*/
static const UHangulSyllableType gcbToHst[]={
U_HST_NOT_APPLICABLE, /* U_GCB_OTHER */
Expand All @@ -610,6 +614,11 @@ static const UHangulSyllableType gcbToHst[]={
};

static int32_t getHangulSyllableType(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
// Ignore supplementary code points: They all have HST=NA.
// This is a simple way to handle the GCB!=hst cases since Unicode 16 (Kirat Rai vowels).
if(c>0xffff) {
return U_HST_NOT_APPLICABLE;
}
/* see comments on gcbToHst[] above */
int32_t gcb=(int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT;
if(gcb<UPRV_LENGTHOF(gcbToHst)) {
Expand Down
4 changes: 4 additions & 0 deletions icu4c/source/test/cintltst/cucdtst.c
Original file line number Diff line number Diff line change
Expand Up @@ -2699,6 +2699,10 @@ TestAdditionalProperties(void) {

{ 0xd7a4, UCHAR_HANGUL_SYLLABLE_TYPE, 0 },

// GCB=V but hst=NA (exception to GCB=hst for relevant values)
{ 0x16D67, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_NOT_APPLICABLE },
{ 0x16D6A, UCHAR_HANGUL_SYLLABLE_TYPE, U_HST_NOT_APPLICABLE },

{ -1, 0x410, 0 }, /* version break for Unicode 4.1 */

{ 0x00d7, UCHAR_PATTERN_SYNTAX, true },
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -648,7 +648,11 @@ public int getType(int c) {

/*
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
* Hangul_Syllable_Type is redundant with a subset of Grapheme_Cluster_Break.
*
* Starting with Unicode 16, there is an exception:
* Some Kirat Rai vowels are given GCB=V for proper grapheme clustering, but
* they are of course not related to Hangul syllables.
*/
private static final int /* UHangulSyllableType */ gcbToHst[]={
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */
Expand Down Expand Up @@ -809,6 +813,12 @@ int getMaxValue(int which) {
new IntProperty(SRC_PROPSVEC) { // HANGUL_SYLLABLE_TYPE
@Override
int getValue(int c) {
// Ignore supplementary code points: They all have HST=NA.
// This is a simple way to handle the GCB!=hst cases since Unicode 16
// (Kirat Rai vowels).
if(c>0xffff) {
return HangulSyllableType.NOT_APPLICABLE;
}
/* see comments on gcbToHst[] above */
int gcb=(getAdditional(c, 2)&GCB_MASK)>>>GCB_SHIFT;
if(gcb<gcbToHst.length) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2109,6 +2109,10 @@ public void TestAdditionalProperties()

{ 0xd7a4, UProperty.HANGUL_SYLLABLE_TYPE, 0 },

// GCB=V but hst=NA (exception to GCB=hst for relevant values)
{ 0x16D67, UProperty.HANGUL_SYLLABLE_TYPE, UCharacter.HangulSyllableType.NOT_APPLICABLE },
{ 0x16D6A, UProperty.HANGUL_SYLLABLE_TYPE, UCharacter.HangulSyllableType.NOT_APPLICABLE },

{ -1, 0x410, 0 }, /* version break for Unicode 4.1 */

{ 0x00d7, UProperty.PATTERN_SYNTAX, 1 },
Expand Down

0 comments on commit 6543634

Please sign in to comment.