Skip to content

Commit

Permalink
ICU-13637 Break Iterator Rule Updates for Indic Grapheme Clusters.
Browse files Browse the repository at this point in the history
  • Loading branch information
aheninger committed Jun 28, 2019
1 parent c434557 commit fa240d4
Show file tree
Hide file tree
Showing 14 changed files with 1,236 additions and 93 deletions.
4 changes: 2 additions & 2 deletions icu4c/source/data/BUILDRULES.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def generate(config, glob, common_vars):
exit(1)

requests += generate_cnvalias(config, glob, common_vars)
requests += generate_ulayout(config, glob, common_vars)
requests += generate_confusables(config, glob, common_vars)
requests += generate_conversion_mappings(config, glob, common_vars)
requests += generate_brkitr_brk(config, glob, common_vars)
Expand All @@ -31,7 +32,6 @@ def generate(config, glob, common_vars):
requests += generate_coll_ucadata(config, glob, common_vars)
requests += generate_full_unicore_data(config, glob, common_vars)
requests += generate_unames(config, glob, common_vars)
requests += generate_ulayout(config, glob, common_vars)
requests += generate_misc(config, glob, common_vars)
requests += generate_curr_supplemental(config, glob, common_vars)
requests += generate_translit(config, glob, common_vars)
Expand Down Expand Up @@ -189,7 +189,7 @@ def generate_brkitr_brk(config, glob, common_vars):
RepeatedExecutionRequest(
name = "brkitr_brk",
category = "brkitr_rules",
dep_targets = [DepTarget("cnvalias")],
dep_targets = [DepTarget("cnvalias"), DepTarget("ulayout")],
input_files = input_files,
output_files = output_files,
tool = IcuTool("genbrk"),
Expand Down
10 changes: 10 additions & 0 deletions icu4c/source/data/brkitr/rules/char.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ $Prepend = [\p{Grapheme_Cluster_Break = Prepend}];
$SpacingMark = [\p{Grapheme_Cluster_Break = SpacingMark}];

#
# From cldr/common/properties/segments/
# and issue CLDR-10994
#
$Virama = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Virama}];
$LinkingConsonant = [\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}&\p{Indic_Syllabic_Category=Consonant}];
$ExtCccZwj = [[\p{gcb=Extend}-\p{ccc=0}] \p{gcb=ZWJ}];

# Korean Syllable Definitions
#
$L = [\p{Grapheme_Cluster_Break = L}];
Expand Down Expand Up @@ -57,6 +64,9 @@ $L ($L | $V | $LV | $LVT);
# GB 9b
$Prepend [^$Control $CR $LF];

# GB 9.3, from CLDR-10994
$LinkingConsonant $ExtCccZwj* $Virama $ExtCccZwj* $LinkingConsonant;

# GB 11 Do not break within emoji modifier sequences or emoji zwj sequences.
$Extended_Pict $Extend* $ZWJ $Extended_Pict;

Expand Down
10 changes: 5 additions & 5 deletions icu4c/source/test/intltest/rbbimonkeytest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,13 @@ CharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeStri
printf("epandedDef: %s\n", CStr(expandedDef)());
}

UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status);
LocalPointer<UnicodeSet> s(new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status), status);
if (U_FAILURE(status)) {
IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__,
u_errorName(status), CStr(name)());
return NULL;
IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s\n Expanded set definition: %s",
__FILE__, __LINE__, u_errorName(status), CStr(name)(), CStr(expandedDef)());
return nullptr;
}
CharClass *cclass = new CharClass(name, definition, expandedDef, s);
CharClass *cclass = new CharClass(name, definition, expandedDef, s.orphan());
CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
new UnicodeString(name), // Key, owned by hash table.
cclass, // Value, owned by hash table.
Expand Down
31 changes: 30 additions & 1 deletion icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1611,6 +1611,9 @@ class RBBICharMonkey: public RBBIMonkeyKind {
UnicodeSet *fLVTSet;
UnicodeSet *fHangulSet;
UnicodeSet *fExtendedPictSet;
UnicodeSet *fViramaSet;
UnicodeSet *fLinkingConsonantSet;
UnicodeSet *fExtCccZwjSet;
UnicodeSet *fAnySet;

const UnicodeString *fText;
Expand Down Expand Up @@ -1643,6 +1646,11 @@ RBBICharMonkey::RBBICharMonkey() {
fHangulSet->addAll(*fLVTSet);

fExtendedPictSet = new UnicodeSet(u"[:Extended_Pictographic:]", status);
fViramaSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
"\\p{Indic_Syllabic_Category=Virama}]", status);
fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
"\\p{Indic_Syllabic_Category=Consonant}]", status);
fExtCccZwjSet = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
fAnySet = new UnicodeSet(0, 0x10ffff);

fSets = new UVector(status);
Expand All @@ -1658,6 +1666,9 @@ RBBICharMonkey::RBBICharMonkey() {
fSets->addElement(fAnySet, status);
fSets->addElement(fZWJSet, status);
fSets->addElement(fExtendedPictSet, status);
fSets->addElement(fViramaSet, status);
fSets->addElement(fLinkingConsonantSet, status);
fSets->addElement(fExtCccZwjSet, status);
if (U_FAILURE(status)) {
deferredStatus = status;
}
Expand Down Expand Up @@ -1777,6 +1788,22 @@ int32_t RBBICharMonkey::next(int32_t prevPos) {
continue;
}

// Rule (GB9.3) LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant
// Note: Viramas are also included in the ExtCccZwj class.
if (fLinkingConsonantSet->contains(c2)) {
int pi = p1;
bool sawVirama = false;
while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
if (fViramaSet->contains(fText->char32At(pi))) {
sawVirama = true;
}
pi = fText->moveIndex32(pi, -1);
}
if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
continue;
}
}

// Rule (GB11) Extended_Pictographic Extend * ZWJ x Extended_Pictographic
if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
continue;
Expand Down Expand Up @@ -1827,7 +1854,9 @@ RBBICharMonkey::~RBBICharMonkey() {
delete fAnySet;
delete fZWJSet;
delete fExtendedPictSet;
}
delete fViramaSet;
delete fLinkingConsonantSet;
delete fExtCccZwjSet;}

//------------------------------------------------------------------------------------------
//
Expand Down
Loading

0 comments on commit fa240d4

Please sign in to comment.