Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP- ICU-13219 -u-dx #2676

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 5 additions & 2 deletions icu4c/source/common/brkeng.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,11 +163,14 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
}

const LanguageBreakEngine *
ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char* locale) {
UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status);
if (U_SUCCESS(status)) {
const LanguageBreakEngine *engine = nullptr;
if (DictionaryBreakEngine::suppressScriptBreak(locale, code)) {
return nullptr; // -u-dx was requested
}
// Try to use LSTM first
const LSTMData *data = CreateLSTMDataForScript(code, status);
if (U_SUCCESS(status)) {
Expand All @@ -188,7 +191,7 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
DictionaryMatcher *m = loadDictionaryMatcherFor(code);
if (m != nullptr) {
switch(code) {
case USCRIPT_THAI:
case USCRIPT_THAI:
engine = new ThaiBreakEngine(m, status);
break;
case USCRIPT_LAO:
Expand Down
41 changes: 40 additions & 1 deletion icu4c/source/common/dictbe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "uassert.h"
#include "unicode/normlzr.h"
#include "cmemory.h"
#include "cstring.h"
#include "dictionarydata.h"

U_NAMESPACE_BEGIN
Expand All @@ -42,7 +43,11 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
}

UBool
DictionaryBreakEngine::handles(UChar32 c, const char*) const {
DictionaryBreakEngine::handles(UChar32 c, const char* locale) const {
if (DictionaryBreakEngine::suppressScriptBreak(locale, c)) {
// suppressed by ID
return false;
}
return fSet.contains(c);
}

Expand Down Expand Up @@ -85,6 +90,40 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
fSet.compact();
}

UBool DictionaryBreakEngine::suppressScriptBreak(const char *locale, UScriptCode code) {
// get the keyword value
UErrorCode status = U_ZERO_ERROR;
char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
int32_t len = uloc_getKeywordValue(locale, "dx", buf, ULOC_KEYWORD_AND_VALUES_CAPACITY, &status);
if (U_FAILURE(status)) return false;
// loop over the keyword values
for(int32_t i =0; i<len; i+= 5) {
// turn hyphen into a null
if(buf[i+4] != 0 && buf[i+4] == '-') {
buf[i+4] = 0; // terminate (in buffer): 'hira-kata' -> 'hira\0kata'
} // else: possibly malformed, let match fail

const char *scriptName = buf+i;
if (!uprv_strncmp(scriptName, "zyyy", 4)) {
return true; // matched 'all'
} else if(!uprv_strnicmp(scriptName, uscript_getShortName(code), 4)) {
return true; // matched the specific script
}
}
return false;
}

UBool DictionaryBreakEngine::suppressScriptBreak(const char *locale, UChar32 c) {
UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status);
if (U_FAILURE(status)) {
return false;
} else {
return suppressScriptBreak(locale, code);
}
}


/*
******************************************************************
* PossibleWord
Expand Down
5 changes: 5 additions & 0 deletions icu4c/source/common/dictbe.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
UBool isPhraseBreaking,
UErrorCode& status) const = 0;

public:
/** @returns true if the specified code is suppressed by the specified locale, -u-dx */
static UBool suppressScriptBreak(const char *locale, UScriptCode code);
/** @returns true if the specified char is suppressed by the specified locale, -u-dx */
static UBool suppressScriptBreak(const char *locale, UChar32 c);
};

/*******************************************************************
Expand Down
53 changes: 53 additions & 0 deletions icu4c/source/test/intltest/rbbiapts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@
#include "unicode/ustring.h"
#include "unicode/utext.h"
#include "cmemory.h"
#include "dictbe.h"
#if !UCONFIG_NO_BREAK_ITERATION
#include "unicode/filteredbrk.h"
#include <stdio.h> // for snprintf
#endif
#include <unicode/uscript.h>
/**
* API Test the RuleBasedBreakIterator class
*/
Expand Down Expand Up @@ -1409,6 +1411,56 @@ void RBBIAPITest::TestFilteredBreakIteratorBuilder() {
#endif
}



/** helper function for testing*/
const char *RBBIAPITest::forLangTag(char buf[ULOC_FULLNAME_CAPACITY], const char *locale) {
if(!locale) return locale;
UErrorCode status = U_ZERO_ERROR;
int32_t parsedLength;
uloc_forLanguageTag(locale, buf, ULOC_FULLNAME_CAPACITY, &parsedLength, &status);
// verify no err
assertFalse(u_errorName(status), U_FAILURE(status));
return buf;
}

void RBBIAPITest::TestSuppressDictionary() {
char buf[ULOC_FULLNAME_CAPACITY];

// sanity checks of our internal function
{
const char *t = forLangTag(buf, "en");
assertEquals(WHERE, "en", t);
}
{
const char *t = forLangTag(buf, "en-u-dx-Thai");
assertEquals(WHERE, "en@dx=thai", t);
}
{
const char *t = forLangTag(buf, "sss-u-dx-Thai-Laoo");
assertEquals(WHERE, "sss@dx=thai-laoo", t);
}

assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(nullptr, USCRIPT_COMMON));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai"), USCRIPT_THAI));
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai"), USCRIPT_HANGUL));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_COMMON));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_THAI));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_HANGUL));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_THAI));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_LAO));
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_HANGUL));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Laoo-Zyyy"), USCRIPT_THAI));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo"), USCRIPT_THAI));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo-tz-gblon"), USCRIPT_THAI));
assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo"), USCRIPT_COMMON));

// try where there's no -u-dx
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh"), USCRIPT_MYANMAR));
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh-t-k0-plqdkbd"), USCRIPT_MYANMAR));
assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh-u-tz-gblon-"), USCRIPT_MYANMAR));
}

//---------------------------------------------
// runIndexedTest
//---------------------------------------------
Expand Down Expand Up @@ -1439,6 +1491,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
#if !UCONFIG_NO_BREAK_ITERATION
TESTCASE_AUTO(TestFilteredBreakIteratorBuilder);
#endif
TESTCASE_AUTO(TestSuppressDictionary);
TESTCASE_AUTO_END;
}

Expand Down
4 changes: 4 additions & 0 deletions icu4c/source/test/intltest/rbbiapts.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ class RBBIAPITest: public IntlTest {

void TestRefreshInputText();

void TestSuppressDictionary();

/**
*Internal subroutines
**/
Expand All @@ -99,6 +101,8 @@ class RBBIAPITest: public IntlTest {
/*Internal subroutine used for comparison of expected and acquired results */
void doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expected);

/** Helper: convert the language tag */
const char *forLangTag(char buf[ULOC_FULLNAME_CAPACITY], const char *locale);

};

Expand Down
25 changes: 25 additions & 0 deletions icu4c/source/test/testdata/rbbitst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1525,6 +1525,31 @@ Bangkok)•</data>
#
####################################################################################

# -u-dx (exclude script)
#<locale th


#<line>

<locale sss@dx=thai>
<line>
# Should no longer break at the dictionary points - it's not Thai language
# Short Test
<data>•โอํน• อะไป •จู่วาม •โล่น•</data>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be

<data>•โอํน •อะไป •จู่วาม •โล่น•</data>
```
the line break should happen after the ' ' not before. 

#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should be

<data>•โอํน •อะไป •จู่วาม •โล่น •เปี่ยร •อะลู่วาง •แมะ, •ปาย •อัน •แบ็จ •อะโจํน •ซา •เมาะ. •อัน •ฮะบืน •ตะ •เวี่ยะ •ตะ •งี่ยาน, •อัน •ฮะบืน •อีว •อะปายฮ.•</data>

there should be no line break before the ' ' but only after. right? @eggrobin

#<word>
# Should no longer break at the dictionary points - it's not the Thai language
#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For work break, this line should be

<data>•โอํน<200> •อะไป<200> •จู่วาม<200> •โล่น<200> •เปี่ยร<200> •อะลู่วาง<200> •แมะ<200>,• •ปาย<200> •อัน<200> •แบ็จ<200> •อะโจํน<200> •ซา<200> •เมาะ<200>.• •อัน<200> •ฮะบืน<200> •ตะ<200> •เวี่ยะ<200> •ตะ<200> •งี่ยาน<200>,• •อัน<200> •ฮะบืน<200> •อีว<200> •อะปายฮ<200>.•</data>
  1. the break should have status 200 (notice in the beginning of the test file it said " Break position, status == nnn" and ICU use 200 for "Tag value for words that contain letters, excluding hiragana, katakana or ideographic characters" https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ubrk_8h.html#af9836cc79482f82ac12eefb1f70b14b9a945eb6dd49881b36d4e2c9f592d97197 )
  2. the '.' (fullstop) and ',' (comma) should be a separated word, not attached to the previous token as part of the word, right? @eggrobin


#<locale sss@dx=zyyy>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

using dx=zyyy is very troublesome . I filed a CLDR bug in https://unicode-org.atlassian.net/browse/CLDR-17247 for that. From my point of view, that is a very bad way to exclude "all scripts"

#<line>
# Should no longer break at the dictionary points - it's not Thai language
#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be

<data>•โอํน •อะไป •จู่วาม •โล่น •เปี่ยร •อะลู่วาง •แมะ, •ปาย •อัน •แบ็จ •อะโจํน •ซา •เมาะ. •อัน •ฮะบืน •ตะ •เวี่ยะ •ตะ •งี่ยาน, •อัน •ฮะบืน •อีว •อะปายฮ.•</data>

there should have no line break before the " " SPACE

#<word>
# Should no longer break at the dictionary points - it's not the Thai language
#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be

<data>•โอํน<200> •อะไป<200> •จู่วาม<200> •โล่น<200> •เปี่ยร<200> •อะลู่วาง<200> •แมะ<200>,• •ปาย<200> •อัน<200> •แบ็จ<200> •อะโจํน<200> •ซา<200> •เมาะ<200>.• •อัน<200> •ฮะบืน<200> •ตะ<200> •เวี่ยะ<200> •ตะ<200> •งี่ยาน<200>,• •อัน<200> •ฮะบืน<200> •อีว<200> •อะปายฮ<200>.•</data>

there should be a break before the . (fullstop) and , (comma), right? (see above)



# Japanese line break tailoring test

<locale ja>
Expand Down