From 3450333aaa2ebf127872b5cb42be3720cf400075 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 15 Sep 2023 10:13:45 +0100 Subject: [PATCH 1/2] ICU-13219 work on implementation of -u-dx --- icu4c/source/common/brkeng.cpp | 7 +++- icu4c/source/common/dictbe.cpp | 41 ++++++++++++++++++- icu4c/source/common/dictbe.h | 5 +++ icu4c/source/test/intltest/rbbiapts.cpp | 53 +++++++++++++++++++++++++ icu4c/source/test/intltest/rbbiapts.h | 4 ++ icu4c/source/test/testdata/rbbitst.txt | 21 ++++++++++ 6 files changed, 128 insertions(+), 3 deletions(-) diff --git a/icu4c/source/common/brkeng.cpp b/icu4c/source/common/brkeng.cpp index c8442310b8b3..0bb8a277eba7 100644 --- a/icu4c/source/common/brkeng.cpp +++ b/icu4c/source/common/brkeng.cpp @@ -163,11 +163,14 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) { } const LanguageBreakEngine * -ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) { +ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char* locale) { UErrorCode status = U_ZERO_ERROR; UScriptCode code = uscript_getScript(c, &status); if (U_SUCCESS(status)) { const LanguageBreakEngine *engine = nullptr; + if (DictionaryBreakEngine::suppressScriptBreak(locale, code)) { + return nullptr; // -u-dx was requested + } // Try to use LSTM first const LSTMData *data = CreateLSTMDataForScript(code, status); if (U_SUCCESS(status)) { @@ -188,7 +191,7 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) { DictionaryMatcher *m = loadDictionaryMatcherFor(code); if (m != nullptr) { switch(code) { - case USCRIPT_THAI: + case USCRIPT_THAI: engine = new ThaiBreakEngine(m, status); break; case USCRIPT_LAO: diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp index 3d672c03bfb3..61cd7ff3dc42 100644 --- a/icu4c/source/common/dictbe.cpp +++ b/icu4c/source/common/dictbe.cpp @@ -27,6 +27,7 @@ #include "uassert.h" #include "unicode/normlzr.h" #include "cmemory.h" +#include "cstring.h" #include "dictionarydata.h" U_NAMESPACE_BEGIN @@ -42,7 +43,11 @@ DictionaryBreakEngine::~DictionaryBreakEngine() { } UBool -DictionaryBreakEngine::handles(UChar32 c, const char*) const { +DictionaryBreakEngine::handles(UChar32 c, const char* locale) const { + if (DictionaryBreakEngine::suppressScriptBreak(locale, c)) { + // suppressed by ID + return false; + } return fSet.contains(c); } @@ -85,6 +90,40 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { fSet.compact(); } +UBool DictionaryBreakEngine::suppressScriptBreak(const char *locale, UScriptCode code) { + // get the keyword value + UErrorCode status = U_ZERO_ERROR; + char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY]; + int32_t len = uloc_getKeywordValue(locale, "dx", buf, ULOC_KEYWORD_AND_VALUES_CAPACITY, &status); + if (U_FAILURE(status)) return false; + // loop over the keyword values + for(int32_t i =0; i 'hira\0kata' + } // else: possibly malformed, let match fail + + const char *scriptName = buf+i; + if (!uprv_strncmp(scriptName, "zyyy", 4)) { + return true; // matched 'all' + } else if(!uprv_strnicmp(scriptName, uscript_getShortName(code), 4)) { + return true; // matched the specific script + } + } + return false; +} + +UBool DictionaryBreakEngine::suppressScriptBreak(const char *locale, UChar32 c) { + UErrorCode status = U_ZERO_ERROR; + UScriptCode code = uscript_getScript(c, &status); + if (U_FAILURE(status)) { + return false; + } else { + return suppressScriptBreak(locale, code); + } +} + + /* ****************************************************************** * PossibleWord diff --git a/icu4c/source/common/dictbe.h b/icu4c/source/common/dictbe.h index e512071fa45d..04542f219323 100644 --- a/icu4c/source/common/dictbe.h +++ b/icu4c/source/common/dictbe.h @@ -113,6 +113,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine { UBool isPhraseBreaking, UErrorCode& status) const = 0; +public: + /** @returns true if the specified code is suppressed by the specified locale, -u-dx */ + static UBool suppressScriptBreak(const char *locale, UScriptCode code); + /** @returns true if the specified char is suppressed by the specified locale, -u-dx */ + static UBool suppressScriptBreak(const char *locale, UChar32 c); }; /******************************************************************* diff --git a/icu4c/source/test/intltest/rbbiapts.cpp b/icu4c/source/test/intltest/rbbiapts.cpp index d6d1a890d492..bb24f1168c5a 100644 --- a/icu4c/source/test/intltest/rbbiapts.cpp +++ b/icu4c/source/test/intltest/rbbiapts.cpp @@ -25,10 +25,12 @@ #include "unicode/ustring.h" #include "unicode/utext.h" #include "cmemory.h" +#include "dictbe.h" #if !UCONFIG_NO_BREAK_ITERATION #include "unicode/filteredbrk.h" #include // for snprintf #endif +#include /** * API Test the RuleBasedBreakIterator class */ @@ -1409,6 +1411,56 @@ void RBBIAPITest::TestFilteredBreakIteratorBuilder() { #endif } + + +/** helper function for testing*/ +const char *RBBIAPITest::forLangTag(char buf[ULOC_FULLNAME_CAPACITY], const char *locale) { + if(!locale) return locale; + UErrorCode status = U_ZERO_ERROR; + int32_t parsedLength; + uloc_forLanguageTag(locale, buf, ULOC_FULLNAME_CAPACITY, &parsedLength, &status); + // verify no err + assertFalse(u_errorName(status), U_FAILURE(status)); + return buf; +} + +void RBBIAPITest::TestSuppressDictionary() { + char buf[ULOC_FULLNAME_CAPACITY]; + + // sanity checks of our internal function + { + const char *t = forLangTag(buf, "en"); + assertEquals(WHERE, "en", t); + } + { + const char *t = forLangTag(buf, "en-u-dx-Thai"); + assertEquals(WHERE, "en@dx=thai", t); + } + { + const char *t = forLangTag(buf, "sss-u-dx-Thai-Laoo"); + assertEquals(WHERE, "sss@dx=thai-laoo", t); + } + + assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(nullptr, USCRIPT_COMMON)); + assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai"), USCRIPT_THAI)); + assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai"), USCRIPT_HANGUL)); + assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_COMMON)); + assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_THAI)); + assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_HANGUL)); + assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_THAI)); + assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_LAO)); + assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_HANGUL)); + assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Laoo-Zyyy"), USCRIPT_THAI)); + assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo"), USCRIPT_THAI)); + assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo-tz-gblon"), USCRIPT_THAI)); + assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo"), USCRIPT_COMMON)); + + // try where there's no -u-dx + assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh"), USCRIPT_MYANMAR)); + assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh-t-k0-plqdkbd"), USCRIPT_MYANMAR)); + assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh-u-tz-gblon-"), USCRIPT_MYANMAR)); +} + //--------------------------------------------- // runIndexedTest //--------------------------------------------- @@ -1439,6 +1491,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name, #if !UCONFIG_NO_BREAK_ITERATION TESTCASE_AUTO(TestFilteredBreakIteratorBuilder); #endif + TESTCASE_AUTO(TestSuppressDictionary); TESTCASE_AUTO_END; } diff --git a/icu4c/source/test/intltest/rbbiapts.h b/icu4c/source/test/intltest/rbbiapts.h index d65a2bc52197..15df5a5479f2 100644 --- a/icu4c/source/test/intltest/rbbiapts.h +++ b/icu4c/source/test/intltest/rbbiapts.h @@ -90,6 +90,8 @@ class RBBIAPITest: public IntlTest { void TestRefreshInputText(); + void TestSuppressDictionary(); + /** *Internal subroutines **/ @@ -99,6 +101,8 @@ class RBBIAPITest: public IntlTest { /*Internal subroutine used for comparison of expected and acquired results */ void doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expected); + /** Helper: convert the language tag */ + const char *forLangTag(char buf[ULOC_FULLNAME_CAPACITY], const char *locale); }; diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 274b4051d249..84c1e4330286 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1525,6 +1525,27 @@ Bangkok)• # #################################################################################### +# -u-dx (exclude script) + + + + + +# Should no longer break at the dictionary points - it's not Thai language +•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.• +# +# Should no longer break at the dictionary points - it's not the Thai language +#•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.• + +# +# +# Should no longer break at the dictionary points - it's not Thai language +#•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.• +# +# Should no longer break at the dictionary points - it's not the Thai language +#•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.• + + # Japanese line break tailoring test From c140c6dde5e295066f0921e4f857ba782861b187 Mon Sep 17 00:00:00 2001 From: "Steven R. Loomis" Date: Fri, 6 Oct 2023 12:21:48 -0500 Subject: [PATCH 2/2] WIP --- icu4c/source/test/testdata/rbbitst.txt | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 84c1e4330286..b85c5b486255 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -1526,13 +1526,17 @@ Bangkok)• #################################################################################### # -u-dx (exclude script) - - +# # Should no longer break at the dictionary points - it's not Thai language -•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.• +# Short Test +•โอํน• อะไป •จู่วาม •โล่น• +#•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.• # # Should no longer break at the dictionary points - it's not the Thai language #•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•