unicode-org · srl295 · Sep 15, 2023 · Oct 6, 2023 · FrankYFTang · Nov 20, 2023
diff --git a/icu4c/source/common/brkeng.cpp b/icu4c/source/common/brkeng.cpp
@@ -163,11 +163,14 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, const char* locale) {
 }
 
 const LanguageBreakEngine *
-ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
+ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char* locale) {
     UErrorCode status = U_ZERO_ERROR;
     UScriptCode code = uscript_getScript(c, &status);
     if (U_SUCCESS(status)) {
         const LanguageBreakEngine *engine = nullptr;
+        if (DictionaryBreakEngine::suppressScriptBreak(locale, code)) {
+            return nullptr; // -u-dx was requested
+        }
         // Try to use LSTM first
         const LSTMData *data = CreateLSTMDataForScript(code, status);
         if (U_SUCCESS(status)) {
@@ -188,7 +191,7 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, const char*) {
         DictionaryMatcher *m = loadDictionaryMatcherFor(code);
         if (m != nullptr) {
             switch(code) {
-            case USCRIPT_THAI:
+            case USCRIPT_THAI:                
                 engine = new ThaiBreakEngine(m, status);
                 break;
             case USCRIPT_LAO:

diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp
@@ -27,6 +27,7 @@
 #include "uassert.h"
 #include "unicode/normlzr.h"
 #include "cmemory.h"
+#include "cstring.h"
 #include "dictionarydata.h"
 
 U_NAMESPACE_BEGIN
@@ -42,7 +43,11 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
 }
 
 UBool
-DictionaryBreakEngine::handles(UChar32 c, const char*) const {
+DictionaryBreakEngine::handles(UChar32 c, const char* locale) const {
+    if (DictionaryBreakEngine::suppressScriptBreak(locale, c)) {
+        // suppressed by ID
+        return false;
+    }
     return fSet.contains(c);
 }
 
@@ -85,6 +90,40 @@ DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
     fSet.compact();
 }
 
+UBool DictionaryBreakEngine::suppressScriptBreak(const char *locale, UScriptCode code) {
+    // get the keyword value
+    UErrorCode status = U_ZERO_ERROR;
+    char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
+    int32_t len = uloc_getKeywordValue(locale, "dx", buf, ULOC_KEYWORD_AND_VALUES_CAPACITY, &status);
+    if (U_FAILURE(status)) return false;
+    // loop over the keyword values
+    for(int32_t i =0; i<len; i+= 5) {
+        // turn hyphen into a null
+        if(buf[i+4] != 0 && buf[i+4] == '-') {
+            buf[i+4] = 0; // terminate (in buffer):  'hira-kata' -> 'hira\0kata'
+        } // else: possibly malformed, let match fail
+
+        const char *scriptName = buf+i;
+        if (!uprv_strncmp(scriptName, "zyyy", 4)) {
+            return true; // matched 'all'
+        } else if(!uprv_strnicmp(scriptName, uscript_getShortName(code), 4)) {
+            return true; // matched the specific script
+        }
+    }
+    return false;
+}
+
+UBool DictionaryBreakEngine::suppressScriptBreak(const char *locale, UChar32 c) {
+    UErrorCode status = U_ZERO_ERROR;
+    UScriptCode code = uscript_getScript(c, &status);
+    if (U_FAILURE(status)) {
+        return false;
+    } else {
+        return suppressScriptBreak(locale, code);
+    }
+}
+
+
 /*
  ******************************************************************
  * PossibleWord

diff --git a/icu4c/source/common/dictbe.h b/icu4c/source/common/dictbe.h
@@ -113,6 +113,11 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
                                            UBool isPhraseBreaking,
                                            UErrorCode& status) const = 0;
 
+public:
+   /** @returns true if the specified code is suppressed by the specified locale, -u-dx */
+   static UBool suppressScriptBreak(const char *locale, UScriptCode code); 
+   /** @returns true if the specified char is suppressed by the specified locale, -u-dx */
+   static UBool suppressScriptBreak(const char *locale, UChar32 c);
 };
 
 /*******************************************************************

diff --git a/icu4c/source/test/intltest/rbbiapts.cpp b/icu4c/source/test/intltest/rbbiapts.cpp
@@ -25,10 +25,12 @@
 #include "unicode/ustring.h"
 #include "unicode/utext.h"
 #include "cmemory.h"
+#include "dictbe.h"
 #if !UCONFIG_NO_BREAK_ITERATION
 #include "unicode/filteredbrk.h"
 #include <stdio.h> // for snprintf
 #endif
+#include <unicode/uscript.h>
 /**
  * API Test the RuleBasedBreakIterator class
  */
@@ -1409,6 +1411,56 @@ void RBBIAPITest::TestFilteredBreakIteratorBuilder() {
 #endif
 }
 
+
+
+/** helper function for testing*/
+const char *RBBIAPITest::forLangTag(char buf[ULOC_FULLNAME_CAPACITY], const char *locale) {
+    if(!locale) return locale;
+    UErrorCode status = U_ZERO_ERROR;
+    int32_t parsedLength;
+    uloc_forLanguageTag(locale, buf, ULOC_FULLNAME_CAPACITY, &parsedLength, &status);
+    // verify no err
+    assertFalse(u_errorName(status), U_FAILURE(status));
+    return buf;
+}
+
+void RBBIAPITest::TestSuppressDictionary() {
+    char buf[ULOC_FULLNAME_CAPACITY];
+
+    // sanity checks of our internal function
+    {
+        const char *t = forLangTag(buf, "en");
+        assertEquals(WHERE, "en", t);
+    }
+    {
+        const char *t = forLangTag(buf, "en-u-dx-Thai");
+        assertEquals(WHERE, "en@dx=thai", t);
+    }
+    {
+        const char *t = forLangTag(buf, "sss-u-dx-Thai-Laoo");
+        assertEquals(WHERE, "sss@dx=thai-laoo", t);
+    }
+
+    assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(nullptr, USCRIPT_COMMON));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai"), USCRIPT_THAI));
+    assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai"), USCRIPT_HANGUL));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_COMMON));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_THAI));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy"), USCRIPT_HANGUL));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_THAI));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_LAO));
+    assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Thai-Laoo"), USCRIPT_HANGUL));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Laoo-Zyyy"), USCRIPT_THAI));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo"), USCRIPT_THAI));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo-tz-gblon"), USCRIPT_THAI));
+    assertTrue(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "en-u-dx-Zyyy-Laoo"), USCRIPT_COMMON));
+
+    // try where there's no -u-dx
+    assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh"), USCRIPT_MYANMAR));
+    assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh-t-k0-plqdkbd"), USCRIPT_MYANMAR));
+    assertFalse(WHERE, DictionaryBreakEngine::suppressScriptBreak(forLangTag(buf, "tlh-u-tz-gblon-"), USCRIPT_MYANMAR));
+}
+
 //---------------------------------------------
 // runIndexedTest
 //---------------------------------------------
@@ -1439,6 +1491,7 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
 #if !UCONFIG_NO_BREAK_ITERATION
     TESTCASE_AUTO(TestFilteredBreakIteratorBuilder);
 #endif
+    TESTCASE_AUTO(TestSuppressDictionary);
     TESTCASE_AUTO_END;
 }
 

diff --git a/icu4c/source/test/intltest/rbbiapts.h b/icu4c/source/test/intltest/rbbiapts.h
@@ -90,6 +90,8 @@ class RBBIAPITest: public IntlTest {
 
     void TestRefreshInputText();
 
+    void TestSuppressDictionary();
+
     /**
      *Internal subroutines
      **/
@@ -99,6 +101,8 @@ class RBBIAPITest: public IntlTest {
     /*Internal subroutine used for comparison of expected and acquired results */
     void doTest(UnicodeString& testString, int32_t start, int32_t gotoffset, int32_t expectedOffset, const char* expected);
 
+    /** Helper: convert the language tag */
+    const char *forLangTag(char buf[ULOC_FULLNAME_CAPACITY], const char *locale);
 
 };
 

diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt
@@ -1525,6 +1525,31 @@ Bangkok)•</data>
 #
 ####################################################################################
 
+# -u-dx (exclude script)
+#<locale th
+
+
+#<line>
+
+<locale sss@dx=thai>
+<line>
+# Should no longer break at the dictionary points - it's not Thai language
+# Short Test
+<data>•โอํน• อะไป •จู่วาม •โล่น•</data>
+#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
+#<word>
+# Should no longer break at the dictionary points - it's not the Thai language
+#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
+
+#<locale sss@dx=zyyy>
+#<line>
+# Should no longer break at the dictionary points - it's not Thai language
+#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
+#<word>
+# Should no longer break at the dictionary points - it's not the Thai language
+#<data>•โอํน• •อะไป• •จู่วาม• •โล่น• •เปี่ยร• •อะลู่วาง• •แมะ,• •ปาย• •อัน• •แบ็จ• •อะโจํน• •ซา• •เมาะ.• •อัน• •ฮะบืน• •ตะ• •เวี่ยะ• •ตะ• •งี่ยาน,• •อัน• •ฮะบืน• •อีว• •อะปายฮ.•</data>
+
+
 # Japanese line break tailoring test
 
 <locale ja>