Skip to content

Commit

Permalink
ICU-22785 move Block bits from propsvec0 to new trie
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed Jun 4, 2024
1 parent 81492ae commit 0d8a3cc
Show file tree
Hide file tree
Showing 14 changed files with 2,864 additions and 2,614 deletions.
3 changes: 3 additions & 0 deletions icu4c/source/common/characterproperties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,9 @@ void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) {
case UPROPS_SRC_ID_COMPAT_MATH:
uprops_addPropertyStarts(src, &sa, &errorCode);
break;
case UPROPS_SRC_BLOCK:
ublock_addPropertyStarts(&sa, errorCode);
break;
default:
errorCode = U_INTERNAL_PROGRAM_ERROR;
break;
Expand Down
28 changes: 27 additions & 1 deletion icu4c/source/common/uchar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/ucptrie.h"
#include "unicode/uscript.h"
#include "unicode/udata.h"
#include "uassert.h"
Expand Down Expand Up @@ -515,6 +516,8 @@ uprv_getMaxValues(int32_t column) {
return indexes[UPROPS_MAX_VALUES_INDEX];
case 2:
return indexes[UPROPS_MAX_VALUES_2_INDEX];
case UPROPS_MAX_VALUES_OTHER_INDEX:
return indexes[column];
default:
return 0;
}
Expand Down Expand Up @@ -618,7 +621,15 @@ uscript_getScriptExtensions(UChar32 c,

U_CAPI UBlockCode U_EXPORT2
ublock_getCode(UChar32 c) {
return (UBlockCode)((u_getUnicodeProperties(c, 0)&UPROPS_BLOCK_MASK)>>UPROPS_BLOCK_SHIFT);
// We store Block values indexed by the code point shifted right 4 bits
// and use a "small" UCPTrie=CodePointTrie for minimal data size.
// This works because blocks have xxx0..xxxF ranges.
uint32_t c4 = c; // unsigned so that shifting right does not worry the compiler
// Shift unless out of range, in which case we fetch the trie's error value.
if (c4 <= 0x10ffff) {
c4 >>= 4;
}
return (UBlockCode)ucptrie_get(&block_trie, c4);
}

/* property starts for UnicodeSet ------------------------------------------- */
Expand Down Expand Up @@ -706,3 +717,18 @@ upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
/* add the start code point of each same-value range of the properties vectors trie */
utrie2_enum(&propsVectorsTrie, nullptr, _enumPropertyStartsRange, sa);
}

U_CFUNC void U_EXPORT2
ublock_addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) {
// Add the start code point of each same-value range of the trie.
// We store Block values indexed by the code point shifted right 4 bits;
// see ublock_getCode().
UChar32 start = 0, end;
uint32_t value;
while (start < 0x11000 && // limit: (max code point + 1) >> 4
(end = ucptrie_getRange(&block_trie, start, UCPMAP_RANGE_NORMAL, 0,
nullptr, nullptr, &value)) >= 0) {
sa->add(sa->set, start << 4);
start = end + 1;
}
}
5,213 changes: 2,647 additions & 2,566 deletions icu4c/source/common/uchar_props_data.h

Large diffs are not rendered by default.

10 changes: 9 additions & 1 deletion icu4c/source/common/uprops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,14 @@ static int32_t biDiGetMaxValue(const IntProperty &/*prop*/, UProperty which) {
return ubidi_getMaxValue(which);
}

static int32_t getBlock(const IntProperty &/*prop*/, UChar32 c, UProperty /*which*/) {
return (int32_t)ublock_getCode(c);
}

static int32_t blockGetMaxValue(const IntProperty &/*prop*/, UProperty /*which*/) {
return uprv_getMaxValues(UPROPS_MAX_VALUES_OTHER_INDEX) & UPROPS_MAX_BLOCK;
}

#if UCONFIG_NO_NORMALIZATION
static int32_t getCombiningClass(const IntProperty &, UChar32, UProperty) {
return 0;
Expand Down Expand Up @@ -683,7 +691,7 @@ static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={
* For them, column is the UPropertySource value.
*/
{ UPROPS_SRC_BIDI, 0, 0, getBiDiClass, biDiGetMaxValue },
{ 0, UPROPS_BLOCK_MASK, UPROPS_BLOCK_SHIFT, defaultGetValue, defaultGetMaxValue },
{ UPROPS_SRC_BLOCK, 0, 0, getBlock, blockGetMaxValue },
{ UPROPS_SRC_NFC, 0, 0xff, getCombiningClass, getMaxValueFromShift },
{ 2, UPROPS_DT_MASK, 0, defaultGetValue, defaultGetMaxValue },
{ 0, UPROPS_EA_MASK, UPROPS_EA_SHIFT, defaultGetValue, defaultGetMaxValue },
Expand Down
40 changes: 33 additions & 7 deletions icu4c/source/common/uprops.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,18 @@ enum {

UPROPS_SCRIPT_EXTENSIONS_INDEX,

UPROPS_RESERVED_INDEX_7,
UPROPS_BLOCK_TRIE_INDEX,
UPROPS_RESERVED_INDEX_8,

/* size of the data file (number of 32-bit units after the header) */
/** size of the data file (number of 32-bit units after the header) */
UPROPS_DATA_TOP_INDEX,

/* maximum values for code values in vector word 0 */
/** maximum values for code values in vector word 0 */
UPROPS_MAX_VALUES_INDEX=10,
/* maximum values for code values in vector word 2 */
/** maximum values for code values in vector word 2 */
UPROPS_MAX_VALUES_2_INDEX,
/** maximum values for other code values */
UPROPS_MAX_VALUES_OTHER_INDEX,

UPROPS_INDEX_COUNT=16
};
Expand Down Expand Up @@ -117,6 +119,7 @@ enum {
/* number of properties vector words */
#define UPROPS_VECTOR_WORDS 3

// TODO: merge scx+Script bit sets together
/*
* Properties in vector word 0
* Bits
Expand All @@ -129,7 +132,7 @@ enum {
* 0: Script=bits 21..20 & 7..0
* 21..20 Bits 9..8 of the UScriptCode, or index to Script_Extensions
* 19..17 East Asian Width
* 16.. 8 UBlockCode
* 16.. 8 reserved since format version 9; was UBlockCode
* 7.. 0 UScriptCode, or index to Script_Extensions
*/

Expand All @@ -150,8 +153,8 @@ enum {
#define UPROPS_EA_MASK 0x000e0000
#define UPROPS_EA_SHIFT 17

#define UPROPS_BLOCK_MASK 0x0001ff00
#define UPROPS_BLOCK_SHIFT 8
// fine UPROPS_BLOCK_MASK 0x0001ff00
// fine UPROPS_BLOCK_SHIFT 8

#define UPROPS_SCRIPT_LOW_MASK 0x000000ff

Expand Down Expand Up @@ -319,6 +322,17 @@ inline constexpr uint8_t uprops_idTypeToEncoded[] = {

#define UPROPS_DT_MASK 0x0000001f

#ifdef __cplusplus

namespace {

// Bits 9..0 in UPROPS_MAX_VALUES_OTHER_INDEX
inline constexpr uint32_t UPROPS_MAX_BLOCK = 0x3ff;

} // namespace

#endif // __cplusplus

/**
* Gets the main properties value for a code point.
* Implemented in uchar.c for uprops.cpp.
Expand Down Expand Up @@ -392,6 +406,8 @@ enum {
ZWNBSP =0xfeff
};

// TODO: Move these two functions into a different header file (new unames.h?) so that uprops.h
// need not be C-compatible any more.
/**
* Get the maximum length of a (regular/1.0/extended) character name.
* @return 0 if no character names available.
Expand Down Expand Up @@ -445,6 +461,7 @@ enum UPropertySource {
UPROPS_SRC_EMOJI,
UPROPS_SRC_IDSU,
UPROPS_SRC_ID_COMPAT_MATH,
UPROPS_SRC_BLOCK,
/** One more than the highest UPropertySource (UPROPS_SRC_) constant. */
UPROPS_SRC_COUNT
};
Expand Down Expand Up @@ -476,6 +493,13 @@ upropsvec_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
U_CFUNC void U_EXPORT2
uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *pErrorCode);

#ifdef __cplusplus

U_CFUNC void U_EXPORT2
ublock_addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode);

#endif // __cplusplus

/**
* Return a set of characters for property enumeration.
* For each two consecutive characters (start, limit) in the set,
Expand All @@ -488,6 +512,8 @@ uprops_addPropertyStarts(UPropertySource src, const USetAdder *sa, UErrorCode *p
uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode);
*/

// TODO: Move this into a different header file (udataswp.h? new unames.h?) so that uprops.h
// need not be C-compatible any more.
/**
* Swap the ICU Unicode character names file. See uchar.c.
* @internal
Expand Down
Binary file modified icu4c/source/data/in/uprops.icu
Binary file not shown.
20 changes: 10 additions & 10 deletions icu4c/source/data/unidata/changes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ export CLDR_SRC=~/cldr/uni/src
export ICU_ROOT=~/icu/uni
export ICU_SRC=$ICU_ROOT/src
export ICU_OUT=$ICU_ROOT/dbg
export ICUDT=icudt75b
export ICUDT=icudt76b
export ICU4C_DATA_IN=$ICU_SRC/icu4c/source/data/in
export ICU4C_UNIDATA=$ICU_SRC/icu4c/source/data/unidata
export LD_LIBRARY_PATH=$ICU_OUT/icu4c/lib
Expand All @@ -86,7 +86,7 @@ export CLDR_SRC=~/oss/cldr/mine/src
export ICU_ROOT=~/oss/icu
export ICU_SRC=$ICU_ROOT
export ICU_OUT=$ICU_ROOT
export ICUDT=icudt75b
export ICUDT=icudt76b
export ICU4C_DATA_IN=$ICU_SRC/icu4c/source/data/in
export ICU4C_UNIDATA=$ICU_SRC/icu4c/source/data/unidata
export LD_LIBRARY_PATH=$ICU_OUT/icu4c/lib
Expand Down Expand Up @@ -326,14 +326,14 @@ TODO
output:
...
make[1]: Entering directory '/usr/local/google/home/mscherer/icu/uni/dbg/icu4c/data'
mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt75b
mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt75b
LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt75l.dat ./out/icu4j/icudt75b.dat -s ./out/build/icudt75l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt75b
mv ./out/icu4j/"com/ibm/icu/impl/data/icudt75b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt75b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt75b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt75b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt75b"
jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt75b/
mkdir -p ./out/icu4j/com/ibm/icu/impl/data/icudt76b
mkdir -p ./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt76b
LD_LIBRARY_PATH=../lib:../stubdata:../tools/ctestfw:$LD_LIBRARY_PATH ../bin/icupkg ./out/tmp/icudt76l.dat ./out/icu4j/icudt76b.dat -s ./out/build/icudt76l -x '*' -tb -d ./out/icu4j/com/ibm/icu/impl/data/icudt76b
mv ./out/icu4j/"com/ibm/icu/impl/data/icudt76b/zoneinfo64.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt76b/metaZones.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt76b/timezoneTypes.res" ./out/icu4j/"com/ibm/icu/impl/data/icudt76b/windowsZones.res" "./out/icu4j/tzdata/com/ibm/icu/impl/data/icudt76b"
jar cf ./out/icu4j/icudata.jar -C ./out/icu4j com/ibm/icu/impl/data/icudt76b/
mkdir -p /tmp/icu4j/main/shared/data
cp ./out/icu4j/icudata.jar /tmp/icu4j/main/shared/data
jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt75b/
jar cf ./out/icu4j/icutzdata.jar -C ./out/icu4j/tzdata com/ibm/icu/impl/data/icudt76b/
mkdir -p /tmp/icu4j/main/shared/data
cp ./out/icu4j/icutzdata.jar /tmp/icu4j/main/shared/data
make[1]: Leaving directory '/usr/local/google/home/mscherer/icu/uni/dbg/icu4c/data'
Expand All @@ -343,8 +343,8 @@ TODO
cp -v com/ibm/icu/impl/data/$ICUDT/brkitr/* $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/$ICUDT/brkitr
cp -v com/ibm/icu/impl/data/$ICUDT/confusables.cfu $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/$ICUDT
cp -v com/ibm/icu/impl/data/$ICUDT/*.nrm $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/$ICUDT
cd com/ibm/icu/impl/data/$ICUDT/
ls *.icu | egrep -v "cnvalias.icu" | awk '{print "cp " $0 " $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/$ICUDT";}' | sh
cd com/ibm/icu/impl/data/icudata/
ls *.icu | egrep -v "cnvalias.icu" | awk '{print "cp " $0 " $ICU_SRC/icu4j/main/core/src/main/resources/com/ibm/icu/impl/data/icudata";}' | sh
- The procedure above is very conservative:
It refreshes only the parts of the ICU4J data that we think are affected by a Unicode data update.
It avoids dealing with any other discrepancies
Expand Down
1 change: 1 addition & 0 deletions icu4c/source/test/cintltst/udatatst.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include "ucol_imp.h"
#include "ucol_swp.h"
#include "ucnv_bld.h"
#include "udataswp.h"
#include "sprpimpl.h"
#include "rbbidata.h"

Expand Down
2 changes: 1 addition & 1 deletion icu4c/source/test/depstest/dependencies.txt
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ group: script_runs
group: uchar
uchar.o
deps
utrie2
ucptrie utrie2

group: messagepattern # for MessageFormat and tools
messagepattern.o
Expand Down
14 changes: 11 additions & 3 deletions icu4c/source/tools/toolutil/swapimpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,13 +296,21 @@ uprops_swap(const UDataSwapper *ds,
// SCX const uint16_t scriptExtensions[2*(i7-i6)];
ds->swapArray16(ds,
inData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX],
4*(dataIndexes[UPROPS_RESERVED_INDEX_7]-dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]),
4*(dataIndexes[UPROPS_BLOCK_TRIE_INDEX]-dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]),
outData32+dataIndexes[UPROPS_SCRIPT_EXTENSIONS_INDEX],
pErrorCode);

// Swap the Block UCPTrie=CodePointTrie.
int32_t partOffset = dataIndexes[UPROPS_BLOCK_TRIE_INDEX];
int32_t nextOffset = dataIndexes[UPROPS_RESERVED_INDEX_8];
int32_t partLength = 4 * (nextOffset - partOffset);
if (partLength >= 0) {
utrie_swapAnyVersion(ds, inData32 + partOffset, partLength,
outData32 + partOffset, pErrorCode);
}
}

/* i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data */
return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX_7];
return headerSize+4*dataIndexes[UPROPS_RESERVED_INDEX_8];
}

/* Unicode case mapping data swapping --------------------------------------- */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ private static UnicodeSet getInclusionsForSource(int src) {
case UCharacterProperty.SRC_ID_COMPAT_MATH:
UCharacterProperty.mathCompat_addPropertyStarts(incl);
break;
case UCharacterProperty.SRC_BLOCK:
UCharacterProperty.INSTANCE.ublock_addPropertyStarts(incl);
break;
default:
throw new IllegalStateException("getInclusions(unknown src " + src + ")");
}
Expand Down
Loading

0 comments on commit 0d8a3cc

Please sign in to comment.