Skip to content

Commit

Permalink
ICU-22758 Make icuexportdata compatible with Unicode 16 normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
hsivonen authored and markusicu committed May 6, 2024
1 parent 69cb085 commit 7eaefe0
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 26 deletions.
4 changes: 2 additions & 2 deletions .ci-builds/.azure-pipelines-icu4c.yml
Original file line number Diff line number Diff line change
Expand Up @@ -588,9 +588,9 @@ jobs:
- script: |
cd icu4c/source
mkdir -p icuexportdata/norm/fast
# TODO ./bin/icuexportdata --mode norm --index --copyright --verbose --destdir icuexportdata/norm/fast --trie-type fast --all
./bin/icuexportdata --mode norm --index --copyright --verbose --destdir icuexportdata/norm/fast --trie-type fast --all
mkdir -p icuexportdata/norm/small
# TODO ./bin/icuexportdata --mode norm --index --copyright --verbose --destdir icuexportdata/norm/small --trie-type small --all
./bin/icuexportdata --mode norm --index --copyright --verbose --destdir icuexportdata/norm/small --trie-type small --all
displayName: 'Build normalization data files'
env:
LD_LIBRARY_PATH: lib
Expand Down
58 changes: 34 additions & 24 deletions icu4c/source/tools/icuexportdata/icuexportdata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ void writeDecompositionData(const char* basename, uint32_t baseSize16, uint32_t
for (int32_t i = pendingTrieInsertions.size() - 1; i >= 0; --i) {
const PendingDescriptor& pending = pendingTrieInsertions[i];
uint32_t additional = 0;
if (!(pending.descriptor & 0xFFFE0000)) {
if (!(pending.descriptor & 0xFFFC0000)) {
uint32_t offset = pending.descriptor & 0xFFF;
if (!pending.supplementary) {
if (offset >= baseSize16) {
Expand Down Expand Up @@ -678,8 +678,19 @@ const int32_t BACKWARD_COMBINING_STARTER_MARKER = 1;

/// Marker that a complex decomposition isn't round-trippable
/// under re-composition.
///
/// TODO: When taking a data format break, swap this around with
/// `BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER`.
const uint32_t NON_ROUND_TRIP_MARKER = 1;

/// Marker that a complex decomposition starts with a starter
/// that can combine backwards.
///
/// TODO: When taking a data format break, swap this around with
/// `NON_ROUND_TRIP_MARKER` to use the same bit as with characters
/// that decompose to self but can combine backwards.
const uint32_t BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER = 2;

UBool permissibleBmpPair(UBool knownToRoundTrip, UChar32 c, UChar32 second) {
if (knownToRoundTrip) {
return true;
Expand Down Expand Up @@ -951,21 +962,18 @@ void computeDecompositions(const char* basename,
if (!nonNfdOrRoundTrips) {
compositionPassthroughBound = c;
}
if (len == 1 && utf32[0] <= 0xFFFF) {
if (startsWithBackwardCombiningStarter) {
if (mainNormalizer == nfdNormalizer) {
// Not supposed to happen in NFD
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
} else if (!((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) {
// Other than conjoining jamo vowels and trails
// unsupported for non-NFD.
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
if (len == 1 && ((utf32[0] >= 0x1161 && utf32[0] <= 0x1175) || (utf32[0] >= 0x11A8 && utf32[0] <= 0x11C2))) {
// Singleton decompositions to conjoining jamo.
if (mainNormalizer == nfdNormalizer) {
// Not supposed to happen in NFD
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, false});
} else if (len == 2 &&
} else if (!startsWithBackwardCombiningStarter && len == 1 && utf32[0] <= 0xFFFF) {
pendingTrieInsertions.push_back({c, uint32_t(utf32[0]) << 16, false});
} else if (!startsWithBackwardCombiningStarter &&
len == 2 &&
utf32[0] <= 0xFFFF &&
utf32[1] <= 0xFFFF &&
!u_getCombiningClass(utf32[0]) &&
Expand All @@ -979,17 +987,8 @@ void computeDecompositions(const char* basename,
handleError(status, basename);
}
}
if (startsWithBackwardCombiningStarter) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}
pendingTrieInsertions.push_back({c, (uint32_t(utf32[0]) << 16) | uint32_t(utf32[1]), false});
} else {
if (startsWithBackwardCombiningStarter) {
status.set(U_INTERNAL_PROGRAM_ERROR);
handleError(status, basename);
}

UBool supplementary = false;
UBool nonInitialStarter = false;
for (int32_t i = 0; i < len; ++i) {
Expand All @@ -1011,6 +1010,13 @@ void computeDecompositions(const char* basename,
nonInitialStarter = true;
}
}
if (len == 1) {
// The format doesn't allow for length 1 for BMP,
// so if these ever occur, they need to be promoted
// to wider storage. As of Unicode 16 alpha, this
// case does not arise.
supplementary = true;
}
if (!supplementary) {
if (len > LONGEST_ENCODABLE_LENGTH_16 || !len || len == 1) {
if (len == 18 && c == 0xFDFA) {
Expand Down Expand Up @@ -1074,7 +1080,11 @@ void computeDecompositions(const char* basename,
if (!nonNfdOrRoundTrips) {
nonRoundTripMarker = (NON_ROUND_TRIP_MARKER << 16);
}
pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker, supplementary});
uint32_t canCombineBackwardsMarker = 0;
if (startsWithBackwardCombiningStarter) {
canCombineBackwardsMarker = (BACKWARD_COMBINING_STARTER_DECOMPOSITION_MARKER << 16);
}
pendingTrieInsertions.push_back({c, descriptor | nonRoundTripMarker | canCombineBackwardsMarker, supplementary});
}
}
if (storage16.size() + storage32.size() > 0xFFF) {
Expand Down

0 comments on commit 7eaefe0

Please sign in to comment.