Skip to content

Commit

Permalink
ICU-11548 Improve regex static UnicodeSets handling
Browse files Browse the repository at this point in the history
Compiled regular expression patterns make use of several shared common
UnicodeSets. This change simplifies the creation and use of these
static UnicodeSets.

- Pointer fields to the static sets are removed from the compiled patterns,
  and the static variables are accessed directly. The deleted pointers
  were a hold-over from earlier code that did not use shared statics.

- The UnicodeSet pattern literals are changed from hex constants to
  u"string literals".

- The size of fRuleSets (from regexst.h) is changed from a hard-coded 10
  to the number of UnicodeSets actually required. Doing this required
  a change to regexcst.pl to export the required size. Changing and
  rerunning this perl code resulted in massive but benign changes to
  the generated file regexcst.h, the result of perl having changed its
  order of enumeration of hashes since the file was last regenerated.

- UnicodeSets are frozen when possible. Should result in faster matching.
  • Loading branch information
aheninger committed Jan 30, 2020
1 parent b9bb612 commit 54a60fe
Show file tree
Hide file tree
Showing 9 changed files with 254 additions and 374 deletions.
28 changes: 13 additions & 15 deletions icu4c/source/i18n/regexcmp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,6 @@ void RegexCompile::compile(
if (U_FAILURE(*fStatus)) {
return;
}
fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;


// Initialize the pattern scanning state machine
fPatternLength = utext_nativeLength(pat);
Expand Down Expand Up @@ -1565,15 +1562,15 @@ UBool RegexCompile::doParseActions(int32_t action)
case doSetBackslash_s:
{
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
break;
}

case doSetBackslash_S:
{
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
SSet.complement();
UnicodeSet SSet;
SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
set->addAll(SSet);
break;
}
Expand Down Expand Up @@ -1642,15 +1639,15 @@ UBool RegexCompile::doParseActions(int32_t action)
case doSetBackslash_w:
{
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
break;
}

case doSetBackslash_W:
{
UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
SSet.complement();
UnicodeSet SSet;
SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
set->addAll(SSet);
break;
}
Expand Down Expand Up @@ -2425,6 +2422,7 @@ void RegexCompile::compileSet(UnicodeSet *theSet)
{
// The set contains two or more chars. (the normal case)
// Put it into the compiled pattern as a set.
theSet->freeze();
int32_t setNumber = fRXPat->fSets->size();
fRXPat->fSets->addElement(theSet, *fStatus);
appendOp(URX_SETREF, setNumber);
Expand Down Expand Up @@ -2818,8 +2816,8 @@ void RegexCompile::matchStartType() {
if (currentLen == 0) {
int32_t sn = URX_VAL(op);
U_ASSERT(sn>0 && sn<URX_LAST_SET);
const UnicodeSet *s = fRXPat->fStaticSets[sn];
fRXPat->fInitialChars->addAll(*s);
const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
fRXPat->fInitialChars->addAll(s);
numInitialStrings += 2;
}
currentLen = safeIncrement(currentLen, 1);
Expand All @@ -2831,9 +2829,8 @@ void RegexCompile::matchStartType() {
case URX_STAT_SETREF_N:
if (currentLen == 0) {
int32_t sn = URX_VAL(op);
const UnicodeSet *s = fRXPat->fStaticSets[sn];
UnicodeSet sc(*s);
sc.complement();
UnicodeSet sc;
sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
fRXPat->fInitialChars->addAll(sc);
numInitialStrings += 2;
}
Expand Down Expand Up @@ -4420,7 +4417,8 @@ UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UB

status = U_ZERO_ERROR;
if (propName.caseCompare(u"word", -1, 0) == 0) {
set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status);
set.adoptInsteadAndCheckErrorCode(
RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
break;
}
if (propName.compare(u"all", -1) == 0) {
Expand Down
200 changes: 100 additions & 100 deletions icu4c/source/i18n/regexcst.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,117 +20,117 @@ U_NAMESPACE_BEGIN
//
// Character classes for regex pattern scanning.
//
static const uint8_t kRuleSet_ascii_letter = 128;
static const uint8_t kRuleSet_digit_char = 129;
static const uint8_t kRuleSet_digit_char = 128;
static const uint8_t kRuleSet_ascii_letter = 129;
static const uint8_t kRuleSet_rule_char = 130;

constexpr uint32_t kRuleSet_count = 131-128;

enum Regex_PatternParseAction {
doSetBackslash_V,
doSetBackslash_h,
doBeginNamedBackRef,
doSetMatchMode,
doEnterQuoteMode,
doOpenCaptureParen,
doContinueNamedCapture,
doSetBackslash_d,
doBeginMatchMode,
doBackslashX,
doSetPosixProp,
doIntervalError,
doSetBackslash_D,
doBackslashh,
doBackslashH,
doSetLiteralEscaped,
doSetBackslash_s,
doNOP,
doBackslashv,
doOpenLookBehind,
doPatStart,
doPossessiveInterval,
doOpenAtomicParen,
doOpenLookAheadNeg,
doBackslashd,
doCompleteNamedBackRef,
doPatStart,
doBackslashS,
doBackslashD,
doNGStar,
doNOP,
doBackslashX,
doSetLiteral,
doContinueNamedCapture,
doBackslashG,
doBackslashR,
doSetBegin,
doSetBackslash_v,
doPossessivePlus,
doPerlInline,
doBackslashZ,
doIntervalUpperDigit,
doBadNamedCapture,
doSetDifference2,
doSetAddAmp,
doSetNamedChar,
doNamedChar,
doSetBackslash_H,
doBackslashb,
doBackslashz,
doSetBeginDifference1,
doOpenLookAhead,
doMatchModeParen,
doBackslashV,
doIntevalLowerDigit,
doCaret,
doSetEnd,
doIntervalError,
doSetNegate,
doBackslashS,
doOrOperator,
doBackslashB,
doBackslashw,
doBackslashR,
doIntervalInit,
doSetIntersection2,
doPossessiveInterval,
doRuleError,
doDotAny,
doMatchMode,
doSetBackslash_W,
doNGPlus,
doSetBackslash_D,
doPossessiveOpt,
doSetNamedRange,
doBackslashW,
doContinueNamedBackRef,
doOpenNonCaptureParen,
doExit,
doSetNamedChar,
doSetBackslash_V,
doConditionalExpr,
doBackslashs,
doEscapeError,
doBadOpenParenType,
doPossessiveStar,
doSetAddDash,
doEscapedLiteralChar,
doSetBackslash_w,
doIntervalUpperDigit,
doBackslashv,
doSetBackslash_S,
doSetNoCloseError,
doSetProp,
doBackslashB,
doSetEnd,
doSetRange,
doMatchModeParen,
doPlus,
doBadOpenParenType,
doBackslashV,
doSetMatchMode,
doBackslashz,
doSetNamedRange,
doOpenLookBehindNeg,
doInterval,
doBadNamedCapture,
doBeginMatchMode,
doBackslashd,
doPatFinish,
doNamedChar,
doNGPlus,
doSetDifference2,
doSetBackslash_H,
doCloseParen,
doNGInterval,
doSetProp,
doDotAny,
doOpenCaptureParen,
doEnterQuoteMode,
doOpenAtomicParen,
doBadModeFlag,
doSetBackslash_d,
doSetFinish,
doProperty,
doBeginNamedBackRef,
doBackRef,
doSetBeginUnion,
doEscapeError,
doOpt,
doSetBeginIntersection1,
doPossessivePlus,
doBackslashD,
doOpenLookBehindNeg,
doSetBegin,
doSetIntersection2,
doCompleteNamedBackRef,
doSetRange,
doDollar,
doBackslashH,
doExit,
doNGOpt,
doOpenNonCaptureParen,
doBackslashA,
doSetBackslash_v,
doBackslashh,
doBadModeFlag,
doSetNoCloseError,
doIntervalSame,
doSetAddDash,
doBackslashW,
doPerlInline,
doSetOpError,
doSetLiteral,
doPatFinish,
doBeginNamedCapture,
doEscapedLiteralChar,
doNGInterval,
doSetOpError,
doSetPosixProp,
doSetBeginIntersection1,
doBackslashb,
doSetBeginUnion,
doIntevalLowerDigit,
doSetBackslash_h,
doStar,
doMatchMode,
doBackslashA,
doOpenLookBehind,
doPossessiveOpt,
doOrOperator,
doBackslashw,
doBackslashs,
doLiteralChar,
doSuppressComments,
doCaret,
doIntervalSame,
doNGOpt,
doOpenLookAhead,
doSetBackslash_W,
doMismatchedParenErr,
doNGStar,
doSetFinish,
doInterval,
doBackslashG,
doStar,
doSetBackslash_w,
doSetBackslash_S,
doProperty,
doContinueNamedBackRef,
doIntervalInit,
doSetBackslash_s,
rbbiLastAction};

//-------------------------------------------------------------------------------
Expand Down Expand Up @@ -197,7 +197,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doBadOpenParenType, 255, 206,0, FALSE} // 45
, {doOpenLookBehind, 61 /* = */, 2, 20, TRUE} // 46 open-paren-lookbehind
, {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE} // 47
, {doBeginNamedCapture, 128, 64,0, FALSE} // 48
, {doBeginNamedCapture, 129, 64,0, FALSE} // 48
, {doBadOpenParenType, 255, 206,0, FALSE} // 49
, {doNOP, 41 /* ) */, 255,0, TRUE} // 50 paren-comment
, {doMismatchedParenErr, 253, 206,0, FALSE} // 51
Expand All @@ -213,8 +213,8 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doSetMatchMode, 41 /* ) */, 2,0, TRUE} // 61
, {doMatchModeParen, 58 /* : */, 2, 14, TRUE} // 62
, {doBadModeFlag, 255, 206,0, FALSE} // 63
, {doContinueNamedCapture, 128, 64,0, TRUE} // 64 named-capture
, {doContinueNamedCapture, 129, 64,0, TRUE} // 65
, {doContinueNamedCapture, 129, 64,0, TRUE} // 64 named-capture
, {doContinueNamedCapture, 128, 64,0, TRUE} // 65
, {doOpenCaptureParen, 62 /* > */, 2, 14, TRUE} // 66
, {doBadNamedCapture, 255, 206,0, FALSE} // 67
, {doNGStar, 63 /* ? */, 20,0, TRUE} // 68 quant-star
Expand All @@ -226,13 +226,13 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doNGOpt, 63 /* ? */, 20,0, TRUE} // 74 quant-opt
, {doPossessiveOpt, 43 /* + */, 20,0, TRUE} // 75
, {doOpt, 255, 20,0, FALSE} // 76
, {doNOP, 129, 79,0, FALSE} // 77 interval-open
, {doNOP, 128, 79,0, FALSE} // 77 interval-open
, {doIntervalError, 255, 206,0, FALSE} // 78
, {doIntevalLowerDigit, 129, 79,0, TRUE} // 79 interval-lower
, {doIntevalLowerDigit, 128, 79,0, TRUE} // 79 interval-lower
, {doNOP, 44 /* , */, 83,0, TRUE} // 80
, {doIntervalSame, 125 /* } */, 86,0, TRUE} // 81
, {doIntervalError, 255, 206,0, FALSE} // 82
, {doIntervalUpperDigit, 129, 83,0, TRUE} // 83 interval-upper
, {doIntervalUpperDigit, 128, 83,0, TRUE} // 83 interval-upper
, {doNOP, 125 /* } */, 86,0, TRUE} // 84
, {doIntervalError, 255, 206,0, FALSE} // 85
, {doNGInterval, 63 /* ? */, 20,0, TRUE} // 86 interval-type
Expand Down Expand Up @@ -261,15 +261,15 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
, {doBackslashX, 88 /* X */, 14,0, TRUE} // 109
, {doBackslashZ, 90 /* Z */, 2,0, TRUE} // 110
, {doBackslashz, 122 /* z */, 2,0, TRUE} // 111
, {doBackRef, 129, 14,0, TRUE} // 112
, {doBackRef, 128, 14,0, TRUE} // 112
, {doEscapeError, 253, 206,0, FALSE} // 113
, {doEscapedLiteralChar, 255, 14,0, TRUE} // 114
, {doBeginNamedBackRef, 60 /* < */, 117,0, TRUE} // 115 named-backref
, {doBadNamedCapture, 255, 206,0, FALSE} // 116
, {doContinueNamedBackRef, 128, 119,0, TRUE} // 117 named-backref-2
, {doContinueNamedBackRef, 129, 119,0, TRUE} // 117 named-backref-2
, {doBadNamedCapture, 255, 206,0, FALSE} // 118
, {doContinueNamedBackRef, 128, 119,0, TRUE} // 119 named-backref-3
, {doContinueNamedBackRef, 129, 119,0, TRUE} // 120
, {doContinueNamedBackRef, 129, 119,0, TRUE} // 119 named-backref-3
, {doContinueNamedBackRef, 128, 119,0, TRUE} // 120
, {doCompleteNamedBackRef, 62 /* > */, 14,0, TRUE} // 121
, {doBadNamedCapture, 255, 206,0, FALSE} // 122
, {doSetNegate, 94 /* ^ */, 126,0, TRUE} // 123 set-open
Expand Down
7 changes: 5 additions & 2 deletions icu4c/source/i18n/regexcst.pl
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@
# regexcst.pl
# Compile the regular expression paser state table data into initialized C data.
# Usage:
# cd icu/source/i18n
# cd icu4c/source/i18n
# perl regexcst.pl < regexcst.txt > regexcst.h
#
# The output file, regexcst.h, is included by some of the .cpp regex
# implementation files. This perl script is NOT run as part
# of a normal ICU build. It is run by hand when needed, and the
# regexcst.h generated file is put back into cvs.
# regexcst.h generated file is put back into the source code repository.
#
# See regexcst.txt for a description of the input format for this script.
#
Expand Down Expand Up @@ -201,6 +201,8 @@

die if ($errors>0);

print "// © 2016 and later: Unicode, Inc. and others.\n";
print "// License & terms of use: http://www.unicode.org/copyright.html\n";
print "//---------------------------------------------------------------------------------\n";
print "//\n";
print "// Generated Header File. Do not edit by hand.\n";
Expand Down Expand Up @@ -246,6 +248,7 @@
$i++;
}
}
print " constexpr uint32_t kRuleSet_count = $i-128;";
print "\n\n";

#
Expand Down

0 comments on commit 54a60fe

Please sign in to comment.