Skip to content

Commit

Permalink
Add experimental RexExp syntax (\h, \H, \i and \i?):
Browse files Browse the repository at this point in the history
\h    Start of a word, according to IsWordStartAt()
\H    End of a word, according to IsWordEndAt()
\i    Matches 1 or more characters to end of a word
\i?   Matches 0 or more characters to end of a word

Examples: \i matches a whole word, \hA\i matches a whole word starts with A.
They are useful for processing multi-byte encoding (UTF-8 and DBCS) document.
  • Loading branch information
zufuliu committed Jan 13, 2019
1 parent ce2c758 commit 05a1749
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 8 deletions.
22 changes: 21 additions & 1 deletion scintilla/src/Document.cxx
Expand Up @@ -2651,10 +2651,30 @@ class DocumentIndexer : public CharacterIndexer {

char CharAt(Sci::Position index) const noexcept override {
if (index < 0 || index >= end)
return 0;
return '\0';
else
return pdoc->CharAt(index);
}

bool IsWordStartAt(Sci::Position pos) const noexcept override {
return pdoc->IsWordStartAt(pos);
}

bool IsWordEndAt(Sci::Position pos) const noexcept override {
return pdoc->IsWordEndAt(pos);
}

Sci::Position MovePositionOutsideChar(Sci::Position pos, Sci::Position moveDir) const noexcept override {
return pdoc->MovePositionOutsideChar(pos, moveDir, true);
}

Sci::Position NextPosition(Sci::Position pos, int moveDir) const noexcept override {
return pdoc->NextPosition(pos, moveDir);
}

Sci::Position ExtendWordSelect(Sci::Position pos, int delta) const noexcept override {
return pdoc->ExtendWordSelect(pos, delta, true);
}
};

#ifndef NO_CXX11_REGEX
Expand Down
64 changes: 59 additions & 5 deletions scintilla/src/RESearch.cxx
Expand Up @@ -233,6 +233,12 @@ using namespace Scintilla;
#define CLQ 12 /* 0 to 1 closure */
#define LCLO 13 /* lazy closure */

// experimental
#define EXP_MATCH_WORD_START 14
#define EXP_MATCH_WORD_END 15
#define EXP_MATCH_TO_WORD_END 16
#define EXP_MATCH_TO_WORD_END_OPT 17

#define END 0

/*
Expand Down Expand Up @@ -642,6 +648,11 @@ const char *RESearch::DoCompile(const char *pattern, Sci::Position length, bool
break;
}

if (*p == '?' && *lp == EXP_MATCH_TO_WORD_END) {
*lp = EXP_MATCH_TO_WORD_END_OPT;
break;
}

if (*p == '+') {
for (sp = mp; lp < sp; lp++) {
*mp++ = *lp;
Expand Down Expand Up @@ -672,6 +683,17 @@ const char *RESearch::DoCompile(const char *pattern, Sci::Position length, bool
return badpat("Null pattern inside \\<\\>");
*mp++ = EOW;
break;
case 'h':
*mp++ = EXP_MATCH_WORD_START;
break;
case 'H':
if (*sp == EXP_MATCH_WORD_START)
return badpat("Null pattern inside \\h\\H");
*mp++ = EXP_MATCH_WORD_END;
break;
case 'i':
*mp++ = EXP_MATCH_TO_WORD_END;
break;
case '1':
case '2':
case '3':
Expand Down Expand Up @@ -828,10 +850,11 @@ int RESearch::Execute(const CharacterIndexer &ci, Sci::Position lp, Sci::Positio
// fall through
default: /* regular matching all the way. */
while (lp < endp) {
ep = PMatch(ci, lp, endp, ap);
Sci::Position offset = 1;
ep = PMatch(ci, lp, endp, ap, 1, &offset);
if (ep != NOTFOUND)
break;
lp++;
lp += offset;
}
break;
case END: /* munged automaton. fail always */
Expand Down Expand Up @@ -888,7 +911,7 @@ static inline int isinset(const char *ap, unsigned char c) noexcept {
#define CHRSKIP 3 /* [CLO] CHR chr END */
#define CCLSKIP 34 /* [CLO] CCL 32 bytes END */

Sci::Position RESearch::PMatch(const CharacterIndexer &ci, Sci::Position lp, Sci::Position endp, char *ap) {
Sci::Position RESearch::PMatch(const CharacterIndexer &ci, Sci::Position lp, Sci::Position endp, char *ap, int moveDir, Sci::Position *offset) {
int op;
int c;
int n;
Expand Down Expand Up @@ -938,6 +961,36 @@ Sci::Position RESearch::PMatch(const CharacterIndexer &ci, Sci::Position lp, Sci
if (lp == bol || !iswordc(ci.CharAt(lp - 1)) || iswordc(ci.CharAt(lp)))
return NOTFOUND;
break;
case EXP_MATCH_WORD_START:
if (!ci.IsWordStartAt(lp)) {
if (offset) {
e = ci.MovePositionOutsideChar(lp, moveDir);
*offset = (e == lp) ? ci.NextPosition(lp, moveDir) - lp : e - lp;
}
return NOTFOUND;
}
break;
case EXP_MATCH_WORD_END:
if (lp == bol || !ci.IsWordEndAt(lp)) {
if (offset) {
e = ci.MovePositionOutsideChar(lp, moveDir);
*offset = (e == lp) ? ci.NextPosition(lp, moveDir) - lp : e - lp;
}
return NOTFOUND;
}
break;
case EXP_MATCH_TO_WORD_END:
case EXP_MATCH_TO_WORD_END_OPT: {
e = ci.ExtendWordSelect(lp, moveDir);
const bool find = ci.IsWordEndAt(e);
if (offset) {
*offset = (e == lp) ? ci.NextPosition(lp, moveDir) - lp : e - lp;
}
if ((e == lp && op != EXP_MATCH_TO_WORD_END_OPT) || !find) {
return NOTFOUND;
}
lp = e;
} break;
case REF:
n = *ap++;
bp = bopat[n];
Expand Down Expand Up @@ -992,13 +1045,14 @@ Sci::Position RESearch::PMatch(const CharacterIndexer &ci, Sci::Position lp, Sci
e = NOTFOUND;
while (llp >= are) {
Sci::Position q;
if ((q = PMatch(ci, llp, endp, ap)) != NOTFOUND) {
Sci::Position qoff = -1;
if ((q = PMatch(ci, llp, endp, ap, -1, &qoff)) != NOTFOUND) {
e = q;
lp = llp;
if (op != LCLO) return e;
}
if (*ap == END) return e;
--llp;
llp += qoff;
}
if (*ap == EOT)
PMatch(ci, lp, endp, ap);
Expand Down
7 changes: 6 additions & 1 deletion scintilla/src/RESearch.h
Expand Up @@ -22,6 +22,11 @@ namespace Scintilla {
class CharacterIndexer {
public:
virtual char CharAt(Sci::Position index) const noexcept = 0;
virtual bool IsWordStartAt(Sci::Position pos) const noexcept = 0;
virtual bool IsWordEndAt(Sci::Position pos) const noexcept = 0;
virtual Sci::Position MovePositionOutsideChar(Sci::Position pos, Sci::Position moveDir) const noexcept = 0;
virtual Sci::Position NextPosition(Sci::Position pos, int moveDir) const noexcept = 0;
virtual Sci::Position ExtendWordSelect(Sci::Position pos, int delta) const noexcept = 0;
virtual ~CharacterIndexer() = default;
};

Expand Down Expand Up @@ -57,7 +62,7 @@ class RESearch {
int GetBackslashExpression(const char *pattern, int &incr) noexcept;

const char *DoCompile(const char *pattern, Sci::Position length, bool caseSensitive, bool posix) noexcept;
Sci::Position PMatch(const CharacterIndexer &ci, Sci::Position lp, Sci::Position endp, char *ap);
Sci::Position PMatch(const CharacterIndexer &ci, Sci::Position lp, Sci::Position endp, char *ap, int moveDir = 1, Sci::Position *offset = nullptr);

Sci::Position bol;
Sci::Position tagstk[MAXTAG]; /* subpat tag stack */
Expand Down
2 changes: 1 addition & 1 deletion src/Notepad2.rc
Expand Up @@ -1738,7 +1738,7 @@ END
STRINGTABLE
BEGIN
IDS_BACKSLASHHELP "Backslash Transformations\n\n\\a\tAlert (BEL, ASCII 7)\n\\b\tBackspace (BS, ASCII 8)\n\\e\tEscape (ESC, ASCII 27 / 1B)\n\\f\tFormfeed (FF, ASCII 12 / 0C)\n\\n\tNewline (LF, ASCII 10 / 0A)\n\\r\tCarriage return (CR, ASCII 13 / 0D)\n\\t\tHorizontal Tab (HT, ASCII 9)\n\\v\tVertical Tab (VT, ASCII 11 / 0B)\n\\ooo\tOctal Value\n\\u####\tHexadecimal Value\n\\xhh\tHexadecimal Value\n\\\\\tBackslash"
IDS_REGEXPHELP "RegExp Syntax (Single Lines Only)\n\n.\tAny character\n^\tStart of a line\n$\tEnd of a line\n\\<\tStart of a word\n\\>\tEnd of a word\n[...]\tA set of chars ([abc]) or a range ([a-z])\n[^...]\tChars NOT in the set or range\n\\d\tAny decimal digit\n\\D\tAny non-digit char\n\\s\tAny whitespace char\n\\S\tNot a whitespace char\n\\w\tAny ""word"" char\n\\W\tAny ""non-word"" char\n\\x\tEscape character with otherwise special meaning\n\\xHH\tChar with hex code HH\n?\tMatches preceding 0 or 1 times\n*\tMatches preceding 0 or more times\n+\tMatches preceding 1 or more times\n*? or +?\tNon greedy matching of quantifiers ""?"" and ""+""\n(\tStart of a region\n)\tEnd of a region\n\\n\tRefers to a region when replacing (n is 1-9)\n"
IDS_REGEXPHELP "RegExp Syntax (Single Lines Only)\n\n.\tAny character\n^\tStart of a line\n$\tEnd of a line\n\\<\tStart of a word\n\\>\tEnd of a word\n[...]\tA set of chars ([abc]) or a range ([a-z])\n[^...]\tChars NOT in the set or range\n\\d\tAny decimal digit\n\\D\tAny non-digit char\n\\s\tAny whitespace char\n\\S\tAny non-whitespace char\n\\w\tAny ""word"" char\n\\W\tAny ""non-word"" char\n\\x\tEscape character with otherwise special meaning\n\\xHH\tChar with hex code HH\n?\tMatches preceding 0 or 1 times\n*\tMatches preceding 0 or more times\n+\tMatches preceding 1 or more times\n*? or +?\tNon greedy matching of quantifiers ""?"" and ""+""\n(\tStart of a region\n)\tEnd of a region\n\\n\tRefers to a region when replacing (n is 1-9)\n\nExperimental Syntax:\n\h\tStart of a word, according to IsWordStartAt()\n\H\tEnd of a word, according to IsWordEndAt()\n\i\tMatches 1 or more characters to end of a word\n\i?\tMatches 0 or more characters to end of a word"
IDS_WILDCARDHELP "Wildcard Search\n\n*\tMatches zero or more characters.\n?\tMatches exactly one character. "
END

Expand Down

0 comments on commit 05a1749

Please sign in to comment.