Skip to content

Commit

Permalink
Merge pull request #1807 from b4n/utf8-pattern-truncation
Browse files Browse the repository at this point in the history
Avoid truncating a pattern in the middle of a UTF-8 character
  • Loading branch information
b4n committed Aug 6, 2018
2 parents 2f3b264 + 22b51fa commit 0d56013
Show file tree
Hide file tree
Showing 8 changed files with 74 additions and 10 deletions.
6 changes: 3 additions & 3 deletions Tmain/omit-long-patterns-etags.d/stdout-expected.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

input.sh,349
func96(func961,0
input.sh,351
func96()func961,0
func95()func955,110
func97func979,219
func97(func979,219
3 changes: 3 additions & 0 deletions Tmain/pattern-length-limit.d/input-iso-8859-1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# this is made to pass the `(c & 0xC0) == 0x80` UTF-8 sub-byte check to make
# sure we have a working hard limit in case of malicious input.
a='€‚ƒ€‚ƒ'
1 change: 1 addition & 0 deletions Tmain/pattern-length-limit.d/input-utf8.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
a='éàçè'
16 changes: 16 additions & 0 deletions Tmain/pattern-length-limit.d/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,20 @@ ${CTAGS} --quiet --options=NONE -o - \
--pattern-length-limit=0 \
--kinds-java=f ./input.java

for etags in '' '-e'; do
echo "--- multi-byte handling" $(test -n "$etags" && echo "(etags)")

# as the 7th byte is an inner byte, cutting at 6 and 7 should yield the same result
${CTAGS} --quiet --options=NONE $etags -o - \
--pattern-length-limit=6 \
--kinds-python=v ./input-utf8.py
${CTAGS} --quiet --options=NONE $etags -o - \
--pattern-length-limit=7 \
--kinds-python=v ./input-utf8.py

${CTAGS} --quiet --options=NONE $etags -o - \
--pattern-length-limit=4 \
--kinds-python=v ./input-iso-8859-1.py
done

exit $?
14 changes: 14 additions & 0 deletions Tmain/pattern-length-limit.d/stdout-expected.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,17 @@ a ./input.java /^public cla/;" f
b ./input.java /^public cla/;" f
a ./input.java /^public class Foo extends Bar {static Logger a = Logger.getLogger(Foo.class.getName()); static Logger b = Logger.getLogger(Foo.class.getName());}$/;" f
b ./input.java /^public class Foo extends Bar {static Logger a = Logger.getLogger(Foo.class.getName()); static Logger b = Logger.getLogger(Foo.class.getName());}$/;" f
--- multi-byte handling
a ./input-utf8.py /^a='éà/;" v
a ./input-utf8.py /^a='éà/;" v
a ./input-iso-8859-1.py /^a='����/;" v
--- multi-byte handling (etags)

input-utf8.py,14
a='éàa1,0

input-utf8.py,14
a='éàa1,0

input-iso-8859-1.py,16
a='����a3,141
13 changes: 13 additions & 0 deletions docs/news.rst
Original file line number Diff line number Diff line change
Expand Up @@ -928,6 +928,19 @@ To prevent generating overly large tags files, a pattern field is
truncated, by default, when its size exceeds 96 bytes. A different
limit can be specified with ``--pattern-length-limit=N``.

The truncation avoids cutting in the middle of a UTF-8 code point
spanning multiple bytes to prevent writing invalid byte sequences from
valid input files. This handling allows for an extra 3 bytes above the
configured limit in the worse case of a 4 byte code point starting
right before the limit. Please also note that this handling is fairly
naive and fast, and although it is resistant against any input, it
requires a valid input to work properly; it is not guaranteed to work
as the user expects when dealing with partially invalid UTF-8 input.
This also partially affect non-UTF-8 input, if the byte sequence at
the truncation length looks like a multibyte UTF-8 sequence. This
should however be rare, and in the worse case will lead to including
up to an extra 3 bytes above the limit.

An input source file with long lines and multiple tag matches per
line can generate an excessively large tags file with an
unconstrained pattern length. For example, running ctags on a
Expand Down
7 changes: 6 additions & 1 deletion main/entry.c
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,7 @@ static size_t appendInputLine (int putc_func (char , void *), const char *const
{
size_t length = 0;
const char *p;
int extraLength = 0;

/* Write everything up to, but not including, a line end character.
*/
Expand All @@ -666,7 +667,11 @@ static size_t appendInputLine (int putc_func (char , void *), const char *const
if (c == CRETURN || c == NEWLINE)
break;

if (Option.patternLengthLimit != 0 && length >= Option.patternLengthLimit)
if (Option.patternLengthLimit != 0 && length >= Option.patternLengthLimit &&
/* Do not cut inside a multi-byte UTF-8 character, but safe-guard it not to
* allow more than one extra valid UTF-8 character in case it's not actually
* UTF-8. To do that, limit to an extra 3 UTF-8 sub-bytes (0b10xxxxxx). */
((((unsigned char) c) & 0xc0) != 0x80 || ++extraLength > 3))
{
*omitted = true;
break;
Expand Down
24 changes: 18 additions & 6 deletions main/writer-etags.c
Original file line number Diff line number Diff line change
Expand Up @@ -98,18 +98,30 @@ static int writeEtagsEntry (tagWriter *writer,
long seekValue;
char *const line =
readLineFromBypassAnyway (etags->vLine, tag, &seekValue);
if (line == NULL)
if (line == NULL || line [0] == '\0')
return 0;

len = strlen (line);

if (tag->truncateLineAfterTag)
truncateTagLineAfterTag (line, tag->name, true);
else
line [len - 1] = '\0';

if (Option.patternLengthLimit < len)
line [Option.patternLengthLimit - 1] = '\0';
else if (line [len - 1] == '\n')
line [--len] = '\0';

if (Option.patternLengthLimit > 0 && Option.patternLengthLimit < len)
{
unsigned int truncationLength = Option.patternLengthLimit;

/* don't cut in the middle of a UTF-8 character, but don't allow
* for more than one extra character in case it actually wasn't
* UTF-8. See also entry.c:appendInputLine() */
while (truncationLength < len &&
truncationLength < Option.patternLengthLimit + 3 &&
(((unsigned char) line[truncationLength]) & 0xc0) == 0x80)
truncationLength++;

line [truncationLength] = '\0';
}

length = mio_printf (mio, "%s\177%s\001%lu,%ld\n", line,
tag->name, tag->lineNumber, seekValue);
Expand Down

0 comments on commit 0d56013

Please sign in to comment.