universal-ctags · b4n · Aug 6, 2018 · Jul 30, 2018 · Jul 30, 2018 · Jul 30, 2018
diff --git a/Tmain/omit-long-patterns-etags.d/stdout-expected.txt b/Tmain/omit-long-patterns-etags.d/stdout-expected.txt
@@ -1,5 +1,5 @@
 
-input.sh,349
-                                                                                                func96(func961,0
+input.sh,351
+                                                                                                func96()func961,0
                                                                                                func95()func955,110
-                                                                                                 func97func979,219
+                                                                                                 func97(func979,219
diff --git a/Tmain/pattern-length-limit.d/input-iso-8859-1.py b/Tmain/pattern-length-limit.d/input-iso-8859-1.py
@@ -0,0 +1,3 @@
+# this is made to pass the `(c & 0xC0) == 0x80` UTF-8 sub-byte check to make
+# sure we have a working hard limit in case of malicious input.
+a='��������'
diff --git a/Tmain/pattern-length-limit.d/input-utf8.py b/Tmain/pattern-length-limit.d/input-utf8.py
@@ -0,0 +1 @@
+a='éàçè'
diff --git a/Tmain/pattern-length-limit.d/run.sh b/Tmain/pattern-length-limit.d/run.sh
@@ -14,4 +14,20 @@ ${CTAGS} --quiet --options=NONE -o - \
 	 --pattern-length-limit=0 \
 	 --kinds-java=f  ./input.java
 
+for etags in '' '-e'; do
+	echo "--- multi-byte handling" $(test -n "$etags" && echo "(etags)")
+
+	# as the 7th byte is an inner byte, cutting at 6 and 7 should yield the same result
+	${CTAGS} --quiet --options=NONE $etags -o - \
+		 --pattern-length-limit=6 \
+		 --kinds-python=v  ./input-utf8.py
+	${CTAGS} --quiet --options=NONE $etags -o - \
+		 --pattern-length-limit=7 \
+		 --kinds-python=v  ./input-utf8.py
+
+	${CTAGS} --quiet --options=NONE $etags -o - \
+		 --pattern-length-limit=4 \
+		 --kinds-python=v  ./input-iso-8859-1.py
+done
+
 exit $?
diff --git a/Tmain/pattern-length-limit.d/stdout-expected.txt b/Tmain/pattern-length-limit.d/stdout-expected.txt
@@ -4,3 +4,17 @@ a	./input.java	/^public cla/;"	f
 b	./input.java	/^public cla/;"	f
 a	./input.java	/^public class Foo extends Bar {static Logger a = Logger.getLogger(Foo.class.getName()); static Logger b = Logger.getLogger(Foo.class.getName());}$/;"	f
 b	./input.java	/^public class Foo extends Bar {static Logger a = Logger.getLogger(Foo.class.getName()); static Logger b = Logger.getLogger(Foo.class.getName());}$/;"	f
+--- multi-byte handling
+a	./input-utf8.py	/^a='éà/;"	v
+a	./input-utf8.py	/^a='éà/;"	v
+a	./input-iso-8859-1.py	/^a='����/;"	v
+--- multi-byte handling (etags)
+
+input-utf8.py,14
+a='éàa1,0
+
+input-utf8.py,14
+a='éàa1,0
+
+input-iso-8859-1.py,16
+a='����a3,141
diff --git a/docs/news.rst b/docs/news.rst
@@ -928,6 +928,19 @@ To prevent generating overly large tags files, a pattern field is
 truncated, by default, when its size exceeds 96 bytes. A different
 limit can be specified with ``--pattern-length-limit=N``.
 
+The truncation avoids cutting in the middle of a UTF-8 code point
+spanning multiple bytes to prevent writing invalid byte sequences from
+valid input files. This handling allows for an extra 3 bytes above the
+configured limit in the worse case of a 4 byte code point starting
+right before the limit. Please also note that this handling is fairly
+naive and fast, and although it is resistant against any input, it
+requires a valid input to work properly; it is not guaranteed to work
+as the user expects when dealing with partially invalid UTF-8 input.
+This also partially affect non-UTF-8 input, if the byte sequence at
+the truncation length looks like a multibyte UTF-8 sequence. This
+should however be rare, and in the worse case will lead to including
+up to an extra 3 bytes above the limit.
+
 An input source file with long lines and multiple tag matches per
 line can generate an excessively large tags file with an
 unconstrained pattern length. For example, running ctags on a

diff --git a/main/entry.c b/main/entry.c
@@ -654,6 +654,7 @@ static size_t appendInputLine (int putc_func (char , void *), const char *const
 {
 	size_t length = 0;
 	const char *p;
+	int extraLength = 0;
 
 	/*  Write everything up to, but not including, a line end character.
 	 */
@@ -666,7 +667,11 @@ static size_t appendInputLine (int putc_func (char , void *), const char *const
 		if (c == CRETURN  ||  c == NEWLINE)
 			break;
 
-		if (Option.patternLengthLimit != 0 && length >= Option.patternLengthLimit)
+		if (Option.patternLengthLimit != 0 && length >= Option.patternLengthLimit &&
+			/* Do not cut inside a multi-byte UTF-8 character, but safe-guard it not to
+			 * allow more than one extra valid UTF-8 character in case it's not actually
+			 * UTF-8.  To do that, limit to an extra 3 UTF-8 sub-bytes (0b10xxxxxx). */
+			((((unsigned char) c) & 0xc0) != 0x80 || ++extraLength > 3))
 		{
 			*omitted = true;
 			break;

diff --git a/main/writer-etags.c b/main/writer-etags.c
@@ -98,18 +98,30 @@ static int writeEtagsEntry (tagWriter *writer,
 		long seekValue;
 		char *const line =
 				readLineFromBypassAnyway (etags->vLine, tag, &seekValue);
-		if (line == NULL)
+		if (line == NULL || line [0] == '\0')
 			return 0;
 
 		len = strlen (line);
 
 		if (tag->truncateLineAfterTag)
 			truncateTagLineAfterTag (line, tag->name, true);
-		else
-			line [len - 1] = '\0';
-
-		if (Option.patternLengthLimit < len)
-			line [Option.patternLengthLimit - 1] = '\0';
+		else if (line [len - 1] == '\n')
+			line [--len] = '\0';
+
+		if (Option.patternLengthLimit > 0 && Option.patternLengthLimit < len)
+		{
+			unsigned int truncationLength = Option.patternLengthLimit;
+
+			/* don't cut in the middle of a UTF-8 character, but don't allow
+			 * for more than one extra character in case it actually wasn't
+			 * UTF-8.  See also entry.c:appendInputLine() */
+			while (truncationLength < len &&
+			       truncationLength < Option.patternLengthLimit + 3 &&
+			       (((unsigned char) line[truncationLength]) & 0xc0) == 0x80)
+				truncationLength++;
+
+			line [truncationLength] = '\0';
+		}
 
 		length = mio_printf (mio, "%s\177%s\001%lu,%ld\n", line,
 				tag->name, tag->lineNumber, seekValue);