Split up the utf8 helper functions into a file of their own

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
torvalds · Jul 10, 2012 · e62cdf0 · e62cdf0
1 parent 12e4647
commit e62cdf0
Show file tree

Hide file tree

Showing 5 changed files with 119 additions and 67 deletions.
diff --git a/Makefile b/Makefile
@@ -20,13 +20,13 @@ SRC=ansi.c basic.c bind.c buffer.c crypt.c display.c eval.c exec.c \
 	file.c fileio.c ibmpc.c input.c isearch.c line.c lock.c main.c \
 	pklock.c posix.c random.c region.c search.c spawn.c tcap.c \
 	termio.c vmsvt.c vt52.c window.c word.c names.c globals.c version.c \
-	usage.c wrapper.c
+	usage.c wrapper.c utf8.c
 
 OBJ=ansi.o basic.o bind.o buffer.o crypt.o display.o eval.o exec.o \
 	file.o fileio.o ibmpc.o input.o isearch.o line.o lock.o main.o \
 	pklock.o posix.o random.o region.o search.o spawn.o tcap.o \
 	termio.o vmsvt.o vt52.o window.o word.o names.o globals.o version.o \
-	usage.o wrapper.o
+	usage.o wrapper.o utf8.o
 
 HDR=ebind.h edef.h efunc.h epath.h estruct.h evar.h util.h version.h
 
@@ -132,7 +132,7 @@ basic.o: basic.c estruct.h edef.h
 bind.o: bind.c estruct.h edef.h epath.h
 buffer.o: buffer.c estruct.h edef.h
 crypt.o: crypt.c estruct.h edef.h
-display.o: display.c estruct.h edef.h
+display.o: display.c estruct.h edef.h utf8.h
 eval.o: eval.c estruct.h edef.h evar.h
 exec.o: exec.c estruct.h edef.h
 file.o: file.c estruct.h edef.h
@@ -144,12 +144,14 @@ line.o: line.c estruct.h edef.h
 lock.o: lock.c estruct.h edef.h
 main.o: main.c estruct.h efunc.h edef.h ebind.h
 pklock.o: pklock.c estruct.h
+posix.o: posix.c estruct.h utf8.h
 random.o: random.c estruct.h edef.h
 region.o: region.c estruct.h edef.h
 search.o: search.c estruct.h edef.h
 spawn.o: spawn.c estruct.h edef.h
 tcap.o: tcap.c estruct.h edef.h
 termio.o: termio.c estruct.h edef.h
+utf8.o: utf8.c utf8.h
 vmsvt.o: vmsvt.c estruct.h edef.h
 vt52.o: vt52.c estruct.h edef.h
 window.o: window.c estruct.h edef.h

diff --git a/display.c b/display.c
@@ -19,8 +19,7 @@
 #include "line.h"
 #include "version.h"
 #include "wrapper.h"
-
-typedef unsigned int unicode_t;
+#include "utf8.h"
 
 struct video {
 	int v_flag;		/* Flags */
@@ -434,50 +433,6 @@ static int reframe(struct window *wp)
 	return TRUE;
 }
 
-static unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
-{
-	unsigned value;
-	unsigned char c = line[index];
-	unsigned bytes, mask, i;
-
-	*res = c;
-	line += index;
-	len -= index;
-
-	/*
-	 * 0xxxxxxx is valid utf8
-	 * 10xxxxxx is invalid UTF-8, we assume it is Latin1
-	 */
-	if (c < 0xc0)
-		return 1;
-
-	/* Ok, it's 11xxxxxx, do a stupid decode */
-	mask = 0x20;
-	bytes = 2;
-	while (c & mask) {
-		bytes++;
-		mask >>= 1;
-	}
-
-	/* Invalid? Do it as a single byte Latin1 */
-	if (bytes > 6)
-		return 1;
-
-	value = c & (mask-1);
-
-	/* Ok, do the bytes */
-	for (i = 1; i < bytes; i++) {
-		if (i > len)
-			return 1;
-		c = line[i];
-		if ((c & 0xc0) != 0x80)
-			return 1;
-		value = (value << 6) | (c & 0x3f);
-	}
-	*res = value;
-	return bytes;
-}
-
 static void show_line(struct line *lp)
 {
 	unsigned i = 0, len = llength(lp);

diff --git a/posix.c b/posix.c
@@ -22,6 +22,7 @@
 #include "estruct.h"
 #include "edef.h"
 #include "efunc.h"
+#include "utf8.h"
 
 /* Since Mac OS X's termios.h doesn't have the following 2 macros, define them.
  */
@@ -106,24 +107,11 @@ void ttclose(void)
  */
 int ttputc(int c)
 {
-	unsigned char utf8[6], *p = utf8+5;
-	int bytes = 1;
-
-	if (c < 0)
-		return 0;
-	*p = c;
-	if (c > 0x7f) {
-		int prefix = 0x40;
-		do {
-			*p = 0x80 + (c & 0x3f);
-			--p;
-			bytes++;
-			prefix >>= 1;
-			c >>= 6;
-		} while (c > prefix);
-		*p = c - 2*prefix;
-	}
-	fwrite(p, 1, bytes, stdout);
+	char utf8[6];
+	int bytes;
+
+	bytes = unicode_to_utf8(c, utf8);
+	fwrite(utf8, 1, bytes, stdout);
 	return 0;
 }
 

diff --git a/utf8.c b/utf8.c
@@ -0,0 +1,98 @@
+#include "utf8.h"
+
+/*
+ * utf8_to_unicode()
+ *
+ * Convert a UTF-8 sequence to its unicode value, and return the length of
+ * the sequence in bytes.
+ *
+ * NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can
+ * either use it as-is (ie as Latin1) or you can check for invalid UTF-8
+ * by checking for a length of 1 and a result > 127.
+ *
+ * NOTE 2! This does *not* verify things like minimality. So overlong forms
+ * are happily accepted and decoded, as are the various "invalid values".
+ */
+unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
+{
+	unsigned value;
+	unsigned char c = line[index];
+	unsigned bytes, mask, i;
+
+	*res = c;
+	line += index;
+	len -= index;
+
+	/*
+	 * 0xxxxxxx is valid utf8
+	 * 10xxxxxx is invalid UTF-8, we assume it is Latin1
+	 */
+	if (c < 0xc0)
+		return 1;
+
+	/* Ok, it's 11xxxxxx, do a stupid decode */
+	mask = 0x20;
+	bytes = 2;
+	while (c & mask) {
+		bytes++;
+		mask >>= 1;
+	}
+
+	/* Invalid? Do it as a single byte Latin1 */
+	if (bytes > 6)
+		return 1;
+
+	value = c & (mask-1);
+
+	/* Ok, do the bytes */
+	for (i = 1; i < bytes; i++) {
+		if (i > len)
+			return 1;
+		c = line[i];
+		if ((c & 0xc0) != 0x80)
+			return 1;
+		value = (value << 6) | (c & 0x3f);
+	}
+	*res = value;
+	return bytes;
+}
+
+static void reverse_string(char *begin, char *end)
+{
+	do {
+		char a = *begin, b = *end;
+		*end = a; *begin = b;
+		begin++; end--;
+	} while (begin < end);
+}
+
+/*
+ * unicode_to_utf8()
+ *
+ * Convert a unicode value to its canonical utf-8 sequence.
+ *
+ * NOTE! This does not check for - or care about - the "invalid" unicode
+ * values.  Also, converting a utf-8 sequence to unicode and back does
+ * *not* guarantee the same sequence, since this generates the shortest
+ * possible sequence, while utf8_to_unicode() accepts both Latin1 and
+ * overlong utf-8 sequences.
+ */
+unsigned unicode_to_utf8(unsigned int c, char *utf8)
+{
+	int bytes = 1;
+
+	*utf8 = c;
+	if (c > 0x7f) {
+		int prefix = 0x40;
+		char *p = utf8;
+		do {
+			*p++ = 0x80 + (c & 0x3f);
+			bytes++;
+			prefix >>= 1;
+			c >>= 6;
+		} while (c > prefix);
+		*p = c - 2*prefix;
+		reverse_string(utf8, p);
+	}
+	return bytes;
+}
diff --git a/utf8.h b/utf8.h
@@ -0,0 +1,9 @@
+#ifndef UTF8_H
+#define UTF8_H
+
+typedef unsigned int unicode_t;
+
+unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res);
+unsigned unicode_to_utf8(unsigned int c, char *utf8);
+
+#endif