Skip to content

Commit

Permalink
Split up the utf8 helper functions into a file of their own
Browse files Browse the repository at this point in the history
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
torvalds committed Jul 10, 2012
1 parent 12e4647 commit e62cdf0
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 67 deletions.
8 changes: 5 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ SRC=ansi.c basic.c bind.c buffer.c crypt.c display.c eval.c exec.c \
file.c fileio.c ibmpc.c input.c isearch.c line.c lock.c main.c \
pklock.c posix.c random.c region.c search.c spawn.c tcap.c \
termio.c vmsvt.c vt52.c window.c word.c names.c globals.c version.c \
usage.c wrapper.c
usage.c wrapper.c utf8.c

OBJ=ansi.o basic.o bind.o buffer.o crypt.o display.o eval.o exec.o \
file.o fileio.o ibmpc.o input.o isearch.o line.o lock.o main.o \
pklock.o posix.o random.o region.o search.o spawn.o tcap.o \
termio.o vmsvt.o vt52.o window.o word.o names.o globals.o version.o \
usage.o wrapper.o
usage.o wrapper.o utf8.o

HDR=ebind.h edef.h efunc.h epath.h estruct.h evar.h util.h version.h

Expand Down Expand Up @@ -132,7 +132,7 @@ basic.o: basic.c estruct.h edef.h
bind.o: bind.c estruct.h edef.h epath.h
buffer.o: buffer.c estruct.h edef.h
crypt.o: crypt.c estruct.h edef.h
display.o: display.c estruct.h edef.h
display.o: display.c estruct.h edef.h utf8.h
eval.o: eval.c estruct.h edef.h evar.h
exec.o: exec.c estruct.h edef.h
file.o: file.c estruct.h edef.h
Expand All @@ -144,12 +144,14 @@ line.o: line.c estruct.h edef.h
lock.o: lock.c estruct.h edef.h
main.o: main.c estruct.h efunc.h edef.h ebind.h
pklock.o: pklock.c estruct.h
posix.o: posix.c estruct.h utf8.h
random.o: random.c estruct.h edef.h
region.o: region.c estruct.h edef.h
search.o: search.c estruct.h edef.h
spawn.o: spawn.c estruct.h edef.h
tcap.o: tcap.c estruct.h edef.h
termio.o: termio.c estruct.h edef.h
utf8.o: utf8.c utf8.h
vmsvt.o: vmsvt.c estruct.h edef.h
vt52.o: vt52.c estruct.h edef.h
window.o: window.c estruct.h edef.h
Expand Down
47 changes: 1 addition & 46 deletions display.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
#include "line.h"
#include "version.h"
#include "wrapper.h"

typedef unsigned int unicode_t;
#include "utf8.h"

struct video {
int v_flag; /* Flags */
Expand Down Expand Up @@ -434,50 +433,6 @@ static int reframe(struct window *wp)
return TRUE;
}

static unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
{
unsigned value;
unsigned char c = line[index];
unsigned bytes, mask, i;

*res = c;
line += index;
len -= index;

/*
* 0xxxxxxx is valid utf8
* 10xxxxxx is invalid UTF-8, we assume it is Latin1
*/
if (c < 0xc0)
return 1;

/* Ok, it's 11xxxxxx, do a stupid decode */
mask = 0x20;
bytes = 2;
while (c & mask) {
bytes++;
mask >>= 1;
}

/* Invalid? Do it as a single byte Latin1 */
if (bytes > 6)
return 1;

value = c & (mask-1);

/* Ok, do the bytes */
for (i = 1; i < bytes; i++) {
if (i > len)
return 1;
c = line[i];
if ((c & 0xc0) != 0x80)
return 1;
value = (value << 6) | (c & 0x3f);
}
*res = value;
return bytes;
}

static void show_line(struct line *lp)
{
unsigned i = 0, len = llength(lp);
Expand Down
24 changes: 6 additions & 18 deletions posix.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "estruct.h"
#include "edef.h"
#include "efunc.h"
#include "utf8.h"

/* Since Mac OS X's termios.h doesn't have the following 2 macros, define them.
*/
Expand Down Expand Up @@ -106,24 +107,11 @@ void ttclose(void)
*/
int ttputc(int c)
{
unsigned char utf8[6], *p = utf8+5;
int bytes = 1;

if (c < 0)
return 0;
*p = c;
if (c > 0x7f) {
int prefix = 0x40;
do {
*p = 0x80 + (c & 0x3f);
--p;
bytes++;
prefix >>= 1;
c >>= 6;
} while (c > prefix);
*p = c - 2*prefix;
}
fwrite(p, 1, bytes, stdout);
char utf8[6];
int bytes;

bytes = unicode_to_utf8(c, utf8);
fwrite(utf8, 1, bytes, stdout);
return 0;
}

Expand Down
98 changes: 98 additions & 0 deletions utf8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#include "utf8.h"

/*
* utf8_to_unicode()
*
* Convert a UTF-8 sequence to its unicode value, and return the length of
* the sequence in bytes.
*
* NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can
* either use it as-is (ie as Latin1) or you can check for invalid UTF-8
* by checking for a length of 1 and a result > 127.
*
* NOTE 2! This does *not* verify things like minimality. So overlong forms
* are happily accepted and decoded, as are the various "invalid values".
*/
unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
{
unsigned value;
unsigned char c = line[index];
unsigned bytes, mask, i;

*res = c;
line += index;
len -= index;

/*
* 0xxxxxxx is valid utf8
* 10xxxxxx is invalid UTF-8, we assume it is Latin1
*/
if (c < 0xc0)
return 1;

/* Ok, it's 11xxxxxx, do a stupid decode */
mask = 0x20;
bytes = 2;
while (c & mask) {
bytes++;
mask >>= 1;
}

/* Invalid? Do it as a single byte Latin1 */
if (bytes > 6)
return 1;

value = c & (mask-1);

/* Ok, do the bytes */
for (i = 1; i < bytes; i++) {
if (i > len)
return 1;
c = line[i];
if ((c & 0xc0) != 0x80)
return 1;
value = (value << 6) | (c & 0x3f);
}
*res = value;
return bytes;
}

static void reverse_string(char *begin, char *end)
{
do {
char a = *begin, b = *end;
*end = a; *begin = b;
begin++; end--;
} while (begin < end);
}

/*
* unicode_to_utf8()
*
* Convert a unicode value to its canonical utf-8 sequence.
*
* NOTE! This does not check for - or care about - the "invalid" unicode
* values. Also, converting a utf-8 sequence to unicode and back does
* *not* guarantee the same sequence, since this generates the shortest
* possible sequence, while utf8_to_unicode() accepts both Latin1 and
* overlong utf-8 sequences.
*/
unsigned unicode_to_utf8(unsigned int c, char *utf8)
{
int bytes = 1;

*utf8 = c;
if (c > 0x7f) {
int prefix = 0x40;
char *p = utf8;
do {
*p++ = 0x80 + (c & 0x3f);
bytes++;
prefix >>= 1;
c >>= 6;
} while (c > prefix);
*p = c - 2*prefix;
reverse_string(utf8, p);
}
return bytes;
}
9 changes: 9 additions & 0 deletions utf8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#ifndef UTF8_H
#define UTF8_H

typedef unsigned int unicode_t;

unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res);
unsigned unicode_to_utf8(unsigned int c, char *utf8);

#endif

0 comments on commit e62cdf0

Please sign in to comment.