Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 288 lines (267 sloc) 5.973 kb
1f1b68f Christopher Hall [grifo] tidy up spacing and remove CR characters
hxw authored
1 /*
2 * Copyright (c) 2009 Openmoko Inc.
3 *
4 * This program is free software: you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation, either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program. If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include "ustring.h"
19 #include "utf8.h"
20
21 ucs4_t UTF8_to_UCS4(const unsigned char **pUTF8)
22 {
23 ucs4_t c0, c1, c2, c3;
24
25 /* if 0 returned, it is not a invalid UTF8 character. The pointer moves to the second byte. */
26 c0 = 0;
27 if (**pUTF8)
28 {
29 c0 = (ucs4_t)**pUTF8;
30 (*pUTF8)++;
31 if (c0 & 0x80) /* multi-byte UTF8 char */
32 {
33 if ((c0 & 0xE0) == 0xC0) /* 2-byte UTF8 */
34 {
35 c1 = **pUTF8;
36 if ((c1 & 0xC0) == 0x80)
37 {
38 (*pUTF8)++;
39 c0 = ((c0 & 0x1F) << 6) + (c1 & 0x3F);
40 }
41 else
42 c0 = 0; /* invalid UTF8 character */
43 }
44 else if ((c0 & 0xF0) == 0xE0) /* 3-byte UTF8 */
45 {
46 c1 = **pUTF8;
47 c2 = *(*pUTF8 + 1);
48 if ((c1 & 0xC0) == 0x80 && (c2 & 0xC0) == 0x80)
49 {
50 (*pUTF8) += 2;
51 c0 = ((c0 & 0x0F) << 12) + ((c1 & 0x3F) << 6) + (c2 & 0x3F);
52 }
53 else
54 c0 = 0; /* invalid UTF8 character */
55 }
56 else if ((c0 & 0xF1) == 0xF0) /* 4-byte UTF8 */
57 {
58 c1 = **pUTF8;
59 c2 = *(*pUTF8 + 1);
60 c3 = *(*pUTF8 + 2);
61 if ((c1 & 0xC0) == 0x80 && (c2 & 0xC0) == 0x80 && (c3 & 0xC0) == 0x80)
62 {
63 (*pUTF8) += 3;
64 c0 = ((c0 & 0x07) << 18) + ((c1 & 0x3F) << 12) + ((c2 & 0x3F) << 6) + (c3 & 0x3F) ;
65 }
66 else
67 c0 = 0; /* invalid UTF8 character */
68 }
69 else
70 c0 = 0; /* invalid UTF8 character */
71 }
72 }
73 return c0;
74 }
75
76 void UCS4_to_UTF8(ucs4_t u, unsigned char *sUTF8)
77 {
78 if (u < 0x80)
79 {
80 sUTF8[0] = (unsigned char)u;
81 sUTF8[1] = '\0';
82 }
83 else if (u < 0x800)
84 {
85 sUTF8[0] = (unsigned char)(0xC0 | (u >> 6));
86 sUTF8[1] = (unsigned char)(0x80 | (u & 0x3F));
87 sUTF8[2] = '\0';
88 }
89 else if (u < 0x10000)
90 {
91 sUTF8[0] = (unsigned char)(0xE0 | (u >> 12));
92 sUTF8[1] = (unsigned char)(0x80 | ((u & 0xFFF) >> 6));
93 sUTF8[2] = (unsigned char)(0x80 | (u & 0x3F));
94 sUTF8[3] = '\0';
95 }
96 else if (u < 0x110000)
97 {
98 sUTF8[0] = (unsigned char)(0xF0 | (u >> 18));
99 sUTF8[1] = (unsigned char)(0x80 | ((u & 0x3FFFF) >> 12));
100 sUTF8[2] = (unsigned char)(0x80 | ((u & 0xFFF) >> 6));
101 sUTF8[3] = (unsigned char)(0x80 | (u & 0x3F));
102 sUTF8[4] = '\0';
103 }
104 else
105 {
106 sUTF8[0] = '\0';
107 }
108 }
109
110 void get_last_utf8_char(unsigned char *out_utf8_char, const unsigned char *utf8_str, int utf8_str_len)
111 {
112 int i;
113 int j = 0;
114
115 if (utf8_str_len > 0)
116 {
117 i = utf8_str_len - 1;
118 while (i >= 0 && (utf8_str[i] & 0xC0) == 0x80)
119 i--;
120
121 while (i < utf8_str_len && j < 4)
122 out_utf8_char[j++] = utf8_str[i++];
123 }
124
125 out_utf8_char[j] = '\0';
126 }
127
128 void get_first_utf8_char(unsigned char *out_utf8_char, const unsigned char *utf8_str, int utf8_str_len)
129 {
130 int len;
131 int i = 0;
132
133 if (utf8_str_len > 0)
134 {
135 if ((utf8_str[0] & 0xE0) == 0xC0) /* 2-byte UTF8 */
136 {
137 len = 2;
138 }
139 else if ((utf8_str[0] & 0xF0) == 0xE0) /* 3-byte UTF8 */
140 {
141 len = 3;
142 }
143 else if ((utf8_str[0] & 0xF1) == 0xF0) /* 4-byte UTF8 */
144 {
145 len = 4;
146 }
147 else
148 len = 1;
149
150 for (i = 0; i < len && i < utf8_str_len; i++)
151 out_utf8_char[i] = utf8_str[i];
152 }
153
154 out_utf8_char[i] = '\0';
155 }
156
157 const unsigned char *next_utf8_char(const unsigned char *utf8_str)
158 {
159 int len;
160
161 if ((utf8_str[0] & 0xE0) == 0xC0) /* 2-byte UTF8 */
162 {
163 len = 2;
164 }
165 else if ((utf8_str[0] & 0xF0) == 0xE0) /* 3-byte UTF8 */
166 {
167 len = 3;
168 }
169 else if ((utf8_str[0] & 0xF1) == 0xF0) /* 4-byte UTF8 */
170 {
171 len = 4;
172 }
173 else
174 len = 1;
175
176 while (len && *utf8_str)
177 {
178 len--;
179 utf8_str++;
180 }
181 return utf8_str;
182 }
183
184 void utf8_char_toupper(unsigned char *out, const unsigned char *in)
185 {
186 if ('a' <= *in && *in <= 'z')
187 {
188 out[0] = in[0] + ('A' - 'a');
189 out[1] = '\0';
190 }
191 else if (!ustrncmp(in, "æ", 2))
192 {
193 ustrcpy(out, "Æ");
194 }
195 else if (!ustrncmp(in, "å", 2))
196 {
197 ustrcpy(out, "Å");
198 }
199 else if (!ustrncmp(in, "ø", 2))
200 {
201 ustrcpy(out, "Ø");
202 }
203 else
204 ustrcpy(out, in);
205 }
206
207 unsigned char *full_alphabet_to_half(const unsigned char *full, int *used_len)
208 {
209 static unsigned char half[5];
210
211 memset(half, 0, sizeof(half));
212 if (full[0] == 0xEF && full[1] == 0xBD && 0x81 <= full[2] && full[2] <= 0x9A)
213 {
214 if (used_len)
215 *used_len = 3;
216 half[0] = 'a' + (full[2] - 0x81);
217 }
218 else if (full[0] == 0xEF && full[1] == 0xBC && full[2] == 0x8D) // full width -
219 {
220 if (used_len)
221 *used_len = 3;
222 half[0] = '-';
223 }
224 else if (full[0] == 0xEF && full[1] == 0xBC && full[2] == 0x8C) // full width ,
225 {
226 if (used_len)
227 *used_len = 3;
228 half[0] = ',';
229 }
230 else if (full[0] == 0xE2 && full[1] == 0x80 && full[2] == 0xA7) // full width .
231 {
232 if (used_len)
233 *used_len = 3;
234 half[0] = '.';
235 }
236 else
237 {
238 unsigned char first_utf8_char[5];
239 int len_first_char;
240
241 get_first_utf8_char(first_utf8_char, full, ustrlen(full));
242 len_first_char = ustrlen(first_utf8_char);
243 if (used_len)
244 *used_len = len_first_char;
245 memcpy(half, first_utf8_char, len_first_char);
246 }
247 return half;
248 }
249
250 unsigned char *half_alphabet_to_full(unsigned char c)
251 {
252 static unsigned char full[5];
253 if ('a' <= c && c <= 'z')
254 {
255 full[0] = 0xEF;
256 full[1] = 0xBD;
257 full[2] = 0x81 + (c - 'a');
258 full[3] = '\0';
259 return full;
260 }
261 else if (c == '-')
262 {
263 full[0] = 0xEF;
264 full[1] = 0xBC;
265 full[2] = 0x8D;
266 full[3] = '\0';
267 return full;
268 }
269 else if (c == ',')
270 {
271 full[0] = 0xEF;
272 full[1] = 0xBC;
273 full[2] = 0x8C;
274 full[3] = '\0';
275 return full;
276 }
277 else if (c == '.')
278 {
279 full[0] = 0xE2;
280 full[1] = 0x80;
281 full[2] = 0xA7;
282 full[3] = '\0';
283 return full;
284 }
285 else
286 return NULL;
287 }
Something went wrong with that request. Please try again.