/
encoding.vala
116 lines (104 loc) · 4.35 KB
/
encoding.vala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
/*
* Copyright (C) 2011-2014 Daiki Ueno <ueno@gnu.org>
* Copyright (C) 2011-2014 Red Hat, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
namespace Skk {
// XXX: we use Vala string to represent byte array, assuming that
// it does not contain null element
class EncodingConverter : Object {
static const int BUFSIZ = 4096;
static const string INTERNAL_ENCODING = "UTF-8";
static const Entry<string,string> ENCODING_TO_CODING_SYSTEM_RULE[] = {
{ "UTF-8", "utf-8" },
{ "EUC-JP", "euc-jp" },
{ "Shift_JIS", "shift_jis" },
{ "ISO-2022-JP", "iso-2022-jp" },
{ "EUC-JISX0213", "euc-jisx0213" },
{ "EUC-JISX0213", "euc-jis-2004" },
{ "SHIFT_JISX0213", "shift_jis-2004" }
};
static Regex coding_cookie_regex;
static construct {
try {
coding_cookie_regex = new Regex (
"-\\*-.*[ \t]coding:[ \t]*([^ \t;]+?)[ \t;].*-\\*-");
} catch (GLib.RegexError e) {
assert_not_reached ();
}
}
internal static string? extract_coding_system (string line) {
MatchInfo info = null;
if (coding_cookie_regex.match (line, 0, out info)) {
return info.fetch (1);
}
return null;
}
internal string? get_coding_system () {
foreach (var entry in ENCODING_TO_CODING_SYSTEM_RULE) {
if (entry.key == encoding) {
return entry.value;
}
}
return null;
}
internal string encoding { get; private set; }
CharsetConverter encoder;
CharsetConverter decoder;
internal EncodingConverter (string encoding) throws GLib.Error {
this.encoding = encoding;
encoder = new CharsetConverter (encoding, INTERNAL_ENCODING);
decoder = new CharsetConverter (INTERNAL_ENCODING, encoding);
}
internal EncodingConverter.from_coding_system (string coding) throws GLib.Error {
foreach (var entry in ENCODING_TO_CODING_SYSTEM_RULE) {
if (entry.value == coding) {
this (entry.key);
return;
}
}
assert_not_reached ();
}
string convert (CharsetConverter converter, string str) throws GLib.Error {
uint8[] inbuf = str.data;
uint8[] outbuf = new uint8[BUFSIZ];
StringBuilder builder = new StringBuilder ();
size_t total_bytes_read = 0;
while (total_bytes_read < inbuf.length) {
size_t bytes_read, bytes_written;
// FIXME: we could split inbuf to small segments (<
// BUFSIZ) here to reduce memory copy. However it
// requires proper error handling when the end of
// segmented buffer lies across a UTF-8 character
// boundary.
converter.convert (inbuf[total_bytes_read : inbuf.length],
outbuf,
ConverterFlags.INPUT_AT_END,
out bytes_read,
out bytes_written);
for (int i = 0; i < bytes_written; i++)
builder.append_c ((char)outbuf[i]);
total_bytes_read += bytes_read;
}
return builder.str;
}
internal string encode (string internal_str) throws GLib.Error {
return convert (encoder, internal_str);
}
internal string decode (string external_str) throws GLib.Error {
return convert (decoder, external_str);
}
}
}