/
decode_autodetect.go
76 lines (69 loc) · 1.3 KB
/
decode_autodetect.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
package main
import (
"bytes"
"errors"
"golang.org/x/net/html/charset"
"golang.org/x/text/transform"
"unicode/utf8"
)
var encodings = []string{
"iso-2022-jp",
"euc-jp",
"utf-8",
"sjis",
}
// Converts to UTF-8.
// Charset (UTF-8, Shift-JIS, EUC-JP, ISO-2022-JP) is automatically detected.
func DecodeAutoDetect(src []byte) (string, error) {
for _, enc := range encodings {
e, _ := charset.Lookup(enc)
if e == nil {
continue
}
var buf bytes.Buffer
r := transform.NewWriter(&buf, e.NewDecoder())
_, err := r.Write(src)
if err != nil {
continue
}
err = r.Close()
if err != nil {
continue
}
f := buf.Bytes()
if isInvalidRune(f) {
continue
}
if utf8.Valid(f) {
if hasBom(f) {
f = stripBom(f)
}
return string(f), nil
}
}
return string(src), errors.New("could not determine character code")
}
var utf8bom = []byte{239, 187, 191}
// check have UTF-8 BOM
func hasBom(in []byte) bool {
return bytes.HasPrefix(in, utf8bom)
}
// strip UTF-8 BOM
func stripBom(in []byte) []byte {
return bytes.TrimPrefix(in, utf8bom)
}
func isInvalidRune(in []byte) bool {
cb := in
for len(cb) > 0 {
if utf8.RuneStart(cb[0]) {
r, size := utf8.DecodeRune(cb)
if r == utf8.RuneError {
return true
}
cb = cb[size:]
} else {
cb = cb[1:]
}
}
return false
}