-
Notifications
You must be signed in to change notification settings - Fork 3
/
inquisitor.lisp
172 lines (155 loc) · 6.81 KB
/
inquisitor.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
(in-package :cl-user)
(defpackage inquisitor
(:nicknames :inq)
(:use :cl)
(:import-from :inquisitor.encoding.guess
:ces-guess-from-vector
:list-available-scheme)
(:import-from :inquisitor.eol
:eol-available-p
:eol-guess-from-vector)
(:import-from :inquisitor.external-format
:make-external-format)
(:import-from :inquisitor.names
:+available-encodings+
:+available-eols+
:dependent-name
:independent-name
:unicode-p)
(:import-from :inquisitor.util
:with-byte-array
:byte-array-p
:byte-input-stream-p)
(:export :*detecting-buffer-size*
:make-external-format
:list-available-scheme
:eol-available-p
:+available-encodings+
:+available-eols+
:independent-name
:dependent-name
:unicode-p
:detect-encoding
:detect-end-of-line
:detect-external-format
:detect-external-format-from-file))
(in-package :inquisitor)
(defparameter *default-buffer-size* 1000
"Specifies default buffer size is consed and used by `dedect-encoding`,
`detect-end-of-line` and `detect-external-format`.")
(defgeneric detect-encoding (input symbol))
(defmethod detect-encoding ((buffer vector) (scheme symbol))
"Detect character encoding scheme under the `scheme` from `buffer`."
(if (byte-array-p buffer)
(ces-guess-from-vector buffer scheme)
(error (format nil "supllied vector is not a byte array."))))
(defmethod detect-encoding ((stream stream) (scheme symbol))
"Detect character encoding scheme under the `scheme` from `stream`. Note that this
method modifies `stream`'s file position."
(if (byte-input-stream-p stream)
(let* ((buffer-length *default-buffer-size*)
(buffer (make-array buffer-length :element-type '(unsigned-byte 8)))
(ces-state)
(encoding))
(loop
:for num-read := (read-sequence buffer stream)
:if (< num-read buffer-length)
:do (return-from detect-encoding
(ces-guess-from-vector (subseq buffer 0 num-read) scheme ces-state))
:else
:do (multiple-value-bind (enc ces-st)
(ces-guess-from-vector (subseq buffer 0 num-read) scheme ces-state)
(setf encoding enc
ces-state ces-st)))
encoding)
(error (format nil "supplied stream is not a byte input stream."))))
(defmethod detect-encoding ((path pathname) (scheme symbol))
"Detect character encoding scheme under the `scheme` from `pathname`."
(with-open-file (in path
:direction :input
:element-type '(unsigned-byte 8))
(detect-encoding in scheme)))
(defgeneric detect-end-of-line (input))
(defmethod detect-end-of-line ((buffer vector))
"Detect end-of-line style from `buffer`."
(if (byte-array-p buffer)
(eol-guess-from-vector buffer)
(error (format nil "supllied vector is not a byte array."))))
(defmethod detect-end-of-line ((stream stream))
"Detect end-of-line style from `stream`. Note that this method modifies `stream`'s
file position."
(if (byte-input-stream-p stream)
(let* ((buffer-length *default-buffer-size*)
(buffer (make-array buffer-length :element-type '(unsigned-byte 8))))
(loop
:for num-read := (read-sequence buffer stream)
:if (< num-read buffer-length)
:do (return-from detect-end-of-line
(eol-guess-from-vector (subseq buffer 0 num-read)))
:else
:do (let ((eol (eol-guess-from-vector buffer)))
(when eol
(return-from detect-end-of-line eol)))))
(error (format nil "supplied stream is not a byte input stream."))))
(defmethod detect-end-of-line ((path pathname))
"Detect end-of-line style from `pathname`."
(with-open-file (in path
:direction :input
:element-type '(unsigned-byte 8))
(detect-end-of-line in)))
(defgeneric detect-external-format (input symbol))
(defmethod detect-external-format ((buffer vector) (scheme symbol))
"Detect external-format under the `scheme` from `buffer`."
(if (byte-array-p buffer)
(let* ((enc (ces-guess-from-vector buffer scheme))
(eol (eol-guess-from-vector buffer))
(enc-impl (dependent-name enc))
(eol-impl (dependent-name eol)))
(if (or (eq enc-impl :cannot-treat)
(eq eol-impl :cannot-treat))
(values nil (list enc eol))
(values
(if eol-impl
(make-external-format enc-impl eol-impl)
(make-external-format enc-impl :lf))
(list enc eol))))
(error (format nil "supllied vector is not a byte array."))))
(defmethod detect-external-format ((stream stream) (scheme symbol))
"Detect external-format under the `scheme` from `buffer`. Note that this method
method modifies `stream`'s file position."
(if (byte-input-stream-p stream)
(let* ((buffer-length *default-buffer-size*)
(buffer (make-array buffer-length :element-type '(unsigned-byte 8)))
(encoding)
(ces-state)
(end-of-line))
(loop :named stride-over-buffer
:for num-read := (read-sequence buffer stream)
:if (< num-read buffer-length)
:do (return-from stride-over-buffer
(setf encoding (ces-guess-from-vector (subseq buffer 0 num-read) scheme ces-state)
end-of-line (eol-guess-from-vector (subseq buffer 0 num-read))))
:else
:do (multiple-value-bind (enc ces-st)
(ces-guess-from-vector (subseq buffer 0 num-read) scheme ces-state)
(setf encoding enc
ces-state ces-st)
(unless end-of-line
(setf end-of-line (eol-guess-from-vector buffer)))))
(let ((enc-impl (dependent-name encoding))
(eol-impl (dependent-name end-of-line)))
(if (and (eq enc-impl :cannot-treat)
(eq eol-impl :cannot-treat))
(values nil (list encoding end-of-line))
(values
(if eol-impl
(make-external-format enc-impl eol-impl)
(make-external-format enc-impl :lf))
(list encoding end-of-line)))))
(error (format nil "supplied stream is not a byte input stream."))))
(defmethod detect-external-format ((path pathname) (scheme symbol))
"Detect external-format from `pathname`."
(with-open-file (in path
:direction :input
:element-type '(unsigned-byte 8))
(detect-external-format in scheme)))