/
csv_reader_sequential.v
298 lines (263 loc) · 7.41 KB
/
csv_reader_sequential.v
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
/*
csv serial reader 1.0 alpha
Copyright (c) 2023 Dario Deledda. All rights reserved.
Use of this source code is governed by an MIT license
that can be found in the LICENSE file.
Known limitations:
*/
module csv
import os
@[params]
pub struct SequentialReaderConfig {
pub:
scr_buf voidptr // pointer to the buffer of data
scr_buf_len i64 // if > 0 use the RAM pointed by scr_buf as source of data
file_path string
start_index i64
end_index i64 = -1
mem_buf_size int = 1024 * 64 // default buffer size 64KByte
separator u8 = `,`
comment u8 = `#` // every line that start with the comment char is ignored
default_cell string = '*' // return this string if out of the csv boundaries
empty_cell string // return this string if empty cell
end_line_len int = endline_cr_len // size of the endline rune
quote u8 = `"` // double quote is the standard quote char
}
pub struct SequentialReader {
pub mut:
index i64
f os.File
f_len i64
is_bom_present bool
start_index i64
end_index i64 = -1
end_line u8 = `\n`
end_line_len int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2
separator u8 = `,` // comma is the default separator
separator_len int = 1 // size of the separator rune
quote u8 = `"` // double quote is the standard quote char
comment u8 = `#` // every line that start with the quote char is ignored
default_cell string = '*' // return this string if out of the csv boundaries
empty_cell string = '#' // retunrn this if empty cell
// ram buffer
mem_buf_type u32 // buffer type 0=File,1=RAM
mem_buf voidptr // buffer used to load chars from file
mem_buf_size i64 // size of the buffer
mem_buf_start i64 = -1 // start index in the file of the read buffer
mem_buf_end i64 = -1 // end index in the file of the read buffer
ch_buf []u8 = []u8{cap: 1024}
// error management
row_count i64
col_count i64
}
// csv_sequential_reader creates a sequential csv reader
pub fn csv_sequential_reader(cfg SequentialReaderConfig) !&SequentialReader {
mut cr := &SequentialReader{}
cr.start_index = cfg.start_index
cr.end_index = cfg.end_index
// reading from a RAM buffer
if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 {
cr.mem_buf_type = ram_csv // RAM buffer
cr.mem_buf = cfg.scr_buf
cr.mem_buf_size = cfg.scr_buf_len
if cfg.end_index == -1 {
cr.end_index = cfg.scr_buf_len
}
// check if BOM header is in the memory buffer
unsafe {
if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
&& *(&u8(cr.mem_buf) + 2) == 0xBF {
cr.is_bom_present = true
cr.index += 3 // skip the BOM
cr.start_index += 3 // skip the BOM
}
}
cr.mem_buf_start = 0
cr.mem_buf_end = cr.mem_buf_size
// check if is a file source
} else if cfg.file_path.len > 0 {
if !os.exists(cfg.file_path) {
return error('ERROR: file ${cfg.file_path} not found!')
}
cr.mem_buf_type = file_csv // File buffer
// allocate the memory
unsafe {
cr.mem_buf = malloc(cfg.mem_buf_size)
cr.mem_buf_size = cfg.mem_buf_size
}
cr.f = os.open_file(cfg.file_path, 'rb')!
cr.f.seek(0, .end)!
cr.f_len = cr.f.tell()!
cr.f.seek(cfg.start_index, .start)!
cr.index = cr.f.tell()!
if cfg.end_index == -1 {
cr.end_index = cr.f_len
}
// check if BOM header is in the file
if cr.index == 0 {
if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 {
unsafe {
if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
&& *(&u8(cr.mem_buf) + 2) == 0xBF {
cr.is_bom_present = true
cr.index += 3 // skip the BOM
cr.start_index += 3 // skip the BOM
}
}
}
cr.f.seek(cfg.start_index, .start)!
}
}
cr.default_cell = cfg.default_cell
cr.empty_cell = cfg.empty_cell
cr.end_line_len = cfg.end_line_len
cr.separator = cfg.separator
cr.comment = cfg.comment
cr.quote = cfg.quote
return cr
}
// dispose_csv_reader release the resources used by the csv_reader
pub fn (mut cr SequentialReader) dispose_csv_reader() {
if cr.mem_buf_type == ram_csv {
// do nothing, ram buffer is static
} else if cr.mem_buf_type == file_csv {
// file close
if cr.f.is_opened {
cr.f.close()
}
// free the allocated memory
if cr.mem_buf_size > 0 {
unsafe {
free(cr.mem_buf)
}
cr.mem_buf = unsafe { nil }
cr.mem_buf_size = 0
}
}
}
// has_data return the bytes available for future readings
pub fn (mut cr SequentialReader) has_data() i64 {
return cr.end_index - cr.start_index
}
fn (mut cr SequentialReader) fill_buffer(index i64) ! {
if cr.mem_buf_type == ram_csv {
// for now do nothing if ram buffer
} else {
cr.f.seek(index, .start)!
// IMPORTANT: add 64 bit support in vlib!!
read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))!
cr.mem_buf_start = index
cr.mem_buf_end = index + read_bytes_count
}
}
enum SequentialReadingState as u16 {
comment
quote
after_quote
cell
newline
}
// get_next_row get the next row from the CSV file as a string array
pub fn (mut cr SequentialReader) get_next_row() ![]string {
mut row_res := []string{}
// clear the cell buffer
cr.ch_buf.clear()
mut i := cr.start_index
mut state := SequentialReadingState.cell
p := &u8(cr.mem_buf)
for i < cr.end_index {
if i < cr.mem_buf_start || i >= cr.mem_buf_end {
cr.fill_buffer(i)!
}
unsafe {
ch := *(p + i - cr.mem_buf_start)
if state == .cell {
if ch == cr.separator {
// must be optimized
cr.ch_buf << 0
row_res << if (cr.ch_buf.len - 1) == 0 {
cr.empty_cell
} else {
(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
}
cr.ch_buf.clear()
} else if cr.ch_buf.len == 0 && ch == cr.comment && row_res.len == 0 {
state = .comment
} else if ch == cr.quote {
state = .quote
cr.ch_buf.clear()
cr.col_count++
i++
continue
} else if ch == cr.end_line {
cr.row_count++
cr.col_count = 0
// skip empty rows
if !(row_res.len == 0 && cr.ch_buf.len < 1) {
cr.ch_buf << 0
row_res << if (cr.ch_buf.len - 1) == 0 {
cr.empty_cell
} else {
(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
}
i += cr.end_line_len - 1
break
}
} else if ch == `\r` && cr.end_line_len == 2 {
// skip CR
} else { // normal char inside a cell
cr.ch_buf << ch
}
}
if state == .comment {
if cr.ch_buf.len > 0 {
// must be optimized
cr.ch_buf << 0
row_res << if (cr.ch_buf.len - 1) == 0 {
cr.empty_cell
} else {
(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
}
cr.ch_buf.clear()
} else if ch == cr.end_line {
state = .cell
}
}
if state == .quote {
if ch == cr.quote {
// must be optimized
cr.ch_buf << 0
row_res << if (cr.ch_buf.len - 1) == 0 {
cr.empty_cell
} else {
(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
}
cr.ch_buf.clear()
state = .after_quote
cr.col_count++
i++
continue
} else if ch == cr.end_line {
return error('ERROR: quote not closed at row ${cr.row_count} after column ${cr.col_count}!')
} else { // normal char inside a quote inside a cell
cr.ch_buf << ch
}
}
if state == .after_quote {
if ch == cr.separator {
state = .cell
} else if ch == cr.end_line {
cr.row_count++
cr.col_count = 0
cr.ch_buf.clear()
i += cr.end_line_len - 1
break
}
}
}
cr.col_count++
i++
}
cr.start_index = i
return row_res
}