Skip to content

Commit 5f08d45

Browse files
encoding.xml: update parser logic to skip BOM before prolog (#19858)
1 parent e0207b6 commit 5f08d45

File tree

5 files changed

+51
-2
lines changed

5 files changed

+51
-2
lines changed

vlib/encoding/xml/parser.v

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ const (
1818
double_dash = '--'.bytes()
1919
c_tag = '[C'.bytes()
2020
data_chars = 'DATA'.bytes()
21+
22+
byte_order_marking_first = u8(0xEF)
23+
byte_order_marking_bytes = [u8(0xBB), 0xBF]
2124
)
2225

2326
// Helper types to assist in parsing
@@ -296,18 +299,30 @@ fn parse_doctype(mut reader io.Reader) !DocumentType {
296299
}
297300

298301
fn parse_prolog(mut reader io.Reader) !(Prolog, u8) {
299-
// Trim trailing whitespace
302+
// Skip trailing whitespace and invalid characters
300303
mut local_buf := [u8(0)]
301304
mut ch := next_char(mut reader, mut local_buf)!
302305
for {
303306
match ch {
304-
` `, `\t`, `\n` {
307+
` `, `\t`, `\r`, `\n` {
305308
ch = next_char(mut reader, mut local_buf)!
306309
continue
307310
}
308311
`<` {
309312
break
310313
}
314+
xml.byte_order_marking_first {
315+
// UTF-8 BOM
316+
mut bom_buf := [u8(0), 0]
317+
if reader.read(mut bom_buf)! != 2 {
318+
return error('Invalid UTF-8 BOM.')
319+
}
320+
if bom_buf != xml.byte_order_marking_bytes {
321+
return error('Invalid UTF-8 BOM.')
322+
}
323+
ch = next_char(mut reader, mut local_buf)!
324+
continue
325+
}
311326
else {
312327
return error('Expecting a prolog or root node starting with "<".')
313328
}
File renamed without changes.
File renamed without changes.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
module main
2+
3+
import os
4+
import encoding.xml
5+
6+
fn test_valid_parsing() {
7+
// We use a .bin file to avoid stripping the BOM from the XML file
8+
path := os.join_path(os.dir(@FILE), 'workbook.bin')
9+
10+
doc := xml.XMLDocument.from_file(path) or {
11+
assert false, 'Failed to parse workbook.bin'
12+
exit(1)
13+
}
14+
15+
sheets := doc.get_elements_by_tag('sheet')
16+
assert sheets.len == 1, 'Expected 1 sheet, got ${sheets.len}'
17+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
2+
<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" mc:Ignorable="x15" xmlns:x15="http://schemas.microsoft.com/office/spreadsheetml/2010/11/main">
3+
<fileVersion appName="xl" lastEdited="6" lowestEdited="6" rupBuild="14420"/>
4+
<workbookPr defaultThemeVersion="164011"/>
5+
<bookViews>
6+
<workbookView xWindow="0" yWindow="0" windowWidth="22260" windowHeight="12645"/>
7+
</bookViews>
8+
<sheets>
9+
<sheet name="Sheet1" sheetId="1" r:id="rId1"/>
10+
</sheets>
11+
<calcPr calcId="162913"/>
12+
<extLst>
13+
<ext uri="{140A7094-0E35-4892-8432-C4D2E57EDEB5}" xmlns:x15="http://schemas.microsoft.com/office/spreadsheetml/2010/11/main">
14+
<x15:workbookPr chartTrackingRefBase="1"/>
15+
</ext>
16+
</extLst>
17+
</workbook>

0 commit comments

Comments
 (0)