Skip to content

Commit 35558df

Browse files
vlib: add an encoding.xml module with parser, validation, entity encoding, unit tests (#19708)
1 parent 01022e9 commit 35558df

File tree

48 files changed

+2004
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+2004
-1
lines changed

.gitignore

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,4 +130,9 @@ vls.log
130130
wasm.v
131131
TAGS
132132
tags
133-
vlib/builtin/js/*.js
133+
134+
# ignore large GTK *.gir files
135+
Gtk-4.0.gir
136+
*.gir
137+
138+
vlib/builtin/js/*.js

vlib/encoding/xml/README.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
## Description
2+
3+
`xml` is a module to parse XML documents into a tree structure. It also supports
4+
validation of XML documents against a DTD.
5+
6+
Note that this is not a streaming XML parser. It reads the entire document into
7+
memory and then parses it. This is not a problem for small documents, but it
8+
might be a problem for extremely large documents (several hundred megabytes or more).
9+
10+
## Usage
11+
12+
### Parsing XML Files
13+
14+
There are three different ways to parse an XML Document:
15+
16+
1. Pass the entire XML document as a string to `XMLDocument.from_string`.
17+
2. Specify a file path to `XMLDocument.from_file`.
18+
3. Use a source that implements `io.Reader` and pass it to `XMLDocument.from_reader`.
19+
20+
```v
21+
import encoding.xml
22+
23+
//...
24+
doc := xml.XMLDocument.from_file('test/sample.xml')!
25+
```
26+
27+
### Validating XML Documents
28+
29+
Simply call `validate` on the parsed XML document.
30+
31+
### Querying
32+
33+
Check the `get_element...` methods defined on the XMLDocument struct.
34+
35+
### Escaping and Un-escaping XML Entities
36+
37+
When the `validate` method is called, the XML document is parsed and all text
38+
nodes are un-escaped. This means that the text nodes will contain the actual
39+
text and not the escaped version of the text.
40+
41+
When the XML document is serialized (using `str` or `pretty_str`), all text nodes are escaped.
42+
43+
The escaping and un-escaping can also be done manually using the `escape_text` and
44+
`unescape_text` methods.

vlib/encoding/xml/encoding.v

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
module xml
2+
3+
import strings
4+
5+
// pretty_str returns a pretty-printed version of the XML node. It requires the current indentation
6+
// the node is at, the depth of the node in the tree, and a map of reverse entities to use when
7+
// escaping text.
8+
pub fn (node XMLNode) pretty_str(original_indent string, depth int, reverse_entities map[string]string) string {
9+
// Create the proper indentation first
10+
mut indent_builder := strings.new_builder(original_indent.len * depth)
11+
for _ in 0 .. depth {
12+
indent_builder.write_string(original_indent)
13+
}
14+
indent := indent_builder.str()
15+
16+
// Now we can stringify the node
17+
mut builder := strings.new_builder(1024)
18+
builder.write_string(indent)
19+
builder.write_u8(`<`)
20+
builder.write_string(node.name)
21+
22+
for key, value in node.attributes {
23+
builder.write_u8(` `)
24+
builder.write_string(key)
25+
builder.write_string('="')
26+
builder.write_string(value)
27+
builder.write_u8(`"`)
28+
}
29+
builder.write_string('>\n')
30+
for child in node.children {
31+
match child {
32+
string {
33+
builder.write_string(indent)
34+
builder.write_string(original_indent)
35+
builder.write_string(escape_text(child, reverse_entities: reverse_entities))
36+
}
37+
XMLNode {
38+
builder.write_string(child.pretty_str(original_indent, depth + 1, reverse_entities))
39+
}
40+
XMLComment {
41+
builder.write_string(indent)
42+
builder.write_string(original_indent)
43+
builder.write_string('<!--')
44+
builder.write_string(child.text)
45+
builder.write_string('-->')
46+
}
47+
XMLCData {
48+
builder.write_string(indent)
49+
builder.write_string(original_indent)
50+
builder.write_string('<![CDATA[')
51+
builder.write_string(child.text)
52+
builder.write_string(']]>')
53+
}
54+
}
55+
builder.write_u8(`\n`)
56+
}
57+
builder.write_string(indent)
58+
builder.write_string('</')
59+
builder.write_string(node.name)
60+
builder.write_u8(`>`)
61+
return builder.str()
62+
}
63+
64+
fn (list []DTDListItem) pretty_str(indent string) string {
65+
if list.len == 0 {
66+
return ''
67+
}
68+
69+
mut builder := strings.new_builder(1024)
70+
builder.write_u8(`[`)
71+
builder.write_u8(`\n`)
72+
73+
for item in list {
74+
match item {
75+
DTDEntity {
76+
builder.write_string('${indent}<!ENTITY ${item.name} "${item.value}">')
77+
}
78+
DTDElement {
79+
builder.write_string('${indent}<!ELEMENT ${item.name} ${item.definition}>')
80+
}
81+
}
82+
builder.write_u8(`\n`)
83+
}
84+
builder.write_u8(`]`)
85+
return builder.str()
86+
}
87+
88+
fn (doctype DocumentType) pretty_str(indent string) string {
89+
match doctype.dtd {
90+
string {
91+
content := doctype.dtd
92+
return if content.len > 0 {
93+
'<!DOCTYPE ${doctype.name} SYSTEM "${content}">'
94+
} else {
95+
''
96+
}
97+
}
98+
DocumentTypeDefinition {
99+
if doctype.dtd.list.len == 0 {
100+
return ''
101+
}
102+
103+
mut builder := strings.new_builder(1024)
104+
builder.write_string('<!DOCTYPE ')
105+
builder.write_string(doctype.name)
106+
builder.write_string(' ')
107+
builder.write_string(doctype.dtd.list.pretty_str(indent))
108+
builder.write_string('>')
109+
builder.write_u8(`\n`)
110+
return builder.str()
111+
}
112+
}
113+
}
114+
115+
// pretty_str returns a pretty-printed version of the XML document. It requires the string used to
116+
// indent each level of the document.
117+
pub fn (doc XMLDocument) pretty_str(indent string) string {
118+
mut document_builder := strings.new_builder(1024)
119+
120+
prolog := '<?xml version="${doc.version}" encoding="${doc.encoding}"?>'
121+
comments := if doc.comments.len > 0 {
122+
mut comments_buffer := strings.new_builder(512)
123+
for comment in doc.comments {
124+
comments_buffer.write_string('<!--')
125+
comments_buffer.write_string(comment.text)
126+
comments_buffer.write_string('-->')
127+
comments_buffer.write_u8(`\n`)
128+
}
129+
comments_buffer.str()
130+
} else {
131+
''
132+
}
133+
134+
document_builder.write_string(prolog)
135+
document_builder.write_u8(`\n`)
136+
document_builder.write_string(doc.doctype.pretty_str(indent))
137+
document_builder.write_u8(`\n`)
138+
document_builder.write_string(comments)
139+
document_builder.write_string(doc.root.pretty_str(indent, 0, doc.parsed_reverse_entities))
140+
141+
return document_builder.str()
142+
}
143+
144+
// str returns a string representation of the XML document. It uses a 2-space indentation
145+
// to pretty-print the document.
146+
pub fn (doc XMLDocument) str() string {
147+
return doc.pretty_str(' ')
148+
}

vlib/encoding/xml/entity.v

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
module xml
2+
3+
import strings
4+
5+
pub const default_entities = {
6+
'lt': '<'
7+
'gt': '>'
8+
'amp': '&'
9+
'apos': "'"
10+
'quot': '"'
11+
}
12+
13+
pub const default_entities_reverse = {
14+
'<': 'lt'
15+
'>': 'gt'
16+
'&': 'amp'
17+
"'": 'apos'
18+
'"': 'quot'
19+
}
20+
21+
[params]
22+
pub struct EscapeConfig {
23+
reverse_entities map[string]string = xml.default_entities_reverse
24+
}
25+
26+
// escape_text replaces all entities in the given string with their respective
27+
// XML entity strings. See default_entities, which can be overridden.
28+
pub fn escape_text(content string, config EscapeConfig) string {
29+
mut flattened_entities := []string{cap: 2 * config.reverse_entities.len}
30+
31+
for target, replacement in config.reverse_entities {
32+
flattened_entities << target
33+
flattened_entities << '&' + replacement + ';'
34+
}
35+
36+
return content.replace_each(flattened_entities)
37+
}
38+
39+
[params]
40+
pub struct UnescapeConfig {
41+
entities map[string]string = xml.default_entities
42+
}
43+
44+
// unescape_text replaces all entities in the given string with their respective
45+
// original characters or strings. See default_entities_reverse, which can be overridden.
46+
pub fn unescape_text(content string, config UnescapeConfig) !string {
47+
mut buffer := strings.new_builder(content.len)
48+
mut index := 0
49+
runes := content.runes()
50+
for index < runes.len {
51+
match runes[index] {
52+
`&` {
53+
mut offset := 1
54+
mut entity_buf := strings.new_builder(8)
55+
for index + offset < runes.len && runes[index + offset] != `;` {
56+
entity_buf.write_rune(runes[index + offset])
57+
offset++
58+
}
59+
// Did we reach the end of the string?
60+
if index + offset == runes.len {
61+
return error('Unexpected end of string while parsing entity.')
62+
}
63+
// Did we find a valid entity?
64+
entity := entity_buf.str()
65+
if entity in config.entities {
66+
buffer.write_string(config.entities[entity])
67+
index += offset
68+
} else {
69+
return error('Unknown entity: ' + entity)
70+
}
71+
}
72+
else {
73+
buffer.write_rune(runes[index])
74+
}
75+
}
76+
index++
77+
}
78+
return buffer.str()
79+
}

vlib/encoding/xml/entity_test.v

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
module main
2+
3+
import encoding.xml
4+
5+
fn test_escape() {
6+
assert xml.escape_text('Normal string') == 'Normal string'
7+
assert xml.escape_text('12 < 34') == '12 &lt; 34'
8+
assert xml.escape_text('12 > 34') == '12 &gt; 34'
9+
assert xml.escape_text('12 & 34') == '12 &amp; 34'
10+
assert xml.escape_text('He said, "Very well, let us proceed."') == 'He said, &quot;Very well, let us proceed.&quot;'
11+
assert xml.escape_text("He said, 'Very well, let us proceed.'") == 'He said, &apos;Very well, let us proceed.&apos;'
12+
13+
assert xml.escape_text('Do not escape ©.') == 'Do not escape ©.'
14+
15+
mut reverse_entities := xml.default_entities_reverse.clone()
16+
reverse_entities['©'] = 'copy'
17+
assert xml.escape_text('Do escape ©.', reverse_entities: reverse_entities) == 'Do escape &copy;.'
18+
}
19+
20+
fn test_unescape() ! {
21+
assert xml.unescape_text('Normal string')! == 'Normal string'
22+
assert xml.unescape_text('12 &lt; 34')! == '12 < 34'
23+
assert xml.unescape_text('12 &gt; 34')! == '12 > 34'
24+
assert xml.unescape_text('12 &amp; 34')! == '12 & 34'
25+
assert xml.unescape_text('He said, &quot;Very well, let us proceed.&quot;')! == 'He said, "Very well, let us proceed."'
26+
assert xml.unescape_text('He said, &apos;Very well, let us proceed.&apos;')! == "He said, 'Very well, let us proceed.'"
27+
28+
xml.unescape_text('12 &invalid; 34') or { assert err.msg() == 'Unknown entity: invalid' }
29+
30+
xml.unescape_text('Do not unescape &copy;') or { assert err.msg() == 'Unknown entity: copy' }
31+
32+
mut entities := xml.default_entities.clone()
33+
entities['copy'] = '©'
34+
assert xml.unescape_text('Do unescape &copy;.', entities: entities)! == 'Do unescape ©.'
35+
}

0 commit comments

Comments
 (0)