Skip to content

Commit

Permalink
encoding.html: implement unescape() (#19267)
Browse files Browse the repository at this point in the history
  • Loading branch information
ttytm committed Sep 5, 2023
1 parent c126450 commit 2733416
Show file tree
Hide file tree
Showing 3 changed files with 2,258 additions and 5 deletions.
85 changes: 80 additions & 5 deletions vlib/encoding/html/escape.v
@@ -1,13 +1,24 @@
module html

import encoding.hex
import strconv

[params]
pub struct EscapeConfig {
quote bool = true
}

[params]
pub struct UnescapeConfig {
EscapeConfig
all bool
}

const (
html_replacement_table = ['&', '&amp;', '<', '&lt;', '>', '&gt;']
html_quote_replacement_table = ['"', '&#34;', "'", '&#39;'] // `'&#34;'` is shorter than `'&quot;'`
escape_seq = ['&', '&amp;', '<', '&lt;', '>', '&gt;']
escape_quote_seq = ['"', '&#34;', "'", '&#39;']
unescape_seq = ['&amp;', '&', '&lt;', '<', '&gt;', '>']
unescape_quote_seq = ['&#34;', '"', '&#39;', "'"]
)

// escape converts special characters in the input, specifically "<", ">", and "&"
Expand All @@ -16,10 +27,74 @@ const (
// **Note:** escape() supports funky accents by doing nothing about them. V's UTF-8
// support through `string` is robust enough to deal with these cases.
pub fn escape(input string, config EscapeConfig) string {
tag_free_input := input.replace_each(html.html_replacement_table)
return if config.quote {
tag_free_input.replace_each(html.html_quote_replacement_table)
input.replace_each(html.escape_seq).replace_each(html.escape_quote_seq)
} else {
tag_free_input
input.replace_each(html.escape_seq)
}
}

// unescape converts entities like "&lt;" to "<". By default it is the converse of `escape`.
// If `all` is set to true, it handles named, numeric, and hex values - for example,
// `'&apos;'`, `'&#39;'`, and `'&#x27;'` then unescape to "'".
pub fn unescape(input string, config UnescapeConfig) string {
return if config.all {
unescape_all(input)
} else if config.quote {
input.replace_each(html.unescape_seq).replace_each(html.unescape_quote_seq)
} else {
input.replace_each(html.unescape_seq)
}
}

fn unescape_all(input string) string {
mut result := []rune{}
runes := input.runes()
mut i := 0
outer: for i < runes.len {
if runes[i] == `&` {
mut j := i + 1
for j < runes.len && runes[j] != `;` {
j++
}
if j < runes.len && runes[i + 1] == `#` {
// Numeric escape sequences (e.g., &#39; or &#x27;)
code := runes[i + 2..j].string()
if code[0] == `x` {
// Hexadecimal escape sequence
for c in code[1..] {
if !c.is_hex_digit() {
// Leave invalid sequences unchanged
result << runes[i..j + 1]
i = j + 1
continue outer
}
}
result << hex.decode(code[1..]) or { []u8{} }.bytestr().runes()
} else {
// Decimal escape sequence
if v := strconv.atoi(code) {
result << v
} else {
// Leave invalid sequences unchanged
result << runes[i..j + 1]
}
}
} else {
// Named entity (e.g., &lt;)
entity := runes[i + 1..j].string()
if v := named_references[entity] {
result << v
} else {
// Leave unknown entities unchanged
result << runes[i..j + 1]
}
}
i = j + 1
} else {
result << runes[i]
i++
}
}
return result.string()
}
48 changes: 48 additions & 0 deletions vlib/encoding/html/escape_test.v
Expand Up @@ -20,3 +20,51 @@ fn test_escape_html() {
assert html.escape('café') == 'café'
assert html.escape('<p>façade</p>') == '&lt;p&gt;façade&lt;/p&gt;'
}

fn test_unescape_html() {
// Test different formats
assert html.unescape('&#39;&#x27;&apos;') == "'&#x27;&apos;"
// Converse escape tests
assert html.unescape('&lt;&gt;&amp;') == '<>&'
assert html.unescape('No change') == 'No change'
assert html.unescape('&lt;b&gt;Bold text&lt;/b&gt;') == '<b>Bold text</b>'
assert html.unescape('&lt;img /&gt;') == '<img />'
assert html.unescape('&#39; onmouseover=&#39;alert(1)&#39;') == "' onmouseover='alert(1)'"
assert html.unescape('&lt;a href=&#39;http://www.example.com&#39;&gt;link&lt;/a&gt;') == "<a href='http://www.example.com'>link</a>"
assert html.unescape('&lt;script&gt;alert(&#39;hello&#39;);&lt;/script&gt;') == "<script>alert('hello');</script>"
// Cases obtained from:
// https://github.com/apache/commons-lang/blob/master/src/test/java/org/apache/commons/lang3/StringEscapeUtilsTest.java
assert html.unescape('plain text') == 'plain text'
assert html.unescape('') == ''
assert html.unescape('bread &amp; butter') == 'bread & butter'
assert html.unescape('&#34;bread&#34; &amp; butter') == '"bread" & butter'
assert html.unescape('greater than &gt;') == 'greater than >'
assert html.unescape('&lt; less than') == '< less than'
// Leave accents as-is
assert html.unescape('café') == 'café'
assert html.unescape('&lt;p&gt;façade&lt;/p&gt;') == '<p>façade</p>'
}

fn test_unescape_all_html() {
// Test different formats
assert html.unescape('&#39;&#x27;&apos;', all: true) == "'''"
// Converse escape tests
assert html.unescape('&lt;&gt;&amp;', all: true) == '<>&'
assert html.unescape('No change', all: true) == 'No change'
assert html.unescape('&lt;b&gt;Bold text&lt;/b&gt;', all: true) == '<b>Bold text</b>'
assert html.unescape('&lt;img /&gt;', all: true) == '<img />'
assert html.unescape('&#39; onmouseover=&#39;alert(1)&#39;', all: true) == "' onmouseover='alert(1)'"
assert html.unescape('&lt;a href=&#39;http://www.example.com&#39;&gt;link&lt;/a&gt;', all: true) == "<a href='http://www.example.com'>link</a>"
assert html.unescape('&lt;script&gt;alert(&#39;hello&#39;);&lt;/script&gt;', all: true) == "<script>alert('hello');</script>"
// Cases obtained from:
// https://github.com/apache/commons-lang/blob/master/src/test/java/org/apache/commons/lang3/StringEscapeUtilsTest.java
assert html.unescape('plain text', all: true) == 'plain text'
assert html.unescape('', all: true) == ''
assert html.unescape('bread &amp; butter', all: true) == 'bread & butter'
assert html.unescape('&#34;bread&#34; &amp; butter', all: true) == '"bread" & butter'
assert html.unescape('greater than &gt;', all: true) == 'greater than >'
assert html.unescape('&lt; less than', all: true) == '< less than'
// Leave accents as-is
assert html.unescape('café', all: true) == 'café'
assert html.unescape('&lt;p&gt;façade&lt;/p&gt;', all: true) == '<p>façade</p>'
}

0 comments on commit 2733416

Please sign in to comment.