Skip to content

Commit

Permalink
scanner: implement support for UTF-32 escape codes in string literals (
Browse files Browse the repository at this point in the history
  • Loading branch information
igrekus committed Nov 17, 2023
1 parent 373da77 commit 76530de
Show file tree
Hide file tree
Showing 19 changed files with 169 additions and 44 deletions.
5 changes: 5 additions & 0 deletions vlib/v/checker/tests/string_escape_u16_err_a.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
vlib/v/checker/tests/string_escape_u16_err_a.vv:2:15: error: `\u` incomplete 16 bit unicode character value
1 | fn main() {
2 | println('\u')
| ^
3 | }
5 changes: 5 additions & 0 deletions vlib/v/checker/tests/string_escape_u16_err_b.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
vlib/v/checker/tests/string_escape_u16_err_b.vv:2:15: error: `\u` incomplete 16 bit unicode character value
1 | fn main() {
2 | println('\u345')
| ^
3 | }
5 changes: 5 additions & 0 deletions vlib/v/checker/tests/string_escape_u32_err_a.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
vlib/v/checker/tests/string_escape_u32_err_a.vv:2:15: error: `\U` incomplete 32 bit unicode character value
1 | fn main() {
2 | println('\U')
| ^
3 | }
3 changes: 3 additions & 0 deletions vlib/v/checker/tests/string_escape_u32_err_a.vv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
fn main() {
println('\U')
}
5 changes: 5 additions & 0 deletions vlib/v/checker/tests/string_escape_u32_err_b.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
vlib/v/checker/tests/string_escape_u32_err_b.vv:2:15: error: `\U` incomplete 32 bit unicode character value
1 | fn main() {
2 | println('\U345')
| ^
3 | }
3 changes: 3 additions & 0 deletions vlib/v/checker/tests/string_escape_u32_err_b.vv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
fn main() {
println('\U345')
}
5 changes: 0 additions & 5 deletions vlib/v/checker/tests/string_escape_u_err_a.out

This file was deleted.

5 changes: 0 additions & 5 deletions vlib/v/checker/tests/string_escape_u_err_b.out

This file was deleted.

7 changes: 5 additions & 2 deletions vlib/v/gen/native/tests/string.vv
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ fn test_escape_codes() {
println(star1)
star2 := '\u2605'
println(star2)
star3 := '\U00002605'
println(star3)

aaa := '\x61\141a'
println(aaa)
Expand All @@ -33,13 +35,14 @@ fn test_runes() {

// should all print `★`
print(`\u2605`)
print(`\U00002605`)
print(`\xe2\x98\x85`)
println(`\xe2\x98\x85`)
println(`\xe2\x98\x85`)
}

fn main() {
test_unicode_characters()
test_escape_codes()
test_raw_string()
test_runes()
}
}
3 changes: 2 additions & 1 deletion vlib/v/gen/native/tests/string.vv.out
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
😀😆😎💻🌎
aaa
## #
### #
hello\tworld\n
V
😀
🚀
★★★
★★★
98 changes: 75 additions & 23 deletions vlib/v/scanner/scanner.v
Original file line number Diff line number Diff line change
Expand Up @@ -1216,7 +1216,8 @@ fn (mut s Scanner) ident_string() string {
s.inc_line_number()
}
s.is_inside_string = false
mut u_escapes_pos := []int{} // pos list of \uXXXX
mut u16_escapes_pos := []int{} // pos list of \uXXXX
mut u32_escapes_pos := []int{} // pos list of \UXXXXXXXX
mut h_escapes_pos := []int{} // pos list of \xXX
mut backslash_count := if start_char == scanner.backslash { 1 } else { 0 }
for {
Expand Down Expand Up @@ -1247,7 +1248,7 @@ fn (mut s Scanner) ident_string() string {
if c == scanner.b_lf {
s.inc_line_number()
}
// Escape `\x` `\u`
// Escape `\x` `\u` `\U`
if backslash_count % 2 == 1 && !is_raw && !is_cstr {
// Escape `\x`
if c == `x` {
Expand All @@ -1263,9 +1264,23 @@ fn (mut s Scanner) ident_string() string {
|| s.text[s.pos + 3] == s.quote || s.text[s.pos + 4] == s.quote
|| !s.text[s.pos + 1].is_hex_digit() || !s.text[s.pos + 2].is_hex_digit()
|| !s.text[s.pos + 3].is_hex_digit() || !s.text[s.pos + 4].is_hex_digit() {
s.error(r'`\u` incomplete unicode character value')
s.error(r'`\u` incomplete 16 bit unicode character value')
}
u_escapes_pos << s.pos - 1
u16_escapes_pos << s.pos - 1
}
// Escape `\U`
if c == `U` {
if s.text[s.pos + 1] == s.quote || s.text[s.pos + 2] == s.quote
|| s.text[s.pos + 3] == s.quote || s.text[s.pos + 4] == s.quote
|| s.text[s.pos + 5] == s.quote || s.text[s.pos + 6] == s.quote
|| s.text[s.pos + 7] == s.quote || s.text[s.pos + 8] == s.quote
|| !s.text[s.pos + 1].is_hex_digit() || !s.text[s.pos + 2].is_hex_digit()
|| !s.text[s.pos + 3].is_hex_digit() || !s.text[s.pos + 4].is_hex_digit()
|| !s.text[s.pos + 5].is_hex_digit() || !s.text[s.pos + 6].is_hex_digit()
|| !s.text[s.pos + 7].is_hex_digit() || !s.text[s.pos + 8].is_hex_digit() {
s.error(r'`\U` incomplete 32 bit unicode character value')
}
u32_escapes_pos << s.pos - 1
}
// Unknown escape sequence
if !is_escape_sequence(c) && !c.is_digit() {
Expand Down Expand Up @@ -1307,19 +1322,26 @@ fn (mut s Scanner) ident_string() string {
if !s.is_fmt {
mut segment_idx := 0
mut str_segments := []string{}
if u_escapes_pos.len + h_escapes_pos.len > 0 {
if u16_escapes_pos.len + h_escapes_pos.len + u32_escapes_pos.len > 0 {
mut all_pos := []int{}
all_pos << u_escapes_pos
all_pos << u16_escapes_pos
all_pos << u32_escapes_pos
all_pos << h_escapes_pos
if u_escapes_pos.len != 0 && h_escapes_pos.len != 0 {
all_pos.sort()
}
all_pos.sort()

for pos in all_pos {
str_segments << string_so_far[segment_idx..(pos - start)]
segment_idx = pos - start

if pos in u_escapes_pos {
end_idx, segment := s.decode_u_escape_single(string_so_far, segment_idx)
if pos in u16_escapes_pos {
end_idx, segment := s.decode_u16_escape_single(string_so_far,
segment_idx)
str_segments << segment
segment_idx = end_idx
}
if pos in u32_escapes_pos {
end_idx, segment := s.decode_u32_escape_single(string_so_far,
segment_idx)
str_segments << segment
segment_idx = end_idx
}
Expand Down Expand Up @@ -1407,7 +1429,7 @@ fn (mut s Scanner) decode_o_escapes(sinput string, start int, escapes_pos []int)
return ss.join('')
}

fn (mut s Scanner) decode_u_escape_single(str string, idx int) (int, string) {
fn (mut s Scanner) decode_u16_escape_single(str string, idx int) (int, string) {
end_idx := idx + 6 // "\uXXXX".len == 6
escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 }
// Check if Escaped Code Point is invalid or not
Expand All @@ -1418,9 +1440,32 @@ fn (mut s Scanner) decode_u_escape_single(str string, idx int) (int, string) {
return end_idx, utf32_to_str(u32(escaped_code_point))
}

// decode a single unicode escaped rune into its utf-8 bytes
fn (mut s Scanner) decode_uerune(str string) string {
end_idx, segment := s.decode_u_escape_single(str, 0)
// decode a single 16 bit unicode escaped rune into its utf-8 bytes
fn (mut s Scanner) decode_u16erune(str string) string {
end_idx, segment := s.decode_u16_escape_single(str, 0)
if str.len == end_idx {
return segment
}
mut ss := []string{cap: 2}
ss << segment
ss << str[end_idx..]
return ss.join('')
}

fn (mut s Scanner) decode_u32_escape_single(str string, idx int) (int, string) {
end_idx := idx + 10 // "\uXXXXXXXX".len == 10
escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 }
// Check if Escaped Code Point is invalid or not
if rune(escaped_code_point).length_in_bytes() == -1 {
s.error('invalid unicode point `${str}`')
}

return end_idx, utf32_to_str(u32(escaped_code_point))
}

// decode a single 32 bit unicode escaped rune into its utf-8 bytes
fn (mut s Scanner) decode_u32erune(str string) string {
end_idx, segment := s.decode_u32_escape_single(str, 0)
if str.len == end_idx {
return segment
}
Expand Down Expand Up @@ -1448,7 +1493,7 @@ fn trim_slash_line_break(s string) string {
@[inline]
fn is_escape_sequence(c u8) bool {
return c in [`x`, `u`, `e`, `n`, `r`, `t`, `v`, `a`, `f`, `b`, `\\`, `\``, `$`, `@`, `?`, `{`,
`}`, `'`, `"`]
`}`, `'`, `"`, `U`]
}

/// ident_char is called when a backtick "single-char" is parsed from the code
Expand All @@ -1460,6 +1505,7 @@ fn is_escape_sequence(c u8) bool {
/// escaped single chars like `\\`, `\``, `\n` => '\\', '`', '\n'
/// escaped single hex bytes like `\x01`, `\x61` => '\x01', 'a'
/// escaped unicode literals like `\u2605`
/// escaped unicode 32 literals like `\U00002605`
/// escaped utf8 runes in hex like `\xe2\x98\x85` => (★)
/// escaped utf8 runes in octal like `\342\230\205` => (★)
fn (mut s Scanner) ident_char() string {
Expand All @@ -1475,8 +1521,10 @@ fn (mut s Scanner) ident_char() string {

// set flags for advanced escapes first
escaped_hex := s.expect('\\x', start + 1)
escaped_unicode := s.expect('\\u', start + 1)
escaped_octal := !escaped_hex && !escaped_unicode && s.expect('\\', start + 1)
escaped_unicode_16 := s.expect('\\u', start + 1)
escaped_unicode_32 := s.expect('\\U', start + 1)
escaped_octal := !escaped_hex && !escaped_unicode_16 && !escaped_unicode_32
&& s.expect('\\', start + 1)

// walk the string to get characters up to the next backtick
for {
Expand Down Expand Up @@ -1505,13 +1553,17 @@ fn (mut s Scanner) ident_char() string {
// the string inside the backticks is longer than one character
// but we might only have one rune... attempt to decode escapes
// if the content expresses an escape code, it will have an even number of characters
// e.g. (octal) \141 (hex) \x61 or (unicode) \u2605
// e.g. (octal) \141 (hex) \x61 or (unicode) \u2605 or (32 bit unicode) \U00002605
// we don't handle binary escape codes in rune literals
orig := c
if c.len % 2 == 0 && (escaped_hex || escaped_unicode || escaped_octal) {
if escaped_unicode {
if c.len % 2 == 0
&& (escaped_hex || escaped_unicode_16 || escaped_unicode_32 || escaped_octal) {
if escaped_unicode_16 {
// there can only be one, so attempt to decode it now
c = s.decode_u16erune(c)
} else if escaped_unicode_32 {
// there can only be one, so attempt to decode it now
c = s.decode_uerune(c)
c = s.decode_u32erune(c)
} else {
// find escape sequence start positions
mut escapes_pos := []int{}
Expand All @@ -1530,7 +1582,7 @@ fn (mut s Scanner) ident_char() string {

u := c.runes()
if u.len != 1 {
if escaped_hex || escaped_unicode {
if escaped_hex || escaped_unicode_16 || escaped_unicode_32 {
s.error_with_pos('invalid character literal `${orig}` => `${c}` (${u}) (escape sequence did not refer to a singular rune)',
lspos)
} else if u.len == 0 {
Expand Down
55 changes: 51 additions & 4 deletions vlib/v/scanner/scanner_test.v
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ fn test_escape_rune() {
// will not work until v compiler on github is updated
// assert `\x61` == `a`
// assert `\u0061` == `a`
// assert `\U00000061` == `a`

// will not work until PR is accepted
// assert `\141` == `a`
Expand All @@ -180,11 +181,16 @@ fn test_escape_rune() {
assert result[0].kind == .chartoken
assert result[0].lit == r'\\'

// SINGLE CHAR UNICODE ESCAPE
// SINGLE CHAR 16-bit UNICODE ESCAPE
result = scan_tokens(r'`\u2605`')
assert result[0].kind == .chartoken
assert result[0].lit == r'★'

// SINGLE CHAR 32-bit UNICODE ESCAPE
result = scan_tokens(r'`\U00002605`')
assert result[0].kind == .chartoken
assert result[0].lit == r'★'

// SINGLE CHAR ESCAPED ASCII
result = scan_tokens(r'`\x61`')
assert result[0].kind == .chartoken
Expand All @@ -207,6 +213,7 @@ fn test_escape_string() {
assert '\x61' == 'a'
assert '\x62' == 'b'
assert '\u0061' == 'a'
assert '\U00000061' == 'a'
assert '\141' == 'a'
assert '\xe2\x98\x85' == '★'
assert '\342\230\205' == '★'
Expand All @@ -230,14 +237,22 @@ fn test_escape_string() {
assert result[0].kind == .string
assert result[0].lit == r'\\'

// STRING UNICODE ESCAPE
// STRING 16-bit UNICODE ESCAPE
result = scan_tokens(r"'\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'★'
result = scan_tokens(r"'H\u2605H'")
assert result[0].kind == .string
assert result[0].lit == r'H★H'

// STRING 32-bit UNICODE ESCAPE
result = scan_tokens(r"'\U00002605'")
assert result[0].kind == .string
assert result[0].lit == r'★'
result = scan_tokens(r"'H\U00002605H'")
assert result[0].kind == .string
assert result[0].lit == r'H★H'

// STRING ESCAPED ASCII
result = scan_tokens(r"'\x61'")
assert result[0].kind == .string
Expand All @@ -249,22 +264,54 @@ fn test_escape_string() {
assert result[0].kind == .string
assert result[0].lit.bytes() == [u8(0xe2), `9`, `8`, `8`, `5`]

// MIX STRING ESCAPES
// MIX STRING ESCAPES with UTF-16 escapes
result = scan_tokens(r"'\x61\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'a★'
result = scan_tokens(r"'\u2605\x61'")
assert result[0].kind == .string
assert result[0].lit == r'★a'

// MIX STRING ESCAPES with offset
// MIX STRING ESCAPES with UTF-16 escapes with offset
result = scan_tokens(r"'x \x61\u2605\x61'")
assert result[0].kind == .string
assert result[0].lit == r'x a★a'
result = scan_tokens(r"'x \u2605\x61\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'x ★a★'

// MIX STRING ESCAPES with UTF-32 escapes
result = scan_tokens(r"'\x61\U00002605'")
assert result[0].kind == .string
assert result[0].lit == r'a★'
result = scan_tokens(r"'\U00002605\x61'")
assert result[0].kind == .string
assert result[0].lit == r'★a'

// MIX STRING ESCAPES with UTF-32 escapes with offset
result = scan_tokens(r"'x \x61\U00002605\x61'")
assert result[0].kind == .string
assert result[0].lit == r'x a★a'
result = scan_tokens(r"'x \U00002605\x61\U00002605'")
assert result[0].kind == .string
assert result[0].lit == r'x ★a★'

// MIX STRING ESCAPES with UTF-16 and UTF-32 escapes
result = scan_tokens(r"'\u2605\x61\U00002605'")
assert result[0].kind == .string
assert result[0].lit == r'★a★'
result = scan_tokens(r"'\U00002605\x61\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'★a★'

// MIX STRING ESCAPES with UTF-16 and UTF-32 escapes with offset
result = scan_tokens(r"'x \x61\U00002605\x61\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'x a★a★'
result = scan_tokens(r"'x \x61\u2605\x61\U00002605'")
assert result[0].kind == .string
assert result[0].lit == r'x a★a★'

// SHOULD RESULT IN ERRORS
// result = scan_tokens(r'`\x61\x61`') // should always result in an error
// result = scan_tokens(r"'\x'") // should always result in an error
Expand Down
4 changes: 4 additions & 0 deletions vlib/v/scanner/tests/invalid_unicode_16_err.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
vlib/v/scanner/tests/invalid_unicode_16_err.vv:1:13: error: invalid unicode point `\uD8FF`
1 | a := '\uD8FF'
| ^
2 | println(a)
File renamed without changes.
4 changes: 4 additions & 0 deletions vlib/v/scanner/tests/invalid_unicode_32_err.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
vlib/v/scanner/tests/invalid_unicode_32_err.vv:1:17: error: invalid unicode point `\U0000D8FF`
1 | a := '\U0000D8FF'
| ^
2 | println(a)
2 changes: 2 additions & 0 deletions vlib/v/scanner/tests/invalid_unicode_32_err.vv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
a := '\U0000D8FF'
println(a)

0 comments on commit 76530de

Please sign in to comment.